From c098bbb00cd1986cbb58ed1712643f80ed00fcc3 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:48:28 -0800 Subject: [PATCH 01/23] IB/hfi1: Build TID RDMA WRITE request This patch adds the functions to build TID RDMA WRITE request. The work request opcode, packet opcode, and packet formats for TID RDMA WRITE protocol are also defined in this patch. Signed-off-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Ashutosh Dixit Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/qp.h | 2 + drivers/infiniband/hw/hfi1/tid_rdma.c | 38 ++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 3 ++ include/rdma/ib_hdrs.h | 5 +++ include/rdma/tid_rdma_defs.h | 56 +++++++++++++++++++++++++++ 5 files changed, 104 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h index ce25a27aa4a1..f74e2509e8b9 100644 --- a/drivers/infiniband/hw/hfi1/qp.h +++ b/drivers/infiniband/hw/hfi1/qp.h @@ -64,12 +64,14 @@ extern const struct rvt_operation_params hfi1_post_parms[]; * HFI1_S_AHG_CLEAR - have send engine clear ahg state * HFI1_S_WAIT_PIO_DRAIN - qp waiting for PIOs to drain * HFI1_S_WAIT_TID_SPACE - a QP is waiting for TID resource + * HFI1_S_WAIT_TID_RESP - waiting for a TID RDMA WRITE response * HFI1_S_MIN_BIT_MASK - the lowest bit that can be used by hfi1 */ #define HFI1_S_AHG_VALID 0x80000000 #define HFI1_S_AHG_CLEAR 0x40000000 #define HFI1_S_WAIT_PIO_DRAIN 0x20000000 #define HFI1_S_WAIT_TID_SPACE 0x10000000 +#define HFI1_S_WAIT_TID_RESP 0x08000000 #define HFI1_S_MIN_BIT_MASK 0x01000000 /* diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 0ee79403acaf..089e301d9bcd 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -2975,3 +2975,41 @@ void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe) exit: rcu_read_unlock(); } + +/* TID RDMA WRITE functions */ + +u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, + u32 *bth1, u32 *bth2, u32 *len) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_rdma_request *req = wqe_to_tid_req(wqe); + struct tid_rdma_params *remote; + + rcu_read_lock(); + remote = rcu_dereference(qpriv->tid_rdma.remote); + /* + * Set the number of flow to be used based on negotiated + * parameters. + */ + req->n_flows = remote->max_write; + req->state = TID_REQUEST_ACTIVE; + + KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth0, KVER, 0x1); + KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth1, JKEY, remote->jkey); + ohdr->u.tid_rdma.w_req.reth.vaddr = + cpu_to_be64(wqe->rdma_wr.remote_addr + (wqe->length - *len)); + ohdr->u.tid_rdma.w_req.reth.rkey = + cpu_to_be32(wqe->rdma_wr.rkey); + ohdr->u.tid_rdma.w_req.reth.length = cpu_to_be32(*len); + ohdr->u.tid_rdma.w_req.verbs_qp = cpu_to_be32(qp->remote_qpn); + *bth1 &= ~RVT_QPN_MASK; + *bth1 |= remote->qp; + qp->s_state = TID_OP(WRITE_REQ); + qp->s_flags |= HFI1_S_WAIT_TID_RESP; + *bth2 |= IB_BTH_REQ_ACK; + *len = 0; + + rcu_read_unlock(); + return sizeof(ohdr->u.tid_rdma.w_req) / sizeof(u32); +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index a53598ce45b2..baba539b2b80 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -233,4 +233,7 @@ static inline void hfi1_setup_tid_rdma_wqe(struct rvt_qp *qp, setup_tid_rdma_wqe(qp, wqe); } +u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, + u32 *bth1, u32 *bth2, u32 *len); #endif /* HFI1_TID_RDMA_H */ diff --git a/include/rdma/ib_hdrs.h b/include/rdma/ib_hdrs.h index 58a0a0f99e7f..9a90bd031e8c 100644 --- a/include/rdma/ib_hdrs.h +++ b/include/rdma/ib_hdrs.h @@ -123,6 +123,11 @@ union ib_ehdrs { union { struct tid_rdma_read_req r_req; struct tid_rdma_read_resp r_rsp; + struct tid_rdma_write_req w_req; + struct tid_rdma_write_resp w_rsp; + struct tid_rdma_write_data w_data; + struct tid_rdma_resync resync; + struct tid_rdma_ack ack; } tid_rdma; } __packed; diff --git a/include/rdma/tid_rdma_defs.h b/include/rdma/tid_rdma_defs.h index 1c431ea32b52..08fe47c7ad2c 100644 --- a/include/rdma/tid_rdma_defs.h +++ b/include/rdma/tid_rdma_defs.h @@ -27,16 +27,71 @@ struct tid_rdma_read_resp { __be32 verbs_qp; }; +struct tid_rdma_write_req { + __le32 kdeth0; + __le32 kdeth1; + struct ib_reth reth; + __be32 reserved[2]; + __be32 verbs_qp; +}; + +struct tid_rdma_write_resp { + __le32 kdeth0; + __le32 kdeth1; + __be32 aeth; + __be32 reserved[3]; + __be32 tid_flow_psn; + __be32 tid_flow_qp; + __be32 verbs_qp; +}; + +struct tid_rdma_write_data { + __le32 kdeth0; + __le32 kdeth1; + __be32 reserved[6]; + __be32 verbs_qp; +}; + +struct tid_rdma_resync { + __le32 kdeth0; + __le32 kdeth1; + __be32 reserved[6]; + __be32 verbs_qp; +}; + +struct tid_rdma_ack { + __le32 kdeth0; + __le32 kdeth1; + __be32 aeth; + __be32 reserved[2]; + __be32 tid_flow_psn; + __be32 verbs_psn; + __be32 tid_flow_qp; + __be32 verbs_qp; +}; + /* * TID RDMA Opcodes */ #define IB_OPCODE_TID_RDMA 0xe0 enum { + IB_OPCODE_WRITE_REQ = 0x0, + IB_OPCODE_WRITE_RESP = 0x1, + IB_OPCODE_WRITE_DATA = 0x2, + IB_OPCODE_WRITE_DATA_LAST = 0x3, IB_OPCODE_READ_REQ = 0x4, IB_OPCODE_READ_RESP = 0x5, + IB_OPCODE_RESYNC = 0x6, + IB_OPCODE_ACK = 0x7, + IB_OPCODE(TID_RDMA, WRITE_REQ), + IB_OPCODE(TID_RDMA, WRITE_RESP), + IB_OPCODE(TID_RDMA, WRITE_DATA), + IB_OPCODE(TID_RDMA, WRITE_DATA_LAST), IB_OPCODE(TID_RDMA, READ_REQ), IB_OPCODE(TID_RDMA, READ_RESP), + IB_OPCODE(TID_RDMA, RESYNC), + IB_OPCODE(TID_RDMA, ACK), }; #define TID_OP(x) IB_OPCODE_TID_RDMA_##x @@ -47,6 +102,7 @@ enum { * low level drivers. Two of those are used but renamed * to be more descriptive. */ +#define IB_WR_TID_RDMA_WRITE IB_WR_RESERVED1 #define IB_WR_TID_RDMA_READ IB_WR_RESERVED2 #endif /* TID_RDMA_DEFS_H */ From f5a4a95f4dd8a09d28936c2e1e357e4c8dcca6c1 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:48:38 -0800 Subject: [PATCH 02/23] IB/hfi1: Allow for extra entries in QP's s_ack_queue The TID RDMA WRITE protocol differs from normal IB RDMA WRITE in that TID RDMA WRITE requests do require responses, not just ACKs. Therefore, TID RDMA WRITE requests need to be treated as RDMA READ requests from the point of view of the QPs' s_ack_queue. In other words, the QPs' need to allow for TID RDMA WRITE requests to be stored in their s_ack_queue. However, because the user does not know anything about the TID RDMA capability and/or protocols, these extra entries in the queue cannot be advertized to the user. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.h | 11 +++++++++++ drivers/infiniband/hw/hfi1/verbs.c | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index baba539b2b80..9b952351f072 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -28,6 +28,17 @@ */ #define HFI1_S_TID_WAIT_INTERLCK BIT(5) +/* + * Unlike regular IB RDMA VERBS, which do not require an entry + * in the s_ack_queue, TID RDMA WRITE requests do because they + * generate responses. + * Therefore, the s_ack_queue needs to be extended by a certain + * amount. The key point is that the queue needs to be extended + * without letting the "user" know so they user doesn't end up + * using these extra entries. + */ +#define HFI1_TID_RDMA_WRITE_CNT 8 + struct tid_rdma_params { struct rcu_head rcu_head; u32 qp; diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 88676ca79fda..7b87b77582bd 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1888,7 +1888,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold; dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period; dd->verbs_dev.rdi.dparms.reserved_operations = 1; - dd->verbs_dev.rdi.dparms.extra_rdma_atomic = 1; + dd->verbs_dev.rdi.dparms.extra_rdma_atomic = HFI1_TID_RDMA_WRITE_CNT; /* post send table */ dd->verbs_dev.rdi.post_parms = hfi1_post_parms; From 4f9264d156dc6c154a8a6cfae780730bad45c6f8 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:48:48 -0800 Subject: [PATCH 03/23] IB/hfi1: Add an s_acked_ack_queue pointer The s_ack_queue is managed by two pointers into the ring: r_head_ack_queue and s_tail_ack_queue. r_head_ack_queue is the index of where the next received request is going to be placed and s_tail_ack_queue is the entry of the request currently being processed. This works perfectly fine for normal Verbs as the requests are processed one at a time and the s_tail_ack_queue is not moved until the request that it points to is fully completed. In this fashion, s_tail_ack_queue constantly chases r_head_ack_queue and the two pointers can easily be used to determine "queue full" and "queue empty" conditions. The detection of these two conditions are imported in determining when an old entry can safely be overwritten with a new received request and the resources associated with the old request be safely released. When pipelined TID RDMA WRITE is introduced into this mix, things look very different. r_head_ack_queue is still the point at which a newly received request will be inserted, s_tail_ack_queue is still the currently processed request. However, with pipelined TID RDMA WRITE requests, s_tail_ack_queue moves to the next request once all TID RDMA WRITE responses for that request have been sent. The rest of the protocol for a particular request is managed by other pointers specific to TID RDMA - r_tid_tail and r_tid_ack - which point to the entries for which the next TID RDMA DATA packets are going to arrive and the request for which the next TID RDMA ACK packets are to be generated, respectively. What this means is that entries in the ring, which are "behind" s_tail_ack_queue (entries which s_tail_ack_queue has gone past) are no longer considered complete. This is where the problem is - a newly received request could potentially overwrite a still active TID RDMA WRITE request. The reason why the TID RDMA pointers trail s_tail_ack_queue is that the normal Verbs send engine uses s_tail_ack_queue as the pointer for the next response. Since TID RDMA WRITE responses are processed by the normal Verbs send engine, s_tail_ack_queue had to be moved to the next entry once all TID RDMA WRITE response packets were sent to get the desired pipelining between requests. Doing otherwise would mean that the normal Verbs send engine would not be able to send the TID RDMA WRITE responses for the next TID RDMA request until the current one is fully completed. This patch introduces the s_acked_ack_queue index to point to the next request to complete on the responder side. For requests other than TID RDMA WRITE, s_acked_ack_queue should always be kept in sync with s_tail_ack_queue. For TID RDMA WRITE request, it may fall behind s_tail_ack_queue. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 33 ++++++++++++++++++++++---- drivers/infiniband/hw/hfi1/rc.h | 1 + drivers/infiniband/hw/hfi1/tid_rdma.c | 2 ++ drivers/infiniband/hw/hfi1/trace_tid.h | 10 ++++++-- drivers/infiniband/sw/rdmavt/qp.c | 1 + include/rdma/rdmavt_qp.h | 1 + 6 files changed, 41 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 6c9ef572fc69..9dc8e524510e 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -120,6 +120,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, struct hfi1_qp_priv *priv = qp->priv; bool last_pkt; u32 delta; + u8 next = qp->s_tail_ack_queue; trace_hfi1_rsp_make_rc_ack(qp, 0); lockdep_assert_held(&qp->s_lock); @@ -149,9 +150,17 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, * response has been sent instead of only being * constructed. */ - if (++qp->s_tail_ack_queue > - rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) - qp->s_tail_ack_queue = 0; + if (++next > rvt_size_atomic(&dev->rdi)) + next = 0; + /* + * Only advance the s_acked_ack_queue pointer if there + * have been no TID RDMA requests. + */ + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + if (e->opcode != TID_OP(WRITE_REQ) && + qp->s_acked_ack_queue == qp->s_tail_ack_queue) + qp->s_acked_ack_queue = next; + qp->s_tail_ack_queue = next; /* FALLTHROUGH */ case OP(SEND_ONLY): case OP(ACKNOWLEDGE): @@ -172,6 +181,10 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, */ len = e->rdma_sge.sge_length; if (len && !e->rdma_sge.mr) { + if (qp->s_acked_ack_queue == + qp->s_tail_ack_queue) + qp->s_acked_ack_queue = + qp->r_head_ack_queue; qp->s_tail_ack_queue = qp->r_head_ack_queue; goto bail; } @@ -202,6 +215,10 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, */ len = e->rdma_sge.sge_length; if (len && !e->rdma_sge.mr) { + if (qp->s_acked_ack_queue == + qp->s_tail_ack_queue) + qp->s_acked_ack_queue = + qp->r_head_ack_queue; qp->s_tail_ack_queue = qp->r_head_ack_queue; goto bail; } @@ -2235,6 +2252,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, e->psn = psn; if (old_req) goto unlock_done; + if (qp->s_acked_ack_queue == qp->s_tail_ack_queue) + qp->s_acked_ack_queue = prev; qp->s_tail_ack_queue = prev; break; } @@ -2248,6 +2267,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, */ if (!e || e->opcode != (u8)opcode || old_req) goto unlock_done; + if (qp->s_tail_ack_queue == qp->s_acked_ack_queue) + qp->s_acked_ack_queue = prev; qp->s_tail_ack_queue = prev; break; } @@ -2274,6 +2295,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, * Resend the RDMA read or atomic op which * ACKs this duplicate request. */ + if (qp->s_tail_ack_queue == qp->s_acked_ack_queue) + qp->s_acked_ack_queue = mra; qp->s_tail_ack_queue = mra; break; } @@ -2646,7 +2669,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) next = 0; spin_lock_irqsave(&qp->s_lock, flags); - if (unlikely(next == qp->s_tail_ack_queue)) { + if (unlikely(next == qp->s_acked_ack_queue)) { if (!qp->s_ack_queue[next].sent) goto nack_inv_unlck; update_ack_queue(qp, next); @@ -2723,7 +2746,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) next = 0; spin_lock_irqsave(&qp->s_lock, flags); - if (unlikely(next == qp->s_tail_ack_queue)) { + if (unlikely(next == qp->s_acked_ack_queue)) { if (!qp->s_ack_queue[next].sent) goto nack_inv_unlck; update_ack_queue(qp, next); diff --git a/drivers/infiniband/hw/hfi1/rc.h b/drivers/infiniband/hw/hfi1/rc.h index 4329eadcb3df..8e0935b9bf2a 100644 --- a/drivers/infiniband/hw/hfi1/rc.h +++ b/drivers/infiniband/hw/hfi1/rc.h @@ -18,6 +18,7 @@ static inline void update_ack_queue(struct rvt_qp *qp, unsigned int n) if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) next = 0; qp->s_tail_ack_queue = next; + qp->s_acked_ack_queue = next; qp->s_ack_state = OP(ACKNOWLEDGE); } diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 089e301d9bcd..c320a99afb35 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -2044,6 +2044,8 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet, goto unlock; } /* Re-process old requests.*/ + if (qp->s_acked_ack_queue == qp->s_tail_ack_queue) + qp->s_acked_ack_queue = prev; qp->s_tail_ack_queue = prev; /* * Since the qp->s_tail_ack_queue is modified, the diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h index b71638c22d4b..51f5b0e8da71 100644 --- a/drivers/infiniband/hw/hfi1/trace_tid.h +++ b/drivers/infiniband/hw/hfi1/trace_tid.h @@ -40,7 +40,7 @@ u16 hfi1_trace_get_tid_idx(u32 ent); #define RSP_INFO_PRN "[%s] qpn 0x%x state 0x%x s_state 0x%x psn 0x%x " \ "r_psn 0x%x r_state 0x%x r_flags 0x%x " \ "r_head_ack_queue %u s_tail_ack_queue %u " \ - "s_ack_state 0x%x " \ + "s_acked_ack_queue %u s_ack_state 0x%x " \ "s_nak_state 0x%x s_flags 0x%x ps_flags 0x%x " \ "iow_flags 0x%lx" @@ -62,7 +62,7 @@ u16 hfi1_trace_get_tid_idx(u32 ent); "s_next_psn 0x%x" #define RCV_ERR_PRN "[%s] qpn 0x%x s_flags 0x%x state 0x%x " \ - "s_tail_ack_queue %u " \ + "s_acked_ack_queue %u s_tail_ack_queue %u " \ "r_head_ack_queue %u opcode 0x%x psn 0x%x r_psn 0x%x " \ " diff %d" @@ -671,6 +671,7 @@ DECLARE_EVENT_CLASS(/* rsp_info */ __field(u8, r_flags) __field(u8, r_head_ack_queue) __field(u8, s_tail_ack_queue) + __field(u8, s_acked_ack_queue) __field(u8, s_ack_state) __field(u8, s_nak_state) __field(u8, r_nak_state) @@ -691,6 +692,7 @@ DECLARE_EVENT_CLASS(/* rsp_info */ __entry->r_flags = qp->r_flags; __entry->r_head_ack_queue = qp->r_head_ack_queue; __entry->s_tail_ack_queue = qp->s_tail_ack_queue; + __entry->s_acked_ack_queue = qp->s_acked_ack_queue; __entry->s_ack_state = qp->s_ack_state; __entry->s_nak_state = qp->s_nak_state; __entry->s_flags = qp->s_flags; @@ -709,6 +711,7 @@ DECLARE_EVENT_CLASS(/* rsp_info */ __entry->r_flags, __entry->r_head_ack_queue, __entry->s_tail_ack_queue, + __entry->s_acked_ack_queue, __entry->s_ack_state, __entry->s_nak_state, __entry->s_flags, @@ -1007,6 +1010,7 @@ DECLARE_EVENT_CLASS(/* rc_rcv_err */ __field(u32, qpn) __field(u32, s_flags) __field(u8, state) + __field(u8, s_acked_ack_queue) __field(u8, s_tail_ack_queue) __field(u8, r_head_ack_queue) __field(u32, opcode) @@ -1019,6 +1023,7 @@ DECLARE_EVENT_CLASS(/* rc_rcv_err */ __entry->qpn = qp->ibqp.qp_num; __entry->s_flags = qp->s_flags; __entry->state = qp->state; + __entry->s_acked_ack_queue = qp->s_acked_ack_queue; __entry->s_tail_ack_queue = qp->s_tail_ack_queue; __entry->r_head_ack_queue = qp->r_head_ack_queue; __entry->opcode = opcode; @@ -1032,6 +1037,7 @@ DECLARE_EVENT_CLASS(/* rc_rcv_err */ __entry->qpn, __entry->s_flags, __entry->state, + __entry->s_acked_ack_queue, __entry->s_tail_ack_queue, __entry->r_head_ack_queue, __entry->opcode, diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 2769ebdf89fb..14ec2577bcaa 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -854,6 +854,7 @@ static void rvt_init_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, qp->s_mig_state = IB_MIG_MIGRATED; qp->r_head_ack_queue = 0; qp->s_tail_ack_queue = 0; + qp->s_acked_ack_queue = 0; qp->s_num_rd_atomic = 0; if (qp->r_rq.wq) { qp->r_rq.wq->head = 0; diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index d8d88d023092..4ee612ab6cb4 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -375,6 +375,7 @@ struct rvt_qp { u8 s_rnr_retry; /* requester RNR retry counter */ u8 s_num_rd_atomic; /* number of RDMA read/atomic pending */ u8 s_tail_ack_queue; /* index into s_ack_queue[] */ + u8 s_acked_ack_queue; /* index into s_ack_queue[] */ struct rvt_sge_state s_ack_rdma_sge; struct timer_list s_timer; From 07b923701e38f93b4725e64318e6483f890c1c1d Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:48:59 -0800 Subject: [PATCH 04/23] IB/hfi1: Add functions to receive TID RDMA WRITE request This patch adds the functions to receive TID RDMA WRITE request. The request will be stored in the QP's s_ack_queue. This patch also adds code to handle duplicate TID RDMA WRITE request and a function to allocate TID resources for data receiving on the responder side. Signed-off-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Ashutosh Dixit Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/init.c | 1 + drivers/infiniband/hw/hfi1/rc.c | 3 + drivers/infiniband/hw/hfi1/tid_rdma.c | 570 +++++++++++++++++++++++++- drivers/infiniband/hw/hfi1/tid_rdma.h | 16 + drivers/infiniband/hw/hfi1/verbs.h | 12 + 5 files changed, 601 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index d13304f7340d..7841a0ad7cb6 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -1512,6 +1512,7 @@ static int __init hfi1_mod_init(void) goto bail_dev; } + hfi1_compute_tid_rdma_flow_wt(); /* * These must be called before the driver is registered with * the PCI subsystem. diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 9dc8e524510e..fcb733ea8dfb 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -2411,6 +2411,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) void *data = packet->payload; u32 tlen = packet->tlen; struct rvt_qp *qp = packet->qp; + struct hfi1_qp_priv *qpriv = qp->priv; struct hfi1_ibport *ibp = rcd_to_iport(rcd); struct ib_other_headers *ohdr = packet->ohdr; u32 opcode = packet->opcode; @@ -2716,6 +2717,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) qp->r_state = opcode; qp->r_nak_state = 0; qp->r_head_ack_queue = next; + qpriv->r_tid_alloc = qp->r_head_ack_queue; /* Schedule the send engine. */ qp->s_flags |= RVT_S_RESP_PENDING; @@ -2789,6 +2791,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) qp->r_state = opcode; qp->r_nak_state = 0; qp->r_head_ack_queue = next; + qpriv->r_tid_alloc = qp->r_head_ack_queue; /* Schedule the send engine. */ qp->s_flags |= RVT_S_RESP_PENDING; diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index c320a99afb35..516dca9f497e 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -109,12 +109,15 @@ static u32 mask_generation(u32 a) * C - Capcode */ +static u32 tid_rdma_flow_wt; + static void tid_rdma_trigger_resume(struct work_struct *work); static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req); static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req, gfp_t gfp); static void hfi1_init_trdma_req(struct rvt_qp *qp, struct tid_rdma_request *req); +static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx); static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p) { @@ -313,6 +316,11 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, qpriv->flow_state.index = RXE_NUM_TID_FLOWS; qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS; qpriv->flow_state.generation = KERN_GENERATION_RESERVED; + qpriv->rnr_nak_state = TID_RNR_NAK_INIT; + qpriv->r_tid_head = HFI1_QP_WQE_INVALID; + qpriv->r_tid_tail = HFI1_QP_WQE_INVALID; + qpriv->r_tid_ack = HFI1_QP_WQE_INVALID; + qpriv->r_tid_alloc = HFI1_QP_WQE_INVALID; INIT_LIST_HEAD(&qpriv->tid_wait); if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { @@ -1959,6 +1967,8 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet, { struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); struct hfi1_ctxtdata *rcd = ((struct hfi1_qp_priv *)qp->priv)->rcd; + struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); + struct hfi1_qp_priv *qpriv = qp->priv; struct rvt_ack_entry *e; struct tid_rdma_request *req; unsigned long flags; @@ -1982,7 +1992,8 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet, spin_lock_irqsave(&qp->s_lock, flags); e = find_prev_entry(qp, psn, &prev, NULL, &old_req); - if (!e || e->opcode != TID_OP(READ_REQ)) + if (!e || (e->opcode != TID_OP(READ_REQ) && + e->opcode != TID_OP(WRITE_REQ))) goto unlock; req = ack_to_tid_req(e); @@ -2042,6 +2053,114 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet, */ if (old_req) goto unlock; + } else { + struct flow_state *fstate; + bool schedule = false; + u8 i; + + if (req->state == TID_REQUEST_RESEND) { + req->state = TID_REQUEST_RESEND_ACTIVE; + } else if (req->state == TID_REQUEST_INIT_RESEND) { + req->state = TID_REQUEST_INIT; + schedule = true; + } + + /* + * True if the request is already scheduled (between + * qp->s_tail_ack_queue and qp->r_head_ack_queue). + * Also, don't change requests, which are at the SYNC + * point and haven't generated any responses yet. + * There is nothing to retransmit for them yet. + */ + if (old_req || req->state == TID_REQUEST_INIT || + (req->state == TID_REQUEST_SYNC && !req->cur_seg)) { + for (i = prev + 1; ; i++) { + if (i > rvt_size_atomic(&dev->rdi)) + i = 0; + if (i == qp->r_head_ack_queue) + break; + e = &qp->s_ack_queue[i]; + req = ack_to_tid_req(e); + if (e->opcode == TID_OP(WRITE_REQ) && + req->state == TID_REQUEST_INIT) + req->state = TID_REQUEST_INIT_RESEND; + } + /* + * If the state of the request has been changed, + * the first leg needs to get scheduled in order to + * pick up the change. Otherwise, normal response + * processing should take care of it. + */ + if (!schedule) + goto unlock; + } + + /* + * If there is no more allocated segment, just schedule the qp + * without changing any state. + */ + if (req->clear_tail == req->setup_head) + goto schedule; + /* + * If this request has sent responses for segments, which have + * not received data yet (flow_idx != clear_tail), the flow_idx + * pointer needs to be adjusted so the same responses can be + * re-sent. + */ + if (CIRC_CNT(req->flow_idx, req->clear_tail, MAX_FLOWS)) { + fstate = &req->flows[req->clear_tail].flow_state; + qpriv->pending_tid_w_segs -= + CIRC_CNT(req->flow_idx, req->clear_tail, + MAX_FLOWS); + req->flow_idx = + CIRC_ADD(req->clear_tail, + delta_psn(psn, fstate->resp_ib_psn), + MAX_FLOWS); + qpriv->pending_tid_w_segs += + delta_psn(psn, fstate->resp_ib_psn); + /* + * When flow_idx == setup_head, we've gotten a duplicate + * request for a segment, which has not been allocated + * yet. In that case, don't adjust this request. + * However, we still want to go through the loop below + * to adjust all subsequent requests. + */ + if (CIRC_CNT(req->setup_head, req->flow_idx, + MAX_FLOWS)) { + req->cur_seg = delta_psn(psn, e->psn); + req->state = TID_REQUEST_RESEND_ACTIVE; + } + } + + for (i = prev + 1; ; i++) { + /* + * Look at everything up to and including + * s_tail_ack_queue + */ + if (i > rvt_size_atomic(&dev->rdi)) + i = 0; + if (i == qp->r_head_ack_queue) + break; + e = &qp->s_ack_queue[i]; + req = ack_to_tid_req(e); + trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, + e->lpsn, req); + if (e->opcode != TID_OP(WRITE_REQ) || + req->cur_seg == req->comp_seg || + req->state == TID_REQUEST_INIT || + req->state == TID_REQUEST_INIT_RESEND) { + if (req->state == TID_REQUEST_INIT) + req->state = TID_REQUEST_INIT_RESEND; + continue; + } + qpriv->pending_tid_w_segs -= + CIRC_CNT(req->flow_idx, + req->clear_tail, + MAX_FLOWS); + req->flow_idx = req->clear_tail; + req->state = TID_REQUEST_RESEND; + req->cur_seg = req->comp_seg; + } } /* Re-process old requests.*/ if (qp->s_acked_ack_queue == qp->s_tail_ack_queue) @@ -2054,6 +2173,18 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet, * wrong memory region. */ qp->s_ack_state = OP(ACKNOWLEDGE); +schedule: + /* + * It's possible to receive a retry psn that is earlier than an RNRNAK + * psn. In this case, the rnrnak state should be cleared. + */ + if (qpriv->rnr_nak_state) { + qp->s_nak_state = 0; + qpriv->rnr_nak_state = TID_RNR_NAK_INIT; + qp->r_psn = e->lpsn + 1; + hfi1_tid_write_alloc_resources(qp, true); + } + qp->r_state = e->opcode; qp->r_nak_state = 0; qp->s_flags |= RVT_S_RESP_PENDING; @@ -2164,6 +2295,14 @@ void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet) qp->r_head_ack_queue = next; + /* + * For all requests other than TID WRITE which are added to the ack + * queue, qpriv->r_tid_alloc follows qp->r_head_ack_queue. It is ok to + * do this because of interlocks between these and TID WRITE + * requests. The same change has also been made in hfi1_rc_rcv(). + */ + qpriv->r_tid_alloc = qp->r_head_ack_queue; + /* Schedule the send tasklet. */ qp->s_flags |= RVT_S_RESP_PENDING; hfi1_schedule_send(qp); @@ -3015,3 +3154,432 @@ u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe, rcu_read_unlock(); return sizeof(ohdr->u.tid_rdma.w_req) / sizeof(u32); } + +void hfi1_compute_tid_rdma_flow_wt(void) +{ + /* + * Heuristic for computing the RNR timeout when waiting on the flow + * queue. Rather than a computationaly expensive exact estimate of when + * a flow will be available, we assume that if a QP is at position N in + * the flow queue it has to wait approximately (N + 1) * (number of + * segments between two sync points), assuming PMTU of 4K. The rationale + * for this is that flows are released and recycled at each sync point. + */ + tid_rdma_flow_wt = MAX_TID_FLOW_PSN * enum_to_mtu(OPA_MTU_4096) / + TID_RDMA_MAX_SEGMENT_SIZE; +} + +static u32 position_in_queue(struct hfi1_qp_priv *qpriv, + struct tid_queue *queue) +{ + return qpriv->tid_enqueue - queue->dequeue; +} + +/* + * @qp: points to rvt_qp context. + * @to_seg: desired RNR timeout in segments. + * Return: index of the next highest timeout in the ib_hfi1_rnr_table[] + */ +static u32 hfi1_compute_tid_rnr_timeout(struct rvt_qp *qp, u32 to_seg) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + u64 timeout; + u32 bytes_per_us; + u8 i; + + bytes_per_us = active_egress_rate(qpriv->rcd->ppd) / 8; + timeout = (to_seg * TID_RDMA_MAX_SEGMENT_SIZE) / bytes_per_us; + /* + * Find the next highest value in the RNR table to the required + * timeout. This gives the responder some padding. + */ + for (i = 1; i <= IB_AETH_CREDIT_MASK; i++) + if (rvt_rnr_tbl_to_usec(i) >= timeout) + return i; + return 0; +} + +/** + * Central place for resource allocation at TID write responder, + * is called from write_req and write_data interrupt handlers as + * well as the send thread when a queued QP is scheduled for + * resource allocation. + * + * Iterates over (a) segments of a request and then (b) queued requests + * themselves to allocate resources for up to local->max_write + * segments across multiple requests. Stop allocating when we + * hit a sync point, resume allocating after data packets at + * sync point have been received. + * + * Resource allocation and sending of responses is decoupled. The + * request/segment which are being allocated and sent are as follows. + * Resources are allocated for: + * [request: qpriv->r_tid_alloc, segment: req->alloc_seg] + * The send thread sends: + * [request: qp->s_tail_ack_queue, segment:req->cur_seg] + */ +static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx) +{ + struct tid_rdma_request *req; + struct hfi1_qp_priv *qpriv = qp->priv; + struct hfi1_ctxtdata *rcd = qpriv->rcd; + struct tid_rdma_params *local = &qpriv->tid_rdma.local; + struct rvt_ack_entry *e; + u32 npkts, to_seg; + bool last; + int ret = 0; + + lockdep_assert_held(&qp->s_lock); + + while (1) { + /* + * Don't allocate more segments if a RNR NAK has already been + * scheduled to avoid messing up qp->r_psn: the RNR NAK will + * be sent only when all allocated segments have been sent. + * However, if more segments are allocated before that, TID RDMA + * WRITE RESP packets will be sent out for these new segments + * before the RNR NAK packet. When the requester receives the + * RNR NAK packet, it will restart with qp->s_last_psn + 1, + * which does not match qp->r_psn and will be dropped. + * Consequently, the requester will exhaust its retries and + * put the qp into error state. + */ + if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND) + break; + + /* No requests left to process */ + if (qpriv->r_tid_alloc == qpriv->r_tid_head) { + /* If all data has been received, clear the flow */ + if (qpriv->flow_state.index < RXE_NUM_TID_FLOWS && + !qpriv->alloc_w_segs) + hfi1_kern_clear_hw_flow(rcd, qp); + break; + } + + e = &qp->s_ack_queue[qpriv->r_tid_alloc]; + if (e->opcode != TID_OP(WRITE_REQ)) + goto next_req; + req = ack_to_tid_req(e); + /* Finished allocating for all segments of this request */ + if (req->alloc_seg >= req->total_segs) + goto next_req; + + /* Can allocate only a maximum of local->max_write for a QP */ + if (qpriv->alloc_w_segs >= local->max_write) + break; + + /* Don't allocate at a sync point with data packets pending */ + if (qpriv->sync_pt && qpriv->alloc_w_segs) + break; + + /* All data received at the sync point, continue */ + if (qpriv->sync_pt && !qpriv->alloc_w_segs) { + hfi1_kern_clear_hw_flow(rcd, qp); + qpriv->sync_pt = false; + if (qpriv->s_flags & HFI1_R_TID_SW_PSN) + qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; + } + + /* Allocate flow if we don't have one */ + if (qpriv->flow_state.index >= RXE_NUM_TID_FLOWS) { + ret = hfi1_kern_setup_hw_flow(qpriv->rcd, qp); + if (ret) { + to_seg = tid_rdma_flow_wt * + position_in_queue(qpriv, + &rcd->flow_queue); + break; + } + } + + npkts = rvt_div_round_up_mtu(qp, req->seg_len); + + /* + * We are at a sync point if we run out of KDETH PSN space. + * Last PSN of every generation is reserved for RESYNC. + */ + if (qpriv->flow_state.psn + npkts > MAX_TID_FLOW_PSN - 1) { + qpriv->sync_pt = true; + break; + } + + /* + * If overtaking req->acked_tail, send an RNR NAK. Because the + * QP is not queued in this case, and the issue can only be + * caused due a delay in scheduling the second leg which we + * cannot estimate, we use a rather arbitrary RNR timeout of + * (MAX_FLOWS / 2) segments + */ + if (!CIRC_SPACE(req->setup_head, req->acked_tail, + MAX_FLOWS)) { + ret = -EAGAIN; + to_seg = MAX_FLOWS >> 1; + qpriv->s_flags |= RVT_S_ACK_PENDING; + break; + } + + /* Try to allocate rcv array / TID entries */ + ret = hfi1_kern_exp_rcv_setup(req, &req->ss, &last); + if (ret == -EAGAIN) + to_seg = position_in_queue(qpriv, &rcd->rarr_queue); + if (ret) + break; + + qpriv->alloc_w_segs++; + req->alloc_seg++; + continue; +next_req: + /* Begin processing the next request */ + if (++qpriv->r_tid_alloc > + rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) + qpriv->r_tid_alloc = 0; + } + + /* + * Schedule an RNR NAK to be sent if (a) flow or rcv array allocation + * has failed (b) we are called from the rcv handler interrupt context + * (c) an RNR NAK has not already been scheduled + */ + if (ret == -EAGAIN && intr_ctx && !qp->r_nak_state) + goto send_rnr_nak; + + return; + +send_rnr_nak: + lockdep_assert_held(&qp->r_lock); + + /* Set r_nak_state to prevent unrelated events from generating NAK's */ + qp->r_nak_state = hfi1_compute_tid_rnr_timeout(qp, to_seg) | IB_RNR_NAK; + + /* Pull back r_psn to the segment being RNR NAK'd */ + qp->r_psn = e->psn + req->alloc_seg; + qp->r_ack_psn = qp->r_psn; + /* + * Pull back r_head_ack_queue to the ack entry following the request + * being RNR NAK'd. This allows resources to be allocated to the request + * if the queued QP is scheduled. + */ + qp->r_head_ack_queue = qpriv->r_tid_alloc + 1; + if (qp->r_head_ack_queue > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) + qp->r_head_ack_queue = 0; + qpriv->r_tid_head = qp->r_head_ack_queue; + /* + * These send side fields are used in make_rc_ack(). They are set in + * hfi1_send_rc_ack() but must be set here before dropping qp->s_lock + * for consistency + */ + qp->s_nak_state = qp->r_nak_state; + qp->s_ack_psn = qp->r_ack_psn; + /* + * Clear the ACK PENDING flag to prevent unwanted ACK because we + * have modified qp->s_ack_psn here. + */ + qp->s_flags &= ~(RVT_S_ACK_PENDING); + + /* + * qpriv->rnr_nak_state is used to determine when the scheduled RNR NAK + * has actually been sent. qp->s_flags RVT_S_ACK_PENDING bit cannot be + * used for this because qp->s_lock is dropped before calling + * hfi1_send_rc_ack() leading to inconsistency between the receive + * interrupt handlers and the send thread in make_rc_ack() + */ + qpriv->rnr_nak_state = TID_RNR_NAK_SEND; + + /* + * Schedule RNR NAK to be sent. RNR NAK's are scheduled from the receive + * interrupt handlers but will be sent from the send engine behind any + * previous responses that may have been scheduled + */ + rc_defered_ack(rcd, qp); +} + +void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet) +{ + /* HANDLER FOR TID RDMA WRITE REQUEST packet (Responder side)*/ + + /* + * 1. Verify TID RDMA WRITE REQ as per IB_OPCODE_RC_RDMA_WRITE_FIRST + * (see hfi1_rc_rcv()) + * - Don't allow 0-length requests. + * 2. Put TID RDMA WRITE REQ into the response queueu (s_ack_queue) + * - Setup struct tid_rdma_req with request info + * - Prepare struct tid_rdma_flow array? + * 3. Set the qp->s_ack_state as state diagram in design doc. + * 4. Set RVT_S_RESP_PENDING in s_flags. + * 5. Kick the send engine (hfi1_schedule_send()) + */ + struct hfi1_ctxtdata *rcd = packet->rcd; + struct rvt_qp *qp = packet->qp; + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct ib_other_headers *ohdr = packet->ohdr; + struct rvt_ack_entry *e; + unsigned long flags; + struct ib_reth *reth; + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_rdma_request *req; + u32 bth0, psn, len, rkey, num_segs; + bool is_fecn; + u8 next; + u64 vaddr; + int diff; + + bth0 = be32_to_cpu(ohdr->bth[0]); + if (hfi1_ruc_check_hdr(ibp, packet)) + return; + + is_fecn = process_ecn(qp, packet); + psn = mask_psn(be32_to_cpu(ohdr->bth[2])); + + if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) + rvt_comm_est(qp); + + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto nack_inv; + + reth = &ohdr->u.tid_rdma.w_req.reth; + vaddr = be64_to_cpu(reth->vaddr); + len = be32_to_cpu(reth->length); + + num_segs = DIV_ROUND_UP(len, qpriv->tid_rdma.local.max_len); + diff = delta_psn(psn, qp->r_psn); + if (unlikely(diff)) { + if (tid_rdma_rcv_error(packet, ohdr, qp, psn, diff)) + return; + goto send_ack; + } + + /* + * The resent request which was previously RNR NAK'd is inserted at the + * location of the original request, which is one entry behind + * r_head_ack_queue + */ + if (qpriv->rnr_nak_state) + qp->r_head_ack_queue = qp->r_head_ack_queue ? + qp->r_head_ack_queue - 1 : + rvt_size_atomic(ib_to_rvt(qp->ibqp.device)); + + /* We've verified the request, insert it into the ack queue. */ + next = qp->r_head_ack_queue + 1; + if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) + next = 0; + spin_lock_irqsave(&qp->s_lock, flags); + if (unlikely(next == qp->s_acked_ack_queue)) { + if (!qp->s_ack_queue[next].sent) + goto nack_inv_unlock; + update_ack_queue(qp, next); + } + e = &qp->s_ack_queue[qp->r_head_ack_queue]; + req = ack_to_tid_req(e); + + /* Bring previously RNR NAK'd request back to life */ + if (qpriv->rnr_nak_state) { + qp->r_nak_state = 0; + qp->s_nak_state = 0; + qpriv->rnr_nak_state = TID_RNR_NAK_INIT; + qp->r_psn = e->lpsn + 1; + req->state = TID_REQUEST_INIT; + goto update_head; + } + + if (e->rdma_sge.mr) { + rvt_put_mr(e->rdma_sge.mr); + e->rdma_sge.mr = NULL; + } + + /* The length needs to be in multiples of PAGE_SIZE */ + if (!len || len & ~PAGE_MASK) + goto nack_inv_unlock; + + rkey = be32_to_cpu(reth->rkey); + qp->r_len = len; + + if (e->opcode == TID_OP(WRITE_REQ) && + (req->setup_head != req->clear_tail || + req->clear_tail != req->acked_tail)) + goto nack_inv_unlock; + + if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr, + rkey, IB_ACCESS_REMOTE_WRITE))) + goto nack_acc; + + qp->r_psn += num_segs - 1; + + e->opcode = (bth0 >> 24) & 0xff; + e->psn = psn; + e->lpsn = qp->r_psn; + e->sent = 0; + + req->n_flows = min_t(u16, num_segs, qpriv->tid_rdma.local.max_write); + req->state = TID_REQUEST_INIT; + req->cur_seg = 0; + req->comp_seg = 0; + req->ack_seg = 0; + req->alloc_seg = 0; + req->isge = 0; + req->seg_len = qpriv->tid_rdma.local.max_len; + req->total_len = len; + req->total_segs = num_segs; + req->r_flow_psn = e->psn; + req->ss.sge = e->rdma_sge; + req->ss.num_sge = 1; + + req->flow_idx = req->setup_head; + req->clear_tail = req->setup_head; + req->acked_tail = req->setup_head; + + qp->r_state = e->opcode; + qp->r_nak_state = 0; + /* + * We need to increment the MSN here instead of when we + * finish sending the result since a duplicate request would + * increment it more than once. + */ + qp->r_msn++; + qp->r_psn++; + + if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID) { + qpriv->r_tid_tail = qp->r_head_ack_queue; + } else if (qpriv->r_tid_tail == qpriv->r_tid_head) { + struct tid_rdma_request *ptr; + + e = &qp->s_ack_queue[qpriv->r_tid_tail]; + ptr = ack_to_tid_req(e); + + if (e->opcode != TID_OP(WRITE_REQ) || + ptr->comp_seg == ptr->total_segs) { + if (qpriv->r_tid_tail == qpriv->r_tid_ack) + qpriv->r_tid_ack = qp->r_head_ack_queue; + qpriv->r_tid_tail = qp->r_head_ack_queue; + } + } +update_head: + qp->r_head_ack_queue = next; + qpriv->r_tid_head = qp->r_head_ack_queue; + + hfi1_tid_write_alloc_resources(qp, true); + + /* Schedule the send tasklet. */ + qp->s_flags |= RVT_S_RESP_PENDING; + hfi1_schedule_send(qp); + + spin_unlock_irqrestore(&qp->s_lock, flags); + if (is_fecn) + goto send_ack; + return; + +nack_inv_unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); +nack_inv: + rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + qp->r_nak_state = IB_NAK_INVALID_REQUEST; + qp->r_ack_psn = qp->r_psn; + /* Queue NAK for later */ + rc_defered_ack(rcd, qp); + return; +nack_acc: + spin_unlock_irqrestore(&qp->s_lock, flags); + rvt_rc_error(qp, IB_WC_LOC_PROT_ERR); + qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; + qp->r_ack_psn = qp->r_psn; +send_ack: + hfi1_send_rc_ack(packet, is_fecn); +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 9b952351f072..7780a28db316 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -26,7 +26,9 @@ * * HFI1_S_TID_WAIT_INTERLCK - QP is waiting for requester interlock */ +/* BIT(4) reserved for RVT_S_ACK_PENDING. */ #define HFI1_S_TID_WAIT_INTERLCK BIT(5) +#define HFI1_R_TID_SW_PSN BIT(19) /* * Unlike regular IB RDMA VERBS, which do not require an entry @@ -89,10 +91,12 @@ struct tid_rdma_request { } e; struct tid_rdma_flow *flows; /* array of tid flows */ + struct rvt_sge_state ss; /* SGE state for TID RDMA requests */ u16 n_flows; /* size of the flow buffer window */ u16 setup_head; /* flow index we are setting up */ u16 clear_tail; /* flow index we are clearing */ u16 flow_idx; /* flow index most recently set up */ + u16 acked_tail; u32 seg_len; u32 total_len; @@ -103,6 +107,7 @@ struct tid_rdma_request { u32 cur_seg; /* index of current segment */ u32 comp_seg; /* index of last completed segment */ u32 ack_seg; /* index of last ack'ed segment */ + u32 alloc_seg; /* index of next segment to be allocated */ u32 isge; /* index of "current" sge */ u32 ack_pending; /* num acks pending for this request */ @@ -174,6 +179,12 @@ struct tid_rdma_flow { u32 tid_entry[TID_RDMA_MAX_PAGES]; }; +enum tid_rnr_nak_state { + TID_RNR_NAK_INIT = 0, + TID_RNR_NAK_SEND, + TID_RNR_NAK_SENT, +}; + bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data); bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data); bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data); @@ -247,4 +258,9 @@ static inline void hfi1_setup_tid_rdma_wqe(struct rvt_qp *qp, u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe, struct ib_other_headers *ohdr, u32 *bth1, u32 *bth2, u32 *len); + +void hfi1_compute_tid_rdma_flow_wt(void); + +void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet); + #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 841727a684d5..9ced8a4a7b76 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -172,7 +172,15 @@ struct hfi1_qp_priv { unsigned long tid_timer_timeout_jiffies; /* variables for the TID RDMA SE state machine */ + u8 rnr_nak_state; /* RNR NAK state */ u32 s_flags; + u32 r_tid_head; /* Most recently added TID RDMA request */ + u32 r_tid_tail; /* the last completed TID RDMA request */ + u32 r_tid_ack; /* the TID RDMA request to be ACK'ed */ + u32 r_tid_alloc; /* Request for which we are allocating resources */ + u32 pending_tid_w_segs; /* Num of pending tid write segments */ + u32 alloc_w_segs; /* Number of segments for which write */ + /* resources have been allocated for this QP */ /* For TID RDMA READ */ u32 tid_r_reqs; /* Num of tid reads requested */ @@ -180,8 +188,12 @@ struct hfi1_qp_priv { u32 pending_tid_r_segs; /* Num of pending tid read segments */ u16 pkts_ps; /* packets per segment */ u8 timeout_shift; /* account for number of packets per segment */ + + u8 sync_pt; /* Set when QP reaches sync point */ }; +#define HFI1_QP_WQE_INVALID ((u32)-1) + struct hfi1_swqe_priv { struct tid_rdma_request tid_req; struct rvt_sge_state ss; /* Used for TID RDMA READ Request */ From 38d46d3676ed6ecba284eb49e4b675ca9891801a Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:49:09 -0800 Subject: [PATCH 05/23] IB/hfi1: Add a function to build TID RDMA WRITE response This patch adds the function to build TID RDMA WRITE response. The main role of the TID RDMA WRITE RESP packet is to send TID entries to the requester so that they can be used to encode TID RDMA WRITE DATA packet. Signed-off-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Ashutosh Dixit Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 95 +++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 5 ++ drivers/infiniband/hw/hfi1/verbs.h | 1 + 3 files changed, 101 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 516dca9f497e..78828f9f7592 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -3583,3 +3583,98 @@ void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet) send_ack: hfi1_send_rc_ack(packet, is_fecn); } + +u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, + struct ib_other_headers *ohdr, u32 *bth1, + u32 bth2, u32 *len, + struct rvt_sge_state **ss) +{ + struct hfi1_ack_priv *epriv = e->priv; + struct tid_rdma_request *req = &epriv->tid_req; + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_rdma_flow *flow = NULL; + u32 resp_len = 0, hdwords = 0; + void *resp_addr = NULL; + struct tid_rdma_params *remote; + + flow = &req->flows[req->flow_idx]; + switch (req->state) { + default: + /* + * Try to allocate resources here in case QP was queued and was + * later scheduled when resources became available + */ + hfi1_tid_write_alloc_resources(qp, false); + + /* We've already sent everything which is ready */ + if (req->cur_seg >= req->alloc_seg) + goto done; + + /* + * Resources can be assigned but responses cannot be sent in + * rnr_nak state, till the resent request is received + */ + if (qpriv->rnr_nak_state == TID_RNR_NAK_SENT) + goto done; + + req->state = TID_REQUEST_ACTIVE; + req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS); + break; + + case TID_REQUEST_RESEND_ACTIVE: + case TID_REQUEST_RESEND: + req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS); + if (!CIRC_CNT(req->setup_head, req->flow_idx, MAX_FLOWS)) + req->state = TID_REQUEST_ACTIVE; + + break; + } + flow->flow_state.resp_ib_psn = bth2; + resp_addr = (void *)flow->tid_entry; + resp_len = sizeof(*flow->tid_entry) * flow->tidcnt; + req->cur_seg++; + + memset(&ohdr->u.tid_rdma.w_rsp, 0, sizeof(ohdr->u.tid_rdma.w_rsp)); + epriv->ss.sge.vaddr = resp_addr; + epriv->ss.sge.sge_length = resp_len; + epriv->ss.sge.length = epriv->ss.sge.sge_length; + /* + * We can safely zero these out. Since the first SGE covers the + * entire packet, nothing else should even look at the MR. + */ + epriv->ss.sge.mr = NULL; + epriv->ss.sge.m = 0; + epriv->ss.sge.n = 0; + + epriv->ss.sg_list = NULL; + epriv->ss.total_len = epriv->ss.sge.sge_length; + epriv->ss.num_sge = 1; + + *ss = &epriv->ss; + *len = epriv->ss.total_len; + + /* Construct the TID RDMA WRITE RESP packet header */ + rcu_read_lock(); + remote = rcu_dereference(qpriv->tid_rdma.remote); + + KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth0, KVER, 0x1); + KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth1, JKEY, remote->jkey); + ohdr->u.tid_rdma.w_rsp.aeth = rvt_compute_aeth(qp); + ohdr->u.tid_rdma.w_rsp.tid_flow_psn = + cpu_to_be32((flow->flow_state.generation << + HFI1_KDETH_BTH_SEQ_SHIFT) | + (flow->flow_state.spsn & + HFI1_KDETH_BTH_SEQ_MASK)); + ohdr->u.tid_rdma.w_rsp.tid_flow_qp = + cpu_to_be32(qpriv->tid_rdma.local.qp | + ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) << + TID_RDMA_DESTQP_FLOW_SHIFT) | + qpriv->rcd->ctxt); + ohdr->u.tid_rdma.w_rsp.verbs_qp = cpu_to_be32(qp->remote_qpn); + *bth1 = remote->qp; + rcu_read_unlock(); + hdwords = sizeof(ohdr->u.tid_rdma.w_rsp) / sizeof(u32); + qpriv->pending_tid_w_segs++; +done: + return hdwords; +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 7780a28db316..19f4dd89680f 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -263,4 +263,9 @@ void hfi1_compute_tid_rdma_flow_wt(void); void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet); +u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, + struct ib_other_headers *ohdr, u32 *bth1, + u32 bth2, u32 *len, + struct rvt_sge_state **ss); + #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 9ced8a4a7b76..3a501b09621e 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -200,6 +200,7 @@ struct hfi1_swqe_priv { }; struct hfi1_ack_priv { + struct rvt_sge_state ss; /* used for TID WRITE RESP */ struct tid_rdma_request tid_req; }; From 3c759e003a6a4d4b8fd0472f9501e8c45d775c26 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:49:19 -0800 Subject: [PATCH 06/23] IB/hfi1: Add TID resource timer This patch adds the TID resource timer, which is used by the responder to free any TID resources that are allocated for TID RDMA WRITE request and not returned by the requester after a reasonable time. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/qp.c | 1 + drivers/infiniband/hw/hfi1/tid_rdma.c | 92 +++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 3 + drivers/infiniband/hw/hfi1/verbs.h | 1 + 4 files changed, 97 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index acdd9eba189b..31b4b60f4364 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -755,6 +755,7 @@ void quiesce_qp(struct rvt_qp *qp) { struct hfi1_qp_priv *priv = qp->priv; + hfi1_del_tid_reap_timer(qp); iowait_sdma_drain(&priv->s_iowait); qp_pio_drain(qp); flush_tx_list(qp); diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 78828f9f7592..ede25ee195ff 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -118,6 +118,9 @@ static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req, static void hfi1_init_trdma_req(struct rvt_qp *qp, struct tid_rdma_request *req); static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx); +static void hfi1_tid_timeout(struct timer_list *t); +static void hfi1_add_tid_reap_timer(struct rvt_qp *qp); +static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp); static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p) { @@ -321,6 +324,7 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, qpriv->r_tid_tail = HFI1_QP_WQE_INVALID; qpriv->r_tid_ack = HFI1_QP_WQE_INVALID; qpriv->r_tid_alloc = HFI1_QP_WQE_INVALID; + timer_setup(&qpriv->s_tid_timer, hfi1_tid_timeout, 0); INIT_LIST_HEAD(&qpriv->tid_wait); if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { @@ -3619,6 +3623,7 @@ u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, req->state = TID_REQUEST_ACTIVE; req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS); + hfi1_add_tid_reap_timer(qp); break; case TID_REQUEST_RESEND_ACTIVE: @@ -3627,6 +3632,7 @@ u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, if (!CIRC_CNT(req->setup_head, req->flow_idx, MAX_FLOWS)) req->state = TID_REQUEST_ACTIVE; + hfi1_mod_tid_reap_timer(qp); break; } flow->flow_state.resp_ib_psn = bth2; @@ -3678,3 +3684,89 @@ u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, done: return hdwords; } + +static void hfi1_add_tid_reap_timer(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + + lockdep_assert_held(&qp->s_lock); + if (!(qpriv->s_flags & HFI1_R_TID_RSC_TIMER)) { + qpriv->s_flags |= HFI1_R_TID_RSC_TIMER; + qpriv->s_tid_timer.expires = jiffies + + qpriv->tid_timer_timeout_jiffies; + add_timer(&qpriv->s_tid_timer); + } +} + +static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + + lockdep_assert_held(&qp->s_lock); + qpriv->s_flags |= HFI1_R_TID_RSC_TIMER; + mod_timer(&qpriv->s_tid_timer, jiffies + + qpriv->tid_timer_timeout_jiffies); +} + +static int hfi1_stop_tid_reap_timer(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + int rval = 0; + + lockdep_assert_held(&qp->s_lock); + if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) { + rval = del_timer(&qpriv->s_tid_timer); + qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER; + } + return rval; +} + +void hfi1_del_tid_reap_timer(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + + del_timer_sync(&qpriv->s_tid_timer); + qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER; +} + +static void hfi1_tid_timeout(struct timer_list *t) +{ + struct hfi1_qp_priv *qpriv = from_timer(qpriv, t, s_tid_timer); + struct rvt_qp *qp = qpriv->owner; + struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); + unsigned long flags; + u32 i; + + spin_lock_irqsave(&qp->r_lock, flags); + spin_lock(&qp->s_lock); + if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) { + dd_dev_warn(dd_from_ibdev(qp->ibqp.device), "[QP%u] %s %d\n", + qp->ibqp.qp_num, __func__, __LINE__); + hfi1_stop_tid_reap_timer(qp); + /* + * Go though the entire ack queue and clear any outstanding + * HW flow and RcvArray resources. + */ + hfi1_kern_clear_hw_flow(qpriv->rcd, qp); + for (i = 0; i < rvt_max_atomic(rdi); i++) { + struct tid_rdma_request *req = + ack_to_tid_req(&qp->s_ack_queue[i]); + + hfi1_kern_exp_rcv_clear_all(req); + } + spin_unlock(&qp->s_lock); + if (qp->ibqp.event_handler) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_FATAL; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } + rvt_rc_error(qp, IB_WC_RESP_TIMEOUT_ERR); + goto unlock_r_lock; + } + spin_unlock(&qp->s_lock); +unlock_r_lock: + spin_unlock_irqrestore(&qp->r_lock, flags); +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 19f4dd89680f..39137e3c79fe 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -26,6 +26,7 @@ * * HFI1_S_TID_WAIT_INTERLCK - QP is waiting for requester interlock */ +#define HFI1_R_TID_RSC_TIMER BIT(2) /* BIT(4) reserved for RVT_S_ACK_PENDING. */ #define HFI1_S_TID_WAIT_INTERLCK BIT(5) #define HFI1_R_TID_SW_PSN BIT(19) @@ -268,4 +269,6 @@ u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, u32 bth2, u32 *len, struct rvt_sge_state **ss); +void hfi1_del_tid_reap_timer(struct rvt_qp *qp); + #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 3a501b09621e..68a41f54bc78 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -163,6 +163,7 @@ struct hfi1_qp_priv { u32 tid_enqueue; /* saved when tid waited */ u8 s_sc; /* SC[0..4] for next packet */ struct iowait s_iowait; + struct timer_list s_tid_timer; /* for timing tid wait */ struct list_head tid_wait; /* for queueing tid space */ struct hfi1_opfn_data opfn; struct tid_flow_state flow_state; From 72a0ea99ec13bcb27784c1a48f4e8fda61586c26 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:49:31 -0800 Subject: [PATCH 07/23] IB/hfi1: Add a function to receive TID RDMA WRITE response This patch adds a function to receive TID RDMA WRITE response. The TID entries will be stored for encoding TID RDMA WRITE DATA packet later. Signed-off-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Ashutosh Dixit Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 173 ++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 4 + drivers/infiniband/hw/hfi1/verbs.h | 3 + 3 files changed, 180 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index ede25ee195ff..92b6a3d90ce5 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -319,6 +319,9 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, qpriv->flow_state.index = RXE_NUM_TID_FLOWS; qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS; qpriv->flow_state.generation = KERN_GENERATION_RESERVED; + qpriv->s_tid_cur = HFI1_QP_WQE_INVALID; + qpriv->s_tid_head = HFI1_QP_WQE_INVALID; + qpriv->s_tid_tail = HFI1_QP_WQE_INVALID; qpriv->rnr_nak_state = TID_RNR_NAK_INIT; qpriv->r_tid_head = HFI1_QP_WQE_INVALID; qpriv->r_tid_tail = HFI1_QP_WQE_INVALID; @@ -3770,3 +3773,173 @@ static void hfi1_tid_timeout(struct timer_list *t) unlock_r_lock: spin_unlock_irqrestore(&qp->r_lock, flags); } + +void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet) +{ + /* HANDLER FOR TID RDMA WRITE RESPONSE packet (Requestor side */ + + /* + * 1. Find matching SWQE + * 2. Check that TIDENTRY array has enough space for a complete + * segment. If not, put QP in error state. + * 3. Save response data in struct tid_rdma_req and struct tid_rdma_flow + * 4. Remove HFI1_S_WAIT_TID_RESP from s_flags. + * 5. Set qp->s_state + * 6. Kick the send engine (hfi1_schedule_send()) + */ + struct ib_other_headers *ohdr = packet->ohdr; + struct rvt_qp *qp = packet->qp; + struct hfi1_qp_priv *qpriv = qp->priv; + struct hfi1_ctxtdata *rcd = packet->rcd; + struct rvt_swqe *wqe; + struct tid_rdma_request *req; + struct tid_rdma_flow *flow; + enum ib_wc_status status; + u32 opcode, aeth, psn, flow_psn, i, tidlen = 0, pktlen; + bool is_fecn; + unsigned long flags; + + is_fecn = process_ecn(qp, packet); + psn = mask_psn(be32_to_cpu(ohdr->bth[2])); + aeth = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.aeth); + opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; + + spin_lock_irqsave(&qp->s_lock, flags); + + /* Ignore invalid responses */ + if (cmp_psn(psn, qp->s_next_psn) >= 0) + goto ack_done; + + /* Ignore duplicate responses. */ + if (unlikely(cmp_psn(psn, qp->s_last_psn) <= 0)) + goto ack_done; + + if (unlikely(qp->s_acked == qp->s_tail)) + goto ack_done; + + /* + * If we are waiting for a particular packet sequence number + * due to a request being resent, check for it. Otherwise, + * ensure that we haven't missed anything. + */ + if (qp->r_flags & RVT_R_RDMAR_SEQ) { + if (cmp_psn(psn, qp->s_last_psn + 1) != 0) + goto ack_done; + qp->r_flags &= ~RVT_R_RDMAR_SEQ; + } + + wqe = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur); + if (unlikely(wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)) + goto ack_op_err; + + req = wqe_to_tid_req(wqe); + /* + * If we've lost ACKs and our acked_tail pointer is too far + * behind, don't overwrite segments. Just drop the packet and + * let the reliability protocol take care of it. + */ + if (!CIRC_SPACE(req->setup_head, req->acked_tail, MAX_FLOWS)) + goto ack_done; + + /* + * The call to do_rc_ack() should be last in the chain of + * packet checks because it will end up updating the QP state. + * Therefore, anything that would prevent the packet from + * being accepted as a successful response should be prior + * to it. + */ + if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd)) + goto ack_done; + + flow = &req->flows[req->setup_head]; + flow->pkt = 0; + flow->tid_idx = 0; + flow->tid_offset = 0; + flow->sent = 0; + flow->resync_npkts = 0; + flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_qp); + flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) & + TID_RDMA_DESTQP_FLOW_MASK; + flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_psn)); + flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT; + flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK; + flow->flow_state.resp_ib_psn = psn; + flow->length = min_t(u32, req->seg_len, + (wqe->length - (req->comp_seg * req->seg_len))); + + flow->npkts = rvt_div_round_up_mtu(qp, flow->length); + flow->flow_state.lpsn = flow->flow_state.spsn + + flow->npkts - 1; + /* payload length = packet length - (header length + ICRC length) */ + pktlen = packet->tlen - (packet->hlen + 4); + if (pktlen > sizeof(flow->tid_entry)) { + status = IB_WC_LOC_LEN_ERR; + goto ack_err; + } + memcpy(flow->tid_entry, packet->ebuf, pktlen); + flow->tidcnt = pktlen / sizeof(*flow->tid_entry); + + req->comp_seg++; + /* + * Walk the TID_ENTRY list to make sure we have enough space for a + * complete segment. + */ + for (i = 0; i < flow->tidcnt; i++) { + if (!EXP_TID_GET(flow->tid_entry[i], LEN)) { + status = IB_WC_LOC_LEN_ERR; + goto ack_err; + } + tidlen += EXP_TID_GET(flow->tid_entry[i], LEN); + } + if (tidlen * PAGE_SIZE < flow->length) { + status = IB_WC_LOC_LEN_ERR; + goto ack_err; + } + + /* + * If this is the first response for this request, set the initial + * flow index to the current flow. + */ + if (!cmp_psn(psn, wqe->psn)) { + req->r_last_acked = mask_psn(wqe->psn - 1); + /* Set acked flow index to head index */ + req->acked_tail = req->setup_head; + } + + /* advance circular buffer head */ + req->setup_head = CIRC_NEXT(req->setup_head, MAX_FLOWS); + req->state = TID_REQUEST_ACTIVE; + + /* + * If all responses for this TID RDMA WRITE request have been received + * advance the pointer to the next one. + * Since TID RDMA requests could be mixed in with regular IB requests, + * they might not appear sequentially in the queue. Therefore, the + * next request needs to be "found". + */ + if (qpriv->s_tid_cur != qpriv->s_tid_head && + req->comp_seg == req->total_segs) { + for (i = qpriv->s_tid_cur + 1; ; i++) { + if (i == qp->s_size) + i = 0; + wqe = rvt_get_swqe_ptr(qp, i); + if (i == qpriv->s_tid_head) + break; + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) + break; + } + qpriv->s_tid_cur = i; + } + qp->s_flags &= ~HFI1_S_WAIT_TID_RESP; + + goto ack_done; + +ack_op_err: + status = IB_WC_LOC_QP_OP_ERR; +ack_err: + rvt_error_qp(qp, status); +ack_done: + spin_unlock_irqrestore(&qp->s_lock, flags); + if (is_fecn) + hfi1_send_rc_ack(packet, is_fecn); +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 39137e3c79fe..6f11fd5ca4c0 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -102,6 +102,7 @@ struct tid_rdma_request { u32 seg_len; u32 total_len; u32 r_flow_psn; /* IB PSN of next segment start */ + u32 r_last_acked; /* IB PSN of last ACK'ed packet */ u32 s_next_psn; /* IB PSN of next segment start for read */ u32 total_segs; /* segments required to complete a request */ @@ -175,6 +176,7 @@ struct tid_rdma_flow { u8 npagesets; u8 npkts; u8 pkt; + u8 resync_npkts; struct kern_tid_node tnode[TID_RDMA_MAX_PAGES]; struct tid_rdma_pageset pagesets[TID_RDMA_MAX_PAGES]; u32 tid_entry[TID_RDMA_MAX_PAGES]; @@ -271,4 +273,6 @@ u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, void hfi1_del_tid_reap_timer(struct rvt_qp *qp); +void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet); + #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 68a41f54bc78..b2096c7c1132 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -175,6 +175,9 @@ struct hfi1_qp_priv { /* variables for the TID RDMA SE state machine */ u8 rnr_nak_state; /* RNR NAK state */ u32 s_flags; + u32 s_tid_cur; + u32 s_tid_head; + u32 s_tid_tail; u32 r_tid_head; /* Most recently added TID RDMA request */ u32 r_tid_tail; /* the last completed TID RDMA request */ u32 r_tid_ack; /* the TID RDMA request to be ACK'ed */ From 539e1908e45b5cdcc72bded272f8adb52ad2c913 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:49:41 -0800 Subject: [PATCH 08/23] IB/hfi1: Add a function to build TID RDMA WRITE DATA packet This patch adds a function to build TID RDMA WRITE DATA packet. Signed-off-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Ashutosh Dixit Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 62 +++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 4 ++ 2 files changed, 66 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 92b6a3d90ce5..243feaddb811 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -3943,3 +3943,65 @@ void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet) if (is_fecn) hfi1_send_rc_ack(packet, is_fecn); } + +bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, + u32 *bth1, u32 *bth2, u32 *len) +{ + struct tid_rdma_request *req = wqe_to_tid_req(wqe); + struct tid_rdma_flow *flow = &req->flows[req->clear_tail]; + struct tid_rdma_params *remote; + struct rvt_qp *qp = req->qp; + struct hfi1_qp_priv *qpriv = qp->priv; + u32 tidentry = flow->tid_entry[flow->tid_idx]; + u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT; + struct tid_rdma_write_data *wd = &ohdr->u.tid_rdma.w_data; + u32 next_offset, om = KDETH_OM_LARGE; + bool last_pkt; + + if (!tidlen) { + hfi1_trdma_send_complete(qp, wqe, IB_WC_REM_INV_RD_REQ_ERR); + rvt_error_qp(qp, IB_WC_REM_INV_RD_REQ_ERR); + } + + *len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset); + flow->sent += *len; + next_offset = flow->tid_offset + *len; + last_pkt = (flow->tid_idx == (flow->tidcnt - 1) && + next_offset >= tidlen) || (flow->sent >= flow->length); + + rcu_read_lock(); + remote = rcu_dereference(qpriv->tid_rdma.remote); + KDETH_RESET(wd->kdeth0, KVER, 0x1); + KDETH_SET(wd->kdeth0, SH, !last_pkt); + KDETH_SET(wd->kdeth0, INTR, !!(!last_pkt && remote->urg)); + KDETH_SET(wd->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL)); + KDETH_SET(wd->kdeth0, TID, EXP_TID_GET(tidentry, IDX)); + KDETH_SET(wd->kdeth0, OM, om == KDETH_OM_LARGE); + KDETH_SET(wd->kdeth0, OFFSET, flow->tid_offset / om); + KDETH_RESET(wd->kdeth1, JKEY, remote->jkey); + wd->verbs_qp = cpu_to_be32(qp->remote_qpn); + rcu_read_unlock(); + + *bth1 = flow->tid_qpn; + *bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) & + HFI1_KDETH_BTH_SEQ_MASK) | + (flow->flow_state.generation << + HFI1_KDETH_BTH_SEQ_SHIFT)); + if (last_pkt) { + /* PSNs are zero-based, so +1 to count number of packets */ + if (flow->flow_state.lpsn + 1 + + rvt_div_round_up_mtu(qp, req->seg_len) > + MAX_TID_FLOW_PSN) + req->state = TID_REQUEST_SYNC; + *bth2 |= IB_BTH_REQ_ACK; + } + + if (next_offset >= tidlen) { + flow->tid_offset = 0; + flow->tid_idx++; + } else { + flow->tid_offset = next_offset; + } + return last_pkt; +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 6f11fd5ca4c0..f28c7ab752b2 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -275,4 +275,8 @@ void hfi1_del_tid_reap_timer(struct rvt_qp *qp); void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet); +bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, + u32 *bth1, u32 *bth2, u32 *len); + #endif /* HFI1_TID_RDMA_H */ From d72fe7d5008b5600a11f03a0dcb743fd7acb0085 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:49:51 -0800 Subject: [PATCH 09/23] IB/hfi1: Add a function to receive TID RDMA WRITE DATA packet This patch adds a function to receive TID RDMA WRITE DATA packet, which is in the KDETH PSN space in packet ordering. Due to the use of header suppression, software is generally only notified when the last data packet for a segment is received. This patch also adds code to handle KDETH EFLAGS errors for ingress TID RDMA WRITE DATA packets. Signed-off-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Ashutosh Dixit Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 236 ++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 2 + drivers/infiniband/hw/hfi1/verbs.h | 3 + 3 files changed, 241 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 243feaddb811..166a34c8449d 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -2566,13 +2566,32 @@ static bool tid_rdma_tid_err(struct hfi1_ctxtdata *rcd, u8 opcode) { struct rvt_qp *qp = packet->qp; + struct hfi1_qp_priv *qpriv = qp->priv; u32 ipsn; struct ib_other_headers *ohdr = packet->ohdr; + struct rvt_ack_entry *e; + struct tid_rdma_request *req; + struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); + u32 i; if (rcv_type >= RHF_RCV_TYPE_IB) goto done; spin_lock(&qp->s_lock); + + /* + * We've ran out of space in the eager buffer. + * Eagerly received KDETH packets which require space in the + * Eager buffer (packet that have payload) are TID RDMA WRITE + * response packets. In this case, we have to re-transmit the + * TID RDMA WRITE request. + */ + if (rcv_type == RHF_RCV_TYPE_EAGER) { + hfi1_restart_rc(qp, qp->s_last_psn + 1, 1); + hfi1_schedule_send(qp); + goto done_unlock; + } + /* * For TID READ response, error out QP after freeing the tid * resources. @@ -2586,8 +2605,25 @@ static bool tid_rdma_tid_err(struct hfi1_ctxtdata *rcd, rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); goto done; } + goto done_unlock; } + /* + * Error out the qp for TID RDMA WRITE + */ + hfi1_kern_clear_hw_flow(qpriv->rcd, qp); + for (i = 0; i < rvt_max_atomic(rdi); i++) { + e = &qp->s_ack_queue[i]; + if (e->opcode == TID_OP(WRITE_REQ)) { + req = ack_to_tid_req(e); + hfi1_kern_exp_rcv_clear_all(req); + } + } + spin_unlock(&qp->s_lock); + rvt_rc_error(qp, IB_WC_LOC_LEN_ERR); + goto done; + +done_unlock: spin_unlock(&qp->s_lock); done: return true; @@ -2837,8 +2873,12 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, u8 opcode; u32 qp_num, psn, ibpsn; struct rvt_qp *qp; + struct hfi1_qp_priv *qpriv; unsigned long flags; bool ret = true; + struct rvt_ack_entry *e; + struct tid_rdma_request *req; + struct tid_rdma_flow *flow; trace_hfi1_msg_handle_kdeth_eflags(NULL, "Kdeth error: rhf ", packet->rhf); @@ -2897,14 +2937,109 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, ibpsn = mask_psn(ibpsn); ret = handle_read_kdeth_eflags(rcd, packet, rcv_type, rte, psn, ibpsn); + goto r_unlock; } + /* + * qp->s_tail_ack_queue points to the rvt_ack_entry currently being + * processed. These a completed sequentially so we can be sure that + * the pointer will not change until the entire request has completed. + */ + spin_lock(&qp->s_lock); + qpriv = qp->priv; + e = &qp->s_ack_queue[qpriv->r_tid_tail]; + req = ack_to_tid_req(e); + flow = &req->flows[req->clear_tail]; + + switch (rcv_type) { + case RHF_RCV_TYPE_EXPECTED: + switch (rte) { + case RHF_RTE_EXPECTED_FLOW_SEQ_ERR: + if (!(qpriv->s_flags & HFI1_R_TID_SW_PSN)) { + u64 reg; + + qpriv->s_flags |= HFI1_R_TID_SW_PSN; + /* + * The only sane way to get the amount of + * progress is to read the HW flow state. + */ + reg = read_uctxt_csr(dd, rcd->ctxt, + RCV_TID_FLOW_TABLE + + (8 * flow->idx)); + flow->flow_state.r_next_psn = mask_psn(reg); + qpriv->r_next_psn_kdeth = + flow->flow_state.r_next_psn; + goto nak_psn; + } else { + /* + * If the received PSN does not match the next + * expected PSN, NAK the packet. + * However, only do that if we know that the a + * NAK has already been sent. Otherwise, this + * mismatch could be due to packets that were + * already in flight. + */ + if (psn != flow->flow_state.r_next_psn) { + psn = flow->flow_state.r_next_psn; + goto nak_psn; + } + + qpriv->s_nak_state = 0; + /* + * If SW PSN verification is successful and this + * is the last packet in the segment, tell the + * caller to process it as a normal packet. + */ + if (psn == full_flow_psn(flow, + flow->flow_state.lpsn)) + ret = false; + qpriv->r_next_psn_kdeth = + ++flow->flow_state.r_next_psn; + } + break; + + case RHF_RTE_EXPECTED_FLOW_GEN_ERR: + goto nak_psn; + + default: + break; + } + break; + + case RHF_RCV_TYPE_ERROR: + switch (rte) { + case RHF_RTE_ERROR_OP_CODE_ERR: + case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR: + case RHF_RTE_ERROR_KHDR_HCRC_ERR: + case RHF_RTE_ERROR_KHDR_KVER_ERR: + case RHF_RTE_ERROR_CONTEXT_ERR: + case RHF_RTE_ERROR_KHDR_TID_ERR: + default: + break; + } + default: + break; + } + +unlock: + spin_unlock(&qp->s_lock); r_unlock: spin_unlock_irqrestore(&qp->r_lock, flags); rcu_unlock: rcu_read_unlock(); drop: return ret; +nak_psn: + ibp->rvp.n_rc_seqnak++; + if (!qpriv->s_nak_state) { + qpriv->s_nak_state = IB_NAK_PSN_ERROR; + /* We are NAK'ing the next expected PSN */ + qpriv->s_nak_psn = mask_psn(flow->flow_state.r_next_psn); + qpriv->s_flags |= RVT_S_ACK_PENDING; + if (qpriv->r_tid_ack == HFI1_QP_WQE_INVALID) + qpriv->r_tid_ack = qpriv->r_tid_tail; + } + goto unlock; } /* @@ -4005,3 +4140,104 @@ bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe, } return last_pkt; } + +void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet) +{ + struct rvt_qp *qp = packet->qp; + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_ctxtdata *rcd = priv->rcd; + struct ib_other_headers *ohdr = packet->ohdr; + struct rvt_ack_entry *e; + struct tid_rdma_request *req; + struct tid_rdma_flow *flow; + struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); + unsigned long flags; + u32 psn, next; + u8 opcode; + + psn = mask_psn(be32_to_cpu(ohdr->bth[2])); + opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; + + /* + * All error handling should be done by now. If we are here, the packet + * is either good or been accepted by the error handler. + */ + spin_lock_irqsave(&qp->s_lock, flags); + e = &qp->s_ack_queue[priv->r_tid_tail]; + req = ack_to_tid_req(e); + flow = &req->flows[req->clear_tail]; + if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.lpsn))) { + if (cmp_psn(psn, flow->flow_state.r_next_psn)) + goto send_nak; + flow->flow_state.r_next_psn++; + goto exit; + } + flow->flow_state.r_next_psn = mask_psn(psn + 1); + hfi1_kern_exp_rcv_clear(req); + priv->alloc_w_segs--; + rcd->flows[flow->idx].psn = psn & HFI1_KDETH_BTH_SEQ_MASK; + req->comp_seg++; + priv->s_nak_state = 0; + + /* + * Release the flow if one of the following conditions has been met: + * - The request has reached a sync point AND all outstanding + * segments have been completed, or + * - The entire request is complete and there are no more requests + * (of any kind) in the queue. + */ + if (priv->r_tid_ack == HFI1_QP_WQE_INVALID) + priv->r_tid_ack = priv->r_tid_tail; + + if (opcode == TID_OP(WRITE_DATA_LAST)) { + for (next = priv->r_tid_tail + 1; ; next++) { + if (next > rvt_size_atomic(&dev->rdi)) + next = 0; + if (next == priv->r_tid_head) + break; + e = &qp->s_ack_queue[next]; + if (e->opcode == TID_OP(WRITE_REQ)) + break; + } + priv->r_tid_tail = next; + if (++qp->s_acked_ack_queue > rvt_size_atomic(&dev->rdi)) + qp->s_acked_ack_queue = 0; + } + + hfi1_tid_write_alloc_resources(qp, true); + + /* + * If we need to generate more responses, schedule the + * send engine. + */ + if (req->cur_seg < req->total_segs || + qp->s_tail_ack_queue != qp->r_head_ack_queue) { + qp->s_flags |= RVT_S_RESP_PENDING; + hfi1_schedule_send(qp); + } + + priv->pending_tid_w_segs--; + if (priv->s_flags & HFI1_R_TID_RSC_TIMER) { + if (priv->pending_tid_w_segs) + hfi1_mod_tid_reap_timer(req->qp); + else + hfi1_stop_tid_reap_timer(req->qp); + } + +done: + priv->s_flags |= RVT_S_ACK_PENDING; +exit: + priv->r_next_psn_kdeth = flow->flow_state.r_next_psn; + spin_unlock_irqrestore(&qp->s_lock, flags); + return; + +send_nak: + if (!priv->s_nak_state) { + priv->s_nak_state = IB_NAK_PSN_ERROR; + priv->s_nak_psn = flow->flow_state.r_next_psn; + priv->s_flags |= RVT_S_ACK_PENDING; + if (priv->r_tid_ack == HFI1_QP_WQE_INVALID) + priv->r_tid_ack = priv->r_tid_tail; + } + goto done; +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index f28c7ab752b2..647a6f0cba31 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -279,4 +279,6 @@ bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe, struct ib_other_headers *ohdr, u32 *bth1, u32 *bth2, u32 *len); +void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet); + #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index b2096c7c1132..eec6e822635b 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -174,6 +174,8 @@ struct hfi1_qp_priv { /* variables for the TID RDMA SE state machine */ u8 rnr_nak_state; /* RNR NAK state */ + u8 s_nak_state; + u32 s_nak_psn; u32 s_flags; u32 s_tid_cur; u32 s_tid_head; @@ -193,6 +195,7 @@ struct hfi1_qp_priv { u16 pkts_ps; /* packets per segment */ u8 timeout_shift; /* account for number of packets per segment */ + u32 r_next_psn_kdeth; u8 sync_pt; /* Set when QP reaches sync point */ }; From 0f75e325aa11552599a18d7558970be16fc15c1a Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:50:03 -0800 Subject: [PATCH 10/23] IB/hfi1: Add a function to build TID RDMA ACK packet This patch adds a function to build TID RDMA ACJ packet, which is also in the KDETH PSN space for packet ordering. This packet is used to acknowledge the receiving of all the TID RDMA WRITE DATA packets before the given KDETH PSN. Similar to RC ACK packets, TID RDMA ACK packets could also be coalesced. Signed-off-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Ashutosh Dixit Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 77 +++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 4 ++ drivers/infiniband/hw/hfi1/verbs.h | 2 + 3 files changed, 83 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 166a34c8449d..d8a7f07b028d 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -4241,3 +4241,80 @@ void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet) } goto done; } + +static bool hfi1_tid_rdma_is_resync_psn(u32 psn) +{ + return (bool)((psn & HFI1_KDETH_BTH_SEQ_MASK) == + HFI1_KDETH_BTH_SEQ_MASK); +} + +u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e, + struct ib_other_headers *ohdr, u16 iflow, + u32 *bth1, u32 *bth2) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_flow_state *fs = &qpriv->flow_state; + struct tid_rdma_request *req = ack_to_tid_req(e); + struct tid_rdma_flow *flow = &req->flows[iflow]; + struct tid_rdma_params *remote; + + rcu_read_lock(); + remote = rcu_dereference(qpriv->tid_rdma.remote); + KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey); + ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn); + *bth1 = remote->qp; + rcu_read_unlock(); + + if (qpriv->resync) { + *bth2 = mask_psn((fs->generation << + HFI1_KDETH_BTH_SEQ_SHIFT) - 1); + ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp); + } else if (qpriv->s_nak_state) { + *bth2 = mask_psn(qpriv->s_nak_psn); + ohdr->u.tid_rdma.ack.aeth = + cpu_to_be32((qp->r_msn & IB_MSN_MASK) | + (qpriv->s_nak_state << + IB_AETH_CREDIT_SHIFT)); + } else { + *bth2 = full_flow_psn(flow, flow->flow_state.lpsn); + ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp); + } + KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1); + ohdr->u.tid_rdma.ack.tid_flow_qp = + cpu_to_be32(qpriv->tid_rdma.local.qp | + ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) << + TID_RDMA_DESTQP_FLOW_SHIFT) | + qpriv->rcd->ctxt); + + ohdr->u.tid_rdma.ack.tid_flow_psn = 0; + ohdr->u.tid_rdma.ack.verbs_psn = + cpu_to_be32(flow->flow_state.resp_ib_psn); + + if (qpriv->resync) { + /* + * If the PSN before the current expect KDETH PSN is the + * RESYNC PSN, then we never received a good TID RDMA WRITE + * DATA packet after a previous RESYNC. + * In this case, the next expected KDETH PSN stays the same. + */ + if (hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1)) { + ohdr->u.tid_rdma.ack.tid_flow_psn = + cpu_to_be32(qpriv->r_next_psn_kdeth_save); + } else { + /* + * Because the KDETH PSNs jump during a RESYNC, it's + * not possible to infer (or compute) the previous value + * of r_next_psn_kdeth in the case of back-to-back + * RESYNC packets. Therefore, we save it. + */ + qpriv->r_next_psn_kdeth_save = + qpriv->r_next_psn_kdeth - 1; + ohdr->u.tid_rdma.ack.tid_flow_psn = + cpu_to_be32(qpriv->r_next_psn_kdeth_save); + qpriv->r_next_psn_kdeth = mask_psn(*bth2 + 1); + } + qpriv->resync = false; + } + + return sizeof(ohdr->u.tid_rdma.ack) / sizeof(u32); +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 647a6f0cba31..89f5af627128 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -281,4 +281,8 @@ bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe, void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet); +u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e, + struct ib_other_headers *ohdr, u16 iflow, + u32 *bth1, u32 *bth2); + #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index eec6e822635b..3a97a39aeba4 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -196,7 +196,9 @@ struct hfi1_qp_priv { u8 timeout_shift; /* account for number of packets per segment */ u32 r_next_psn_kdeth; + u32 r_next_psn_kdeth_save; u8 sync_pt; /* Set when QP reaches sync point */ + u8 resync; }; #define HFI1_QP_WQE_INVALID ((u32)-1) From 9e93e967f7b452e6c9e4a33d0b42ff64fa7293c4 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:50:14 -0800 Subject: [PATCH 11/23] IB/hfi1: Add a function to receive TID RDMA ACK packet This patch adds a function to receive TID RDMA ACK packet, which could be an acknowledge to either a TID RDMA WRITE DATA packet or an TID RDMA RESYNC packet. For an ACK to TID RDMA WRITE DATA packet, the request segments are completed appropriately. For an ACK to a TID RDMA RESYNC packet, any pending segment flow information is updated accordingly. Signed-off-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Ashutosh Dixit Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/opfn.c | 3 + drivers/infiniband/hw/hfi1/qp.h | 2 + drivers/infiniband/hw/hfi1/tid_rdma.c | 212 +++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 3 + drivers/infiniband/hw/hfi1/trace_tid.h | 5 +- drivers/infiniband/hw/hfi1/verbs.h | 4 + 6 files changed, 228 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/opfn.c b/drivers/infiniband/hw/hfi1/opfn.c index 2ca070690b2f..82e1889ca969 100644 --- a/drivers/infiniband/hw/hfi1/opfn.c +++ b/drivers/infiniband/hw/hfi1/opfn.c @@ -245,6 +245,9 @@ void opfn_qp_init(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask) struct hfi1_qp_priv *priv = qp->priv; unsigned long flags; + if (attr_mask & IB_QP_RETRY_CNT) + priv->s_retry = attr->retry_cnt; + spin_lock_irqsave(&priv->opfn.lock, flags); if (ibqp->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { struct tid_rdma_params *local = &priv->tid_rdma.local; diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h index f74e2509e8b9..d531b760ea93 100644 --- a/drivers/infiniband/hw/hfi1/qp.h +++ b/drivers/infiniband/hw/hfi1/qp.h @@ -65,6 +65,7 @@ extern const struct rvt_operation_params hfi1_post_parms[]; * HFI1_S_WAIT_PIO_DRAIN - qp waiting for PIOs to drain * HFI1_S_WAIT_TID_SPACE - a QP is waiting for TID resource * HFI1_S_WAIT_TID_RESP - waiting for a TID RDMA WRITE response + * HFI1_S_WAIT_HALT - halt the first leg send engine * HFI1_S_MIN_BIT_MASK - the lowest bit that can be used by hfi1 */ #define HFI1_S_AHG_VALID 0x80000000 @@ -72,6 +73,7 @@ extern const struct rvt_operation_params hfi1_post_parms[]; #define HFI1_S_WAIT_PIO_DRAIN 0x20000000 #define HFI1_S_WAIT_TID_SPACE 0x10000000 #define HFI1_S_WAIT_TID_RESP 0x08000000 +#define HFI1_S_WAIT_HALT 0x04000000 #define HFI1_S_MIN_BIT_MASK 0x01000000 /* diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index d8a7f07b028d..5eb8453a719e 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -319,6 +319,7 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, qpriv->flow_state.index = RXE_NUM_TID_FLOWS; qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS; qpriv->flow_state.generation = KERN_GENERATION_RESERVED; + qpriv->s_state = TID_OP(WRITE_RESP); qpriv->s_tid_cur = HFI1_QP_WQE_INVALID; qpriv->s_tid_head = HFI1_QP_WQE_INVALID; qpriv->s_tid_tail = HFI1_QP_WQE_INVALID; @@ -327,6 +328,7 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, qpriv->r_tid_tail = HFI1_QP_WQE_INVALID; qpriv->r_tid_ack = HFI1_QP_WQE_INVALID; qpriv->r_tid_alloc = HFI1_QP_WQE_INVALID; + atomic_set(&qpriv->n_tid_requests, 0); timer_setup(&qpriv->s_tid_timer, hfi1_tid_timeout, 0); INIT_LIST_HEAD(&qpriv->tid_wait); @@ -4318,3 +4320,213 @@ u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e, return sizeof(ohdr->u.tid_rdma.ack) / sizeof(u32); } + +void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) +{ + struct ib_other_headers *ohdr = packet->ohdr; + struct rvt_qp *qp = packet->qp; + struct hfi1_qp_priv *qpriv = qp->priv; + struct rvt_swqe *wqe; + struct tid_rdma_request *req; + struct tid_rdma_flow *flow; + u32 aeth, psn, req_psn, ack_psn, fspsn, resync_psn, ack_kpsn; + bool is_fecn; + unsigned long flags; + u16 fidx; + + is_fecn = process_ecn(qp, packet); + psn = mask_psn(be32_to_cpu(ohdr->bth[2])); + aeth = be32_to_cpu(ohdr->u.tid_rdma.ack.aeth); + req_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.verbs_psn)); + resync_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.tid_flow_psn)); + + spin_lock_irqsave(&qp->s_lock, flags); + + /* If we are waiting for an ACK to RESYNC, drop any other packets */ + if ((qp->s_flags & HFI1_S_WAIT_HALT) && + cmp_psn(psn, qpriv->s_resync_psn)) + goto ack_op_err; + + ack_psn = req_psn; + if (hfi1_tid_rdma_is_resync_psn(psn)) + ack_kpsn = resync_psn; + else + ack_kpsn = psn; + if (aeth >> 29) { + ack_psn--; + ack_kpsn--; + } + + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + + if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) + goto ack_op_err; + + req = wqe_to_tid_req(wqe); + flow = &req->flows[req->acked_tail]; + + /* Drop stale ACK/NAK */ + if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.spsn)) < 0) + goto ack_op_err; + + while (cmp_psn(ack_kpsn, + full_flow_psn(flow, flow->flow_state.lpsn)) >= 0 && + req->ack_seg < req->cur_seg) { + req->ack_seg++; + /* advance acked segment pointer */ + req->acked_tail = CIRC_NEXT(req->acked_tail, MAX_FLOWS); + req->r_last_acked = flow->flow_state.resp_ib_psn; + if (req->ack_seg == req->total_segs) { + req->state = TID_REQUEST_COMPLETE; + wqe = do_rc_completion(qp, wqe, + to_iport(qp->ibqp.device, + qp->port_num)); + atomic_dec(&qpriv->n_tid_requests); + if (qp->s_acked == qp->s_tail) + break; + if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) + break; + req = wqe_to_tid_req(wqe); + } + flow = &req->flows[req->acked_tail]; + } + + switch (aeth >> 29) { + case 0: /* ACK */ + if (qpriv->s_flags & RVT_S_WAIT_ACK) + qpriv->s_flags &= ~RVT_S_WAIT_ACK; + if (!hfi1_tid_rdma_is_resync_psn(psn)) { + hfi1_schedule_send(qp); + } else { + u32 spsn, fpsn, last_acked, generation; + struct tid_rdma_request *rptr; + + /* Allow new requests (see hfi1_make_tid_rdma_pkt) */ + qp->s_flags &= ~HFI1_S_WAIT_HALT; + /* + * Clear RVT_S_SEND_ONE flag in case that the TID RDMA + * ACK is received after the TID retry timer is fired + * again. In this case, do not send any more TID + * RESYNC request or wait for any more TID ACK packet. + */ + qpriv->s_flags &= ~RVT_S_SEND_ONE; + hfi1_schedule_send(qp); + + if ((qp->s_acked == qpriv->s_tid_tail && + req->ack_seg == req->total_segs) || + qp->s_acked == qp->s_tail) { + qpriv->s_state = TID_OP(WRITE_DATA_LAST); + goto done; + } + + if (req->ack_seg == req->comp_seg) { + qpriv->s_state = TID_OP(WRITE_DATA); + goto done; + } + + /* + * The PSN to start with is the next PSN after the + * RESYNC PSN. + */ + psn = mask_psn(psn + 1); + generation = psn >> HFI1_KDETH_BTH_SEQ_SHIFT; + spsn = 0; + + /* + * Update to the correct WQE when we get an ACK(RESYNC) + * in the middle of a request. + */ + if (delta_psn(ack_psn, wqe->lpsn)) + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + req = wqe_to_tid_req(wqe); + flow = &req->flows[req->acked_tail]; + /* + * RESYNC re-numbers the PSN ranges of all remaining + * segments. Also, PSN's start from 0 in the middle of a + * segment and the first segment size is less than the + * default number of packets. flow->resync_npkts is used + * to track the number of packets from the start of the + * real segment to the point of 0 PSN after the RESYNC + * in order to later correctly rewind the SGE. + */ + fpsn = full_flow_psn(flow, flow->flow_state.spsn); + req->r_ack_psn = psn; + flow->resync_npkts += + delta_psn(mask_psn(resync_psn + 1), fpsn); + /* + * Renumber all packet sequence number ranges + * based on the new generation. + */ + last_acked = qp->s_acked; + rptr = req; + while (1) { + /* start from last acked segment */ + for (fidx = rptr->acked_tail; + CIRC_CNT(rptr->setup_head, fidx, + MAX_FLOWS); + fidx = CIRC_NEXT(fidx, MAX_FLOWS)) { + u32 lpsn; + u32 gen; + + flow = &rptr->flows[fidx]; + gen = flow->flow_state.generation; + if (WARN_ON(gen == generation && + flow->flow_state.spsn != + spsn)) + continue; + lpsn = flow->flow_state.lpsn; + lpsn = full_flow_psn(flow, lpsn); + flow->npkts = + delta_psn(lpsn, + mask_psn(resync_psn) + ); + flow->flow_state.generation = + generation; + flow->flow_state.spsn = spsn; + flow->flow_state.lpsn = + flow->flow_state.spsn + + flow->npkts - 1; + flow->pkt = 0; + spsn += flow->npkts; + resync_psn += flow->npkts; + } + if (++last_acked == qpriv->s_tid_cur + 1) + break; + if (last_acked == qp->s_size) + last_acked = 0; + wqe = rvt_get_swqe_ptr(qp, last_acked); + rptr = wqe_to_tid_req(wqe); + } + req->cur_seg = req->ack_seg; + qpriv->s_tid_tail = qp->s_acked; + qpriv->s_state = TID_OP(WRITE_REQ); + } +done: + qpriv->s_retry = qp->s_retry_cnt; + break; + + case 3: /* NAK */ + switch ((aeth >> IB_AETH_CREDIT_SHIFT) & + IB_AETH_CREDIT_MASK) { + case 0: /* PSN sequence error */ + flow = &req->flows[req->acked_tail]; + fspsn = full_flow_psn(flow, flow->flow_state.spsn); + req->r_ack_psn = mask_psn(be32_to_cpu(ohdr->bth[2])); + req->cur_seg = req->ack_seg; + qpriv->s_tid_tail = qp->s_acked; + qpriv->s_state = TID_OP(WRITE_REQ); + qpriv->s_retry = qp->s_retry_cnt; + break; + + default: + break; + } + break; + + default: + break; + } + +ack_op_err: + spin_unlock_irqrestore(&qp->s_lock, flags); +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 89f5af627128..499036e7a3e8 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -101,6 +101,7 @@ struct tid_rdma_request { u32 seg_len; u32 total_len; + u32 r_ack_psn; /* next expected ack PSN */ u32 r_flow_psn; /* IB PSN of next segment start */ u32 r_last_acked; /* IB PSN of last ACK'ed packet */ u32 s_next_psn; /* IB PSN of next segment start for read */ @@ -285,4 +286,6 @@ u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e, struct ib_other_headers *ohdr, u16 iflow, u32 *bth1, u32 *bth2); +void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet); + #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h index 51f5b0e8da71..a45b5257d6c4 100644 --- a/drivers/infiniband/hw/hfi1/trace_tid.h +++ b/drivers/infiniband/hw/hfi1/trace_tid.h @@ -52,7 +52,7 @@ u16 hfi1_trace_get_tid_idx(u32 ent); #define TID_READ_SENDER_PRN "[%s] qpn 0x%x newreq %u tid_r_reqs %u " \ "tid_r_comp %u pending_tid_r_segs %u " \ "s_flags 0x%x ps_flags 0x%x iow_flags 0x%lx " \ - "hw_flow_index %u generation 0x%x " \ + "s_state 0x%x hw_flow_index %u generation 0x%x " \ "fpsn 0x%x flow_flags 0x%x" #define TID_REQ_PRN "[%s] qpn 0x%x newreq %u opcode 0x%x psn 0x%x lpsn 0x%x " \ @@ -844,6 +844,7 @@ DECLARE_EVENT_CLASS(/* tid_read_sender */ __field(u32, s_flags) __field(u32, ps_flags) __field(unsigned long, iow_flags) + __field(u8, s_state) __field(u32, hw_flow_index) __field(u32, generation) __field(u32, fpsn) @@ -861,6 +862,7 @@ DECLARE_EVENT_CLASS(/* tid_read_sender */ __entry->s_flags = qp->s_flags; __entry->ps_flags = priv->s_flags; __entry->iow_flags = priv->s_iowait.flags; + __entry->s_state = priv->s_state; __entry->hw_flow_index = priv->flow_state.index; __entry->generation = priv->flow_state.generation; __entry->fpsn = priv->flow_state.psn; @@ -877,6 +879,7 @@ DECLARE_EVENT_CLASS(/* tid_read_sender */ __entry->s_flags, __entry->ps_flags, __entry->iow_flags, + __entry->s_state, __entry->hw_flow_index, __entry->generation, __entry->fpsn, diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 3a97a39aeba4..30e3f5af5cf1 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -170,9 +170,12 @@ struct hfi1_qp_priv { struct tid_rdma_qp_params tid_rdma; struct rvt_qp *owner; u8 hdr_type; /* 9B or 16B */ + atomic_t n_tid_requests; /* # of sent TID RDMA requests */ unsigned long tid_timer_timeout_jiffies; /* variables for the TID RDMA SE state machine */ + u8 s_state; + u8 s_retry; u8 rnr_nak_state; /* RNR NAK state */ u8 s_nak_state; u32 s_nak_psn; @@ -197,6 +200,7 @@ struct hfi1_qp_priv { u32 r_next_psn_kdeth; u32 r_next_psn_kdeth_save; + u32 s_resync_psn; u8 sync_pt; /* Set when QP reaches sync point */ u8 resync; }; From 829eaee5d09a7500bdce9ed0bc6ec6861f8ae45b Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:50:24 -0800 Subject: [PATCH 12/23] IB/hfi1: Add TID RDMA retry timer This patch adds the TID RDMA retry timer to make sure that TID RDMA WRITE DATA packets for a segment are received successfully by the responder. This timer is generally armed when the last TID RDMA WRITE DATA packet for a segment is sent out and stopped when all TID RDMA DATA packets are acknowledged. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/opfn.c | 2 + drivers/infiniband/hw/hfi1/tid_rdma.c | 93 +++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 4 ++ drivers/infiniband/hw/hfi1/verbs.h | 2 + 4 files changed, 101 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/opfn.c b/drivers/infiniband/hw/hfi1/opfn.c index 82e1889ca969..370a5a8eaa71 100644 --- a/drivers/infiniband/hw/hfi1/opfn.c +++ b/drivers/infiniband/hw/hfi1/opfn.c @@ -252,6 +252,8 @@ void opfn_qp_init(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask) if (ibqp->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { struct tid_rdma_params *local = &priv->tid_rdma.local; + if (attr_mask & IB_QP_TIMEOUT) + priv->tid_retry_timeout_jiffies = qp->timeout_jiffies; if (qp->pmtu == enum_to_mtu(OPA_MTU_4096) || qp->pmtu == enum_to_mtu(OPA_MTU_8192)) { tid_rdma_opfn_init(qp, local); diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 5eb8453a719e..a4faf7d6c224 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -121,6 +121,9 @@ static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx); static void hfi1_tid_timeout(struct timer_list *t); static void hfi1_add_tid_reap_timer(struct rvt_qp *qp); static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp); +static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp); +static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp); +static void hfi1_tid_retry_timeout(struct timer_list *t); static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p) { @@ -330,6 +333,7 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, qpriv->r_tid_alloc = HFI1_QP_WQE_INVALID; atomic_set(&qpriv->n_tid_requests, 0); timer_setup(&qpriv->s_tid_timer, hfi1_tid_timeout, 0); + timer_setup(&qpriv->s_tid_retry_timer, hfi1_tid_retry_timeout, 0); INIT_LIST_HEAD(&qpriv->tid_wait); if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { @@ -4396,11 +4400,19 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) if (qpriv->s_flags & RVT_S_WAIT_ACK) qpriv->s_flags &= ~RVT_S_WAIT_ACK; if (!hfi1_tid_rdma_is_resync_psn(psn)) { + /* Check if there is any pending TID ACK */ + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && + req->ack_seg < req->cur_seg) + hfi1_mod_tid_retry_timer(qp); + else + hfi1_stop_tid_retry_timer(qp); hfi1_schedule_send(qp); } else { u32 spsn, fpsn, last_acked, generation; struct tid_rdma_request *rptr; + /* ACK(RESYNC) */ + hfi1_stop_tid_retry_timer(qp); /* Allow new requests (see hfi1_make_tid_rdma_pkt) */ qp->s_flags &= ~HFI1_S_WAIT_HALT; /* @@ -4506,6 +4518,7 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) break; case 3: /* NAK */ + hfi1_stop_tid_retry_timer(qp); switch ((aeth >> IB_AETH_CREDIT_SHIFT) & IB_AETH_CREDIT_MASK) { case 0: /* PSN sequence error */ @@ -4530,3 +4543,83 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) ack_op_err: spin_unlock_irqrestore(&qp->s_lock, flags); } + +void hfi1_add_tid_retry_timer(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct ib_qp *ibqp = &qp->ibqp; + struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); + + lockdep_assert_held(&qp->s_lock); + if (!(priv->s_flags & HFI1_S_TID_RETRY_TIMER)) { + priv->s_flags |= HFI1_S_TID_RETRY_TIMER; + priv->s_tid_retry_timer.expires = jiffies + + priv->tid_retry_timeout_jiffies + rdi->busy_jiffies; + add_timer(&priv->s_tid_retry_timer); + } +} + +static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct ib_qp *ibqp = &qp->ibqp; + struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); + + lockdep_assert_held(&qp->s_lock); + priv->s_flags |= HFI1_S_TID_RETRY_TIMER; + mod_timer(&priv->s_tid_retry_timer, jiffies + + priv->tid_retry_timeout_jiffies + rdi->busy_jiffies); +} + +static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + int rval = 0; + + lockdep_assert_held(&qp->s_lock); + if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) { + rval = del_timer(&priv->s_tid_retry_timer); + priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER; + } + return rval; +} + +void hfi1_del_tid_retry_timer(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + del_timer_sync(&priv->s_tid_retry_timer); + priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER; +} + +static void hfi1_tid_retry_timeout(struct timer_list *t) +{ + struct hfi1_qp_priv *priv = from_timer(priv, t, s_tid_retry_timer); + struct rvt_qp *qp = priv->owner; + struct rvt_swqe *wqe; + unsigned long flags; + + spin_lock_irqsave(&qp->r_lock, flags); + spin_lock(&qp->s_lock); + if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) { + hfi1_stop_tid_retry_timer(qp); + if (!priv->s_retry) { + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + hfi1_trdma_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); + rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); + } else { + priv->s_flags &= ~RVT_S_WAIT_ACK; + /* Only send one packet (the RESYNC) */ + priv->s_flags |= RVT_S_SEND_ONE; + /* + * No additional request shall be made by this QP until + * the RESYNC has been complete. + */ + qp->s_flags |= HFI1_S_WAIT_HALT; + priv->s_state = TID_OP(RESYNC); + priv->s_retry--; + } + } + spin_unlock(&qp->s_lock); + spin_unlock_irqrestore(&qp->r_lock, flags); +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 499036e7a3e8..3be5f79ed1fb 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -29,6 +29,7 @@ #define HFI1_R_TID_RSC_TIMER BIT(2) /* BIT(4) reserved for RVT_S_ACK_PENDING. */ #define HFI1_S_TID_WAIT_INTERLCK BIT(5) +#define HFI1_S_TID_RETRY_TIMER BIT(17) #define HFI1_R_TID_SW_PSN BIT(19) /* @@ -288,4 +289,7 @@ u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e, void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet); +void hfi1_add_tid_retry_timer(struct rvt_qp *qp); +void hfi1_del_tid_retry_timer(struct rvt_qp *qp); + #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 30e3f5af5cf1..bfd642e831f7 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -164,6 +164,7 @@ struct hfi1_qp_priv { u8 s_sc; /* SC[0..4] for next packet */ struct iowait s_iowait; struct timer_list s_tid_timer; /* for timing tid wait */ + struct timer_list s_tid_retry_timer; /* for timing tid ack */ struct list_head tid_wait; /* for queueing tid space */ struct hfi1_opfn_data opfn; struct tid_flow_state flow_state; @@ -172,6 +173,7 @@ struct hfi1_qp_priv { u8 hdr_type; /* 9B or 16B */ atomic_t n_tid_requests; /* # of sent TID RDMA requests */ unsigned long tid_timer_timeout_jiffies; + unsigned long tid_retry_timeout_jiffies; /* variables for the TID RDMA SE state machine */ u8 s_state; From 6e391c6a4a8f97d34fa859c906387c05e91adbe9 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:50:36 -0800 Subject: [PATCH 13/23] IB/hfi1: Add a function to build TID RDMA RESYNC packet This patch adds a function to build TID RDMA RESYNC packet, which is sent by the requester to notify the responder that no TID RDMA ACK packet has been received for a given KDETH PSN. Signed-off-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Ashutosh Dixit Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 26 ++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 4 ++++ 2 files changed, 30 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index a4faf7d6c224..e10f6d714dff 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -4623,3 +4623,29 @@ static void hfi1_tid_retry_timeout(struct timer_list *t) spin_unlock(&qp->s_lock); spin_unlock_irqrestore(&qp->r_lock, flags); } + +u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, u32 *bth1, + u32 *bth2, u16 fidx) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_rdma_params *remote; + struct tid_rdma_request *req = wqe_to_tid_req(wqe); + struct tid_rdma_flow *flow = &req->flows[fidx]; + u32 generation; + + rcu_read_lock(); + remote = rcu_dereference(qpriv->tid_rdma.remote); + KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey); + ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn); + *bth1 = remote->qp; + rcu_read_unlock(); + + generation = kern_flow_generation_next(flow->flow_state.generation); + *bth2 = mask_psn((generation << HFI1_KDETH_BTH_SEQ_SHIFT) - 1); + qpriv->s_resync_psn = *bth2; + *bth2 |= IB_BTH_REQ_ACK; + KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1); + + return sizeof(ohdr->u.tid_rdma.resync) / sizeof(u32); +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 3be5f79ed1fb..d876b0efeac2 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -292,4 +292,8 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet); void hfi1_add_tid_retry_timer(struct rvt_qp *qp); void hfi1_del_tid_retry_timer(struct rvt_qp *qp); +u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, u32 *bth1, + u32 *bth2, u16 fidx); + #endif /* HFI1_TID_RDMA_H */ From 7cf0ad679de46c61739238c3f4542f14cc7bbc69 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:50:46 -0800 Subject: [PATCH 14/23] IB/hfi1: Add a function to receive TID RDMA RESYNC packet This patch adds a function to receive TID RDMA RESYNC packet on the responder side. The QP's hardware flow will be updated and all allocated software flows will be updated accordingly in order to drop all stale packets. Signed-off-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Ashutosh Dixit Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 103 ++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 2 + 2 files changed, 105 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index e10f6d714dff..1901d5b6bbb9 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -4649,3 +4649,106 @@ u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe, return sizeof(ohdr->u.tid_rdma.resync) / sizeof(u32); } + +void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet) +{ + struct ib_other_headers *ohdr = packet->ohdr; + struct rvt_qp *qp = packet->qp; + struct hfi1_qp_priv *qpriv = qp->priv; + struct hfi1_ctxtdata *rcd = qpriv->rcd; + struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); + struct rvt_ack_entry *e; + struct tid_rdma_request *req; + struct tid_rdma_flow *flow; + struct tid_flow_state *fs = &qpriv->flow_state; + u32 psn, generation, idx, gen_next; + bool is_fecn; + unsigned long flags; + + is_fecn = process_ecn(qp, packet); + psn = mask_psn(be32_to_cpu(ohdr->bth[2])); + + generation = mask_psn(psn + 1) >> HFI1_KDETH_BTH_SEQ_SHIFT; + spin_lock_irqsave(&qp->s_lock, flags); + + gen_next = (fs->generation == KERN_GENERATION_RESERVED) ? + generation : kern_flow_generation_next(fs->generation); + /* + * RESYNC packet contains the "next" generation and can only be + * from the current or previous generations + */ + if (generation != mask_generation(gen_next - 1) && + generation != gen_next) + goto bail; + /* Already processing a resync */ + if (qpriv->resync) + goto bail; + + spin_lock(&rcd->exp_lock); + if (fs->index >= RXE_NUM_TID_FLOWS) { + /* + * If we don't have a flow, save the generation so it can be + * applied when a new flow is allocated + */ + fs->generation = generation; + } else { + /* Reprogram the QP flow with new generation */ + rcd->flows[fs->index].generation = generation; + fs->generation = kern_setup_hw_flow(rcd, fs->index); + } + fs->psn = 0; + /* + * Disable SW PSN checking since a RESYNC is equivalent to a + * sync point and the flow has/will be reprogrammed + */ + qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; + + /* + * Reset all TID flow information with the new generation. + * This is done for all requests and segments after the + * last received segment + */ + for (idx = qpriv->r_tid_tail; ; idx++) { + u16 flow_idx; + + if (idx > rvt_size_atomic(&dev->rdi)) + idx = 0; + e = &qp->s_ack_queue[idx]; + if (e->opcode == TID_OP(WRITE_REQ)) { + req = ack_to_tid_req(e); + + /* start from last unacked segment */ + for (flow_idx = req->clear_tail; + CIRC_CNT(req->setup_head, flow_idx, + MAX_FLOWS); + flow_idx = CIRC_NEXT(flow_idx, MAX_FLOWS)) { + u32 lpsn; + u32 next; + + flow = &req->flows[flow_idx]; + lpsn = full_flow_psn(flow, + flow->flow_state.lpsn); + next = flow->flow_state.r_next_psn; + flow->npkts = delta_psn(lpsn, next - 1); + flow->flow_state.generation = fs->generation; + flow->flow_state.spsn = fs->psn; + flow->flow_state.lpsn = + flow->flow_state.spsn + flow->npkts - 1; + flow->flow_state.r_next_psn = + full_flow_psn(flow, + flow->flow_state.spsn); + fs->psn += flow->npkts; + } + } + if (idx == qp->s_tail_ack_queue) + break; + } + + spin_unlock(&rcd->exp_lock); + qpriv->resync = true; + /* RESYNC request always gets a TID RDMA ACK. */ + qpriv->s_nak_state = 0; + qpriv->s_flags |= RVT_S_ACK_PENDING; +bail: + spin_unlock_irqrestore(&qp->s_lock, flags); +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index d876b0efeac2..bdcf18455d9d 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -296,4 +296,6 @@ u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe, struct ib_other_headers *ohdr, u32 *bth1, u32 *bth2, u16 fidx); +void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet); + #endif /* HFI1_TID_RDMA_H */ From 6e38fca6b1524e9a9aa0d2a10d99975eef1791c1 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:50:56 -0800 Subject: [PATCH 15/23] IB/hfi1: Resend the TID RDMA WRITE DATA packets This patch adds the logic to resend TID RDMA WRITE DATA packets. The tracking indices will be reset properly so that the correct TID entries will be used. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 63 ++++++++++++++++++++++++--- drivers/infiniband/hw/hfi1/verbs.h | 1 + 2 files changed, 58 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 1901d5b6bbb9..cb6321b0d2c9 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -3059,8 +3059,9 @@ void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, { struct tid_rdma_request *req = wqe_to_tid_req(wqe); struct tid_rdma_flow *flow; - int diff; - u32 tididx = 0; + struct hfi1_qp_priv *qpriv = qp->priv; + int diff, delta_pkts; + u32 tididx = 0, i; u16 fidx; if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { @@ -3076,11 +3077,20 @@ void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, return; } } else { - return; + fidx = req->acked_tail; + flow = &req->flows[fidx]; + *bth2 = mask_psn(req->r_ack_psn); } + if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) + delta_pkts = delta_psn(*bth2, flow->flow_state.ib_spsn); + else + delta_pkts = delta_psn(*bth2, + full_flow_psn(flow, + flow->flow_state.spsn)); + trace_hfi1_tid_flow_restart_req(qp, fidx, flow); - diff = delta_psn(*bth2, flow->flow_state.ib_spsn); + diff = delta_pkts + flow->resync_npkts; flow->sent = 0; flow->pkt = 0; @@ -3104,6 +3114,18 @@ void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, break; } } + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) { + rvt_skip_sge(&qpriv->tid_ss, (req->cur_seg * req->seg_len) + + flow->sent, 0); + /* + * Packet PSN is based on flow_state.spsn + flow->pkt. However, + * during a RESYNC, the generation is incremented and the + * sequence is reset to 0. Since we've adjusted the npkts in the + * flow and the SGE has been sufficiently advanced, we have to + * adjust flow->pkt in order to calculate the correct PSN. + */ + flow->pkt -= flow->resync_npkts; + } if (flow->tid_offset == EXP_TID_GET(flow->tid_entry[tididx], LEN) * PAGE_SIZE) { @@ -3111,13 +3133,42 @@ void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, flow->tid_offset = 0; } flow->tid_idx = tididx; - /* Move flow_idx to correct index */ - req->flow_idx = fidx; + if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) + /* Move flow_idx to correct index */ + req->flow_idx = fidx; + else + req->clear_tail = fidx; trace_hfi1_tid_flow_restart_req(qp, fidx, flow); trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, wqe->psn, wqe->lpsn, req); req->state = TID_REQUEST_ACTIVE; + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) { + /* Reset all the flows that we are going to resend */ + fidx = CIRC_NEXT(fidx, MAX_FLOWS); + i = qpriv->s_tid_tail; + do { + for (; CIRC_CNT(req->setup_head, fidx, MAX_FLOWS); + fidx = CIRC_NEXT(fidx, MAX_FLOWS)) { + req->flows[fidx].sent = 0; + req->flows[fidx].pkt = 0; + req->flows[fidx].tid_idx = 0; + req->flows[fidx].tid_offset = 0; + req->flows[fidx].resync_npkts = 0; + } + if (i == qpriv->s_tid_cur) + break; + do { + i = (++i == qp->s_size ? 0 : i); + wqe = rvt_get_swqe_ptr(qp, i); + } while (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE); + req = wqe_to_tid_req(wqe); + req->cur_seg = req->ack_seg; + fidx = req->acked_tail; + /* Pull req->clear_tail back */ + req->clear_tail = fidx; + } while (1); + } } void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp) diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index bfd642e831f7..ce40ea9f43c3 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -171,6 +171,7 @@ struct hfi1_qp_priv { struct tid_rdma_qp_params tid_rdma; struct rvt_qp *owner; u8 hdr_type; /* 9B or 16B */ + struct rvt_sge_state tid_ss; /* SGE state pointer for 2nd leg */ atomic_t n_tid_requests; /* # of sent TID RDMA requests */ unsigned long tid_timer_timeout_jiffies; unsigned long tid_retry_timeout_jiffies; From 70dcb2e3dc6aa827d74e09c830ea06c660274880 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:51:07 -0800 Subject: [PATCH 16/23] IB/hfi1: Add the TID second leg send packet builder To improve performance, the TID RDMA WRITE protocol is designed to own a second leg to send data and ack packets in the KDETH PSN space. This patch adds the packet builder for the requester side, which contains the state machine to build TID RDMA WRITE DATA and TID RDMA RESYNC packet. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 211 ++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 7 + drivers/infiniband/hw/hfi1/verbs.h | 2 + 3 files changed, 220 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index cb6321b0d2c9..44c5c0010888 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -331,6 +331,7 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, qpriv->r_tid_tail = HFI1_QP_WQE_INVALID; qpriv->r_tid_ack = HFI1_QP_WQE_INVALID; qpriv->r_tid_alloc = HFI1_QP_WQE_INVALID; + atomic_set(&qpriv->n_requests, 0); atomic_set(&qpriv->n_tid_requests, 0); timer_setup(&qpriv->s_tid_timer, hfi1_tid_timeout, 0); timer_setup(&qpriv->s_tid_retry_timer, hfi1_tid_retry_timeout, 0); @@ -4803,3 +4804,213 @@ void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet) bail: spin_unlock_irqrestore(&qp->s_lock, flags); } + +/* + * Call this function when the last TID RDMA WRITE DATA packet for a request + * is built. + */ +static void update_tid_tail(struct rvt_qp *qp) + __must_hold(&qp->s_lock) +{ + struct hfi1_qp_priv *priv = qp->priv; + u32 i; + struct rvt_swqe *wqe; + + lockdep_assert_held(&qp->s_lock); + /* Can't move beyond s_tid_cur */ + if (priv->s_tid_tail == priv->s_tid_cur) + return; + for (i = priv->s_tid_tail + 1; ; i++) { + if (i == qp->s_size) + i = 0; + + if (i == priv->s_tid_cur) + break; + wqe = rvt_get_swqe_ptr(qp, i); + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) + break; + } + priv->s_tid_tail = i; + priv->s_state = TID_OP(WRITE_RESP); +} + +int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps) + __must_hold(&qp->s_lock) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct rvt_swqe *wqe; + u32 bth1 = 0, bth2 = 0, hwords = 5, len, middle = 0; + struct ib_other_headers *ohdr; + struct rvt_sge_state *ss = &qp->s_sge; + struct rvt_ack_entry *e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + struct tid_rdma_request *req = ack_to_tid_req(e); + bool last = false; + u8 opcode = TID_OP(WRITE_DATA); + + lockdep_assert_held(&qp->s_lock); + /* + * Prioritize the sending of the requests and responses over the + * sending of the TID RDMA data packets. + */ + if (((atomic_read(&priv->n_tid_requests) < HFI1_TID_RDMA_WRITE_CNT) && + atomic_read(&priv->n_requests) && + !(qp->s_flags & (RVT_S_BUSY | RVT_S_WAIT_ACK | + HFI1_S_ANY_WAIT_IO))) || + (e->opcode == TID_OP(WRITE_REQ) && req->cur_seg < req->alloc_seg && + !(qp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT_IO)))) { + struct iowait_work *iowork; + + iowork = iowait_get_ib_work(&priv->s_iowait); + ps->s_txreq = get_waiting_verbs_txreq(iowork); + if (ps->s_txreq || hfi1_make_rc_req(qp, ps)) { + priv->s_flags |= HFI1_S_TID_BUSY_SET; + return 1; + } + } + + ps->s_txreq = get_txreq(ps->dev, qp); + if (!ps->s_txreq) + goto bail_no_tx; + + ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth; + + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) { + if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + if (qp->s_last == READ_ONCE(qp->s_head)) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (iowait_sdma_pending(&priv->s_iowait)) { + qp->s_flags |= RVT_S_WAIT_DMA; + goto bail; + } + clear_ahg(qp); + wqe = rvt_get_swqe_ptr(qp, qp->s_last); + hfi1_trdma_send_complete(qp, wqe, qp->s_last != qp->s_acked ? + IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR); + /* will get called again */ + goto done_free_tx; + } + + if (priv->s_flags & RVT_S_WAIT_ACK) + goto bail; + + /* Check whether there is anything to do. */ + if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) + goto bail; + wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail); + req = wqe_to_tid_req(wqe); + switch (priv->s_state) { + case TID_OP(WRITE_REQ): + case TID_OP(WRITE_RESP): + priv->tid_ss.sge = wqe->sg_list[0]; + priv->tid_ss.sg_list = wqe->sg_list + 1; + priv->tid_ss.num_sge = wqe->wr.num_sge; + priv->tid_ss.total_len = wqe->length; + + if (priv->s_state == TID_OP(WRITE_REQ)) + hfi1_tid_rdma_restart_req(qp, wqe, &bth2); + priv->s_state = TID_OP(WRITE_DATA); + /* fall through */ + + case TID_OP(WRITE_DATA): + /* + * 1. Check whether TID RDMA WRITE RESP available. + * 2. If no: + * 2.1 If have more segments and no TID RDMA WRITE RESP, + * set HFI1_S_WAIT_TID_RESP + * 2.2 Return indicating no progress made. + * 3. If yes: + * 3.1 Build TID RDMA WRITE DATA packet. + * 3.2 If last packet in segment: + * 3.2.1 Change KDETH header bits + * 3.2.2 Advance RESP pointers. + * 3.3 Return indicating progress made. + */ + wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail); + req = wqe_to_tid_req(wqe); + len = wqe->length; + + if (!req->comp_seg || req->cur_seg == req->comp_seg) + goto bail; + + last = hfi1_build_tid_rdma_packet(wqe, ohdr, &bth1, &bth2, + &len); + + if (last) { + /* move pointer to next flow */ + req->clear_tail = CIRC_NEXT(req->clear_tail, + MAX_FLOWS); + if (++req->cur_seg < req->total_segs) { + if (!CIRC_CNT(req->setup_head, req->clear_tail, + MAX_FLOWS)) + qp->s_flags |= HFI1_S_WAIT_TID_RESP; + } else { + priv->s_state = TID_OP(WRITE_DATA_LAST); + opcode = TID_OP(WRITE_DATA_LAST); + + /* Advance the s_tid_tail now */ + update_tid_tail(qp); + } + } + hwords += sizeof(ohdr->u.tid_rdma.w_data) / sizeof(u32); + ss = &priv->tid_ss; + break; + + case TID_OP(RESYNC): + /* Use generation from the most recently received response */ + wqe = rvt_get_swqe_ptr(qp, priv->s_tid_cur); + req = wqe_to_tid_req(wqe); + /* If no responses for this WQE look at the previous one */ + if (!req->comp_seg) { + wqe = rvt_get_swqe_ptr(qp, + (!priv->s_tid_cur ? qp->s_size : + priv->s_tid_cur) - 1); + req = wqe_to_tid_req(wqe); + } + hwords += hfi1_build_tid_rdma_resync(qp, wqe, ohdr, &bth1, + &bth2, + CIRC_PREV(req->setup_head, + MAX_FLOWS)); + ss = NULL; + len = 0; + opcode = TID_OP(RESYNC); + break; + + default: + goto bail; + } + if (priv->s_flags & RVT_S_SEND_ONE) { + priv->s_flags &= ~RVT_S_SEND_ONE; + priv->s_flags |= RVT_S_WAIT_ACK; + bth2 |= IB_BTH_REQ_ACK; + } + qp->s_len -= len; + ps->s_txreq->hdr_dwords = hwords; + ps->s_txreq->sde = priv->s_sde; + ps->s_txreq->ss = ss; + ps->s_txreq->s_cur_size = len; + hfi1_make_ruc_header(qp, ohdr, (opcode << 24), bth1, bth2, + middle, ps); + return 1; +done_free_tx: + hfi1_put_txreq(ps->s_txreq); + ps->s_txreq = NULL; + return 1; + +bail: + hfi1_put_txreq(ps->s_txreq); +bail_no_tx: + ps->s_txreq = NULL; + priv->s_flags &= ~RVT_S_BUSY; + /* + * If we didn't get a txreq, the QP will be woken up later to try + * again, set the flags to the the wake up which work item to wake + * up. + * (A better algorithm should be found to do this and generalize the + * sleep/wakeup flags.) + */ + iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID); + return 0; +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index bdcf18455d9d..0ce0ef6d60f2 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -26,9 +26,13 @@ * * HFI1_S_TID_WAIT_INTERLCK - QP is waiting for requester interlock */ +#define HFI1_S_TID_BUSY_SET BIT(0) +/* BIT(1) reserved for RVT_S_BUSY. */ #define HFI1_R_TID_RSC_TIMER BIT(2) +/* BIT(3) reserved for RVT_S_RESP_PENDING. */ /* BIT(4) reserved for RVT_S_ACK_PENDING. */ #define HFI1_S_TID_WAIT_INTERLCK BIT(5) +/* BIT(7) - BIT(15) reserved for RVT_S_WAIT_*. */ #define HFI1_S_TID_RETRY_TIMER BIT(17) #define HFI1_R_TID_SW_PSN BIT(19) @@ -298,4 +302,7 @@ u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe, void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet); +struct hfi1_pkt_state; +int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps); + #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index ce40ea9f43c3..3e45149d22c4 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -172,6 +172,8 @@ struct hfi1_qp_priv { struct rvt_qp *owner; u8 hdr_type; /* 9B or 16B */ struct rvt_sge_state tid_ss; /* SGE state pointer for 2nd leg */ + atomic_t n_requests; /* # of TID RDMA requests in the */ + /* queue */ atomic_t n_tid_requests; /* # of sent TID RDMA requests */ unsigned long tid_timer_timeout_jiffies; unsigned long tid_retry_timeout_jiffies; From 24c5bfeaf1e66efbc15cd9a6f5565c38d8cdb630 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:51:17 -0800 Subject: [PATCH 17/23] IB/hfi1: Add the TID second leg ACK packet builder This patch adds the TID packet builder for the responder side, which contains the state machine to build TID RDMA ACK packet for either TID RDMA WRITE DATA or TID RDMA RESYNC packets. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 141 ++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 44c5c0010888..15c12243c166 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -124,6 +124,9 @@ static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp); static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp); static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp); static void hfi1_tid_retry_timeout(struct timer_list *t); +static int make_tid_rdma_ack(struct rvt_qp *qp, + struct ib_other_headers *ohdr, + struct hfi1_pkt_state *ps); static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p) { @@ -4874,6 +4877,10 @@ int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps) ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth; + if ((priv->s_flags & RVT_S_ACK_PENDING) && + make_tid_rdma_ack(qp, ohdr, ps)) + return 1; + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) { if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND)) goto bail; @@ -5014,3 +5021,137 @@ int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps) iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID); return 0; } + +static int make_tid_rdma_ack(struct rvt_qp *qp, + struct ib_other_headers *ohdr, + struct hfi1_pkt_state *ps) +{ + struct rvt_ack_entry *e; + struct hfi1_qp_priv *qpriv = qp->priv; + struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); + u32 hwords, next; + u32 len = 0; + u32 bth1 = 0, bth2 = 0; + int middle = 0; + u16 flow; + struct tid_rdma_request *req, *nreq; + + /* Don't send an ACK if we aren't supposed to. */ + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) + goto bail; + + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + + e = &qp->s_ack_queue[qpriv->r_tid_ack]; + req = ack_to_tid_req(e); + /* + * In the RESYNC case, we are exactly one segment past the + * previously sent ack or at the previously sent NAK. So to send + * the resync ack, we go back one segment (which might be part of + * the previous request) and let the do-while loop execute again. + * The advantage of executing the do-while loop is that any data + * received after the previous ack is automatically acked in the + * RESYNC ack. It turns out that for the do-while loop we only need + * to pull back qpriv->r_tid_ack, not the segment + * indices/counters. The scheme works even if the previous request + * was not a TID WRITE request. + */ + if (qpriv->resync) { + if (!req->ack_seg || req->ack_seg == req->total_segs) + qpriv->r_tid_ack = !qpriv->r_tid_ack ? + rvt_size_atomic(&dev->rdi) : + qpriv->r_tid_ack - 1; + e = &qp->s_ack_queue[qpriv->r_tid_ack]; + req = ack_to_tid_req(e); + } + + /* + * If we've sent all the ACKs that we can, we are done + * until we get more segments... + */ + if (!qpriv->s_nak_state && !qpriv->resync && + req->ack_seg == req->comp_seg) + goto bail; + + do { + /* + * To deal with coalesced ACKs, the acked_tail pointer + * into the flow array is used. The distance between it + * and the clear_tail is the number of flows that are + * being ACK'ed. + */ + req->ack_seg += + /* Get up-to-date value */ + CIRC_CNT(req->clear_tail, req->acked_tail, + MAX_FLOWS); + /* Advance acked index */ + req->acked_tail = req->clear_tail; + + /* + * req->clear_tail points to the segment currently being + * received. So, when sending an ACK, the previous + * segment is being ACK'ed. + */ + flow = CIRC_PREV(req->acked_tail, MAX_FLOWS); + if (req->ack_seg != req->total_segs) + break; + req->state = TID_REQUEST_COMPLETE; + + next = qpriv->r_tid_ack + 1; + if (next > rvt_size_atomic(&dev->rdi)) + next = 0; + qpriv->r_tid_ack = next; + if (qp->s_ack_queue[next].opcode != TID_OP(WRITE_REQ)) + break; + nreq = ack_to_tid_req(&qp->s_ack_queue[next]); + if (!nreq->comp_seg || nreq->ack_seg == nreq->comp_seg) + break; + + /* Move to the next ack entry now */ + e = &qp->s_ack_queue[qpriv->r_tid_ack]; + req = ack_to_tid_req(e); + } while (1); + + /* + * At this point qpriv->r_tid_ack == qpriv->r_tid_tail but e and + * req could be pointing at the previous ack queue entry + */ + if (qpriv->s_nak_state || + (qpriv->resync && + !hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1) && + (cmp_psn(qpriv->r_next_psn_kdeth - 1, + full_flow_psn(&req->flows[flow], + req->flows[flow].flow_state.lpsn)) > 0))) { + /* + * A NAK will implicitly acknowledge all previous TID RDMA + * requests. Therefore, we NAK with the req->acked_tail + * segment for the request at qpriv->r_tid_ack (same at + * this point as the req->clear_tail segment for the + * qpriv->r_tid_tail request) + */ + e = &qp->s_ack_queue[qpriv->r_tid_ack]; + req = ack_to_tid_req(e); + flow = req->acked_tail; + } + + hwords += hfi1_build_tid_rdma_write_ack(qp, e, ohdr, flow, &bth1, + &bth2); + len = 0; + qpriv->s_flags &= ~RVT_S_ACK_PENDING; + ps->s_txreq->hdr_dwords = hwords; + ps->s_txreq->sde = qpriv->s_sde; + ps->s_txreq->s_cur_size = len; + ps->s_txreq->ss = NULL; + hfi1_make_ruc_header(qp, ohdr, (TID_OP(ACK) << 24), bth1, bth2, middle, + ps); + return 1; +bail: + /* + * Ensure s_rdma_ack_cnt changes are committed prior to resetting + * RVT_S_RESP_PENDING + */ + smp_wmb(); + qpriv->s_flags &= ~RVT_S_ACK_PENDING; + return 0; +} From 572f0c3301138961a596c522729afb5801135d6e Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:51:27 -0800 Subject: [PATCH 18/23] IB/hfi1: Add the dual leg code The "Second Leg" of the TID RDMA WRITE protocol deals with the transfer of data and ack packets, which are in the KDETH PSN space, as opposed to the IB PSN space. Therefore, the Second Leg could be considered as a separate state machine. As such, it is handled by a different work queue item which is scheduled along with the normal IB state machine work item. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/iowait.h | 12 +++ drivers/infiniband/hw/hfi1/qp.c | 34 ++++++- drivers/infiniband/hw/hfi1/qp.h | 1 + drivers/infiniband/hw/hfi1/ruc.c | 32 ++++-- drivers/infiniband/hw/hfi1/tid_rdma.c | 141 ++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 4 + drivers/infiniband/hw/hfi1/verbs.h | 3 + 7 files changed, 217 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h index 23a58ac0d47c..bd913701761d 100644 --- a/drivers/infiniband/hw/hfi1/iowait.h +++ b/drivers/infiniband/hw/hfi1/iowait.h @@ -185,6 +185,18 @@ static inline bool iowait_schedule(struct iowait *wait, return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_IB_SE].iowork); } +/** + * iowait_tid_schedule - schedule the tid SE + * @wait: the iowait structure + * @wq: the work queue + * @cpu: the cpu + */ +static inline bool iowait_tid_schedule(struct iowait *wait, + struct workqueue_struct *wq, int cpu) +{ + return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_TID_SE].iowork); +} + /** * iowait_sdma_drain() - wait for DMAs to drain * diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 31b4b60f4364..96632c77f36f 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -431,6 +431,11 @@ static void hfi1_qp_schedule(struct rvt_qp *qp) if (ret) iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_IB); } + if (iowait_flag_set(&priv->s_iowait, IOWAIT_PENDING_TID)) { + ret = hfi1_schedule_tid_send(qp); + if (ret) + iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_TID); + } } void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag) @@ -450,8 +455,27 @@ void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag) void hfi1_qp_unbusy(struct rvt_qp *qp, struct iowait_work *wait) { - if (iowait_set_work_flag(wait) == IOWAIT_IB_SE) + struct hfi1_qp_priv *priv = qp->priv; + + if (iowait_set_work_flag(wait) == IOWAIT_IB_SE) { qp->s_flags &= ~RVT_S_BUSY; + /* + * If we are sending a first-leg packet from the second leg, + * we need to clear the busy flag from priv->s_flags to + * avoid a race condition when the qp wakes up before + * the call to hfi1_verbs_send() returns to the second + * leg. In that case, the second leg will terminate without + * being re-scheduled, resulting in failure to send TID RDMA + * WRITE DATA and TID RDMA ACK packets. + */ + if (priv->s_flags & HFI1_S_TID_BUSY_SET) { + priv->s_flags &= ~(HFI1_S_TID_BUSY_SET | + RVT_S_BUSY); + iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID); + } + } else { + priv->s_flags &= ~RVT_S_BUSY; + } } static int iowait_sleep( @@ -694,7 +718,7 @@ void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp) &priv->s_iowait, 1, _hfi1_do_send, - NULL, + _hfi1_do_tid_send, iowait_sleep, iowait_wakeup, iowait_sdma_drained); @@ -851,7 +875,8 @@ void notify_error_qp(struct rvt_qp *qp) if (lock) { write_seqlock(lock); if (!list_empty(&priv->s_iowait.list) && - !(qp->s_flags & RVT_S_BUSY)) { + !(qp->s_flags & RVT_S_BUSY) && + !(priv->s_flags & RVT_S_BUSY)) { qp->s_flags &= ~RVT_S_ANY_WAIT_IO; list_del_init(&priv->s_iowait.list); priv->s_iowait.lock = NULL; @@ -860,7 +885,8 @@ void notify_error_qp(struct rvt_qp *qp) write_sequnlock(lock); } - if (!(qp->s_flags & RVT_S_BUSY)) { + if (!(qp->s_flags & RVT_S_BUSY) && !(priv->s_flags & RVT_S_BUSY)) { + qp->s_hdrwords = 0; if (qp->s_rdma_mr) { rvt_put_mr(qp->s_rdma_mr); qp->s_rdma_mr = NULL; diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h index d531b760ea93..b670321365d3 100644 --- a/drivers/infiniband/hw/hfi1/qp.h +++ b/drivers/infiniband/hw/hfi1/qp.h @@ -82,6 +82,7 @@ extern const struct rvt_operation_params hfi1_post_parms[]; #define HFI1_S_ANY_WAIT_IO (RVT_S_ANY_WAIT_IO | HFI1_S_WAIT_PIO_DRAIN) #define HFI1_S_ANY_WAIT (HFI1_S_ANY_WAIT_IO | RVT_S_ANY_WAIT_SEND) +#define HFI1_S_ANY_TID_WAIT_SEND (RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_DMA) /* * Send if not busy or waiting for I/O and either diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index f96c0f544cb0..124a3ec1e15c 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -453,11 +453,13 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr, #define SEND_RESCHED_TIMEOUT (5 * HZ) /* 5s in jiffies */ /** - * schedule_send_yield - test for a yield required for QP send engine + * hfi1_schedule_send_yield - test for a yield required for QP + * send engine * @timeout: Final time for timeout slice for jiffies * @qp: a pointer to QP * @ps: a pointer to a structure with commonly lookup values for * the the send engine progress + * @tid - true if it is the tid leg * * This routine checks if the time slice for the QP has expired * for RC QPs, if so an additional work entry is queued. At this @@ -465,8 +467,8 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr, * returns true if a yield is required, otherwise, false * is returned. */ -static bool schedule_send_yield(struct rvt_qp *qp, - struct hfi1_pkt_state *ps) +bool hfi1_schedule_send_yield(struct rvt_qp *qp, struct hfi1_pkt_state *ps, + bool tid) { ps->pkts_sent = true; @@ -474,8 +476,24 @@ static bool schedule_send_yield(struct rvt_qp *qp, if (!ps->in_thread || workqueue_congested(ps->cpu, ps->ppd->hfi1_wq)) { spin_lock_irqsave(&qp->s_lock, ps->flags); - qp->s_flags &= ~RVT_S_BUSY; - hfi1_schedule_send(qp); + if (!tid) { + qp->s_flags &= ~RVT_S_BUSY; + hfi1_schedule_send(qp); + } else { + struct hfi1_qp_priv *priv = qp->priv; + + if (priv->s_flags & + HFI1_S_TID_BUSY_SET) { + qp->s_flags &= ~RVT_S_BUSY; + priv->s_flags &= + ~(HFI1_S_TID_BUSY_SET | + RVT_S_BUSY); + } else { + priv->s_flags &= ~RVT_S_BUSY; + } + hfi1_schedule_tid_send(qp); + } + spin_unlock_irqrestore(&qp->s_lock, ps->flags); this_cpu_inc(*ps->ppd->dd->send_schedule); trace_hfi1_rc_expired_time_slice(qp, true); @@ -576,6 +594,8 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) do { /* Check for a constructed packet to be sent. */ if (ps.s_txreq) { + if (priv->s_flags & HFI1_S_TID_BUSY_SET) + qp->s_flags |= RVT_S_BUSY; spin_unlock_irqrestore(&qp->s_lock, ps.flags); /* * If the packet cannot be sent now, return and @@ -585,7 +605,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) return; /* allow other tasks to run */ - if (schedule_send_yield(qp, &ps)) + if (hfi1_schedule_send_yield(qp, &ps, false)) return; spin_lock_irqsave(&qp->s_lock, ps.flags); diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 15c12243c166..80111dd1d876 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -127,6 +127,7 @@ static void hfi1_tid_retry_timeout(struct timer_list *t); static int make_tid_rdma_ack(struct rvt_qp *qp, struct ib_other_headers *ohdr, struct hfi1_pkt_state *ps); +static void hfi1_do_tid_send(struct rvt_qp *qp); static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p) { @@ -3048,6 +3049,7 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, qpriv->s_flags |= RVT_S_ACK_PENDING; if (qpriv->r_tid_ack == HFI1_QP_WQE_INVALID) qpriv->r_tid_ack = qpriv->r_tid_tail; + hfi1_schedule_tid_send(qp); } goto unlock; } @@ -3517,6 +3519,7 @@ static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx) ret = -EAGAIN; to_seg = MAX_FLOWS >> 1; qpriv->s_flags |= RVT_S_ACK_PENDING; + hfi1_schedule_tid_send(qp); break; } @@ -4128,6 +4131,7 @@ void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet) } qp->s_flags &= ~HFI1_S_WAIT_TID_RESP; + hfi1_schedule_tid_send(qp); goto ack_done; ack_op_err: @@ -4287,6 +4291,7 @@ void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet) done: priv->s_flags |= RVT_S_ACK_PENDING; + hfi1_schedule_tid_send(qp); exit: priv->r_next_psn_kdeth = flow->flow_state.r_next_psn; spin_unlock_irqrestore(&qp->s_lock, flags); @@ -4299,6 +4304,7 @@ void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet) priv->s_flags |= RVT_S_ACK_PENDING; if (priv->r_tid_ack == HFI1_QP_WQE_INVALID) priv->r_tid_ack = priv->r_tid_tail; + hfi1_schedule_tid_send(qp); } goto done; } @@ -4567,6 +4573,7 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) req->cur_seg = req->ack_seg; qpriv->s_tid_tail = qp->s_acked; qpriv->s_state = TID_OP(WRITE_REQ); + hfi1_schedule_tid_send(qp); } done: qpriv->s_retry = qp->s_retry_cnt; @@ -4584,6 +4591,7 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) qpriv->s_tid_tail = qp->s_acked; qpriv->s_state = TID_OP(WRITE_REQ); qpriv->s_retry = qp->s_retry_cnt; + hfi1_schedule_tid_send(qp); break; default: @@ -4673,6 +4681,7 @@ static void hfi1_tid_retry_timeout(struct timer_list *t) qp->s_flags |= HFI1_S_WAIT_HALT; priv->s_state = TID_OP(RESYNC); priv->s_retry--; + hfi1_schedule_tid_send(qp); } } spin_unlock(&qp->s_lock); @@ -4804,6 +4813,7 @@ void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet) /* RESYNC request always gets a TID RDMA ACK. */ qpriv->s_nak_state = 0; qpriv->s_flags |= RVT_S_ACK_PENDING; + hfi1_schedule_tid_send(qp); bail: spin_unlock_irqrestore(&qp->s_lock, flags); } @@ -5155,3 +5165,134 @@ static int make_tid_rdma_ack(struct rvt_qp *qp, qpriv->s_flags &= ~RVT_S_ACK_PENDING; return 0; } + +static int hfi1_send_tid_ok(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + return !(priv->s_flags & RVT_S_BUSY || + qp->s_flags & HFI1_S_ANY_WAIT_IO) && + (verbs_txreq_queued(iowait_get_tid_work(&priv->s_iowait)) || + (priv->s_flags & RVT_S_RESP_PENDING) || + !(qp->s_flags & HFI1_S_ANY_TID_WAIT_SEND)); +} + +void _hfi1_do_tid_send(struct work_struct *work) +{ + struct iowait_work *w = container_of(work, struct iowait_work, iowork); + struct rvt_qp *qp = iowait_to_qp(w->iow); + + hfi1_do_tid_send(qp); +} + +static void hfi1_do_tid_send(struct rvt_qp *qp) +{ + struct hfi1_pkt_state ps; + struct hfi1_qp_priv *priv = qp->priv; + + ps.dev = to_idev(qp->ibqp.device); + ps.ibp = to_iport(qp->ibqp.device, qp->port_num); + ps.ppd = ppd_from_ibp(ps.ibp); + ps.wait = iowait_get_tid_work(&priv->s_iowait); + ps.in_thread = false; + ps.timeout_int = qp->timeout_jiffies / 8; + + spin_lock_irqsave(&qp->s_lock, ps.flags); + + /* Return if we are already busy processing a work request. */ + if (!hfi1_send_tid_ok(qp)) { + if (qp->s_flags & HFI1_S_ANY_WAIT_IO) + iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID); + spin_unlock_irqrestore(&qp->s_lock, ps.flags); + return; + } + + priv->s_flags |= RVT_S_BUSY; + + ps.timeout = jiffies + ps.timeout_int; + ps.cpu = priv->s_sde ? priv->s_sde->cpu : + cpumask_first(cpumask_of_node(ps.ppd->dd->node)); + ps.pkts_sent = false; + + /* insure a pre-built packet is handled */ + ps.s_txreq = get_waiting_verbs_txreq(ps.wait); + do { + /* Check for a constructed packet to be sent. */ + if (ps.s_txreq) { + if (priv->s_flags & HFI1_S_TID_BUSY_SET) { + qp->s_flags |= RVT_S_BUSY; + ps.wait = iowait_get_ib_work(&priv->s_iowait); + } + spin_unlock_irqrestore(&qp->s_lock, ps.flags); + + /* + * If the packet cannot be sent now, return and + * the send tasklet will be woken up later. + */ + if (hfi1_verbs_send(qp, &ps)) + return; + + /* allow other tasks to run */ + if (hfi1_schedule_send_yield(qp, &ps, true)) + return; + + spin_lock_irqsave(&qp->s_lock, ps.flags); + if (priv->s_flags & HFI1_S_TID_BUSY_SET) { + qp->s_flags &= ~RVT_S_BUSY; + priv->s_flags &= ~HFI1_S_TID_BUSY_SET; + ps.wait = iowait_get_tid_work(&priv->s_iowait); + if (iowait_flag_set(&priv->s_iowait, + IOWAIT_PENDING_IB)) + hfi1_schedule_send(qp); + } + } + } while (hfi1_make_tid_rdma_pkt(qp, &ps)); + iowait_starve_clear(ps.pkts_sent, &priv->s_iowait); + spin_unlock_irqrestore(&qp->s_lock, ps.flags); +} + +static bool _hfi1_schedule_tid_send(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_ibport *ibp = + to_iport(qp->ibqp.device, qp->port_num); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); + + return iowait_tid_schedule(&priv->s_iowait, ppd->hfi1_wq, + priv->s_sde ? + priv->s_sde->cpu : + cpumask_first(cpumask_of_node(dd->node))); +} + +/** + * hfi1_schedule_tid_send - schedule progress on TID RDMA state machine + * @qp: the QP + * + * This schedules qp progress on the TID RDMA state machine. Caller + * should hold the s_lock. + * Unlike hfi1_schedule_send(), this cannot use hfi1_send_ok() because + * the two state machines can step on each other with respect to the + * RVT_S_BUSY flag. + * Therefore, a modified test is used. + * @return true if the second leg is scheduled; + * false if the second leg is not scheduled. + */ +bool hfi1_schedule_tid_send(struct rvt_qp *qp) +{ + lockdep_assert_held(&qp->s_lock); + if (hfi1_send_tid_ok(qp)) { + /* + * The following call returns true if the qp is not on the + * queue and false if the qp is already on the queue before + * this call. Either way, the qp will be on the queue when the + * call returns. + */ + _hfi1_schedule_tid_send(qp); + return true; + } + if (qp->s_flags & HFI1_S_ANY_WAIT_IO) + iowait_set_flag(&((struct hfi1_qp_priv *)qp->priv)->s_iowait, + IOWAIT_PENDING_TID); + return false; +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 0ce0ef6d60f2..7f8f17ba6c14 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -305,4 +305,8 @@ void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet); struct hfi1_pkt_state; int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps); +void _hfi1_do_tid_send(struct work_struct *work); + +bool hfi1_schedule_tid_send(struct rvt_qp *qp); + #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 3e45149d22c4..bee3d21a548e 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -443,6 +443,9 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr, u32 bth0, u32 bth1, u32 bth2, int middle, struct hfi1_pkt_state *ps); +bool hfi1_schedule_send_yield(struct rvt_qp *qp, struct hfi1_pkt_state *ps, + bool tid); + void _hfi1_do_send(struct work_struct *work); void hfi1_do_send_from_rvt(struct rvt_qp *qp); From 3c6cb20a0d17d7a75778fb0935d6fa427c8177af Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:51:39 -0800 Subject: [PATCH 19/23] IB/hfi1: Add TID RDMA WRITE functionality into RDMA verbs This patch integrates TID RDMA WRITE protocol into normal RDMA verbs framework. The TID RDMA WRITE protocol is an end-to-end protocol between the hfi1 drivers on two OPA nodes that converts a qualified RDMA WRITE request into a TID RDMA WRITE request to avoid data copying on the responder side. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/qp.c | 7 + drivers/infiniband/hw/hfi1/rc.c | 487 ++++++++++++++++++++++--- drivers/infiniband/hw/hfi1/tid_rdma.c | 14 + drivers/infiniband/hw/hfi1/user_sdma.c | 3 +- drivers/infiniband/hw/hfi1/verbs.c | 17 +- drivers/infiniband/hw/hfi1/verbs.h | 1 + include/rdma/rdmavt_qp.h | 1 + 7 files changed, 481 insertions(+), 49 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 96632c77f36f..cfd598e4b303 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -138,6 +138,12 @@ const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = { .flags = RVT_OPERATION_USE_RESERVE, }, +[IB_WR_TID_RDMA_WRITE] = { + .length = sizeof(struct ib_rdma_wr), + .qpt_support = BIT(IB_QPT_RC), + .flags = RVT_OPERATION_IGN_RNR_CNT, +}, + }; static void flush_list_head(struct list_head *l) @@ -780,6 +786,7 @@ void quiesce_qp(struct rvt_qp *qp) struct hfi1_qp_priv *priv = qp->priv; hfi1_del_tid_reap_timer(qp); + hfi1_del_tid_retry_timer(qp); iowait_sdma_drain(&priv->s_iowait); qp_pio_drain(qp); flush_tx_list(qp); diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index fcb733ea8dfb..6d2abea896e5 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -111,16 +111,17 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, struct hfi1_pkt_state *ps) { struct rvt_ack_entry *e; - u32 hwords; + u32 hwords, hdrlen; u32 len = 0; u32 bth0 = 0, bth2 = 0; u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT); int middle = 0; u32 pmtu = qp->pmtu; - struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_qp_priv *qpriv = qp->priv; bool last_pkt; u32 delta; u8 next = qp->s_tail_ack_queue; + struct tid_rdma_request *req; trace_hfi1_rsp_make_rc_ack(qp, 0); lockdep_assert_held(&qp->s_lock); @@ -128,7 +129,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) goto bail; - if (priv->hdr_type == HFI1_PKT_TYPE_9B) + if (qpriv->hdr_type == HFI1_PKT_TYPE_9B) /* header size in 32-bit words LRH+BTH = (8+12)/4. */ hwords = 5; else @@ -206,6 +207,21 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, hwords++; qp->s_ack_rdma_psn = e->psn; bth2 = mask_psn(qp->s_ack_rdma_psn++); + } else if (e->opcode == TID_OP(WRITE_REQ)) { + /* + * If a TID RDMA WRITE RESP is being resent, we have to + * wait for the actual request. All requests that are to + * be resent will have their state set to + * TID_REQUEST_RESEND. When the new request arrives, the + * state will be changed to TID_REQUEST_RESEND_ACTIVE. + */ + req = ack_to_tid_req(e); + if (req->state == TID_REQUEST_RESEND || + req->state == TID_REQUEST_INIT_RESEND) + goto bail; + qp->s_ack_state = TID_OP(WRITE_RESP); + qp->s_ack_rdma_psn = mask_psn(e->psn + req->cur_seg); + goto write_resp; } else if (e->opcode == TID_OP(READ_REQ)) { /* * If a TID RDMA read response is being resent and @@ -267,6 +283,59 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, bth2 = mask_psn(qp->s_ack_rdma_psn++); break; + case TID_OP(WRITE_RESP): +write_resp: + /* + * 1. Check if RVT_S_ACK_PENDING is set. If yes, + * goto normal. + * 2. Attempt to allocate TID resources. + * 3. Remove RVT_S_RESP_PENDING flags from s_flags + * 4. If resources not available: + * 4.1 Set RVT_S_WAIT_TID_SPACE + * 4.2 Queue QP on RCD TID queue + * 4.3 Put QP on iowait list. + * 4.4 Build IB RNR NAK with appropriate timeout value + * 4.5 Return indication progress made. + * 5. If resources are available: + * 5.1 Program HW flow CSRs + * 5.2 Build TID RDMA WRITE RESP packet + * 5.3 If more resources needed, do 2.1 - 2.3. + * 5.4 Wake up next QP on RCD TID queue. + * 5.5 Return indication progress made. + */ + + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + req = ack_to_tid_req(e); + + /* + * Send scheduled RNR NAK's. RNR NAK's need to be sent at + * segment boundaries, not at request boundaries. Don't change + * s_ack_state because we are still in the middle of a request + */ + if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND && + qp->s_tail_ack_queue == qpriv->r_tid_alloc && + req->cur_seg == req->alloc_seg) { + qpriv->rnr_nak_state = TID_RNR_NAK_SENT; + goto normal_no_state; + } + + bth2 = mask_psn(qp->s_ack_rdma_psn); + hdrlen = hfi1_build_tid_rdma_write_resp(qp, e, ohdr, &bth1, + bth2, &len, + &ps->s_txreq->ss); + if (!hdrlen) + return 0; + + hwords += hdrlen; + bth0 = qp->s_ack_state << 24; + qp->s_ack_rdma_psn++; + if (req->cur_seg != req->total_segs) + break; + + e->sent = 1; + qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); + break; + case TID_OP(READ_RESP): read_resp: e = &qp->s_ack_queue[qp->s_tail_ack_queue]; @@ -298,8 +367,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, * (see above). */ qp->s_ack_state = OP(SEND_ONLY); - qp->s_flags &= ~RVT_S_ACK_PENDING; - ps->s_txreq->ss = NULL; +normal_no_state: if (qp->s_nak_state) ohdr->u.aeth = cpu_to_be32((qp->r_msn & IB_MSN_MASK) | @@ -311,9 +379,11 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, len = 0; bth0 = OP(ACKNOWLEDGE) << 24; bth2 = mask_psn(qp->s_ack_psn); + qp->s_flags &= ~RVT_S_ACK_PENDING; + ps->s_txreq->ss = NULL; } qp->s_rdma_ack_cnt++; - ps->s_txreq->sde = priv->s_sde; + ps->s_txreq->sde = qpriv->s_sde; ps->s_txreq->s_cur_size = len; ps->s_txreq->hdr_dwords = hwords; hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps); @@ -366,6 +436,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) int middle = 0; int delta; struct tid_rdma_flow *flow = NULL; + struct tid_rdma_params *remote; trace_hfi1_sender_make_rc_req(qp); lockdep_assert_held(&qp->s_lock); @@ -414,7 +485,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) goto done_free_tx; } - if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK)) + if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK | HFI1_S_WAIT_HALT)) goto bail; if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) { @@ -586,6 +657,108 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) qp->s_cur = 0; break; + case IB_WR_TID_RDMA_WRITE: + if (newreq) { + /* + * Limit the number of TID RDMA WRITE requests. + */ + if (atomic_read(&priv->n_tid_requests) >= + HFI1_TID_RDMA_WRITE_CNT) + goto bail; + + if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + } + + hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, + &bth1, &bth2, + &len); + ss = NULL; + if (priv->s_tid_cur == HFI1_QP_WQE_INVALID) { + priv->s_tid_cur = qp->s_cur; + if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) { + priv->s_tid_tail = qp->s_cur; + priv->s_state = TID_OP(WRITE_RESP); + } + } else if (priv->s_tid_cur == priv->s_tid_head) { + struct rvt_swqe *__w; + struct tid_rdma_request *__r; + + __w = rvt_get_swqe_ptr(qp, priv->s_tid_cur); + __r = wqe_to_tid_req(__w); + + /* + * The s_tid_cur pointer is advanced to s_cur if + * any of the following conditions about the WQE + * to which s_ti_cur currently points to are + * satisfied: + * 1. The request is not a TID RDMA WRITE + * request, + * 2. The request is in the INACTIVE or + * COMPLETE states (TID RDMA READ requests + * stay at INACTIVE and TID RDMA WRITE + * transition to COMPLETE when done), + * 3. The request is in the ACTIVE or SYNC + * state and the number of completed + * segments is equal to the total segment + * count. + * (If ACTIVE, the request is waiting for + * ACKs. If SYNC, the request has not + * received any responses because it's + * waiting on a sync point.) + */ + if (__w->wr.opcode != IB_WR_TID_RDMA_WRITE || + __r->state == TID_REQUEST_INACTIVE || + __r->state == TID_REQUEST_COMPLETE || + ((__r->state == TID_REQUEST_ACTIVE || + __r->state == TID_REQUEST_SYNC) && + __r->comp_seg == __r->total_segs)) { + if (priv->s_tid_tail == + priv->s_tid_cur && + priv->s_state == + TID_OP(WRITE_DATA_LAST)) { + priv->s_tid_tail = qp->s_cur; + priv->s_state = + TID_OP(WRITE_RESP); + } + priv->s_tid_cur = qp->s_cur; + } + /* + * A corner case: when the last TID RDMA WRITE + * request was completed, s_tid_head, + * s_tid_cur, and s_tid_tail all point to the + * same location. Other requests are posted and + * s_cur wraps around to the same location, + * where a new TID RDMA WRITE is posted. In + * this case, none of the indices need to be + * updated. However, the priv->s_state should. + */ + if (priv->s_tid_tail == qp->s_cur && + priv->s_state == TID_OP(WRITE_DATA_LAST)) + priv->s_state = TID_OP(WRITE_RESP); + } + req = wqe_to_tid_req(wqe); + if (newreq) { + priv->s_tid_head = qp->s_cur; + priv->pending_tid_w_resp += req->total_segs; + atomic_inc(&priv->n_tid_requests); + atomic_dec(&priv->n_requests); + } else { + req->state = TID_REQUEST_RESEND; + req->comp_seg = delta_psn(bth2, wqe->psn); + /* + * Pull back any segments since we are going + * to re-receive them. + */ + req->setup_head = req->clear_tail; + priv->pending_tid_w_resp += + delta_psn(wqe->lpsn, bth2) + 1; + } + + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + case IB_WR_RDMA_READ: /* * Don't allow more operations to be started @@ -745,7 +918,8 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) if (qp->s_tail >= qp->s_size) qp->s_tail = 0; } - if (wqe->wr.opcode == IB_WR_RDMA_READ) + if (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) qp->s_psn = wqe->lpsn + 1; else if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) qp->s_psn = req->s_next_psn; @@ -865,6 +1039,33 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) if (qp->s_cur == qp->s_size) qp->s_cur = 0; break; + + case TID_OP(WRITE_RESP): + /* + * This value for s_state is used for restarting a TID RDMA + * WRITE request. See comment in OP(RDMA_READ_RESPONSE_MIDDLE + * for more). + */ + req = wqe_to_tid_req(wqe); + req->state = TID_REQUEST_RESEND; + rcu_read_lock(); + remote = rcu_dereference(priv->tid_rdma.remote); + req->comp_seg = delta_psn(qp->s_psn, wqe->psn); + len = wqe->length - (req->comp_seg * remote->max_len); + rcu_read_unlock(); + + bth2 = mask_psn(qp->s_psn); + hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, &bth1, + &bth2, &len); + qp->s_psn = wqe->lpsn + 1; + ss = NULL; + qp->s_state = TID_OP(WRITE_REQ); + priv->pending_tid_w_resp += delta_psn(wqe->lpsn, bth2) + 1; + priv->s_tid_cur = qp->s_cur; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + case TID_OP(READ_RESP): if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) goto bail; @@ -965,7 +1166,8 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) } qp->s_sending_hpsn = bth2; delta = delta_psn(bth2, wqe->psn); - if (delta && delta % HFI1_PSN_CREDIT == 0) + if (delta && delta % HFI1_PSN_CREDIT == 0 && + wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) bth2 |= IB_BTH_REQ_ACK; if (qp->s_flags & RVT_S_SEND_ONE) { qp->s_flags &= ~RVT_S_SEND_ONE; @@ -998,6 +1200,12 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) bail_no_tx: ps->s_txreq = NULL; qp->s_flags &= ~RVT_S_BUSY; + /* + * If we didn't get a txreq, the QP will be woken up later to try + * again. Set the flags to indicate which work item to wake + * up. + */ + iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_IB); return 0; } @@ -1285,6 +1493,7 @@ static void reset_psn(struct rvt_qp *qp, u32 psn) lockdep_assert_held(&qp->s_lock); qp->s_cur = n; priv->pending_tid_r_segs = 0; + priv->pending_tid_w_resp = 0; qp->s_num_rd_atomic = 0; /* @@ -1342,6 +1551,10 @@ static void reset_psn(struct rvt_qp *qp, u32 psn) qp->s_state = OP(RDMA_READ_RESPONSE_LAST); break; + case IB_WR_TID_RDMA_WRITE: + qp->s_state = TID_OP(WRITE_RESP); + break; + case IB_WR_RDMA_READ: qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE); break; @@ -1435,7 +1648,7 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR | RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN | - RVT_S_WAIT_ACK); + RVT_S_WAIT_ACK | HFI1_S_WAIT_TID_RESP); if (wait) qp->s_flags |= RVT_S_SEND_ONE; reset_psn(qp, psn); @@ -1443,7 +1656,8 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) /* * Set qp->s_sending_psn to the next PSN after the given one. - * This would be psn+1 except when RDMA reads are present. + * This would be psn+1 except when RDMA reads or TID RDMA ops + * are present. */ static void reset_sending_psn(struct rvt_qp *qp, u32 psn) { @@ -1456,7 +1670,8 @@ static void reset_sending_psn(struct rvt_qp *qp, u32 psn) wqe = rvt_get_swqe_ptr(qp, n); if (cmp_psn(psn, wqe->lpsn) <= 0) { if (wqe->wr.opcode == IB_WR_RDMA_READ || - wqe->wr.opcode == IB_WR_TID_RDMA_READ) + wqe->wr.opcode == IB_WR_TID_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) qp->s_sending_psn = wqe->lpsn + 1; else qp->s_sending_psn = psn + 1; @@ -1479,8 +1694,9 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) struct rvt_swqe *wqe; struct ib_header *hdr = NULL; struct hfi1_16b_header *hdr_16b = NULL; - u32 opcode; + u32 opcode, head, tail; u32 psn; + struct tid_rdma_request *req; lockdep_assert_held(&qp->s_lock); if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK)) @@ -1507,29 +1723,84 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) opcode = ib_bth_get_opcode(ohdr); if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) && opcode <= OP(ATOMIC_ACKNOWLEDGE)) || - opcode == TID_OP(READ_RESP)) { + opcode == TID_OP(READ_RESP) || + opcode == TID_OP(WRITE_RESP)) { WARN_ON(!qp->s_rdma_ack_cnt); qp->s_rdma_ack_cnt--; return; } psn = ib_bth_get_psn(ohdr); - reset_sending_psn(qp, psn); + /* + * Don't attempt to reset the sending PSN for packets in the + * KDETH PSN space since the PSN does not match anything. + */ + if (opcode != TID_OP(WRITE_DATA) && + opcode != TID_OP(WRITE_DATA_LAST) && + opcode != TID_OP(ACK) && opcode != TID_OP(RESYNC)) + reset_sending_psn(qp, psn); + + /* Handle TID RDMA WRITE packets differently */ + if (opcode >= TID_OP(WRITE_REQ) && + opcode <= TID_OP(WRITE_DATA_LAST)) { + head = priv->s_tid_head; + tail = priv->s_tid_cur; + /* + * s_tid_cur is set to s_tid_head in the case, where + * a new TID RDMA request is being started and all + * previous ones have been completed. + * Therefore, we need to do a secondary check in order + * to properly determine whether we should start the + * RC timer. + */ + wqe = rvt_get_swqe_ptr(qp, tail); + req = wqe_to_tid_req(wqe); + if (head == tail && req->comp_seg < req->total_segs) { + if (tail == 0) + tail = qp->s_size - 1; + else + tail -= 1; + } + } else { + head = qp->s_tail; + tail = qp->s_acked; + } /* * Start timer after a packet requesting an ACK has been sent and * there are still requests that haven't been acked. */ - if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail && + if ((psn & IB_BTH_REQ_ACK) && tail != head && + opcode != TID_OP(WRITE_DATA) && opcode != TID_OP(WRITE_DATA_LAST) && + opcode != TID_OP(RESYNC) && !(qp->s_flags & - (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) && - (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { + (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) && + (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { if (opcode == TID_OP(READ_REQ)) rvt_add_retry_timer_ext(qp, priv->timeout_shift); else rvt_add_retry_timer(qp); } + /* Start TID RDMA ACK timer */ + if ((opcode == TID_OP(WRITE_DATA) || + opcode == TID_OP(WRITE_DATA_LAST) || + opcode == TID_OP(RESYNC)) && + (psn & IB_BTH_REQ_ACK) && + !(priv->s_flags & HFI1_S_TID_RETRY_TIMER) && + (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { + /* + * The TID RDMA ACK packet could be received before this + * function is called. Therefore, add the timer only if TID + * RDMA ACK packets are actually pending. + */ + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + req = wqe_to_tid_req(wqe); + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && + req->ack_seg < req->cur_seg) + hfi1_add_tid_retry_timer(qp); + } + while (qp->s_last != qp->s_acked) { u32 s_last; @@ -1628,7 +1899,16 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, } qp->s_retry = qp->s_retry_cnt; - update_last_psn(qp, wqe->lpsn); + /* + * Don't update the last PSN if the request being completed is + * a TID RDMA WRITE request. + * Completion of the TID RDMA WRITE requests are done by the + * TID RDMA ACKs and as such could be for a request that has + * already been ACKed as far as the IB state machine is + * concerned. + */ + if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) + update_last_psn(qp, wqe->lpsn); /* * If we are completing a request which is in the process of @@ -1658,6 +1938,54 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, return wqe; } +static void set_restart_qp(struct rvt_qp *qp, struct hfi1_ctxtdata *rcd) +{ + /* Retry this request. */ + if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) { + qp->r_flags |= RVT_R_RDMAR_SEQ; + hfi1_restart_rc(qp, qp->s_last_psn + 1, 0); + if (list_empty(&qp->rspwait)) { + qp->r_flags |= RVT_R_RSP_SEND; + rvt_get_qp(qp); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } + } +} + +/** + * update_qp_retry_state - Update qp retry state. + * @qp: the QP + * @psn: the packet sequence number of the TID RDMA WRITE RESP. + * @spsn: The start psn for the given TID RDMA WRITE swqe. + * @lpsn: The last psn for the given TID RDMA WRITE swqe. + * + * This function is called to update the qp retry state upon + * receiving a TID WRITE RESP after the qp is scheduled to retry + * a request. + */ +static void update_qp_retry_state(struct rvt_qp *qp, u32 psn, u32 spsn, + u32 lpsn) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + + qp->s_psn = psn + 1; + /* + * If this is the first TID RDMA WRITE RESP packet for the current + * request, change the s_state so that the retry will be processed + * correctly. Similarly, if this is the last TID RDMA WRITE RESP + * packet, change the s_state and advance the s_cur. + */ + if (cmp_psn(psn, lpsn) >= 0) { + qp->s_cur = qpriv->s_tid_cur + 1; + if (qp->s_cur >= qp->s_size) + qp->s_cur = 0; + qp->s_state = TID_OP(WRITE_REQ); + } else if (!cmp_psn(psn, spsn)) { + qp->s_cur = qpriv->s_tid_cur; + qp->s_state = TID_OP(WRITE_RESP); + } +} + /** * do_rc_ack - process an incoming RC ACK * @qp: the QP the ACK came in on @@ -1679,6 +2007,7 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, int ret = 0; u32 ack_psn; int diff; + struct rvt_dev_info *rdi; lockdep_assert_held(&qp->s_lock); /* @@ -1725,18 +2054,10 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, (opcode != TID_OP(READ_RESP) || diff != 0)) || ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) && - (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) { - /* Retry this request. */ - if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) { - qp->r_flags |= RVT_R_RDMAR_SEQ; - hfi1_restart_rc(qp, qp->s_last_psn + 1, 0); - if (list_empty(&qp->rspwait)) { - qp->r_flags |= RVT_R_RSP_SEND; - rvt_get_qp(qp); - list_add_tail(&qp->rspwait, - &rcd->qp_wait_list); - } - } + (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0)) || + (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && + (delta_psn(psn, qp->s_last_psn) != 1))) { + set_restart_qp(qp, rcd); /* * No need to process the ACK/NAK since we are * restarting an earlier request. @@ -1768,6 +2089,14 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, hfi1_schedule_send(qp); } } + + /* + * TID RDMA WRITE requests will be completed by the TID RDMA + * ACK packet handler (see tid_rdma.c). + */ + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) + break; + wqe = do_rc_completion(qp, wqe, ibp); if (qp->s_acked == qp->s_tail) break; @@ -1785,17 +2114,60 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, else rvt_stop_rc_timers(qp); } else if (qp->s_acked != qp->s_tail) { + struct rvt_swqe *__w = NULL; + + if (qpriv->s_tid_cur != HFI1_QP_WQE_INVALID) + __w = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur); + /* - * We are expecting more ACKs so - * mod the retry timer. + * Stop timers if we've received all of the TID RDMA + * WRITE * responses. */ - rvt_mod_retry_timer(qp); - /* - * We can stop re-sending the earlier packets and - * continue with the next packet the receiver wants. - */ - if (cmp_psn(qp->s_psn, psn) <= 0) - reset_psn(qp, psn + 1); + if (__w && __w->wr.opcode == IB_WR_TID_RDMA_WRITE && + opcode == TID_OP(WRITE_RESP)) { + /* + * Normally, the loop above would correctly + * process all WQEs from s_acked onward and + * either complete them or check for correct + * PSN sequencing. + * However, for TID RDMA, due to pipelining, + * the response may not be for the request at + * s_acked so the above look would just be + * skipped. This does not allow for checking + * the PSN sequencing. It has to be done + * separately. + */ + if (cmp_psn(psn, qp->s_last_psn + 1)) { + set_restart_qp(qp, rcd); + goto bail_stop; + } + /* + * If the psn is being resent, stop the + * resending. + */ + if (qp->s_cur != qp->s_tail && + cmp_psn(qp->s_psn, psn) <= 0) + update_qp_retry_state(qp, psn, + __w->psn, + __w->lpsn); + else if (--qpriv->pending_tid_w_resp) + rvt_mod_retry_timer(qp); + else + rvt_stop_rc_timers(qp); + } else { + /* + * We are expecting more ACKs so + * mod the retry timer. + */ + rvt_mod_retry_timer(qp); + /* + * We can stop re-sending the earlier packets + * and continue with the next packet the + * receiver wants. + */ + if (cmp_psn(qp->s_psn, psn) <= 0) + reset_psn(qp, psn + 1); + } } else { /* No more acks - kill all timers */ rvt_stop_rc_timers(qp); @@ -1811,6 +2183,15 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, rvt_get_credit(qp, aeth); qp->s_rnr_retry = qp->s_rnr_retry_cnt; qp->s_retry = qp->s_retry_cnt; + /* + * If the current request is a TID RDMA WRITE request and the + * response is not a TID RDMA WRITE RESP packet, s_last_psn + * can't be advanced. + */ + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && + opcode != TID_OP(WRITE_RESP) && + cmp_psn(psn, wqe->psn) >= 0) + return 1; update_last_psn(qp, psn); return 1; @@ -1820,20 +2201,31 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, goto bail_stop; if (qp->s_flags & RVT_S_WAIT_RNR) goto bail_stop; - if (qp->s_rnr_retry == 0) { + rdi = ib_to_rvt(qp->ibqp.device); + if (qp->s_rnr_retry == 0 && + !((rdi->post_parms[wqe->wr.opcode].flags & + RVT_OPERATION_IGN_RNR_CNT) && + qp->s_rnr_retry_cnt == 0)) { status = IB_WC_RNR_RETRY_EXC_ERR; goto class_b; } - if (qp->s_rnr_retry_cnt < 7) + if (qp->s_rnr_retry_cnt < 7 && qp->s_rnr_retry_cnt > 0) qp->s_rnr_retry--; - /* The last valid PSN is the previous PSN. */ - update_last_psn(qp, psn - 1); + /* + * The last valid PSN is the previous PSN. For TID RDMA WRITE + * request, s_last_psn should be incremented only when a TID + * RDMA WRITE RESP is received to avoid skipping lost TID RDMA + * WRITE RESP packets. + */ + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) { + reset_psn(qp, qp->s_last_psn + 1); + } else { + update_last_psn(qp, psn - 1); + reset_psn(qp, psn); + } ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn); - - reset_psn(qp, psn); - qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK); rvt_stop_rc_timers(qp); rvt_add_rnr_timer(qp, aeth); @@ -1918,6 +2310,7 @@ static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn, while (cmp_psn(psn, wqe->lpsn) > 0) { if (wqe->wr.opcode == IB_WR_RDMA_READ || wqe->wr.opcode == IB_WR_TID_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_WRITE || wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) break; diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 80111dd1d876..490e47a0f68b 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -3205,6 +3205,20 @@ void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp) do { struct hfi1_swqe_priv *priv = wqe->priv; + ret = hfi1_kern_exp_rcv_clear(&priv->tid_req); + } while (!ret); + } + for (i = qp->s_acked_ack_queue; i != qp->r_head_ack_queue;) { + struct rvt_ack_entry *e = &qp->s_ack_queue[i]; + + if (++i == rvt_max_atomic(ib_to_rvt(qp->ibqp.device))) + i = 0; + /* Free only locally allocated TID entries */ + if (e->opcode != TID_OP(WRITE_REQ)) + continue; + do { + struct hfi1_ack_priv *priv = e->priv; + ret = hfi1_kern_exp_rcv_clear(&priv->tid_req); } while (!ret); } diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index e5e7fad09f32..6764114b886c 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -1126,7 +1126,8 @@ static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags) 0xffffffull), psn = val & mask; if (expct) - psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK); + psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) | + ((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK); else psn = psn + frags; return psn & mask; diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 7b87b77582bd..ab97d71cdd92 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -161,6 +161,7 @@ MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the */ const enum ib_wc_opcode ib_hfi1_wc_opcode[] = { [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE, + [IB_WR_TID_RDMA_WRITE] = IB_WC_RDMA_WRITE, [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE, [IB_WR_SEND] = IB_WC_SEND, [IB_WR_SEND_WITH_IMM] = IB_WC_SEND, @@ -203,6 +204,12 @@ const u8 hdr_len_by_opcode[256] = { [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = 12 + 8 + 4, [IB_OPCODE_TID_RDMA_READ_REQ] = 12 + 8 + 36, [IB_OPCODE_TID_RDMA_READ_RESP] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_WRITE_REQ] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_WRITE_RESP] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_WRITE_DATA] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_WRITE_DATA_LAST] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_ACK] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_RESYNC] = 12 + 8 + 36, /* UC */ [IB_OPCODE_UC_SEND_FIRST] = 12 + 8, [IB_OPCODE_UC_SEND_MIDDLE] = 12 + 8, @@ -248,8 +255,14 @@ static const opcode_handler opcode_handler_tbl[256] = { [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = &hfi1_rc_rcv, /* TID RDMA has separate handlers for different opcodes.*/ + [IB_OPCODE_TID_RDMA_WRITE_REQ] = &hfi1_rc_rcv_tid_rdma_write_req, + [IB_OPCODE_TID_RDMA_WRITE_RESP] = &hfi1_rc_rcv_tid_rdma_write_resp, + [IB_OPCODE_TID_RDMA_WRITE_DATA] = &hfi1_rc_rcv_tid_rdma_write_data, + [IB_OPCODE_TID_RDMA_WRITE_DATA_LAST] = &hfi1_rc_rcv_tid_rdma_write_data, [IB_OPCODE_TID_RDMA_READ_REQ] = &hfi1_rc_rcv_tid_rdma_read_req, [IB_OPCODE_TID_RDMA_READ_RESP] = &hfi1_rc_rcv_tid_rdma_read_resp, + [IB_OPCODE_TID_RDMA_RESYNC] = &hfi1_rc_rcv_tid_rdma_resync, + [IB_OPCODE_TID_RDMA_ACK] = &hfi1_rc_rcv_tid_rdma_ack, /* UC */ [IB_OPCODE_UC_SEND_FIRST] = &hfi1_uc_rcv, @@ -1332,7 +1345,9 @@ static void hfi1_fill_device_attr(struct hfi1_devdata *dd) rdi->dparms.props.max_mr_size = U64_MAX; rdi->dparms.props.max_fast_reg_page_list_len = UINT_MAX; rdi->dparms.props.max_qp = hfi1_max_qps; - rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs; + rdi->dparms.props.max_qp_wr = + (hfi1_max_qp_wrs >= HFI1_QP_WQE_INVALID ? + HFI1_QP_WQE_INVALID - 1 : hfi1_max_qp_wrs); rdi->dparms.props.max_send_sge = hfi1_max_sges; rdi->dparms.props.max_recv_sge = hfi1_max_sges; rdi->dparms.props.max_sge_rd = hfi1_max_sges; diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index bee3d21a548e..62ace0b2d17a 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -193,6 +193,7 @@ struct hfi1_qp_priv { u32 r_tid_ack; /* the TID RDMA request to be ACK'ed */ u32 r_tid_alloc; /* Request for which we are allocating resources */ u32 pending_tid_w_segs; /* Num of pending tid write segments */ + u32 pending_tid_w_resp; /* Num of pending tid write responses */ u32 alloc_w_segs; /* Number of segments for which write */ /* resources have been allocated for this QP */ diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 4ee612ab6cb4..f0fbd4063fef 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -246,6 +246,7 @@ struct rvt_ack_entry { #define RVT_OPERATION_ATOMIC_SGE 0x00000004 #define RVT_OPERATION_LOCAL 0x00000008 #define RVT_OPERATION_USE_RESERVE 0x00000010 +#define RVT_OPERATION_IGN_RNR_CNT 0x00000020 #define RVT_OPERATION_MAX (IB_WR_RESERVED10 + 1) From c6c231175ccdf188d443c27e5456b9e2f65e44d4 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:51:49 -0800 Subject: [PATCH 20/23] IB/hfi1: Add interlock between TID RDMA WRITE and other requests This locking mechanism is designed to provent vavious memory corruption scenarios from occurring when requests are pipelined, especially when RDMA WRITE requests are interleaved with TID RDMA READ requests: 1. READ-AFTER-READ; 2. READ-AFTER-WRITE; 3. WRITE-AFTER-READ; 4. WRITE-AFTER-WRITE. When memory corruption is likely, a request will be held back until previous requests have been completed. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 6 ++++ drivers/infiniband/hw/hfi1/tid_rdma.c | 46 +++++++++++++++++++++++++-- drivers/infiniband/hw/hfi1/tid_rdma.h | 9 ++++++ 3 files changed, 59 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 6d2abea896e5..cfb863364f50 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -173,6 +173,12 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, } e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + /* Check for tid write fence */ + if ((qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK) || + hfi1_tid_rdma_ack_interlock(qp, e)) { + iowait_set_flag(&qpriv->s_iowait, IOWAIT_PENDING_IB); + goto bail; + } if (e->opcode == OP(RDMA_READ_REQUEST)) { /* * If a RDMA read response is being resent and diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 490e47a0f68b..286752011f25 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -2179,6 +2179,7 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet, req->state = TID_REQUEST_RESEND; req->cur_seg = req->comp_seg; } + qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK; } /* Re-process old requests.*/ if (qp->s_acked_ack_queue == qp->s_tail_ack_queue) @@ -3229,6 +3230,7 @@ bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe) struct rvt_swqe *prev; struct hfi1_qp_priv *priv = qp->priv; u32 s_prev; + struct tid_rdma_request *req; s_prev = (qp->s_cur == 0 ? qp->s_size : qp->s_cur) - 1; prev = rvt_get_swqe_ptr(qp, s_prev); @@ -3240,14 +3242,28 @@ bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe) case IB_WR_ATOMIC_CMP_AND_SWP: case IB_WR_ATOMIC_FETCH_AND_ADD: case IB_WR_RDMA_WRITE: + switch (prev->wr.opcode) { + case IB_WR_TID_RDMA_WRITE: + req = wqe_to_tid_req(prev); + if (req->ack_seg != req->total_segs) + goto interlock; + default: + break; + } case IB_WR_RDMA_READ: - break; + if (prev->wr.opcode != IB_WR_TID_RDMA_WRITE) + break; + /* fall through */ case IB_WR_TID_RDMA_READ: switch (prev->wr.opcode) { case IB_WR_RDMA_READ: if (qp->s_acked != qp->s_cur) goto interlock; break; + case IB_WR_TID_RDMA_WRITE: + req = wqe_to_tid_req(prev); + if (req->ack_seg != req->total_segs) + goto interlock; default: break; } @@ -5157,7 +5173,9 @@ static int make_tid_rdma_ack(struct rvt_qp *qp, e = &qp->s_ack_queue[qpriv->r_tid_ack]; req = ack_to_tid_req(e); flow = req->acked_tail; - } + } else if (req->ack_seg == req->total_segs && + qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK) + qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK; hwords += hfi1_build_tid_rdma_write_ack(qp, e, ohdr, flow, &bth1, &bth2); @@ -5310,3 +5328,27 @@ bool hfi1_schedule_tid_send(struct rvt_qp *qp) IOWAIT_PENDING_TID); return false; } + +bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e) +{ + struct rvt_ack_entry *prev; + struct tid_rdma_request *req; + struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); + struct hfi1_qp_priv *priv = qp->priv; + u32 s_prev; + + s_prev = qp->s_tail_ack_queue == 0 ? rvt_size_atomic(&dev->rdi) : + (qp->s_tail_ack_queue - 1); + prev = &qp->s_ack_queue[s_prev]; + + if ((e->opcode == TID_OP(READ_REQ) || + e->opcode == OP(RDMA_READ_REQUEST)) && + prev->opcode == TID_OP(WRITE_REQ)) { + req = ack_to_tid_req(prev); + if (req->ack_seg != req->total_segs) { + priv->s_flags |= HFI1_R_TID_WAIT_INTERLCK; + return true; + } + } + return false; +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 7f8f17ba6c14..44468188a374 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -25,6 +25,7 @@ * s_flags, there are no collisions. * * HFI1_S_TID_WAIT_INTERLCK - QP is waiting for requester interlock + * HFI1_R_TID_WAIT_INTERLCK - QP is waiting for responder interlock */ #define HFI1_S_TID_BUSY_SET BIT(0) /* BIT(1) reserved for RVT_S_BUSY. */ @@ -32,9 +33,15 @@ /* BIT(3) reserved for RVT_S_RESP_PENDING. */ /* BIT(4) reserved for RVT_S_ACK_PENDING. */ #define HFI1_S_TID_WAIT_INTERLCK BIT(5) +#define HFI1_R_TID_WAIT_INTERLCK BIT(6) /* BIT(7) - BIT(15) reserved for RVT_S_WAIT_*. */ +/* BIT(16) reserved for RVT_S_SEND_ONE */ #define HFI1_S_TID_RETRY_TIMER BIT(17) +/* BIT(18) reserved for RVT_S_ECN. */ #define HFI1_R_TID_SW_PSN BIT(19) +/* BIT(26) reserved for HFI1_S_WAIT_HALT */ +/* BIT(27) reserved for HFI1_S_WAIT_TID_RESP */ +/* BIT(28) reserved for HFI1_S_WAIT_TID_SPACE */ /* * Unlike regular IB RDMA VERBS, which do not require an entry @@ -309,4 +316,6 @@ void _hfi1_do_tid_send(struct work_struct *work); bool hfi1_schedule_tid_send(struct rvt_qp *qp); +bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e); + #endif /* HFI1_TID_RDMA_H */ From ad00889e7ca226a2bed2b210f17c93b7be1b1542 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:51:59 -0800 Subject: [PATCH 21/23] IB/hfi1: Enable TID RDMA WRITE protocol This patch enables TID RDMA WRITE protocol by converting a qualified RDMA WRITE request into a TID RDMA WRITE request internally: (1) The TID RDMA cability must be enabled; (2) The request must start on a 4K page boundary; (3) The request length must be a multiple of 4K and must be larger or equal to 256K. Signed-off-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 22 ++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 3 ++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 286752011f25..db3188f66dba 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -3322,6 +3322,18 @@ void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe) new_opcode = IB_WR_TID_RDMA_READ; do_tid_rdma = true; } + } else if (wqe->wr.opcode == IB_WR_RDMA_WRITE) { + /* + * TID RDMA is enabled for this RDMA WRITE request iff: + * 1. The remote address is page-aligned, + * 2. The length is larger than the minimum segment size, + * 3. The length is page-multiple. + */ + if (!(wqe->rdma_wr.remote_addr & ~PAGE_MASK) && + !(wqe->length & ~PAGE_MASK)) { + new_opcode = IB_WR_TID_RDMA_WRITE; + do_tid_rdma = true; + } } if (do_tid_rdma) { @@ -3338,12 +3350,22 @@ void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe) priv->tid_req.n_flows = remote->max_read; qpriv->tid_r_reqs++; wqe->lpsn += rvt_div_round_up_mtu(qp, wqe->length) - 1; + } else { + wqe->lpsn += priv->tid_req.total_segs - 1; + atomic_inc(&qpriv->n_requests); } priv->tid_req.cur_seg = 0; priv->tid_req.comp_seg = 0; priv->tid_req.ack_seg = 0; priv->tid_req.state = TID_REQUEST_INACTIVE; + /* + * Reset acked_tail. + * TID RDMA READ does not have ACKs so it does not + * update the pointer. We have to reset it so TID RDMA + * WRITE does not get confused. + */ + priv->tid_req.acked_tail = priv->tid_req.setup_head; trace_hfi1_tid_req_setup_tid_wqe(qp, 1, wqe->wr.opcode, wqe->psn, wqe->lpsn, &priv->tid_req); diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 44468188a374..53ab24ef4f02 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -266,7 +266,8 @@ static inline void hfi1_setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe) { if (wqe->priv && - wqe->wr.opcode == IB_WR_RDMA_READ && + (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_RDMA_WRITE) && wqe->length >= TID_RDMA_MIN_SEGMENT_SIZE) setup_tid_rdma_wqe(qp, wqe); } From a05c9bdcfd16cec3a004cca339ab45de4cdf4799 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:52:09 -0800 Subject: [PATCH 22/23] IB/hfi1: Add static trace for TID RDMA WRITE protocol This patch makes the following changes to the static trace: 1. Adds the decoding of TID RDMA WRITE packets in IB header trace; 2. Adds trace events for various stages of the TID RDMA WRITE protocol. These events provide a fine-grained control for monitoring and debugging the hfi1 driver in the filed. Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 12 + drivers/infiniband/hw/hfi1/tid_rdma.c | 88 ++++ drivers/infiniband/hw/hfi1/trace.c | 66 +++ drivers/infiniband/hw/hfi1/trace_ibhdrs.h | 6 + drivers/infiniband/hw/hfi1/trace_tid.h | 517 +++++++++++++++++++++- drivers/infiniband/hw/hfi1/trace_tx.h | 6 + 6 files changed, 692 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index cfb863364f50..82afa7736be7 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -162,6 +162,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, qp->s_acked_ack_queue == qp->s_tail_ack_queue) qp->s_acked_ack_queue = next; qp->s_tail_ack_queue = next; + trace_hfi1_rsp_make_rc_ack(qp, e->psn); /* FALLTHROUGH */ case OP(SEND_ONLY): case OP(ACKNOWLEDGE): @@ -263,6 +264,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, bth2 = mask_psn(e->psn); e->sent = 1; } + trace_hfi1_tid_write_rsp_make_rc_ack(qp); bth0 = qp->s_ack_state << 24; break; @@ -335,6 +337,8 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, hwords += hdrlen; bth0 = qp->s_ack_state << 24; qp->s_ack_rdma_psn++; + trace_hfi1_tid_req_make_rc_ack_write(qp, 0, e->opcode, e->psn, + e->lpsn, req); if (req->cur_seg != req->total_segs) break; @@ -761,6 +765,11 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) delta_psn(wqe->lpsn, bth2) + 1; } + trace_hfi1_tid_write_sender_make_req(qp, newreq); + trace_hfi1_tid_req_make_req_write(qp, newreq, + wqe->wr.opcode, + wqe->psn, wqe->lpsn, + req); if (++qp->s_cur == qp->s_size) qp->s_cur = 0; break; @@ -1070,6 +1079,8 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) priv->s_tid_cur = qp->s_cur; if (++qp->s_cur == qp->s_size) qp->s_cur = 0; + trace_hfi1_tid_req_make_req_write(qp, 0, wqe->wr.opcode, + wqe->psn, wqe->lpsn, req); break; case TID_OP(READ_RESP): @@ -1625,6 +1636,7 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) wqe = do_rc_completion(qp, wqe, ibp); qp->s_flags &= ~RVT_S_WAIT_ACK; } else { + trace_hfi1_tid_write_sender_restart_rc(qp, 0); if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { struct tid_rdma_request *req; diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index db3188f66dba..a49eb3d9b5b9 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -2962,6 +2962,12 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, e = &qp->s_ack_queue[qpriv->r_tid_tail]; req = ack_to_tid_req(e); flow = &req->flows[req->clear_tail]; + trace_hfi1_eflags_err_write(qp, rcv_type, rte, psn); + trace_hfi1_rsp_handle_kdeth_eflags(qp, psn); + trace_hfi1_tid_write_rsp_handle_kdeth_eflags(qp); + trace_hfi1_tid_req_handle_kdeth_eflags(qp, 0, e->opcode, e->psn, + e->lpsn, req); + trace_hfi1_tid_flow_handle_kdeth_eflags(qp, req->clear_tail, flow); switch (rcv_type) { case RHF_RCV_TYPE_EXPECTED: @@ -3489,6 +3495,8 @@ static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx) lockdep_assert_held(&qp->s_lock); while (1) { + trace_hfi1_rsp_tid_write_alloc_res(qp, 0); + trace_hfi1_tid_write_rsp_alloc_res(qp); /* * Don't allocate more segments if a RNR NAK has already been * scheduled to avoid messing up qp->r_psn: the RNR NAK will @@ -3517,6 +3525,8 @@ static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx) if (e->opcode != TID_OP(WRITE_REQ)) goto next_req; req = ack_to_tid_req(e); + trace_hfi1_tid_req_write_alloc_res(qp, 0, e->opcode, e->psn, + e->lpsn, req); /* Finished allocating for all segments of this request */ if (req->alloc_seg >= req->total_segs) goto next_req; @@ -3633,6 +3643,7 @@ static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx) */ qp->s_flags &= ~(RVT_S_ACK_PENDING); + trace_hfi1_rsp_tid_write_alloc_res(qp, qp->r_psn); /* * qpriv->rnr_nak_state is used to determine when the scheduled RNR NAK * has actually been sent. qp->s_flags RVT_S_ACK_PENDING bit cannot be @@ -3686,6 +3697,7 @@ void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet) is_fecn = process_ecn(qp, packet); psn = mask_psn(be32_to_cpu(ohdr->bth[2])); + trace_hfi1_rsp_rcv_tid_write_req(qp, psn); if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) rvt_comm_est(qp); @@ -3794,6 +3806,9 @@ void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet) qp->r_msn++; qp->r_psn++; + trace_hfi1_tid_req_rcv_write_req(qp, 0, e->opcode, e->psn, e->lpsn, + req); + if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID) { qpriv->r_tid_tail = qp->r_head_ack_queue; } else if (qpriv->r_tid_tail == qpriv->r_tid_head) { @@ -3814,6 +3829,7 @@ void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet) qpriv->r_tid_head = qp->r_head_ack_queue; hfi1_tid_write_alloc_resources(qp, true); + trace_hfi1_tid_write_rsp_rcv_req(qp); /* Schedule the send tasklet. */ qp->s_flags |= RVT_S_RESP_PENDING; @@ -3855,6 +3871,10 @@ u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, void *resp_addr = NULL; struct tid_rdma_params *remote; + trace_hfi1_tid_req_build_write_resp(qp, 0, e->opcode, e->psn, e->lpsn, + req); + trace_hfi1_tid_write_rsp_build_resp(qp); + trace_hfi1_rsp_build_tid_write_resp(qp, bth2); flow = &req->flows[req->flow_idx]; switch (req->state) { default: @@ -3876,12 +3896,14 @@ u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, goto done; req->state = TID_REQUEST_ACTIVE; + trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow); req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS); hfi1_add_tid_reap_timer(qp); break; case TID_REQUEST_RESEND_ACTIVE: case TID_REQUEST_RESEND: + trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow); req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS); if (!CIRC_CNT(req->setup_head, req->flow_idx, MAX_FLOWS)) req->state = TID_REQUEST_ACTIVE; @@ -3996,6 +4018,9 @@ static void hfi1_tid_timeout(struct timer_list *t) if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) { dd_dev_warn(dd_from_ibdev(qp->ibqp.device), "[QP%u] %s %d\n", qp->ibqp.qp_num, __func__, __LINE__); + trace_hfi1_msg_tid_timeout(/* msg */ + qp, "resource timeout = ", + (u64)qpriv->tid_timer_timeout_jiffies); hfi1_stop_tid_reap_timer(qp); /* * Go though the entire ack queue and clear any outstanding @@ -4102,6 +4127,8 @@ void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet) if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd)) goto ack_done; + trace_hfi1_ack(qp, psn); + flow = &req->flows[req->setup_head]; flow->pkt = 0; flow->tid_idx = 0; @@ -4129,13 +4156,17 @@ void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet) } memcpy(flow->tid_entry, packet->ebuf, pktlen); flow->tidcnt = pktlen / sizeof(*flow->tid_entry); + trace_hfi1_tid_flow_rcv_write_resp(qp, req->setup_head, flow); req->comp_seg++; + trace_hfi1_tid_write_sender_rcv_resp(qp, 0); /* * Walk the TID_ENTRY list to make sure we have enough space for a * complete segment. */ for (i = 0; i < flow->tidcnt; i++) { + trace_hfi1_tid_entry_rcv_write_resp(/* entry */ + qp, i, flow->tid_entry[i]); if (!EXP_TID_GET(flow->tid_entry[i], LEN)) { status = IB_WC_LOC_LEN_ERR; goto ack_err; @@ -4147,6 +4178,8 @@ void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet) goto ack_err; } + trace_hfi1_tid_req_rcv_write_resp(qp, 0, wqe->wr.opcode, wqe->psn, + wqe->lpsn, req); /* * If this is the first response for this request, set the initial * flow index to the current flow. @@ -4221,6 +4254,8 @@ bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe, next_offset = flow->tid_offset + *len; last_pkt = (flow->tid_idx == (flow->tidcnt - 1) && next_offset >= tidlen) || (flow->sent >= flow->length); + trace_hfi1_tid_entry_build_write_data(qp, flow->tid_idx, tidentry); + trace_hfi1_tid_flow_build_write_data(qp, req->clear_tail, flow); rcu_read_lock(); remote = rcu_dereference(qpriv->tid_rdma.remote); @@ -4303,6 +4338,10 @@ void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet) * - The entire request is complete and there are no more requests * (of any kind) in the queue. */ + trace_hfi1_rsp_rcv_tid_write_data(qp, psn); + trace_hfi1_tid_req_rcv_write_data(qp, 0, e->opcode, e->psn, e->lpsn, + req); + trace_hfi1_tid_write_rsp_rcv_data(qp); if (priv->r_tid_ack == HFI1_QP_WQE_INVALID) priv->r_tid_ack = priv->r_tid_tail; @@ -4451,6 +4490,7 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) unsigned long flags; u16 fidx; + trace_hfi1_tid_write_sender_rcv_tid_ack(qp, 0); is_fecn = process_ecn(qp, packet); psn = mask_psn(be32_to_cpu(ohdr->bth[2])); aeth = be32_to_cpu(ohdr->u.tid_rdma.ack.aeth); @@ -4458,6 +4498,7 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) resync_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.tid_flow_psn)); spin_lock_irqsave(&qp->s_lock, flags); + trace_hfi1_rcv_tid_ack(qp, aeth, psn, req_psn, resync_psn); /* If we are waiting for an ACK to RESYNC, drop any other packets */ if ((qp->s_flags & HFI1_S_WAIT_HALT) && @@ -4480,7 +4521,10 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) goto ack_op_err; req = wqe_to_tid_req(wqe); + trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn, + wqe->lpsn, req); flow = &req->flows[req->acked_tail]; + trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow); /* Drop stale ACK/NAK */ if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.spsn)) < 0) @@ -4493,11 +4537,14 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) /* advance acked segment pointer */ req->acked_tail = CIRC_NEXT(req->acked_tail, MAX_FLOWS); req->r_last_acked = flow->flow_state.resp_ib_psn; + trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn, + wqe->lpsn, req); if (req->ack_seg == req->total_segs) { req->state = TID_REQUEST_COMPLETE; wqe = do_rc_completion(qp, wqe, to_iport(qp->ibqp.device, qp->port_num)); + trace_hfi1_sender_rcv_tid_ack(qp); atomic_dec(&qpriv->n_tid_requests); if (qp->s_acked == qp->s_tail) break; @@ -4506,8 +4553,11 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) req = wqe_to_tid_req(wqe); } flow = &req->flows[req->acked_tail]; + trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow); } + trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn, + wqe->lpsn, req); switch (aeth >> 29) { case 0: /* ACK */ if (qpriv->s_flags & RVT_S_WAIT_ACK) @@ -4614,6 +4664,9 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) flow->pkt = 0; spsn += flow->npkts; resync_psn += flow->npkts; + trace_hfi1_tid_flow_rcv_tid_ack(qp, + fidx, + flow); } if (++last_acked == qpriv->s_tid_cur + 1) break; @@ -4638,6 +4691,8 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) case 0: /* PSN sequence error */ flow = &req->flows[req->acked_tail]; fspsn = full_flow_psn(flow, flow->flow_state.spsn); + trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, + flow); req->r_ack_psn = mask_psn(be32_to_cpu(ohdr->bth[2])); req->cur_seg = req->ack_seg; qpriv->s_tid_tail = qp->s_acked; @@ -4713,16 +4768,28 @@ static void hfi1_tid_retry_timeout(struct timer_list *t) struct rvt_qp *qp = priv->owner; struct rvt_swqe *wqe; unsigned long flags; + struct tid_rdma_request *req; spin_lock_irqsave(&qp->r_lock, flags); spin_lock(&qp->s_lock); + trace_hfi1_tid_write_sender_retry_timeout(qp, 0); if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) { hfi1_stop_tid_retry_timer(qp); if (!priv->s_retry) { + trace_hfi1_msg_tid_retry_timeout(/* msg */ + qp, + "Exhausted retries. Tid retry timeout = ", + (u64)priv->tid_retry_timeout_jiffies); + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); hfi1_trdma_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); } else { + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + req = wqe_to_tid_req(wqe); + trace_hfi1_tid_req_tid_retry_timeout(/* req */ + qp, 0, wqe->wr.opcode, wqe->psn, wqe->lpsn, req); + priv->s_flags &= ~RVT_S_WAIT_ACK; /* Only send one packet (the RESYNC) */ priv->s_flags |= RVT_S_SEND_ONE; @@ -4818,6 +4885,7 @@ void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet) * sync point and the flow has/will be reprogrammed */ qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; + trace_hfi1_tid_write_rsp_rcv_resync(qp); /* * Reset all TID flow information with the new generation. @@ -4832,6 +4900,8 @@ void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet) e = &qp->s_ack_queue[idx]; if (e->opcode == TID_OP(WRITE_REQ)) { req = ack_to_tid_req(e); + trace_hfi1_tid_req_rcv_resync(qp, 0, e->opcode, e->psn, + e->lpsn, req); /* start from last unacked segment */ for (flow_idx = req->clear_tail; @@ -4854,6 +4924,8 @@ void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet) full_flow_psn(flow, flow->flow_state.spsn); fs->psn += flow->npkts; + trace_hfi1_tid_flow_rcv_resync(qp, flow_idx, + flow); } } if (idx == qp->s_tail_ack_queue) @@ -4913,6 +4985,7 @@ int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps) u8 opcode = TID_OP(WRITE_DATA); lockdep_assert_held(&qp->s_lock); + trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0); /* * Prioritize the sending of the requests and responses over the * sending of the TID RDMA data packets. @@ -4970,6 +5043,8 @@ int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps) goto bail; wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail); req = wqe_to_tid_req(wqe); + trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode, wqe->psn, + wqe->lpsn, req); switch (priv->s_state) { case TID_OP(WRITE_REQ): case TID_OP(WRITE_RESP): @@ -4997,6 +5072,8 @@ int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps) * 3.2.2 Advance RESP pointers. * 3.3 Return indicating progress made. */ + trace_hfi1_sender_make_tid_pkt(qp); + trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0); wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail); req = wqe_to_tid_req(wqe); len = wqe->length; @@ -5004,6 +5081,8 @@ int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps) if (!req->comp_seg || req->cur_seg == req->comp_seg) goto bail; + trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode, + wqe->psn, wqe->lpsn, req); last = hfi1_build_tid_rdma_packet(wqe, ohdr, &bth1, &bth2, &len); @@ -5028,6 +5107,7 @@ int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps) break; case TID_OP(RESYNC): + trace_hfi1_sender_make_tid_pkt(qp); /* Use generation from the most recently received response */ wqe = rvt_get_swqe_ptr(qp, priv->s_tid_cur); req = wqe_to_tid_req(wqe); @@ -5098,6 +5178,7 @@ static int make_tid_rdma_ack(struct rvt_qp *qp, u16 flow; struct tid_rdma_request *req, *nreq; + trace_hfi1_tid_write_rsp_make_tid_ack(qp); /* Don't send an ACK if we aren't supposed to. */ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) goto bail; @@ -5128,6 +5209,9 @@ static int make_tid_rdma_ack(struct rvt_qp *qp, req = ack_to_tid_req(e); } + trace_hfi1_rsp_make_tid_ack(qp, e->psn); + trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn, + req); /* * If we've sent all the ACKs that we can, we are done * until we get more segments... @@ -5199,6 +5283,9 @@ static int make_tid_rdma_ack(struct rvt_qp *qp, qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK) qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK; + trace_hfi1_tid_write_rsp_make_tid_ack(qp); + trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn, + req); hwords += hfi1_build_tid_rdma_write_ack(qp, e, ohdr, flow, &bth1, &bth2); len = 0; @@ -5251,6 +5338,7 @@ static void hfi1_do_tid_send(struct rvt_qp *qp) ps.in_thread = false; ps.timeout_int = qp->timeout_jiffies / 8; + trace_hfi1_rc_do_tid_send(qp, false); spin_lock_irqsave(&qp->s_lock, ps.flags); /* Return if we are already busy processing a work request. */ diff --git a/drivers/infiniband/hw/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c index 28181d711fed..9a3d236bcc88 100644 --- a/drivers/infiniband/hw/hfi1/trace.c +++ b/drivers/infiniband/hw/hfi1/trace.c @@ -133,6 +133,11 @@ const char *hfi1_trace_get_packet_l2_str(u8 l2) #define TID_RDMA_KDETH_DATA "kdeth0 0x%x: kver %u sh %u intr %u tidctrl %u tid %x offset %x kdeth1 0x%x: jkey %x" #define TID_READ_REQ_PRN "tid_flow_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x" #define TID_READ_RSP_PRN "verbs_qp 0x%x" +#define TID_WRITE_REQ_PRN "original_qp 0x%x" +#define TID_WRITE_RSP_PRN "tid_flow_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x" +#define TID_WRITE_DATA_PRN "verbs_qp 0x%x" +#define TID_ACK_PRN "tid_flow_psn 0x%x verbs_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x" +#define TID_RESYNC_PRN "verbs_qp 0x%x" #define OP(transport, op) IB_OPCODE_## transport ## _ ## op @@ -327,6 +332,45 @@ const char *parse_everbs_hdrs( parse_syndrome(be32_to_cpu(eh->aeth) >> 24), be32_to_cpu(eh->aeth) & IB_MSN_MASK); break; + case OP(TID_RDMA, WRITE_REQ): + trace_seq_printf(p, TID_RDMA_KDETH " " RETH_PRN " " + TID_WRITE_REQ_PRN, + le32_to_cpu(eh->tid_rdma.w_req.kdeth0), + le32_to_cpu(eh->tid_rdma.w_req.kdeth1), + ib_u64_get(&eh->tid_rdma.w_req.reth.vaddr), + be32_to_cpu(eh->tid_rdma.w_req.reth.rkey), + be32_to_cpu(eh->tid_rdma.w_req.reth.length), + be32_to_cpu(eh->tid_rdma.w_req.verbs_qp)); + break; + case OP(TID_RDMA, WRITE_RESP): + trace_seq_printf(p, TID_RDMA_KDETH " " AETH_PRN " " + TID_WRITE_RSP_PRN, + le32_to_cpu(eh->tid_rdma.w_rsp.kdeth0), + le32_to_cpu(eh->tid_rdma.w_rsp.kdeth1), + be32_to_cpu(eh->tid_rdma.w_rsp.aeth) >> 24, + parse_syndrome(/* aeth */ + be32_to_cpu(eh->tid_rdma.w_rsp.aeth) + >> 24), + (be32_to_cpu(eh->tid_rdma.w_rsp.aeth) & + IB_MSN_MASK), + be32_to_cpu(eh->tid_rdma.w_rsp.tid_flow_psn), + be32_to_cpu(eh->tid_rdma.w_rsp.tid_flow_qp), + be32_to_cpu(eh->tid_rdma.w_rsp.verbs_qp)); + break; + case OP(TID_RDMA, WRITE_DATA_LAST): + case OP(TID_RDMA, WRITE_DATA): + trace_seq_printf(p, TID_RDMA_KDETH_DATA " " TID_WRITE_DATA_PRN, + le32_to_cpu(eh->tid_rdma.w_data.kdeth0), + KDETH_GET(eh->tid_rdma.w_data.kdeth0, KVER), + KDETH_GET(eh->tid_rdma.w_data.kdeth0, SH), + KDETH_GET(eh->tid_rdma.w_data.kdeth0, INTR), + KDETH_GET(eh->tid_rdma.w_data.kdeth0, TIDCTRL), + KDETH_GET(eh->tid_rdma.w_data.kdeth0, TID), + KDETH_GET(eh->tid_rdma.w_data.kdeth0, OFFSET), + le32_to_cpu(eh->tid_rdma.w_data.kdeth1), + KDETH_GET(eh->tid_rdma.w_data.kdeth1, JKEY), + be32_to_cpu(eh->tid_rdma.w_data.verbs_qp)); + break; case OP(TID_RDMA, READ_REQ): trace_seq_printf(p, TID_RDMA_KDETH " " RETH_PRN " " TID_READ_REQ_PRN, @@ -359,6 +403,28 @@ const char *parse_everbs_hdrs( IB_MSN_MASK), be32_to_cpu(eh->tid_rdma.r_rsp.verbs_qp)); break; + case OP(TID_RDMA, ACK): + trace_seq_printf(p, TID_RDMA_KDETH " " AETH_PRN " " + TID_ACK_PRN, + le32_to_cpu(eh->tid_rdma.ack.kdeth0), + le32_to_cpu(eh->tid_rdma.ack.kdeth1), + be32_to_cpu(eh->tid_rdma.ack.aeth) >> 24, + parse_syndrome(/* aeth */ + be32_to_cpu(eh->tid_rdma.ack.aeth) + >> 24), + (be32_to_cpu(eh->tid_rdma.ack.aeth) & + IB_MSN_MASK), + be32_to_cpu(eh->tid_rdma.ack.tid_flow_psn), + be32_to_cpu(eh->tid_rdma.ack.verbs_psn), + be32_to_cpu(eh->tid_rdma.ack.tid_flow_qp), + be32_to_cpu(eh->tid_rdma.ack.verbs_qp)); + break; + case OP(TID_RDMA, RESYNC): + trace_seq_printf(p, TID_RDMA_KDETH " " TID_RESYNC_PRN, + le32_to_cpu(eh->tid_rdma.resync.kdeth0), + le32_to_cpu(eh->tid_rdma.resync.kdeth1), + be32_to_cpu(eh->tid_rdma.resync.verbs_qp)); + break; /* aeth + atomicacketh */ case OP(RC, ATOMIC_ACKNOWLEDGE): trace_seq_printf(p, AETH_PRN " " ATOMICACKETH_PRN, diff --git a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h index 1116238bf24d..d1372cc66de6 100644 --- a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h +++ b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h @@ -79,8 +79,14 @@ __print_symbolic(opcode, \ ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE), \ ib_opcode_name(RC_COMPARE_SWAP), \ ib_opcode_name(RC_FETCH_ADD), \ + ib_opcode_name(TID_RDMA_WRITE_REQ), \ + ib_opcode_name(TID_RDMA_WRITE_RESP), \ + ib_opcode_name(TID_RDMA_WRITE_DATA), \ + ib_opcode_name(TID_RDMA_WRITE_DATA_LAST), \ ib_opcode_name(TID_RDMA_READ_REQ), \ ib_opcode_name(TID_RDMA_READ_RESP), \ + ib_opcode_name(TID_RDMA_RESYNC), \ + ib_opcode_name(TID_RDMA_ACK), \ ib_opcode_name(UC_SEND_FIRST), \ ib_opcode_name(UC_SEND_MIDDLE), \ ib_opcode_name(UC_SEND_LAST), \ diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h index a45b5257d6c4..548dfc45a407 100644 --- a/drivers/infiniband/hw/hfi1/trace_tid.h +++ b/drivers/infiniband/hw/hfi1/trace_tid.h @@ -56,16 +56,33 @@ u16 hfi1_trace_get_tid_idx(u32 ent); "fpsn 0x%x flow_flags 0x%x" #define TID_REQ_PRN "[%s] qpn 0x%x newreq %u opcode 0x%x psn 0x%x lpsn 0x%x " \ - "cur_seg %u comp_seg %u ack_seg %u " \ + "cur_seg %u comp_seg %u ack_seg %u alloc_seg %u " \ "total_segs %u setup_head %u clear_tail %u flow_idx %u " \ - "state %u r_flow_psn 0x%x " \ - "s_next_psn 0x%x" + "acked_tail %u state %u r_ack_psn 0x%x r_flow_psn 0x%x " \ + "r_last_ackd 0x%x s_next_psn 0x%x" #define RCV_ERR_PRN "[%s] qpn 0x%x s_flags 0x%x state 0x%x " \ "s_acked_ack_queue %u s_tail_ack_queue %u " \ "r_head_ack_queue %u opcode 0x%x psn 0x%x r_psn 0x%x " \ " diff %d" +#define TID_WRITE_RSPDR_PRN "[%s] qpn 0x%x r_tid_head %u r_tid_tail %u " \ + "r_tid_ack %u r_tid_alloc %u alloc_w_segs %u " \ + "pending_tid_w_segs %u sync_pt %s " \ + "ps_nak_psn 0x%x ps_nak_state 0x%x " \ + "prnr_nak_state 0x%x hw_flow_index %u generation "\ + "0x%x fpsn 0x%x flow_flags 0x%x resync %s" \ + "r_next_psn_kdeth 0x%x" + +#define TID_WRITE_SENDER_PRN "[%s] qpn 0x%x newreq %u s_tid_cur %u " \ + "s_tid_tail %u s_tid_head %u " \ + "pending_tid_w_resp %u n_requests %u " \ + "n_tid_requests %u s_flags 0x%x ps_flags 0x%x "\ + "iow_flags 0x%lx s_state 0x%x s_retry %u" + +#define KDETH_EFLAGS_ERR_PRN "[%s] qpn 0x%x TID ERR: RcvType 0x%x " \ + "RcvTypeError 0x%x PSN 0x%x" + DECLARE_EVENT_CLASS(/* class */ hfi1_exp_tid_reg_unreg, TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages, @@ -382,6 +399,18 @@ DEFINE_EVENT(/* event */ TP_ARGS(qp, msg, more) ); +DEFINE_EVENT(/* event */ + hfi1_msg_template, hfi1_msg_tid_timeout, + TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more), + TP_ARGS(qp, msg, more) +); + +DEFINE_EVENT(/* event */ + hfi1_msg_template, hfi1_msg_tid_retry_timeout, + TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more), + TP_ARGS(qp, msg, more) +); + DECLARE_EVENT_CLASS(/* tid_flow_page */ hfi1_tid_flow_page_template, TP_PROTO(struct rvt_qp *qp, struct tid_rdma_flow *flow, u32 index, @@ -562,6 +591,42 @@ DEFINE_EVENT(/* event */ TP_ARGS(qp, index, flow) ); +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_build_write_resp, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_rcv_write_resp, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_build_write_data, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_rcv_tid_ack, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_rcv_resync, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_handle_kdeth_eflags, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + DECLARE_EVENT_CLASS(/* tid_node */ hfi1_tid_node_template, TP_PROTO(struct rvt_qp *qp, const char *msg, u32 index, u32 base, @@ -656,6 +721,18 @@ DEFINE_EVENT(/* event */ TP_ARGS(qp, index, ent) ); +DEFINE_EVENT(/* event */ + hfi1_tid_entry_template, hfi1_tid_entry_rcv_write_resp, + TP_PROTO(struct rvt_qp *qp, int index, u32 entry), + TP_ARGS(qp, index, entry) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_entry_template, hfi1_tid_entry_build_write_data, + TP_PROTO(struct rvt_qp *qp, int index, u32 entry), + TP_ARGS(qp, index, entry) +); + DECLARE_EVENT_CLASS(/* rsp_info */ hfi1_responder_info_template, TP_PROTO(struct rvt_qp *qp, u32 psn), @@ -738,6 +815,42 @@ DEFINE_EVENT(/* event */ TP_ARGS(qp, psn) ); +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_tid_write_alloc_res, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_rcv_tid_write_req, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_build_tid_write_resp, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_rcv_tid_write_data, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_make_tid_ack, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_handle_kdeth_eflags, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + DECLARE_EVENT_CLASS(/* sender_info */ hfi1_sender_info_template, TP_PROTO(struct rvt_qp *qp), @@ -830,6 +943,18 @@ DEFINE_EVENT(/* event */ TP_ARGS(qp) ); +DEFINE_EVENT(/* event */ + hfi1_sender_info_template, hfi1_sender_rcv_tid_ack, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_sender_info_template, hfi1_sender_make_tid_pkt, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + DECLARE_EVENT_CLASS(/* tid_read_sender */ hfi1_tid_read_sender_template, TP_PROTO(struct rvt_qp *qp, char newreq), @@ -908,12 +1033,16 @@ DECLARE_EVENT_CLASS(/* tid_rdma_request */ __field(u32, cur_seg) __field(u32, comp_seg) __field(u32, ack_seg) + __field(u32, alloc_seg) __field(u32, total_segs) __field(u16, setup_head) __field(u16, clear_tail) __field(u16, flow_idx) + __field(u16, acked_tail) __field(u32, state) + __field(u32, r_ack_psn) __field(u32, r_flow_psn) + __field(u32, r_last_acked) __field(u32, s_next_psn) ), TP_fast_assign(/* assign */ @@ -926,12 +1055,16 @@ DECLARE_EVENT_CLASS(/* tid_rdma_request */ __entry->cur_seg = req->cur_seg; __entry->comp_seg = req->comp_seg; __entry->ack_seg = req->ack_seg; + __entry->alloc_seg = req->alloc_seg; __entry->total_segs = req->total_segs; __entry->setup_head = req->setup_head; __entry->clear_tail = req->clear_tail; __entry->flow_idx = req->flow_idx; + __entry->acked_tail = req->acked_tail; __entry->state = req->state; + __entry->r_ack_psn = req->r_ack_psn; __entry->r_flow_psn = req->r_flow_psn; + __entry->r_last_acked = req->r_last_acked; __entry->s_next_psn = req->s_next_psn; ), TP_printk(/* print */ @@ -945,12 +1078,16 @@ DECLARE_EVENT_CLASS(/* tid_rdma_request */ __entry->cur_seg, __entry->comp_seg, __entry->ack_seg, + __entry->alloc_seg, __entry->total_segs, __entry->setup_head, __entry->clear_tail, __entry->flow_idx, + __entry->acked_tail, __entry->state, + __entry->r_ack_psn, __entry->r_flow_psn, + __entry->r_last_acked, __entry->s_next_psn ) ); @@ -1004,6 +1141,97 @@ DEFINE_EVENT(/* event */ TP_ARGS(qp, newreq, opcode, psn, lpsn, req) ); +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_write_alloc_res, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_write_req, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_build_write_resp, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_write_resp, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_write_data, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_tid_ack, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_tid_retry_timeout, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_resync, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_make_tid_pkt, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_make_tid_ack, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_handle_kdeth_eflags, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_make_rc_ack_write, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_make_req_write, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + DECLARE_EVENT_CLASS(/* rc_rcv_err */ hfi1_rc_rcv_err_template, TP_PROTO(struct rvt_qp *qp, u32 opcode, u32 psn, int diff), @@ -1090,6 +1318,289 @@ DEFINE_EVENT(/* event */ TP_ARGS(qp, index, sge) ); +DECLARE_EVENT_CLASS(/* tid_write_sp */ + hfi1_tid_write_rsp_template, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, r_tid_head) + __field(u32, r_tid_tail) + __field(u32, r_tid_ack) + __field(u32, r_tid_alloc) + __field(u32, alloc_w_segs) + __field(u32, pending_tid_w_segs) + __field(bool, sync_pt) + __field(u32, ps_nak_psn) + __field(u8, ps_nak_state) + __field(u8, prnr_nak_state) + __field(u32, hw_flow_index) + __field(u32, generation) + __field(u32, fpsn) + __field(u32, flow_flags) + __field(bool, resync) + __field(u32, r_next_psn_kdeth) + ), + TP_fast_assign(/* assign */ + struct hfi1_qp_priv *priv = qp->priv; + + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->r_tid_head = priv->r_tid_head; + __entry->r_tid_tail = priv->r_tid_tail; + __entry->r_tid_ack = priv->r_tid_ack; + __entry->r_tid_alloc = priv->r_tid_alloc; + __entry->alloc_w_segs = priv->alloc_w_segs; + __entry->pending_tid_w_segs = priv->pending_tid_w_segs; + __entry->sync_pt = priv->sync_pt; + __entry->ps_nak_psn = priv->s_nak_psn; + __entry->ps_nak_state = priv->s_nak_state; + __entry->prnr_nak_state = priv->rnr_nak_state; + __entry->hw_flow_index = priv->flow_state.index; + __entry->generation = priv->flow_state.generation; + __entry->fpsn = priv->flow_state.psn; + __entry->flow_flags = priv->flow_state.flags; + __entry->resync = priv->resync; + __entry->r_next_psn_kdeth = priv->r_next_psn_kdeth; + ), + TP_printk(/* print */ + TID_WRITE_RSPDR_PRN, + __get_str(dev), + __entry->qpn, + __entry->r_tid_head, + __entry->r_tid_tail, + __entry->r_tid_ack, + __entry->r_tid_alloc, + __entry->alloc_w_segs, + __entry->pending_tid_w_segs, + __entry->sync_pt ? "yes" : "no", + __entry->ps_nak_psn, + __entry->ps_nak_state, + __entry->prnr_nak_state, + __entry->hw_flow_index, + __entry->generation, + __entry->fpsn, + __entry->flow_flags, + __entry->resync ? "yes" : "no", + __entry->r_next_psn_kdeth + ) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_alloc_res, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_rcv_req, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_build_resp, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_rcv_data, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_rcv_resync, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_make_tid_ack, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_handle_kdeth_eflags, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_make_rc_ack, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DECLARE_EVENT_CLASS(/* tid_write_sender */ + hfi1_tid_write_sender_template, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(char, newreq) + __field(u32, s_tid_cur) + __field(u32, s_tid_tail) + __field(u32, s_tid_head) + __field(u32, pending_tid_w_resp) + __field(u32, n_requests) + __field(u32, n_tid_requests) + __field(u32, s_flags) + __field(u32, ps_flags) + __field(unsigned long, iow_flags) + __field(u8, s_state) + __field(u8, s_retry) + ), + TP_fast_assign(/* assign */ + struct hfi1_qp_priv *priv = qp->priv; + + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->newreq = newreq; + __entry->s_tid_cur = priv->s_tid_cur; + __entry->s_tid_tail = priv->s_tid_tail; + __entry->s_tid_head = priv->s_tid_head; + __entry->pending_tid_w_resp = priv->pending_tid_w_resp; + __entry->n_requests = atomic_read(&priv->n_requests); + __entry->n_tid_requests = atomic_read(&priv->n_tid_requests); + __entry->s_flags = qp->s_flags; + __entry->ps_flags = priv->s_flags; + __entry->iow_flags = priv->s_iowait.flags; + __entry->s_state = priv->s_state; + __entry->s_retry = priv->s_retry; + ), + TP_printk(/* print */ + TID_WRITE_SENDER_PRN, + __get_str(dev), + __entry->qpn, + __entry->newreq, + __entry->s_tid_cur, + __entry->s_tid_tail, + __entry->s_tid_head, + __entry->pending_tid_w_resp, + __entry->n_requests, + __entry->n_tid_requests, + __entry->s_flags, + __entry->ps_flags, + __entry->iow_flags, + __entry->s_state, + __entry->s_retry + ) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_sender_template, hfi1_tid_write_sender_rcv_resp, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_sender_template, hfi1_tid_write_sender_rcv_tid_ack, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_sender_template, hfi1_tid_write_sender_retry_timeout, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_sender_template, hfi1_tid_write_sender_make_tid_pkt, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_sender_template, hfi1_tid_write_sender_make_req, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_sender_template, hfi1_tid_write_sender_restart_rc, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq) +); + +DECLARE_EVENT_CLASS(/* tid_ack */ + hfi1_tid_ack_template, + TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn, + u32 req_psn, u32 resync_psn), + TP_ARGS(qp, aeth, psn, req_psn, resync_psn), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, aeth) + __field(u32, psn) + __field(u32, req_psn) + __field(u32, resync_psn) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) + __entry->qpn = qp->ibqp.qp_num; + __entry->aeth = aeth; + __entry->psn = psn; + __entry->req_psn = req_psn; + __entry->resync_psn = resync_psn; + ), + TP_printk(/* print */ + "[%s] qpn 0x%x aeth 0x%x psn 0x%x req_psn 0x%x resync_psn 0x%x", + __get_str(dev), + __entry->qpn, + __entry->aeth, + __entry->psn, + __entry->req_psn, + __entry->resync_psn + ) +); + +DEFINE_EVENT(/* rcv_tid_ack */ + hfi1_tid_ack_template, hfi1_rcv_tid_ack, + TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn, + u32 req_psn, u32 resync_psn), + TP_ARGS(qp, aeth, psn, req_psn, resync_psn) +); + +DECLARE_EVENT_CLASS(/* kdeth_eflags_error */ + hfi1_kdeth_eflags_error_template, + TP_PROTO(struct rvt_qp *qp, u8 rcv_type, u8 rte, u32 psn), + TP_ARGS(qp, rcv_type, rte, psn), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u8, rcv_type) + __field(u8, rte) + __field(u32, psn) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->rcv_type = rcv_type; + __entry->rte = rte; + __entry->psn = psn; + ), + TP_printk(/* print */ + KDETH_EFLAGS_ERR_PRN, + __get_str(dev), + __entry->qpn, + __entry->rcv_type, + __entry->rte, + __entry->psn + ) +); + +DEFINE_EVENT(/* event */ + hfi1_kdeth_eflags_error_template, hfi1_eflags_err_write, + TP_PROTO(struct rvt_qp *qp, u8 rcv_type, u8 rte, u32 psn), + TP_ARGS(qp, rcv_type, rte, psn) +); + #endif /* __HFI1_TRACE_TID_H */ #undef TRACE_INCLUDE_PATH diff --git a/drivers/infiniband/hw/hfi1/trace_tx.h b/drivers/infiniband/hw/hfi1/trace_tx.h index 37dbb3e599c3..09eb0c9ada00 100644 --- a/drivers/infiniband/hw/hfi1/trace_tx.h +++ b/drivers/infiniband/hw/hfi1/trace_tx.h @@ -846,6 +846,12 @@ DEFINE_EVENT( TP_ARGS(qp, flag) ); +DEFINE_EVENT(/* event */ + hfi1_do_send_template, hfi1_rc_do_tid_send, + TP_PROTO(struct rvt_qp *qp, bool flag), + TP_ARGS(qp, flag) +); + DEFINE_EVENT( hfi1_do_send_template, hfi1_rc_expired_time_slice, TP_PROTO(struct rvt_qp *qp, bool flag), From 34025fb0c4c9d6b2e294f8f8f0a82491a13c83a2 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:52:19 -0800 Subject: [PATCH 23/23] IB/hfi1: Prioritize the sending of ACK packets ACK packets are generally associated with request completion and resource release and therefore should be sent first. This patch optimizes the send engine by using the following policies: (1) QPs with RVT_S_ACK_PENDING bit set in qp->s_flags or qpriv->s_flags should have their priority incremented; (2) QPs with ACK or TID-ACK packet queued should have their priority incremented; (3) When a QP is queued to the wait list due to resource constraints, it will be queued to the head if it has ACK packet to send; (4) When selecting qps to run from the wait list, the one with the highest priority and starve_cnt will be selected; each priority will be equivalent to a fixed number of starve_cnt (16). Reviewed-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/iowait.c | 34 ++++++++- drivers/infiniband/hw/hfi1/iowait.h | 87 ++++++++++++++++-------- drivers/infiniband/hw/hfi1/pio.c | 18 +++-- drivers/infiniband/hw/hfi1/qp.c | 15 +++- drivers/infiniband/hw/hfi1/rc.c | 1 + drivers/infiniband/hw/hfi1/sdma.c | 24 ++++--- drivers/infiniband/hw/hfi1/sdma_txreq.h | 1 + drivers/infiniband/hw/hfi1/tid_rdma.c | 1 + drivers/infiniband/hw/hfi1/user_sdma.c | 6 +- drivers/infiniband/hw/hfi1/verbs.c | 1 + drivers/infiniband/hw/hfi1/verbs_txreq.h | 1 + drivers/infiniband/hw/hfi1/vnic_sdma.c | 6 +- 12 files changed, 144 insertions(+), 51 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/iowait.c b/drivers/infiniband/hw/hfi1/iowait.c index 582f1ba136ff..adb4a1ba921b 100644 --- a/drivers/infiniband/hw/hfi1/iowait.c +++ b/drivers/infiniband/hw/hfi1/iowait.c @@ -6,6 +6,9 @@ #include "iowait.h" #include "trace_iowait.h" +/* 1 priority == 16 starve_cnt */ +#define IOWAIT_PRIORITY_STARVE_SHIFT 4 + void iowait_set_flag(struct iowait *wait, u32 flag) { trace_hfi1_iowait_set(wait, flag); @@ -44,7 +47,8 @@ void iowait_init(struct iowait *wait, u32 tx_limit, uint seq, bool pkts_sent), void (*wakeup)(struct iowait *wait, int reason), - void (*sdma_drained)(struct iowait *wait)) + void (*sdma_drained)(struct iowait *wait), + void (*init_priority)(struct iowait *wait)) { int i; @@ -58,6 +62,7 @@ void iowait_init(struct iowait *wait, u32 tx_limit, wait->sleep = sleep; wait->wakeup = wakeup; wait->sdma_drained = sdma_drained; + wait->init_priority = init_priority; wait->flags = 0; for (i = 0; i < IOWAIT_SES; i++) { wait->wait[i].iow = wait; @@ -92,3 +97,30 @@ int iowait_set_work_flag(struct iowait_work *w) iowait_set_flag(w->iow, IOWAIT_PENDING_TID); return IOWAIT_TID_SE; } + +/** + * iowait_priority_update_top - update the top priority entry + * @w: the iowait struct + * @top: a pointer to the top priority entry + * @idx: the index of the current iowait in an array + * @top_idx: the array index for the iowait entry that has the top priority + * + * This function is called to compare the priority of a given + * iowait with the given top priority entry. The top index will + * be returned. + */ +uint iowait_priority_update_top(struct iowait *w, + struct iowait *top, + uint idx, uint top_idx) +{ + u8 cnt, tcnt; + + /* Convert priority into starve_cnt and compare the total.*/ + cnt = (w->priority << IOWAIT_PRIORITY_STARVE_SHIFT) + w->starved_cnt; + tcnt = (top->priority << IOWAIT_PRIORITY_STARVE_SHIFT) + + top->starved_cnt; + if (cnt > tcnt) + return idx; + else + return top_idx; +} diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h index bd913701761d..07847cb72169 100644 --- a/drivers/infiniband/hw/hfi1/iowait.h +++ b/drivers/infiniband/hw/hfi1/iowait.h @@ -100,6 +100,7 @@ struct iowait_work { * @sleep: no space callback * @wakeup: space callback wakeup * @sdma_drained: sdma count drained + * @init_priority: callback to manipulate priority * @lock: lock protected head of wait queue * @iowork: workqueue overhead * @wait_dma: wait for sdma_busy == 0 @@ -109,7 +110,7 @@ struct iowait_work { * @tx_limit: limit for overflow queuing * @tx_count: number of tx entry's in tx_head'ed list * @flags: wait flags (one per QP) - * @wait: SE array + * @wait: SE array for multiple legs * * This is to be embedded in user's state structure * (QP or PQ). @@ -120,10 +121,13 @@ struct iowait_work { * are callbacks for the ULP to implement * what ever queuing/dequeuing of * the embedded iowait and its containing struct - * when a resource shortage like SDMA ring space is seen. + * when a resource shortage like SDMA ring space + * or PIO credit space is seen. * * Both potentially have locks help - * so sleeping is not allowed. + * so sleeping is not allowed and it is not + * supported to submit txreqs from the wakeup + * call directly because of lock conflicts. * * The wait_dma member along with the iow * @@ -143,6 +147,7 @@ struct iowait { ); void (*wakeup)(struct iowait *wait, int reason); void (*sdma_drained)(struct iowait *wait); + void (*init_priority)(struct iowait *wait); seqlock_t *lock; wait_queue_head_t wait_dma; wait_queue_head_t wait_pio; @@ -152,6 +157,7 @@ struct iowait { u32 tx_limit; u32 tx_count; u8 starved_cnt; + u8 priority; unsigned long flags; struct iowait_work wait[IOWAIT_SES]; }; @@ -171,7 +177,8 @@ void iowait_init(struct iowait *wait, u32 tx_limit, uint seq, bool pkts_sent), void (*wakeup)(struct iowait *wait, int reason), - void (*sdma_drained)(struct iowait *wait)); + void (*sdma_drained)(struct iowait *wait), + void (*init_priority)(struct iowait *wait)); /** * iowait_schedule() - schedule the default send engine work @@ -339,6 +346,8 @@ static inline u16 iowait_get_desc(struct iowait_work *w) tx = list_first_entry(&w->tx_head, struct sdma_txreq, list); num_desc = tx->num_desc; + if (tx->flags & SDMA_TXREQ_F_VIP) + w->iow->priority++; } return num_desc; } @@ -352,6 +361,37 @@ static inline u32 iowait_get_all_desc(struct iowait *w) return num_desc; } +static inline void iowait_update_priority(struct iowait_work *w) +{ + struct sdma_txreq *tx = NULL; + + if (!list_empty(&w->tx_head)) { + tx = list_first_entry(&w->tx_head, struct sdma_txreq, + list); + if (tx->flags & SDMA_TXREQ_F_VIP) + w->iow->priority++; + } +} + +static inline void iowait_update_all_priority(struct iowait *w) +{ + iowait_update_priority(&w->wait[IOWAIT_IB_SE]); + iowait_update_priority(&w->wait[IOWAIT_TID_SE]); +} + +static inline void iowait_init_priority(struct iowait *w) +{ + w->priority = 0; + if (w->init_priority) + w->init_priority(w); +} + +static inline void iowait_get_priority(struct iowait *w) +{ + iowait_init_priority(w); + iowait_update_all_priority(w); +} + /** * iowait_queue - Put the iowait on a wait queue * @pkts_sent: have some packets been sent before queuing? @@ -368,14 +408,18 @@ static inline void iowait_queue(bool pkts_sent, struct iowait *w, /* * To play fair, insert the iowait at the tail of the wait queue if it * has already sent some packets; Otherwise, put it at the head. + * However, if it has priority packets to send, also put it at the + * head. */ - if (pkts_sent) { - list_add_tail(&w->list, wait_head); + if (pkts_sent) w->starved_cnt = 0; - } else { - list_add(&w->list, wait_head); + else w->starved_cnt++; - } + + if (w->priority > 0 || !pkts_sent) + list_add(&w->list, wait_head); + else + list_add_tail(&w->list, wait_head); } /** @@ -392,27 +436,10 @@ static inline void iowait_starve_clear(bool pkts_sent, struct iowait *w) w->starved_cnt = 0; } -/** - * iowait_starve_find_max - Find the maximum of the starve count - * @w: the iowait struct - * @max: a variable containing the max starve count - * @idx: the index of the current iowait in an array - * @max_idx: a variable containing the array index for the - * iowait entry that has the max starve count - * - * This function is called to compare the starve count of a - * given iowait with the given max starve count. The max starve - * count and the index will be updated if the iowait's start - * count is larger. - */ -static inline void iowait_starve_find_max(struct iowait *w, u8 *max, - uint idx, uint *max_idx) -{ - if (w->starved_cnt > *max) { - *max = w->starved_cnt; - *max_idx = idx; - } -} +/* Update the top priority index */ +uint iowait_priority_update_top(struct iowait *w, + struct iowait *top, + uint idx, uint top_idx); /** * iowait_packet_queued() - determine if a packet is queued diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index 04126d7e318d..a1de566fe95e 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -1599,8 +1599,7 @@ static void sc_piobufavail(struct send_context *sc) struct rvt_qp *qp; struct hfi1_qp_priv *priv; unsigned long flags; - uint i, n = 0, max_idx = 0; - u8 max_starved_cnt = 0; + uint i, n = 0, top_idx = 0; if (dd->send_contexts[sc->sw_index].type != SC_KERNEL && dd->send_contexts[sc->sw_index].type != SC_VL15) @@ -1619,11 +1618,18 @@ static void sc_piobufavail(struct send_context *sc) if (n == ARRAY_SIZE(qps)) break; wait = list_first_entry(list, struct iowait, list); + iowait_get_priority(wait); qp = iowait_to_qp(wait); priv = qp->priv; list_del_init(&priv->s_iowait.list); priv->s_iowait.lock = NULL; - iowait_starve_find_max(wait, &max_starved_cnt, n, &max_idx); + if (n) { + priv = qps[top_idx]->priv; + top_idx = iowait_priority_update_top(wait, + &priv->s_iowait, + n, top_idx); + } + /* refcount held until actual wake up */ qps[n++] = qp; } @@ -1638,12 +1644,12 @@ static void sc_piobufavail(struct send_context *sc) } write_sequnlock_irqrestore(&sc->waitlock, flags); - /* Wake up the most starved one first */ + /* Wake up the top-priority one first */ if (n) - hfi1_qp_wakeup(qps[max_idx], + hfi1_qp_wakeup(qps[top_idx], RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN); for (i = 0; i < n; i++) - if (i != max_idx) + if (i != top_idx) hfi1_qp_wakeup(qps[i], RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN); } diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index cfd598e4b303..d8f7add935df 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -518,6 +518,7 @@ static int iowait_sleep( ibp->rvp.n_dmawait++; qp->s_flags |= RVT_S_WAIT_DMA_DESC; + iowait_get_priority(&priv->s_iowait); iowait_queue(pkts_sent, &priv->s_iowait, &sde->dmawait); priv->s_iowait.lock = &sde->waitlock; @@ -567,6 +568,17 @@ static void iowait_sdma_drained(struct iowait *wait) spin_unlock_irqrestore(&qp->s_lock, flags); } +static void hfi1_init_priority(struct iowait *w) +{ + struct rvt_qp *qp = iowait_to_qp(w); + struct hfi1_qp_priv *priv = qp->priv; + + if (qp->s_flags & RVT_S_ACK_PENDING) + w->priority++; + if (priv->s_flags & RVT_S_ACK_PENDING) + w->priority++; +} + /** * qp_to_sdma_engine - map a qp to a send engine * @qp: the QP @@ -727,7 +739,8 @@ void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp) _hfi1_do_tid_send, iowait_sleep, iowait_wakeup, - iowait_sdma_drained); + iowait_sdma_drained, + hfi1_init_priority); return priv; } diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 82afa7736be7..e6726c1ab866 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -390,6 +390,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, bth0 = OP(ACKNOWLEDGE) << 24; bth2 = mask_psn(qp->s_ack_psn); qp->s_flags &= ~RVT_S_ACK_PENDING; + ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP; ps->s_txreq->ss = NULL; } qp->s_rdma_ack_cnt++; diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index 96897a91fb0a..b0110728f541 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -1747,10 +1747,9 @@ static inline u16 sdma_gethead(struct sdma_engine *sde) */ static void sdma_desc_avail(struct sdma_engine *sde, uint avail) { - struct iowait *wait, *nw; + struct iowait *wait, *nw, *twait; struct iowait *waits[SDMA_WAIT_BATCH_SIZE]; - uint i, n = 0, seq, max_idx = 0; - u8 max_starved_cnt = 0; + uint i, n = 0, seq, tidx = 0; #ifdef CONFIG_SDMA_VERBOSITY dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx, @@ -1775,13 +1774,20 @@ static void sdma_desc_avail(struct sdma_engine *sde, uint avail) continue; if (n == ARRAY_SIZE(waits)) break; + iowait_init_priority(wait); num_desc = iowait_get_all_desc(wait); if (num_desc > avail) break; avail -= num_desc; - /* Find the most starved wait memeber */ - iowait_starve_find_max(wait, &max_starved_cnt, - n, &max_idx); + /* Find the top-priority wait memeber */ + if (n) { + twait = waits[tidx]; + tidx = + iowait_priority_update_top(wait, + twait, + n, + tidx); + } list_del_init(&wait->list); waits[n++] = wait; } @@ -1790,12 +1796,12 @@ static void sdma_desc_avail(struct sdma_engine *sde, uint avail) } } while (read_seqretry(&sde->waitlock, seq)); - /* Schedule the most starved one first */ + /* Schedule the top-priority entry first */ if (n) - waits[max_idx]->wakeup(waits[max_idx], SDMA_AVAIL_REASON); + waits[tidx]->wakeup(waits[tidx], SDMA_AVAIL_REASON); for (i = 0; i < n; i++) - if (i != max_idx) + if (i != tidx) waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON); } diff --git a/drivers/infiniband/hw/hfi1/sdma_txreq.h b/drivers/infiniband/hw/hfi1/sdma_txreq.h index bf7d777d756e..514a4784566b 100644 --- a/drivers/infiniband/hw/hfi1/sdma_txreq.h +++ b/drivers/infiniband/hw/hfi1/sdma_txreq.h @@ -91,6 +91,7 @@ struct sdma_desc { #define SDMA_TXREQ_F_URGENT 0x0001 #define SDMA_TXREQ_F_AHG_COPY 0x0002 #define SDMA_TXREQ_F_USE_AHG 0x0004 +#define SDMA_TXREQ_F_VIP 0x0010 struct sdma_txreq; typedef void (*callback_t)(struct sdma_txreq *, int); diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index a49eb3d9b5b9..bc2ff83026f7 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -5296,6 +5296,7 @@ static int make_tid_rdma_ack(struct rvt_qp *qp, ps->s_txreq->ss = NULL; hfi1_make_ruc_header(qp, ohdr, (TID_OP(ACK) << 24), bth1, bth2, middle, ps); + ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP; return 1; bail: /* diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index 6764114b886c..8bfbc6d7ea34 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -144,8 +144,10 @@ static int defer_packet_queue( */ xchg(&pq->state, SDMA_PKT_Q_DEFERRED); write_seqlock(&sde->waitlock); - if (list_empty(&pq->busy.list)) + if (list_empty(&pq->busy.list)) { + iowait_get_priority(&pq->busy); iowait_queue(pkts_sent, &pq->busy, &sde->dmawait); + } write_sequnlock(&sde->waitlock); return -EBUSY; eagain: @@ -191,7 +193,7 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, pq->mm = fd->mm; iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue, - activate_packet_queue, NULL); + activate_packet_queue, NULL, NULL); pq->reqidx = 0; pq->reqs = kcalloc(hfi1_sdma_comp_ring_size, diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index ab97d71cdd92..55a56b3d7f83 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -945,6 +945,7 @@ static int pio_wait(struct rvt_qp *qp, dev->n_piodrain += !!(flag & HFI1_S_WAIT_PIO_DRAIN); qp->s_flags |= flag; was_empty = list_empty(&sc->piowait); + iowait_get_priority(&priv->s_iowait); iowait_queue(ps->pkts_sent, &priv->s_iowait, &sc->piowait); priv->s_iowait.lock = &sc->waitlock; diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.h b/drivers/infiniband/hw/hfi1/verbs_txreq.h index 2a77af26a231..b002e96eb335 100644 --- a/drivers/infiniband/hw/hfi1/verbs_txreq.h +++ b/drivers/infiniband/hw/hfi1/verbs_txreq.h @@ -94,6 +94,7 @@ static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev, tx->txreq.num_desc = 0; /* Set the header type */ tx->phdr.hdr.hdr_type = priv->hdr_type; + tx->txreq.flags = 0; return tx; } diff --git a/drivers/infiniband/hw/hfi1/vnic_sdma.c b/drivers/infiniband/hw/hfi1/vnic_sdma.c index 1f81c480e028..af1b1ffcb38e 100644 --- a/drivers/infiniband/hw/hfi1/vnic_sdma.c +++ b/drivers/infiniband/hw/hfi1/vnic_sdma.c @@ -240,8 +240,10 @@ static int hfi1_vnic_sdma_sleep(struct sdma_engine *sde, } vnic_sdma->state = HFI1_VNIC_SDMA_Q_DEFERRED; - if (list_empty(&vnic_sdma->wait.list)) + if (list_empty(&vnic_sdma->wait.list)) { + iowait_get_priority(wait->iow); iowait_queue(pkts_sent, wait->iow, &sde->dmawait); + } write_sequnlock(&sde->waitlock); return -EBUSY; } @@ -281,7 +283,7 @@ void hfi1_vnic_sdma_init(struct hfi1_vnic_vport_info *vinfo) iowait_init(&vnic_sdma->wait, 0, NULL, NULL, hfi1_vnic_sdma_sleep, - hfi1_vnic_sdma_wakeup, NULL); + hfi1_vnic_sdma_wakeup, NULL, NULL); vnic_sdma->sde = &vinfo->dd->per_sdma[i]; vnic_sdma->dd = vinfo->dd; vnic_sdma->vinfo = vinfo;