/** * DOC: TID RDMA READ protocol * * This is an end-to-end protocol at the hfi1 level between two nodes that * improves performance by avoiding data copy on the requester side. It * converts a qualified RDMA READ request into a TID RDMA READ request on * the requester side and thereafter handles the request and response * differently. To be qualified, the RDMA READ request should meet the * following: * -- The total data length should be greater than 256K; * -- The total data length should be a multiple of 4K page size; * -- Each local scatter-gather entry should be 4K page aligned; * -- Each local scatter-gather entry should be a multiple of 4K page size;
*/
/* Maximum number of packets within a flow generation. */ #define MAX_TID_FLOW_PSN BIT(HFI1_KDETH_BTH_SEQ_SHIFT)
#define GENERATION_MASK 0xFFFFF
static u32 mask_generation(u32 a)
{ return a & GENERATION_MASK;
}
/* Reserved generation value to set to unused flows for kernel contexts */ #define KERN_GENERATION_RESERVED mask_generation(U32_MAX)
/* * J_KEY for kernel contexts when TID RDMA is used. * See generate_jkey() in hfi.h for more information.
*/ #define TID_RDMA_JKEY 32 #define HFI1_KERNEL_MIN_JKEY HFI1_ADMIN_JKEY_RANGE #define HFI1_KERNEL_MAX_JKEY (2 * HFI1_ADMIN_JKEY_RANGE - 1)
/* Maximum number of segments in flight per QP request. */ #define TID_RDMA_MAX_READ_SEGS_PER_REQ 6 #define TID_RDMA_MAX_WRITE_SEGS_PER_REQ 4 #define MAX_REQ max_t(u16, TID_RDMA_MAX_READ_SEGS_PER_REQ, \
TID_RDMA_MAX_WRITE_SEGS_PER_REQ) #define MAX_FLOWS roundup_pow_of_two(MAX_REQ + 1)
/* * OPFN TID layout * * 63 47 31 15 * NNNNNNNNKKKKKKKK MMMMMMMMMMMTTTTT DDDDDDUVVVJJJJJJ RRRRRRWWWWWWCCCC * 3210987654321098 7654321098765432 1098765432109876 5432109876543210 * N - the context Number * K - the Kdeth_qp * M - Max_len * T - Timeout * D - reserveD * V - version * U - Urg capable * J - Jkey * R - max_Read * W - max_Write * C - Capcode
*/
old = rcu_dereference_protected(priv->tid_rdma.remote,
lockdep_is_held(&priv->opfn.lock));
data &= ~0xfULL; /* * If data passed in is zero, return true so as not to continue the * negotiation process
*/ if (!data || !HFI1_CAP_IS_KSET(TID_RDMA)) goto null; /* * If kzalloc fails, return false. This will result in: * * at the requester a new OPFN request being generated to retry * the negotiation * * at the responder, 0 being returned to the requester so as to * disable TID RDMA at both the requester and the responder
*/
remote = kzalloc(sizeof(*remote), GFP_ATOMIC); if (!remote) {
ret = false; goto null;
}
tid_rdma_opfn_decode(remote, data);
priv->tid_timer_timeout_jiffies =
usecs_to_jiffies((((4096UL * (1UL << remote->timeout)) /
1000UL) << 3) * 7);
trace_hfi1_opfn_param(qp, 0, &priv->tid_rdma.local);
trace_hfi1_opfn_param(qp, 1, remote);
rcu_assign_pointer(priv->tid_rdma.remote, remote); /* * A TID RDMA READ request's segment size is not equal to * remote->max_len only when the request's data length is smaller * than remote->max_len. In that case, there will be only one segment. * Therefore, when priv->pkts_ps is used to calculate req->cur_seg * during retry, it will lead to req->cur_seg = 0, which is exactly * what is expected.
*/
priv->pkts_ps = (u16)rvt_div_mtu(qp, remote->max_len);
priv->timeout_shift = ilog2(priv->pkts_ps - 1) + 1; goto free;
null:
RCU_INIT_POINTER(priv->tid_rdma.remote, NULL);
priv->timeout_shift = 0;
free: if (old)
kfree_rcu(old, rcu_head); return ret;
}
ret = tid_rdma_conn_reply(qp, *data);
*data = 0; /* * If tid_rdma_conn_reply() returns error, set *data as 0 to indicate * TID RDMA could not be enabled. This will result in TID RDMA being * disabled at the requester too.
*/ if (ret)
(void)tid_rdma_conn_req(qp, data); return ret;
}
/* Flow and tid waiter functions */ /** * DOC: lock ordering * * There are two locks involved with the queuing * routines: the qp s_lock and the exp_lock. * * Since the tid space allocation is called from * the send engine, the qp s_lock is already held. * * The allocation routines will get the exp_lock. * * The first_qp() call is provided to allow the head of * the rcd wait queue to be fetched under the exp_lock and * followed by a drop of the exp_lock. * * Any qp in the wait list will have the qp reference count held * to hold the qp in memory.
*/
/* * return head of rcd wait list * * Must hold the exp_lock. * * Get a reference to the QP to hold the QP in memory. * * The caller must release the reference when the local * is no longer being used.
*/ staticstruct rvt_qp *first_qp(struct hfi1_ctxtdata *rcd, struct tid_queue *queue)
__must_hold(&rcd->exp_lock)
{ struct hfi1_qp_priv *priv;
/** * kernel_tid_waiters - determine rcd wait * @rcd: the receive context * @queue: the queue to operate on * @qp: the head of the qp being processed * * This routine will return false IFF * the list is NULL or the head of the * list is the indicated qp. * * Must hold the qp s_lock and the exp_lock. * * Return: * false if either of the conditions below are satisfied: * 1. The list is empty or * 2. The indicated qp is at the head of the list and the * HFI1_S_WAIT_TID_SPACE bit is set in qp->s_flags. * true is returned otherwise.
*/ staticbool kernel_tid_waiters(struct hfi1_ctxtdata *rcd, struct tid_queue *queue, struct rvt_qp *qp)
__must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
{ struct rvt_qp *fqp; bool ret = true;
/** * dequeue_tid_waiter - dequeue the qp from the list * @rcd: the receive context * @queue: the queue to operate on * @qp: the qp to remove the wait list * * This routine removes the indicated qp from the * wait list if it is there. * * This should be done after the hardware flow and * tid array resources have been allocated. * * Must hold the qp s_lock and the rcd exp_lock. * * It assumes the s_lock to protect the s_flags * field and to reliably test the HFI1_S_WAIT_TID_SPACE flag.
*/ staticvoid dequeue_tid_waiter(struct hfi1_ctxtdata *rcd, struct tid_queue *queue, struct rvt_qp *qp)
__must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
{ struct hfi1_qp_priv *priv = qp->priv;
/** * queue_qp_for_tid_wait - suspend QP on tid space * @rcd: the receive context * @queue: the queue to operate on * @qp: the qp * * The qp is inserted at the tail of the rcd * wait queue and the HFI1_S_WAIT_TID_SPACE s_flag is set. * * Must hold the qp s_lock and the exp_lock.
*/ staticvoid queue_qp_for_tid_wait(struct hfi1_ctxtdata *rcd, struct tid_queue *queue, struct rvt_qp *qp)
__must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
{ struct hfi1_qp_priv *priv = qp->priv;
/** * __trigger_tid_waiter - trigger tid waiter * @qp: the qp * * This is a private entrance to schedule the qp * assuming the caller is holding the qp->s_lock.
*/ staticvoid __trigger_tid_waiter(struct rvt_qp *qp)
__must_hold(&qp->s_lock)
{
lockdep_assert_held(&qp->s_lock); if (!(qp->s_flags & HFI1_S_WAIT_TID_SPACE)) return;
trace_hfi1_qpwakeup(qp, HFI1_S_WAIT_TID_SPACE);
hfi1_schedule_send(qp);
}
/** * tid_rdma_schedule_tid_wakeup - schedule wakeup for a qp * @qp: the qp * * trigger a schedule or a waiting qp in a deadlock * safe manner. The qp reference is held prior * to this call via first_qp(). * * If the qp trigger was already scheduled (!rval) * the reference is dropped, otherwise the resume * or the destroy cancel will dispatch the reference.
*/ staticvoid tid_rdma_schedule_tid_wakeup(struct rvt_qp *qp)
{ struct hfi1_qp_priv *priv; struct hfi1_ibport *ibp; struct hfi1_pportdata *ppd; struct hfi1_devdata *dd; bool rval;
/** * tid_rdma_trigger_resume - field a trigger work request * @work: the work item * * Complete the off qp trigger processing by directly * calling the progress routine.
*/ staticvoid tid_rdma_trigger_resume(struct work_struct *work)
{ struct tid_rdma_qp_params *tr; struct hfi1_qp_priv *priv; struct rvt_qp *qp;
/* * tid_rdma_flush_wait - unwind any tid space wait * * This is called when resetting a qp to * allow a destroy or reset to get rid * of any tid space linkage and reference counts.
*/ staticvoid _tid_rdma_flush_wait(struct rvt_qp *qp, struct tid_queue *queue)
__must_hold(&qp->s_lock)
{ struct hfi1_qp_priv *priv;
/* Flow functions */ /** * kern_reserve_flow - allocate a hardware flow * @rcd: the context to use for allocation * @last: the index of the preferred flow. Use RXE_NUM_TID_FLOWS to * signify "don't care". * * Use a bit mask based allocation to reserve a hardware * flow for use in receiving KDETH data packets. If a preferred flow is * specified the function will attempt to reserve that flow again, if * available. * * The exp_lock must be held. * * Return: * On success: a value positive value between 0 and RXE_NUM_TID_FLOWS - 1 * On failure: -EAGAIN
*/ staticint kern_reserve_flow(struct hfi1_ctxtdata *rcd, int last)
__must_hold(&rcd->exp_lock)
{ int nr;
/* Attempt to reserve the preferred flow index */ if (last >= 0 && last < RXE_NUM_TID_FLOWS &&
!test_and_set_bit(last, &rcd->flow_mask)) return last;
/** * tid_rdma_find_phys_blocks_4k - get groups base on mr info * @flow: overall info for a TID RDMA segment * @pages: pointer to an array of page structs * @npages: number of pages * @list: page set array to return * * This routine returns the number of groups associated with * the current sge information. This implementation is based * on the expected receive find_phys_blocks() adjusted to * use the MR information vs. the pfn. * * Return: * the number of RcvArray entries
*/ static u32 tid_rdma_find_phys_blocks_4k(struct tid_rdma_flow *flow, struct page **pages,
u32 npages, struct tid_rdma_pageset *list)
{
u32 pagecount, pageidx, setcount = 0, i; void *vaddr, *this_vaddr;
if (!npages) return 0;
/* * Look for sets of physically contiguous pages in the user buffer. * This will allow us to optimize Expected RcvArray entry usage by * using the bigger supported sizes.
*/
vaddr = page_address(pages[0]);
trace_hfi1_tid_flow_page(flow->req->qp, flow, 0, 0, 0, vaddr); for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
this_vaddr = i < npages ? page_address(pages[i]) : NULL;
trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 0, 0,
this_vaddr); /* * If the vaddr's are not sequential, pages are not physically * contiguous.
*/ if (this_vaddr != (vaddr + PAGE_SIZE)) { /* * At this point we have to loop over the set of * physically contiguous pages and break them down it * sizes supported by the HW. * There are two main constraints: * 1. The max buffer size is MAX_EXPECTED_BUFFER. * If the total set size is bigger than that * program only a MAX_EXPECTED_BUFFER chunk. * 2. The buffer size has to be a power of two. If * it is not, round down to the closes power of * 2 and program that size.
*/ while (pagecount) { int maxpages = pagecount;
u32 bufsize = pagecount * PAGE_SIZE;
list[setcount].idx = pageidx;
list[setcount].count = maxpages;
trace_hfi1_tid_pageset(flow->req->qp, setcount,
list[setcount].idx,
list[setcount].count);
pagecount -= maxpages;
pageidx += maxpages;
setcount++;
}
pageidx = i;
pagecount = 1;
vaddr = this_vaddr;
} else {
vaddr += PAGE_SIZE;
pagecount++;
}
} /* insure we always return an even number of sets */ if (setcount & 1)
list[setcount++].count = 0; return setcount;
}
/** * tid_flush_pages - dump out pages into pagesets * @list: list of pagesets * @idx: pointer to current page index * @pages: number of pages to dump * @sets: current number of pagesset * * This routine flushes out accumuated pages. * * To insure an even number of sets the * code may add a filler. * * This can happen with when pages is not * a power of 2 or pages is a power of 2 * less than the maximum pages. * * Return: * The new number of sets
*/
/** * tid_rdma_find_phys_blocks_8k - get groups base on mr info * @flow: overall info for a TID RDMA segment * @pages: pointer to an array of page structs * @npages: number of pages * @list: page set array to return * * This routine parses an array of pages to compute pagesets * in an 8k compatible way. * * pages are tested two at a time, i, i + 1 for contiguous * pages and i - 1 and i contiguous pages. * * If any condition is false, any accumulated pages are flushed and * v0,v1 are emitted as separate PAGE_SIZE pagesets * * Otherwise, the current 8k is totaled for a future flush. * * Return: * The number of pagesets * list set with the returned number of pagesets *
*/ static u32 tid_rdma_find_phys_blocks_8k(struct tid_rdma_flow *flow, struct page **pages,
u32 npages, struct tid_rdma_pageset *list)
{
u32 idx, sets = 0, i;
u32 pagecnt = 0; void *v0, *v1, *vm1;
if (!npages) return 0; for (idx = 0, i = 0, vm1 = NULL; i < npages; i += 2) { /* get a new v0 */
v0 = page_address(pages[i]);
trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 0, v0);
v1 = i + 1 < npages ?
page_address(pages[i + 1]) : NULL;
trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 1, v1); /* compare i, i + 1 vaddr */ if (v1 != (v0 + PAGE_SIZE)) { /* flush out pages */
sets = tid_flush_pages(list, &idx, pagecnt, sets); /* output v0,v1 as two pagesets */
list[sets].idx = idx++;
list[sets++].count = 1; if (v1) {
list[sets].count = 1;
list[sets++].idx = idx++;
} else {
list[sets++].count = 0;
}
vm1 = NULL;
pagecnt = 0; continue;
} /* i,i+1 consecutive, look at i-1,i */ if (vm1 && v0 != (vm1 + PAGE_SIZE)) { /* flush out pages */
sets = tid_flush_pages(list, &idx, pagecnt, sets);
pagecnt = 0;
} /* pages will always be a multiple of 8k */
pagecnt += 2; /* save i-1 */
vm1 = v1; /* move to next pair */
} /* dump residual pages at end */
sets = tid_flush_pages(list, &idx, npages - idx, sets); /* by design cannot be odd sets */
WARN_ON(sets & 1); return sets;
}
/* * Find pages for one segment of a sge array represented by @ss. The function * does not check the sge, the sge must have been checked for alignment with a * prior call to hfi1_kern_trdma_ok. Other sge checking is done as part of * rvt_lkey_ok and rvt_rkey_ok. Also, the function only modifies the local sge * copy maintained in @ss->sge, the original sge is not modified. * * Unlike IB RDMA WRITE, we can't decrement ss->num_sge here because we are not * releasing the MR reference count at the same time. Otherwise, we'll "leak" * references to the MR. This difference requires that we keep track of progress * into the sg_list. This is done by the cur_seg cursor in the tid_rdma_request * structure.
*/ static u32 kern_find_pages(struct tid_rdma_flow *flow, struct page **pages, struct rvt_sge_state *ss, bool *last)
{ struct tid_rdma_request *req = flow->req; struct rvt_sge *sge = &ss->sge;
u32 length = flow->req->seg_len;
u32 len = PAGE_SIZE;
u32 i = 0;
while (length && req->isge < ss->num_sge) {
pages[i++] = virt_to_page(sge->vaddr);
/* * Try to allocate pageset_count TID's from TID groups for a context * * This function allocates TID's without moving groups between lists or * modifying grp->map. This is done as follows, being cogizant of the lists * between which the TID groups will move: * 1. First allocate complete groups of 8 TID's since this is more efficient, * these groups will move from group->full without affecting used * 2. If more TID's are needed allocate from used (will move from used->full or * stay in used) * 3. If we still don't have the required number of TID's go back and look again * at a complete group (will move from group->used)
*/ staticint kern_alloc_tids(struct tid_rdma_flow *flow)
{ struct hfi1_ctxtdata *rcd = flow->req->rcd; struct hfi1_devdata *dd = rcd->dd;
u32 ngroups, pageidx = 0; struct tid_group *group = NULL, *used;
u8 use;
/* First look at complete groups */
list_for_each_entry(group, &rcd->tid_group_list.list, list) {
kern_add_tid_node(flow, rcd, "complete groups", group,
group->size);
pageidx += group->size; if (!--ngroups) break;
}
if (pageidx >= flow->npagesets) goto ok;
used_list: /* Now look at partially used groups */
list_for_each_entry(used, &rcd->tid_used_list.list, list) {
use = min_t(u32, flow->npagesets - pageidx,
used->size - used->used);
kern_add_tid_node(flow, rcd, "used groups", used, use);
pageidx += use; if (pageidx >= flow->npagesets) goto ok;
}
/* * Look again at a complete group, continuing from where we left. * However, if we are at the head, we have reached the end of the * complete groups list from the first loop above
*/ if (group && &group->list == &rcd->tid_group_list.list) goto bail_eagain;
group = list_prepare_entry(group, &rcd->tid_group_list.list,
list); if (list_is_last(&group->list, &rcd->tid_group_list.list)) goto bail_eagain;
group = list_next_entry(group, list);
use = min_t(u32, flow->npagesets - pageidx, group->size);
kern_add_tid_node(flow, rcd, "complete continue", group, use);
pageidx += use; if (pageidx >= flow->npagesets) goto ok;
bail_eagain:
trace_hfi1_msg_alloc_tids(flow->req->qp, " insufficient tids: needed ",
(u64)flow->npagesets); return -EAGAIN;
ok: return 0;
}
rcventry -= rcd->expected_base;
tidctrl = pair ? 0x3 : rcventry & 0x1 ? 0x2 : 0x1; /* * A single TID entry will be used to use a rcvarr pair (with * tidctrl 0x3), if ALL these are true (a) the bit pos is even * (b) the group map shows current and the next bits as free * indicating two consecutive rcvarry entries are available (c) * we actually need 2 more entries
*/
pair = !(i & 0x1) && !((node->map >> i) & 0x3) &&
node->cnt >= cnt + 2; if (!pair) { if (!pset->count)
tidctrl = 0x1;
flow->tid_entry[flow->tidcnt++] =
EXP_TID_SET(IDX, rcventry >> 1) |
EXP_TID_SET(CTRL, tidctrl) |
EXP_TID_SET(LEN, npages);
trace_hfi1_tid_entry_alloc(/* entry */
flow->req->qp, flow->tidcnt - 1,
flow->tid_entry[flow->tidcnt - 1]);
flow->npkts = 0;
flow->tidcnt = 0; for (i = 0; i < flow->tnode_cnt; i++)
kern_program_rcv_group(flow, i, &pset_idx);
trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head, flow);
}
/** * hfi1_kern_exp_rcv_setup() - setup TID's and flow for one segment of a * TID RDMA request * * @req: TID RDMA request for which the segment/flow is being set up * @ss: sge state, maintains state across successive segments of a sge * @last: set to true after the last sge segment has been processed * * This function * (1) finds a free flow entry in the flow circular buffer * (2) finds pages and continuous physical chunks constituing one segment * of an sge * (3) allocates TID group entries for those chunks * (4) programs rcvarray entries in the hardware corresponding to those * TID's * (5) computes a tidarray with formatted TID entries which can be sent * to the sender * (6) Reserves and programs HW flows. * (7) It also manages queueing the QP when TID/flow resources are not * available. * * @req points to struct tid_rdma_request of which the segments are a part. The * function uses qp, rcd and seg_len members of @req. In the absence of errors, * req->flow_idx is the index of the flow which has been prepared in this * invocation of function call. With flow = &req->flows[req->flow_idx], * flow->tid_entry contains the TID array which the sender can use for TID RDMA * sends and flow->npkts contains number of packets required to send the * segment. * * hfi1_check_sge_align should be called prior to calling this function and if * it signals error TID RDMA cannot be used for this sge and this function * should not be called. * * For the queuing, caller must hold the flow->req->qp s_lock from the send * engine and the function will procure the exp_lock. * * Return: * The function returns -EAGAIN if sufficient number of TID/flow resources to * map the segment could not be allocated. In this case the function should be * called again with previous arguments to retry the TID allocation. There are * no other error returns. The function returns 0 on success.
*/ int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req, struct rvt_sge_state *ss, bool *last)
__must_hold(&req->qp->s_lock)
{ struct tid_rdma_flow *flow = &req->flows[req->setup_head]; struct hfi1_ctxtdata *rcd = req->rcd; struct hfi1_qp_priv *qpriv = req->qp->priv; unsignedlong flags; struct rvt_qp *fqp;
u16 clear_tail = req->clear_tail;
lockdep_assert_held(&req->qp->s_lock); /* * We return error if either (a) we don't have space in the flow * circular buffer, or (b) we already have max entries in the buffer. * Max entries depend on the type of request we are processing and the * negotiated TID RDMA parameters.
*/ if (!CIRC_SPACE(req->setup_head, clear_tail, MAX_FLOWS) ||
CIRC_CNT(req->setup_head, clear_tail, MAX_FLOWS) >=
req->n_flows) return -EINVAL;
/* * Get pages, identify contiguous physical memory chunks for the segment * If we can not determine a DMA address mapping we will treat it just * like if we ran out of space above.
*/ if (kern_get_phys_blocks(flow, qpriv->pages, ss, last)) {
hfi1_wait_kmem(flow->req->qp); return -ENOMEM;
}
spin_lock_irqsave(&rcd->exp_lock, flags); if (kernel_tid_waiters(rcd, &rcd->rarr_queue, flow->req->qp)) goto queue;
/* * At this point we know the number of pagesets and hence the number of * TID's to map the segment. Allocate the TID's from the TID groups. If * we cannot allocate the required number we exit and try again later
*/ if (kern_alloc_tids(flow)) goto queue; /* * Finally program the TID entries with the pagesets, compute the * tidarray and enable the HW flow
*/
kern_program_rcvarray(flow);
/* * Setup the flow state with relevant information. * This information is used for tracking the sequence of data packets * for the segment. * The flow is setup here as this is the most accurate time and place * to do so. Doing at a later time runs the risk of the flow data in * qpriv getting out of sync.
*/
memset(&flow->flow_state, 0x0, sizeof(flow->flow_state));
flow->idx = qpriv->flow_state.index;
flow->flow_state.generation = qpriv->flow_state.generation;
flow->flow_state.spsn = qpriv->flow_state.psn;
flow->flow_state.lpsn = flow->flow_state.spsn + flow->npkts - 1;
flow->flow_state.r_next_psn =
full_flow_psn(flow, flow->flow_state.spsn);
qpriv->flow_state.psn += flow->npkts;
dequeue_tid_waiter(rcd, &rcd->rarr_queue, flow->req->qp); /* get head before dropping lock */
fqp = first_qp(rcd, &rcd->rarr_queue);
spin_unlock_irqrestore(&rcd->exp_lock, flags);
tid_rdma_schedule_tid_wakeup(fqp);
/* * This function is called after one segment has been successfully sent to * release the flow and TID HW/SW resources for that segment. The segments for a * TID RDMA request are setup and cleared in FIFO order which is managed using a * circular buffer.
*/ int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req)
__must_hold(&req->qp->s_lock)
{ struct tid_rdma_flow *flow = &req->flows[req->clear_tail]; struct hfi1_ctxtdata *rcd = req->rcd; unsignedlong flags; int i; struct rvt_qp *fqp;
lockdep_assert_held(&req->qp->s_lock); /* Exit if we have nothing in the flow circular buffer */ if (!CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) return -EINVAL;
spin_lock_irqsave(&rcd->exp_lock, flags);
for (i = 0; i < flow->tnode_cnt; i++)
kern_unprogram_rcv_group(flow, i); /* To prevent double unprogramming */
flow->tnode_cnt = 0; /* get head before dropping lock */
fqp = first_qp(rcd, &rcd->rarr_queue);
spin_unlock_irqrestore(&rcd->exp_lock, flags);
/* * This function is called to release all the tid entries for * a request.
*/ void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req)
__must_hold(&req->qp->s_lock)
{ /* Use memory barrier for proper ordering */ while (CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) { if (hfi1_kern_exp_rcv_clear(req)) break;
}
}
/** * hfi1_kern_exp_rcv_free_flows - free previously allocated flow information * @req: the tid rdma request to be cleaned
*/ staticvoid hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req)
{
kfree(req->flows);
req->flows = NULL;
}
/** * __trdma_clean_swqe - clean up for large sized QPs * @qp: the queue patch * @wqe: the send wqe
*/ void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
{ struct hfi1_swqe_priv *p = wqe->priv;
hfi1_kern_exp_rcv_free_flows(&p->tid_req);
}
/* * This can be called at QP create time or in the data path.
*/ staticint hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req,
gfp_t gfp)
{ struct tid_rdma_flow *flows; int i;
if (likely(req->flows)) return 0;
flows = kmalloc_node(MAX_FLOWS * sizeof(*flows), gfp,
req->rcd->numa_id); if (!flows) return -ENOMEM; /* mini init */ for (i = 0; i < MAX_FLOWS; i++) {
flows[i].req = req;
flows[i].npagesets = 0;
flows[i].pagesets[0].mapped = 0;
flows[i].resync_npkts = 0;
}
req->flows = flows; return 0;
}
/* * Initialize various TID RDMA request variables. * These variables are "static", which is why they * can be pre-initialized here before the WRs has * even been submitted. * However, non-NULL values for these variables do not * imply that this WQE has been enabled for TID RDMA. * Drivers should check the WQE's opcode to determine * if a request is a TID RDMA one or not.
*/
req->qp = qp;
req->rcd = qpriv->rcd;
}
u64 hfi1_access_sw_tid_wait(conststruct cntr_entry *entry, void *context, int vl, int mode, u64 data)
{ struct hfi1_devdata *dd = context;
/* This is the IB psn used to send the request */
*bth2 = mask_psn(flow->flow_state.ib_spsn + flow->pkt);
trace_hfi1_tid_flow_build_read_pkt(qp, req->flow_idx, flow);
/* TID Entries for TID RDMA READ payload */
req_addr = &flow->tid_entry[flow->tid_idx];
req_len = sizeof(*flow->tid_entry) *
(flow->tidcnt - flow->tid_idx);
memset(&ohdr->u.tid_rdma.r_req, 0, sizeof(ohdr->u.tid_rdma.r_req));
wpriv->ss.sge.vaddr = req_addr;
wpriv->ss.sge.sge_length = req_len;
wpriv->ss.sge.length = wpriv->ss.sge.sge_length; /* * We can safely zero these out. Since the first SGE covers the * entire packet, nothing else should even look at the MR.
*/
wpriv->ss.sge.mr = NULL;
wpriv->ss.sge.m = 0;
wpriv->ss.sge.n = 0;
/* * @len: contains the data length to read upon entry and the read request * payload length upon exit.
*/
u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe, struct ib_other_headers *ohdr, u32 *bth1,
u32 *bth2, u32 *len)
__must_hold(&qp->s_lock)
{ struct hfi1_qp_priv *qpriv = qp->priv; struct tid_rdma_request *req = wqe_to_tid_req(wqe); struct tid_rdma_flow *flow = NULL;
u32 hdwords = 0; bool last; bool retry = true;
u32 npkts = rvt_div_round_up_mtu(qp, *len);
trace_hfi1_tid_req_build_read_req(qp, 0, wqe->wr.opcode, wqe->psn,
wqe->lpsn, req); /* * Check sync conditions. Make sure that there are no pending * segments before freeing the flow.
*/
sync_check: if (req->state == TID_REQUEST_SYNC) { if (qpriv->pending_tid_r_segs) goto done;
/* * If the request for this segment is resent, the tid resources should * have been allocated before. In this case, req->flow_idx should * fall behind req->setup_head.
*/ if (req->flow_idx == req->setup_head) {
retry = false; if (req->state == TID_REQUEST_RESEND) { /* * This is the first new segment for a request whose * earlier segments have been re-sent. We need to * set up the sge pointer correctly.
*/
restart_sge(&qp->s_sge, wqe, req->s_next_psn,
qp->pmtu);
req->isge = 0;
req->state = TID_REQUEST_ACTIVE;
}
/* * Check sync. The last PSN of each generation is reserved for * RESYNC.
*/ if ((qpriv->flow_state.psn + npkts) > MAX_TID_FLOW_PSN - 1) {
req->state = TID_REQUEST_SYNC; goto sync_check;
}
/* Allocate the flow if not yet */ if (hfi1_kern_setup_hw_flow(qpriv->rcd, qp)) goto done;
/* * The following call will advance req->setup_head after * allocating the tid entries.
*/ if (hfi1_kern_exp_rcv_setup(req, &qp->s_sge, &last)) {
req->state = TID_REQUEST_QUEUED;
/* * We don't have resources for this segment. The QP has * already been queued.
*/ goto done;
}
}
/* req->flow_idx should only be one slot behind req->setup_head */
flow = &req->flows[req->flow_idx];
flow->pkt = 0;
flow->tid_idx = 0;
flow->sent = 0; if (!retry) { /* Set the first and last IB PSN for the flow in use.*/
flow->flow_state.ib_spsn = req->s_next_psn;
flow->flow_state.ib_lpsn =
flow->flow_state.ib_spsn + flow->npkts - 1;
}
/* Calculate the next segment start psn.*/
req->s_next_psn += flow->npkts;
/* * Walk the TID_ENTRY list to make sure we have enough space for a * complete segment. Also calculate the number of required packets.
*/
flow->npkts = rvt_div_round_up_mtu(qp, len); for (i = 0; i < flow->tidcnt; i++) {
trace_hfi1_tid_entry_rcv_read_req(qp, i,
flow->tid_entry[i]);
tlen = EXP_TID_GET(flow->tid_entry[i], LEN); if (!tlen) return 1;
/* * For tid pair (tidctr == 3), the buffer size of the pair * should be the sum of the buffer size described by each * tid entry. However, only the first entry needs to be * specified in the request (see WFR HAS Section 8.5.7.1).
*/
tidlen += tlen;
} if (tidlen * PAGE_SIZE < len) return 1;
reth = &ohdr->u.tid_rdma.r_req.reth; /* * The requester always restarts from the start of the original * request.
*/
len = be32_to_cpu(reth->length); if (psn != e->psn || len != req->total_len) goto unlock;
qp->r_len = len;
ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
IB_ACCESS_REMOTE_READ); if (unlikely(!ok)) goto unlock;
/* * If all the response packets for the current request have * been sent out and this request is complete (old_request * == false) and the TID flow may be unusable (the * req->clear_tail is advanced). However, when an earlier * request is received, this request will not be complete any * more (qp->s_tail_ack_queue is moved back, see below). * Consequently, we need to update the TID flow info every time * a duplicate request is received.
*/
bth0 = be32_to_cpu(ohdr->bth[0]); if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn,
vaddr, len)) goto unlock;
/* * True if the request is already scheduled (between * qp->s_tail_ack_queue and qp->r_head_ack_queue);
*/ if (old_req) goto unlock;
} else { struct flow_state *fstate; bool schedule = false;
u8 i;
/* * True if the request is already scheduled (between * qp->s_tail_ack_queue and qp->r_head_ack_queue). * Also, don't change requests, which are at the SYNC * point and haven't generated any responses yet. * There is nothing to retransmit for them yet.
*/ if (old_req || req->state == TID_REQUEST_INIT ||
(req->state == TID_REQUEST_SYNC && !req->cur_seg)) { for (i = prev + 1; ; i++) { if (i > rvt_size_atomic(&dev->rdi))
i = 0; if (i == qp->r_head_ack_queue) break;
e = &qp->s_ack_queue[i];
req = ack_to_tid_req(e); if (e->opcode == TID_OP(WRITE_REQ) &&
req->state == TID_REQUEST_INIT)
req->state = TID_REQUEST_INIT_RESEND;
} /* * If the state of the request has been changed, * the first leg needs to get scheduled in order to * pick up the change. Otherwise, normal response * processing should take care of it.
*/ if (!schedule) goto unlock;
}
/* * If there is no more allocated segment, just schedule the qp * without changing any state.
*/ if (req->clear_tail == req->setup_head) goto schedule; /* * If this request has sent responses for segments, which have * not received data yet (flow_idx != clear_tail), the flow_idx * pointer needs to be adjusted so the same responses can be * re-sent.
*/ if (CIRC_CNT(req->flow_idx, req->clear_tail, MAX_FLOWS)) {
fstate = &req->flows[req->clear_tail].flow_state;
qpriv->pending_tid_w_segs -=
CIRC_CNT(req->flow_idx, req->clear_tail,
MAX_FLOWS);
req->flow_idx =
CIRC_ADD(req->clear_tail,
delta_psn(psn, fstate->resp_ib_psn),
MAX_FLOWS);
qpriv->pending_tid_w_segs +=
delta_psn(psn, fstate->resp_ib_psn); /* * When flow_idx == setup_head, we've gotten a duplicate * request for a segment, which has not been allocated * yet. In that case, don't adjust this request. * However, we still want to go through the loop below * to adjust all subsequent requests.
*/ if (CIRC_CNT(req->setup_head, req->flow_idx,
MAX_FLOWS)) {
req->cur_seg = delta_psn(psn, e->psn);
req->state = TID_REQUEST_RESEND_ACTIVE;
}
}
for (i = prev + 1; ; i++) { /* * Look at everything up to and including * s_tail_ack_queue
*/ if (i > rvt_size_atomic(&dev->rdi))
i = 0; if (i == qp->r_head_ack_queue) break;
e = &qp->s_ack_queue[i];
req = ack_to_tid_req(e);
trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn,
e->lpsn, req); if (e->opcode != TID_OP(WRITE_REQ) ||
req->cur_seg == req->comp_seg ||
req->state == TID_REQUEST_INIT ||
req->state == TID_REQUEST_INIT_RESEND) { if (req->state == TID_REQUEST_INIT)
req->state = TID_REQUEST_INIT_RESEND; continue;
}
qpriv->pending_tid_w_segs -=
CIRC_CNT(req->flow_idx,
req->clear_tail,
MAX_FLOWS);
req->flow_idx = req->clear_tail;
req->state = TID_REQUEST_RESEND;
req->cur_seg = req->comp_seg;
}
qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK;
} /* Re-process old requests.*/ if (qp->s_acked_ack_queue == qp->s_tail_ack_queue)
qp->s_acked_ack_queue = prev;
qp->s_tail_ack_queue = prev; /* * Since the qp->s_tail_ack_queue is modified, the * qp->s_ack_state must be changed to re-initialize * qp->s_ack_rdma_sge; Otherwise, we will end up in * wrong memory region.
*/
qp->s_ack_state = OP(ACKNOWLEDGE);
schedule: /* * It's possible to receive a retry psn that is earlier than an RNRNAK * psn. In this case, the rnrnak state should be cleared.
*/ if (qpriv->rnr_nak_state) {
qp->s_nak_state = 0;
qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
qp->r_psn = e->lpsn + 1;
hfi1_tid_write_alloc_resources(qp, true);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.