1// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
2/*
3 * Copyright(c) 2020 - 2023 Cornelis Networks, Inc.
4 * Copyright(c) 2015 - 2018 Intel Corporation.
5 */
6
7#include <linux/mm.h>
8#include <linux/types.h>
9#include <linux/device.h>
10#include <linux/dmapool.h>
11#include <linux/slab.h>
12#include <linux/list.h>
13#include <linux/highmem.h>
14#include <linux/io.h>
15#include <linux/uio.h>
16#include <linux/rbtree.h>
17#include <linux/spinlock.h>
18#include <linux/delay.h>
19#include <linux/kthread.h>
20#include <linux/mmu_context.h>
21#include <linux/module.h>
22#include <linux/vmalloc.h>
23#include <linux/string.h>
24
25#include "hfi.h"
26#include "sdma.h"
27#include "user_sdma.h"
28#include "verbs.h"  /* for the headers */
29#include "common.h" /* for struct hfi1_tid_info */
30#include "trace.h"
31
32static uint hfi1_sdma_comp_ring_size = 128;
33module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO);
34MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128");
35
36static unsigned initial_pkt_count = 8;
37
38static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts);
39static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status);
40static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq);
41static void user_sdma_free_request(struct user_sdma_request *req);
42static int check_header_template(struct user_sdma_request *req,
43				 struct hfi1_pkt_header *hdr, u32 lrhlen,
44				 u32 datalen);
45static int set_txreq_header(struct user_sdma_request *req,
46			    struct user_sdma_txreq *tx, u32 datalen);
47static int set_txreq_header_ahg(struct user_sdma_request *req,
48				struct user_sdma_txreq *tx, u32 len);
49static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
50				  struct hfi1_user_sdma_comp_q *cq,
51				  u16 idx, enum hfi1_sdma_comp_state state,
52				  int ret);
53static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags);
54static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);
55
56static int defer_packet_queue(
57	struct sdma_engine *sde,
58	struct iowait_work *wait,
59	struct sdma_txreq *txreq,
60	uint seq,
61	bool pkts_sent);
62static void activate_packet_queue(struct iowait *wait, int reason);
63
64static int defer_packet_queue(
65	struct sdma_engine *sde,
66	struct iowait_work *wait,
67	struct sdma_txreq *txreq,
68	uint seq,
69	bool pkts_sent)
70{
71	struct hfi1_user_sdma_pkt_q *pq =
72		container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy);
73
74	write_seqlock(&sde->waitlock);
75	trace_hfi1_usdma_defer(pq, sde, &pq->busy);
76	if (sdma_progress(sde, seq, txreq))
77		goto eagain;
78	/*
79	 * We are assuming that if the list is enqueued somewhere, it
80	 * is to the dmawait list since that is the only place where
81	 * it is supposed to be enqueued.
82	 */
83	xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
84	if (list_empty(&pq->busy.list)) {
85		pq->busy.lock = &sde->waitlock;
86		iowait_get_priority(&pq->busy);
87		iowait_queue(pkts_sent, &pq->busy, &sde->dmawait);
88	}
89	write_sequnlock(&sde->waitlock);
90	return -EBUSY;
91eagain:
92	write_sequnlock(&sde->waitlock);
93	return -EAGAIN;
94}
95
96static void activate_packet_queue(struct iowait *wait, int reason)
97{
98	struct hfi1_user_sdma_pkt_q *pq =
99		container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
100
101	trace_hfi1_usdma_activate(pq, wait, reason);
102	xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
103	wake_up(&wait->wait_dma);
104};
105
106int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
107				struct hfi1_filedata *fd)
108{
109	int ret = -ENOMEM;
110	char buf[64];
111	struct hfi1_devdata *dd;
112	struct hfi1_user_sdma_comp_q *cq;
113	struct hfi1_user_sdma_pkt_q *pq;
114
115	if (!uctxt || !fd)
116		return -EBADF;
117
118	if (!hfi1_sdma_comp_ring_size)
119		return -EINVAL;
120
121	dd = uctxt->dd;
122
123	pq = kzalloc(sizeof(*pq), GFP_KERNEL);
124	if (!pq)
125		return -ENOMEM;
126	pq->dd = dd;
127	pq->ctxt = uctxt->ctxt;
128	pq->subctxt = fd->subctxt;
129	pq->n_max_reqs = hfi1_sdma_comp_ring_size;
130	atomic_set(&pq->n_reqs, 0);
131	init_waitqueue_head(&pq->wait);
132	atomic_set(&pq->n_locked, 0);
133
134	iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue,
135		    activate_packet_queue, NULL, NULL);
136	pq->reqidx = 0;
137
138	pq->reqs = kcalloc(hfi1_sdma_comp_ring_size,
139			   sizeof(*pq->reqs),
140			   GFP_KERNEL);
141	if (!pq->reqs)
142		goto pq_reqs_nomem;
143
144	pq->req_in_use = bitmap_zalloc(hfi1_sdma_comp_ring_size, GFP_KERNEL);
145	if (!pq->req_in_use)
146		goto pq_reqs_no_in_use;
147
148	snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
149		 fd->subctxt);
150	pq->txreq_cache = kmem_cache_create(buf,
151					    sizeof(struct user_sdma_txreq),
152					    L1_CACHE_BYTES,
153					    SLAB_HWCACHE_ALIGN,
154					    NULL);
155	if (!pq->txreq_cache) {
156		dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n",
157			   uctxt->ctxt);
158		goto pq_txreq_nomem;
159	}
160
161	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
162	if (!cq)
163		goto cq_nomem;
164
165	cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps)
166				 * hfi1_sdma_comp_ring_size));
167	if (!cq->comps)
168		goto cq_comps_nomem;
169
170	cq->nentries = hfi1_sdma_comp_ring_size;
171
172	ret = hfi1_init_system_pinning(pq);
173	if (ret)
174		goto pq_mmu_fail;
175
176	rcu_assign_pointer(fd->pq, pq);
177	fd->cq = cq;
178
179	return 0;
180
181pq_mmu_fail:
182	vfree(cq->comps);
183cq_comps_nomem:
184	kfree(cq);
185cq_nomem:
186	kmem_cache_destroy(pq->txreq_cache);
187pq_txreq_nomem:
188	bitmap_free(pq->req_in_use);
189pq_reqs_no_in_use:
190	kfree(pq->reqs);
191pq_reqs_nomem:
192	kfree(pq);
193
194	return ret;
195}
196
197static void flush_pq_iowait(struct hfi1_user_sdma_pkt_q *pq)
198{
199	unsigned long flags;
200	seqlock_t *lock = pq->busy.lock;
201
202	if (!lock)
203		return;
204	write_seqlock_irqsave(lock, flags);
205	if (!list_empty(&pq->busy.list)) {
206		list_del_init(&pq->busy.list);
207		pq->busy.lock = NULL;
208	}
209	write_sequnlock_irqrestore(lock, flags);
210}
211
212int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd,
213			       struct hfi1_ctxtdata *uctxt)
214{
215	struct hfi1_user_sdma_pkt_q *pq;
216
217	trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt);
218
219	spin_lock(&fd->pq_rcu_lock);
220	pq = srcu_dereference_check(fd->pq, &fd->pq_srcu,
221				    lockdep_is_held(&fd->pq_rcu_lock));
222	if (pq) {
223		rcu_assign_pointer(fd->pq, NULL);
224		spin_unlock(&fd->pq_rcu_lock);
225		synchronize_srcu(&fd->pq_srcu);
226		/* at this point there can be no more new requests */
227		iowait_sdma_drain(&pq->busy);
228		/* Wait until all requests have been freed. */
229		wait_event_interruptible(
230			pq->wait,
231			!atomic_read(&pq->n_reqs));
232		kfree(pq->reqs);
233		hfi1_free_system_pinning(pq);
234		bitmap_free(pq->req_in_use);
235		kmem_cache_destroy(pq->txreq_cache);
236		flush_pq_iowait(pq);
237		kfree(pq);
238	} else {
239		spin_unlock(&fd->pq_rcu_lock);
240	}
241	if (fd->cq) {
242		vfree(fd->cq->comps);
243		kfree(fd->cq);
244		fd->cq = NULL;
245	}
246	return 0;
247}
248
249static u8 dlid_to_selector(u16 dlid)
250{
251	static u8 mapping[256];
252	static int initialized;
253	static u8 next;
254	int hash;
255
256	if (!initialized) {
257		memset(mapping, 0xFF, 256);
258		initialized = 1;
259	}
260
261	hash = ((dlid >> 8) ^ dlid) & 0xFF;
262	if (mapping[hash] == 0xFF) {
263		mapping[hash] = next;
264		next = (next + 1) & 0x7F;
265	}
266
267	return mapping[hash];
268}
269
270/**
271 * hfi1_user_sdma_process_request() - Process and start a user sdma request
272 * @fd: valid file descriptor
273 * @iovec: array of io vectors to process
274 * @dim: overall iovec array size
275 * @count: number of io vector array entries processed
276 */
277int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
278				   struct iovec *iovec, unsigned long dim,
279				   unsigned long *count)
280{
281	int ret = 0, i;
282	struct hfi1_ctxtdata *uctxt = fd->uctxt;
283	struct hfi1_user_sdma_pkt_q *pq =
284		srcu_dereference(fd->pq, &fd->pq_srcu);
285	struct hfi1_user_sdma_comp_q *cq = fd->cq;
286	struct hfi1_devdata *dd = pq->dd;
287	unsigned long idx = 0;
288	u8 pcount = initial_pkt_count;
289	struct sdma_req_info info;
290	struct user_sdma_request *req;
291	u8 opcode, sc, vl;
292	u16 pkey;
293	u32 slid;
294	u16 dlid;
295	u32 selector;
296
297	if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
298		hfi1_cdbg(
299		   SDMA,
300		   "[%u:%u:%u] First vector not big enough for header %lu/%lu",
301		   dd->unit, uctxt->ctxt, fd->subctxt,
302		   iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr));
303		return -EINVAL;
304	}
305	ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info));
306	if (ret) {
307		hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)",
308			  dd->unit, uctxt->ctxt, fd->subctxt, ret);
309		return -EFAULT;
310	}
311
312	trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt,
313				     (u16 *)&info);
314	if (info.comp_idx >= hfi1_sdma_comp_ring_size) {
315		hfi1_cdbg(SDMA,
316			  "[%u:%u:%u:%u] Invalid comp index",
317			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
318		return -EINVAL;
319	}
320
321	/*
322	 * Sanity check the header io vector count.  Need at least 1 vector
323	 * (header) and cannot be larger than the actual io vector count.
324	 */
325	if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) {
326		hfi1_cdbg(SDMA,
327			  "[%u:%u:%u:%u] Invalid iov count %d, dim %ld",
328			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx,
329			  req_iovcnt(info.ctrl), dim);
330		return -EINVAL;
331	}
332
333	if (!info.fragsize) {
334		hfi1_cdbg(SDMA,
335			  "[%u:%u:%u:%u] Request does not specify fragsize",
336			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
337		return -EINVAL;
338	}
339
340	/* Try to claim the request. */
341	if (test_and_set_bit(info.comp_idx, pq->req_in_use)) {
342		hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use",
343			  dd->unit, uctxt->ctxt, fd->subctxt,
344			  info.comp_idx);
345		return -EBADSLT;
346	}
347	/*
348	 * All safety checks have been done and this request has been claimed.
349	 */
350	trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt,
351					     info.comp_idx);
352	req = pq->reqs + info.comp_idx;
353	req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */
354	req->data_len  = 0;
355	req->pq = pq;
356	req->cq = cq;
357	req->ahg_idx = -1;
358	req->iov_idx = 0;
359	req->sent = 0;
360	req->seqnum = 0;
361	req->seqcomp = 0;
362	req->seqsubmitted = 0;
363	req->tids = NULL;
364	req->has_error = 0;
365	INIT_LIST_HEAD(&req->txps);
366
367	memcpy(&req->info, &info, sizeof(info));
368
369	/* The request is initialized, count it */
370	atomic_inc(&pq->n_reqs);
371
372	if (req_opcode(info.ctrl) == EXPECTED) {
373		/* expected must have a TID info and at least one data vector */
374		if (req->data_iovs < 2) {
375			SDMA_DBG(req,
376				 "Not enough vectors for expected request");
377			ret = -EINVAL;
378			goto free_req;
379		}
380		req->data_iovs--;
381	}
382
383	if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) {
384		SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
385			 MAX_VECTORS_PER_REQ);
386		ret = -EINVAL;
387		goto free_req;
388	}
389
390	/* Copy the header from the user buffer */
391	ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
392			     sizeof(req->hdr));
393	if (ret) {
394		SDMA_DBG(req, "Failed to copy header template (%d)", ret);
395		ret = -EFAULT;
396		goto free_req;
397	}
398
399	/* If Static rate control is not enabled, sanitize the header. */
400	if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL))
401		req->hdr.pbc[2] = 0;
402
403	/* Validate the opcode. Do not trust packets from user space blindly. */
404	opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff;
405	if ((opcode & USER_OPCODE_CHECK_MASK) !=
406	     USER_OPCODE_CHECK_VAL) {
407		SDMA_DBG(req, "Invalid opcode (%d)", opcode);
408		ret = -EINVAL;
409		goto free_req;
410	}
411	/*
412	 * Validate the vl. Do not trust packets from user space blindly.
413	 * VL comes from PBC, SC comes from LRH, and the VL needs to
414	 * match the SC look up.
415	 */
416	vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF;
417	sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) |
418	      (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4));
419	if (vl >= dd->pport->vls_operational ||
420	    vl != sc_to_vlt(dd, sc)) {
421		SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl);
422		ret = -EINVAL;
423		goto free_req;
424	}
425
426	/* Checking P_KEY for requests from user-space */
427	pkey = (u16)be32_to_cpu(req->hdr.bth[0]);
428	slid = be16_to_cpu(req->hdr.lrh[3]);
429	if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) {
430		ret = -EINVAL;
431		goto free_req;
432	}
433
434	/*
435	 * Also should check the BTH.lnh. If it says the next header is GRH then
436	 * the RXE parsing will be off and will land in the middle of the KDETH
437	 * or miss it entirely.
438	 */
439	if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) {
440		SDMA_DBG(req, "User tried to pass in a GRH");
441		ret = -EINVAL;
442		goto free_req;
443	}
444
445	req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]);
446	/*
447	 * Calculate the initial TID offset based on the values of
448	 * KDETH.OFFSET and KDETH.OM that are passed in.
449	 */
450	req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) *
451		(KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
452		 KDETH_OM_LARGE : KDETH_OM_SMALL);
453	trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt,
454					       info.comp_idx, req->tidoffset);
455	idx++;
456
457	/* Save all the IO vector structures */
458	for (i = 0; i < req->data_iovs; i++) {
459		req->iovs[i].offset = 0;
460		INIT_LIST_HEAD(&req->iovs[i].list);
461		memcpy(&req->iovs[i].iov,
462		       iovec + idx++,
463		       sizeof(req->iovs[i].iov));
464		if (req->iovs[i].iov.iov_len == 0) {
465			ret = -EINVAL;
466			goto free_req;
467		}
468		req->data_len += req->iovs[i].iov.iov_len;
469	}
470	trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt,
471					 info.comp_idx, req->data_len);
472	if (pcount > req->info.npkts)
473		pcount = req->info.npkts;
474	/*
475	 * Copy any TID info
476	 * User space will provide the TID info only when the
477	 * request type is EXPECTED. This is true even if there is
478	 * only one packet in the request and the header is already
479	 * setup. The reason for the singular TID case is that the
480	 * driver needs to perform safety checks.
481	 */
482	if (req_opcode(req->info.ctrl) == EXPECTED) {
483		u16 ntids = iovec[idx].iov_len / sizeof(*req->tids);
484		u32 *tmp;
485
486		if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) {
487			ret = -EINVAL;
488			goto free_req;
489		}
490
491		/*
492		 * We have to copy all of the tids because they may vary
493		 * in size and, therefore, the TID count might not be
494		 * equal to the pkt count. However, there is no way to
495		 * tell at this point.
496		 */
497		tmp = memdup_user(iovec[idx].iov_base,
498				  ntids * sizeof(*req->tids));
499		if (IS_ERR(tmp)) {
500			ret = PTR_ERR(tmp);
501			SDMA_DBG(req, "Failed to copy %d TIDs (%d)",
502				 ntids, ret);
503			goto free_req;
504		}
505		req->tids = tmp;
506		req->n_tids = ntids;
507		req->tididx = 0;
508		idx++;
509	}
510
511	dlid = be16_to_cpu(req->hdr.lrh[1]);
512	selector = dlid_to_selector(dlid);
513	selector += uctxt->ctxt + fd->subctxt;
514	req->sde = sdma_select_user_engine(dd, selector, vl);
515
516	if (!req->sde || !sdma_running(req->sde)) {
517		ret = -ECOMM;
518		goto free_req;
519	}
520
521	/* We don't need an AHG entry if the request contains only one packet */
522	if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG))
523		req->ahg_idx = sdma_ahg_alloc(req->sde);
524
525	set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
526	pq->state = SDMA_PKT_Q_ACTIVE;
527
528	/*
529	 * This is a somewhat blocking send implementation.
530	 * The driver will block the caller until all packets of the
531	 * request have been submitted to the SDMA engine. However, it
532	 * will not wait for send completions.
533	 */
534	while (req->seqsubmitted != req->info.npkts) {
535		ret = user_sdma_send_pkts(req, pcount);
536		if (ret < 0) {
537			int we_ret;
538
539			if (ret != -EBUSY)
540				goto free_req;
541			we_ret = wait_event_interruptible_timeout(
542				pq->busy.wait_dma,
543				pq->state == SDMA_PKT_Q_ACTIVE,
544				msecs_to_jiffies(
545					SDMA_IOWAIT_TIMEOUT));
546			trace_hfi1_usdma_we(pq, we_ret);
547			if (we_ret <= 0)
548				flush_pq_iowait(pq);
549		}
550	}
551	*count += idx;
552	return 0;
553free_req:
554	/*
555	 * If the submitted seqsubmitted == npkts, the completion routine
556	 * controls the final state.  If sequbmitted < npkts, wait for any
557	 * outstanding packets to finish before cleaning up.
558	 */
559	if (req->seqsubmitted < req->info.npkts) {
560		if (req->seqsubmitted)
561			wait_event(pq->busy.wait_dma,
562				   (req->seqcomp == req->seqsubmitted - 1));
563		user_sdma_free_request(req);
564		pq_update(pq);
565		set_comp_state(pq, cq, info.comp_idx, ERROR, ret);
566	}
567	return ret;
568}
569
570static inline u32 compute_data_length(struct user_sdma_request *req,
571				      struct user_sdma_txreq *tx)
572{
573	/*
574	 * Determine the proper size of the packet data.
575	 * The size of the data of the first packet is in the header
576	 * template. However, it includes the header and ICRC, which need
577	 * to be subtracted.
578	 * The minimum representable packet data length in a header is 4 bytes,
579	 * therefore, when the data length request is less than 4 bytes, there's
580	 * only one packet, and the packet data length is equal to that of the
581	 * request data length.
582	 * The size of the remaining packets is the minimum of the frag
583	 * size (MTU) or remaining data in the request.
584	 */
585	u32 len;
586
587	if (!req->seqnum) {
588		if (req->data_len < sizeof(u32))
589			len = req->data_len;
590		else
591			len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
592			       (sizeof(tx->hdr) - 4));
593	} else if (req_opcode(req->info.ctrl) == EXPECTED) {
594		u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
595			PAGE_SIZE;
596		/*
597		 * Get the data length based on the remaining space in the
598		 * TID pair.
599		 */
600		len = min(tidlen - req->tidoffset, (u32)req->info.fragsize);
601		/* If we've filled up the TID pair, move to the next one. */
602		if (unlikely(!len) && ++req->tididx < req->n_tids &&
603		    req->tids[req->tididx]) {
604			tidlen = EXP_TID_GET(req->tids[req->tididx],
605					     LEN) * PAGE_SIZE;
606			req->tidoffset = 0;
607			len = min_t(u32, tidlen, req->info.fragsize);
608		}
609		/*
610		 * Since the TID pairs map entire pages, make sure that we
611		 * are not going to try to send more data that we have
612		 * remaining.
613		 */
614		len = min(len, req->data_len - req->sent);
615	} else {
616		len = min(req->data_len - req->sent, (u32)req->info.fragsize);
617	}
618	trace_hfi1_sdma_user_compute_length(req->pq->dd,
619					    req->pq->ctxt,
620					    req->pq->subctxt,
621					    req->info.comp_idx,
622					    len);
623	return len;
624}
625
626static inline u32 pad_len(u32 len)
627{
628	if (len & (sizeof(u32) - 1))
629		len += sizeof(u32) - (len & (sizeof(u32) - 1));
630	return len;
631}
632
633static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
634{
635	/* (Size of complete header - size of PBC) + 4B ICRC + data length */
636	return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len);
637}
638
639static int user_sdma_txadd_ahg(struct user_sdma_request *req,
640			       struct user_sdma_txreq *tx,
641			       u32 datalen)
642{
643	int ret;
644	u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
645	u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen));
646	struct hfi1_user_sdma_pkt_q *pq = req->pq;
647
648	/*
649	 * Copy the request header into the tx header
650	 * because the HW needs a cacheline-aligned
651	 * address.
652	 * This copy can be optimized out if the hdr
653	 * member of user_sdma_request were also
654	 * cacheline aligned.
655	 */
656	memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr));
657	if (PBC2LRH(pbclen) != lrhlen) {
658		pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
659		tx->hdr.pbc[0] = cpu_to_le16(pbclen);
660	}
661	ret = check_header_template(req, &tx->hdr, lrhlen, datalen);
662	if (ret)
663		return ret;
664	ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY,
665			      sizeof(tx->hdr) + datalen, req->ahg_idx,
666			      0, NULL, 0, user_sdma_txreq_cb);
667	if (ret)
668		return ret;
669	ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr));
670	if (ret)
671		sdma_txclean(pq->dd, &tx->txreq);
672	return ret;
673}
674
675static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts)
676{
677	int ret = 0;
678	u16 count;
679	unsigned npkts = 0;
680	struct user_sdma_txreq *tx = NULL;
681	struct hfi1_user_sdma_pkt_q *pq = NULL;
682	struct user_sdma_iovec *iovec = NULL;
683
684	if (!req->pq)
685		return -EINVAL;
686
687	pq = req->pq;
688
689	/* If tx completion has reported an error, we are done. */
690	if (READ_ONCE(req->has_error))
691		return -EFAULT;
692
693	/*
694	 * Check if we might have sent the entire request already
695	 */
696	if (unlikely(req->seqnum == req->info.npkts)) {
697		if (!list_empty(&req->txps))
698			goto dosend;
699		return ret;
700	}
701
702	if (!maxpkts || maxpkts > req->info.npkts - req->seqnum)
703		maxpkts = req->info.npkts - req->seqnum;
704
705	while (npkts < maxpkts) {
706		u32 datalen = 0;
707
708		/*
709		 * Check whether any of the completions have come back
710		 * with errors. If so, we are not going to process any
711		 * more packets from this request.
712		 */
713		if (READ_ONCE(req->has_error))
714			return -EFAULT;
715
716		tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
717		if (!tx)
718			return -ENOMEM;
719
720		tx->flags = 0;
721		tx->req = req;
722		INIT_LIST_HEAD(&tx->list);
723
724		/*
725		 * For the last packet set the ACK request
726		 * and disable header suppression.
727		 */
728		if (req->seqnum == req->info.npkts - 1)
729			tx->flags |= (TXREQ_FLAGS_REQ_ACK |
730				      TXREQ_FLAGS_REQ_DISABLE_SH);
731
732		/*
733		 * Calculate the payload size - this is min of the fragment
734		 * (MTU) size or the remaining bytes in the request but only
735		 * if we have payload data.
736		 */
737		if (req->data_len) {
738			iovec = &req->iovs[req->iov_idx];
739			if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) {
740				if (++req->iov_idx == req->data_iovs) {
741					ret = -EFAULT;
742					goto free_tx;
743				}
744				iovec = &req->iovs[req->iov_idx];
745				WARN_ON(iovec->offset);
746			}
747
748			datalen = compute_data_length(req, tx);
749
750			/*
751			 * Disable header suppression for the payload <= 8DWS.
752			 * If there is an uncorrectable error in the receive
753			 * data FIFO when the received payload size is less than
754			 * or equal to 8DWS then the RxDmaDataFifoRdUncErr is
755			 * not reported.There is set RHF.EccErr if the header
756			 * is not suppressed.
757			 */
758			if (!datalen) {
759				SDMA_DBG(req,
760					 "Request has data but pkt len is 0");
761				ret = -EFAULT;
762				goto free_tx;
763			} else if (datalen <= 32) {
764				tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH;
765			}
766		}
767
768		if (req->ahg_idx >= 0) {
769			if (!req->seqnum) {
770				ret = user_sdma_txadd_ahg(req, tx, datalen);
771				if (ret)
772					goto free_tx;
773			} else {
774				int changes;
775
776				changes = set_txreq_header_ahg(req, tx,
777							       datalen);
778				if (changes < 0) {
779					ret = changes;
780					goto free_tx;
781				}
782			}
783		} else {
784			ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
785					  datalen, user_sdma_txreq_cb);
786			if (ret)
787				goto free_tx;
788			/*
789			 * Modify the header for this packet. This only needs
790			 * to be done if we are not going to use AHG. Otherwise,
791			 * the HW will do it based on the changes we gave it
792			 * during sdma_txinit_ahg().
793			 */
794			ret = set_txreq_header(req, tx, datalen);
795			if (ret)
796				goto free_txreq;
797		}
798
799		req->koffset += datalen;
800		if (req_opcode(req->info.ctrl) == EXPECTED)
801			req->tidoffset += datalen;
802		req->sent += datalen;
803		while (datalen) {
804			ret = hfi1_add_pages_to_sdma_packet(req, tx, iovec,
805							    &datalen);
806			if (ret)
807				goto free_txreq;
808			iovec = &req->iovs[req->iov_idx];
809		}
810		list_add_tail(&tx->txreq.list, &req->txps);
811		/*
812		 * It is important to increment this here as it is used to
813		 * generate the BTH.PSN and, therefore, can't be bulk-updated
814		 * outside of the loop.
815		 */
816		tx->seqnum = req->seqnum++;
817		npkts++;
818	}
819dosend:
820	ret = sdma_send_txlist(req->sde,
821			       iowait_get_ib_work(&pq->busy),
822			       &req->txps, &count);
823	req->seqsubmitted += count;
824	if (req->seqsubmitted == req->info.npkts) {
825		/*
826		 * The txreq has already been submitted to the HW queue
827		 * so we can free the AHG entry now. Corruption will not
828		 * happen due to the sequential manner in which
829		 * descriptors are processed.
830		 */
831		if (req->ahg_idx >= 0)
832			sdma_ahg_free(req->sde, req->ahg_idx);
833	}
834	return ret;
835
836free_txreq:
837	sdma_txclean(pq->dd, &tx->txreq);
838free_tx:
839	kmem_cache_free(pq->txreq_cache, tx);
840	return ret;
841}
842
843static int check_header_template(struct user_sdma_request *req,
844				 struct hfi1_pkt_header *hdr, u32 lrhlen,
845				 u32 datalen)
846{
847	/*
848	 * Perform safety checks for any type of packet:
849	 *    - transfer size is multiple of 64bytes
850	 *    - packet length is multiple of 4 bytes
851	 *    - packet length is not larger than MTU size
852	 *
853	 * These checks are only done for the first packet of the
854	 * transfer since the header is "given" to us by user space.
855	 * For the remainder of the packets we compute the values.
856	 */
857	if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 ||
858	    lrhlen > get_lrh_len(*hdr, req->info.fragsize))
859		return -EINVAL;
860
861	if (req_opcode(req->info.ctrl) == EXPECTED) {
862		/*
863		 * The header is checked only on the first packet. Furthermore,
864		 * we ensure that at least one TID entry is copied when the
865		 * request is submitted. Therefore, we don't have to verify that
866		 * tididx points to something sane.
867		 */
868		u32 tidval = req->tids[req->tididx],
869			tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE,
870			tididx = EXP_TID_GET(tidval, IDX),
871			tidctrl = EXP_TID_GET(tidval, CTRL),
872			tidoff;
873		__le32 kval = hdr->kdeth.ver_tid_offset;
874
875		tidoff = KDETH_GET(kval, OFFSET) *
876			  (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
877			   KDETH_OM_LARGE : KDETH_OM_SMALL);
878		/*
879		 * Expected receive packets have the following
880		 * additional checks:
881		 *     - offset is not larger than the TID size
882		 *     - TIDCtrl values match between header and TID array
883		 *     - TID indexes match between header and TID array
884		 */
885		if ((tidoff + datalen > tidlen) ||
886		    KDETH_GET(kval, TIDCTRL) != tidctrl ||
887		    KDETH_GET(kval, TID) != tididx)
888			return -EINVAL;
889	}
890	return 0;
891}
892
893/*
894 * Correctly set the BTH.PSN field based on type of
895 * transfer - eager packets can just increment the PSN but
896 * expected packets encode generation and sequence in the
897 * BTH.PSN field so just incrementing will result in errors.
898 */
899static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
900{
901	u32 val = be32_to_cpu(bthpsn),
902		mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull :
903			0xffffffull),
904		psn = val & mask;
905	if (expct)
906		psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) |
907			((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK);
908	else
909		psn = psn + frags;
910	return psn & mask;
911}
912
913static int set_txreq_header(struct user_sdma_request *req,
914			    struct user_sdma_txreq *tx, u32 datalen)
915{
916	struct hfi1_user_sdma_pkt_q *pq = req->pq;
917	struct hfi1_pkt_header *hdr = &tx->hdr;
918	u8 omfactor; /* KDETH.OM */
919	u16 pbclen;
920	int ret;
921	u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
922
923	/* Copy the header template to the request before modification */
924	memcpy(hdr, &req->hdr, sizeof(*hdr));
925
926	/*
927	 * Check if the PBC and LRH length are mismatched. If so
928	 * adjust both in the header.
929	 */
930	pbclen = le16_to_cpu(hdr->pbc[0]);
931	if (PBC2LRH(pbclen) != lrhlen) {
932		pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
933		hdr->pbc[0] = cpu_to_le16(pbclen);
934		hdr->lrh[2] = cpu_to_be16(lrhlen >> 2);
935		/*
936		 * Third packet
937		 * This is the first packet in the sequence that has
938		 * a "static" size that can be used for the rest of
939		 * the packets (besides the last one).
940		 */
941		if (unlikely(req->seqnum == 2)) {
942			/*
943			 * From this point on the lengths in both the
944			 * PBC and LRH are the same until the last
945			 * packet.
946			 * Adjust the template so we don't have to update
947			 * every packet
948			 */
949			req->hdr.pbc[0] = hdr->pbc[0];
950			req->hdr.lrh[2] = hdr->lrh[2];
951		}
952	}
953	/*
954	 * We only have to modify the header if this is not the
955	 * first packet in the request. Otherwise, we use the
956	 * header given to us.
957	 */
958	if (unlikely(!req->seqnum)) {
959		ret = check_header_template(req, hdr, lrhlen, datalen);
960		if (ret)
961			return ret;
962		goto done;
963	}
964
965	hdr->bth[2] = cpu_to_be32(
966		set_pkt_bth_psn(hdr->bth[2],
967				(req_opcode(req->info.ctrl) == EXPECTED),
968				req->seqnum));
969
970	/* Set ACK request on last packet */
971	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
972		hdr->bth[2] |= cpu_to_be32(1UL << 31);
973
974	/* Set the new offset */
975	hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset);
976	/* Expected packets have to fill in the new TID information */
977	if (req_opcode(req->info.ctrl) == EXPECTED) {
978		tidval = req->tids[req->tididx];
979		/*
980		 * If the offset puts us at the end of the current TID,
981		 * advance everything.
982		 */
983		if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
984					 PAGE_SIZE)) {
985			req->tidoffset = 0;
986			/*
987			 * Since we don't copy all the TIDs, all at once,
988			 * we have to check again.
989			 */
990			if (++req->tididx > req->n_tids - 1 ||
991			    !req->tids[req->tididx]) {
992				return -EINVAL;
993			}
994			tidval = req->tids[req->tididx];
995		}
996		omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >=
997			KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT :
998			KDETH_OM_SMALL_SHIFT;
999		/* Set KDETH.TIDCtrl based on value for this TID. */
1000		KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL,
1001			  EXP_TID_GET(tidval, CTRL));
1002		/* Set KDETH.TID based on value for this TID */
1003		KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
1004			  EXP_TID_GET(tidval, IDX));
1005		/* Clear KDETH.SH when DISABLE_SH flag is set */
1006		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH))
1007			KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
1008		/*
1009		 * Set the KDETH.OFFSET and KDETH.OM based on size of
1010		 * transfer.
1011		 */
1012		trace_hfi1_sdma_user_tid_info(
1013			pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx,
1014			req->tidoffset, req->tidoffset >> omfactor,
1015			omfactor != KDETH_OM_SMALL_SHIFT);
1016		KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET,
1017			  req->tidoffset >> omfactor);
1018		KDETH_SET(hdr->kdeth.ver_tid_offset, OM,
1019			  omfactor != KDETH_OM_SMALL_SHIFT);
1020	}
1021done:
1022	trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt,
1023				    req->info.comp_idx, hdr, tidval);
1024	return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr));
1025}
1026
1027static int set_txreq_header_ahg(struct user_sdma_request *req,
1028				struct user_sdma_txreq *tx, u32 datalen)
1029{
1030	u32 ahg[AHG_KDETH_ARRAY_SIZE];
1031	int idx = 0;
1032	u8 omfactor; /* KDETH.OM */
1033	struct hfi1_user_sdma_pkt_q *pq = req->pq;
1034	struct hfi1_pkt_header *hdr = &req->hdr;
1035	u16 pbclen = le16_to_cpu(hdr->pbc[0]);
1036	u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
1037	size_t array_size = ARRAY_SIZE(ahg);
1038
1039	if (PBC2LRH(pbclen) != lrhlen) {
1040		/* PBC.PbcLengthDWs */
1041		idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12,
1042				     (__force u16)cpu_to_le16(LRH2PBC(lrhlen)));
1043		if (idx < 0)
1044			return idx;
1045		/* LRH.PktLen (we need the full 16 bits due to byte swap) */
1046		idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16,
1047				     (__force u16)cpu_to_be16(lrhlen >> 2));
1048		if (idx < 0)
1049			return idx;
1050	}
1051
1052	/*
1053	 * Do the common updates
1054	 */
1055	/* BTH.PSN and BTH.A */
1056	val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
1057		(HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
1058	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
1059		val32 |= 1UL << 31;
1060	idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16,
1061			     (__force u16)cpu_to_be16(val32 >> 16));
1062	if (idx < 0)
1063		return idx;
1064	idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16,
1065			     (__force u16)cpu_to_be16(val32 & 0xffff));
1066	if (idx < 0)
1067		return idx;
1068	/* KDETH.Offset */
1069	idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16,
1070			     (__force u16)cpu_to_le16(req->koffset & 0xffff));
1071	if (idx < 0)
1072		return idx;
1073	idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16,
1074			     (__force u16)cpu_to_le16(req->koffset >> 16));
1075	if (idx < 0)
1076		return idx;
1077	if (req_opcode(req->info.ctrl) == EXPECTED) {
1078		__le16 val;
1079
1080		tidval = req->tids[req->tididx];
1081
1082		/*
1083		 * If the offset puts us at the end of the current TID,
1084		 * advance everything.
1085		 */
1086		if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
1087					 PAGE_SIZE)) {
1088			req->tidoffset = 0;
1089			/*
1090			 * Since we don't copy all the TIDs, all at once,
1091			 * we have to check again.
1092			 */
1093			if (++req->tididx > req->n_tids - 1 ||
1094			    !req->tids[req->tididx])
1095				return -EINVAL;
1096			tidval = req->tids[req->tididx];
1097		}
1098		omfactor = ((EXP_TID_GET(tidval, LEN) *
1099				  PAGE_SIZE) >=
1100				 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT :
1101				 KDETH_OM_SMALL_SHIFT;
1102		/* KDETH.OM and KDETH.OFFSET (TID) */
1103		idx = ahg_header_set(
1104				ahg, idx, array_size, 7, 0, 16,
1105				((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 |
1106				((req->tidoffset >> omfactor)
1107				& 0x7fff)));
1108		if (idx < 0)
1109			return idx;
1110		/* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */
1111		val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
1112				   (EXP_TID_GET(tidval, IDX) & 0x3ff));
1113
1114		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) {
1115			val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
1116						      INTR) <<
1117					    AHG_KDETH_INTR_SHIFT));
1118		} else {
1119			val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ?
1120			       cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) :
1121			       cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
1122						      INTR) <<
1123					     AHG_KDETH_INTR_SHIFT));
1124		}
1125
1126		idx = ahg_header_set(ahg, idx, array_size,
1127				     7, 16, 14, (__force u16)val);
1128		if (idx < 0)
1129			return idx;
1130	}
1131
1132	trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
1133					req->info.comp_idx, req->sde->this_idx,
1134					req->ahg_idx, ahg, idx, tidval);
1135	sdma_txinit_ahg(&tx->txreq,
1136			SDMA_TXREQ_F_USE_AHG,
1137			datalen, req->ahg_idx, idx,
1138			ahg, sizeof(req->hdr),
1139			user_sdma_txreq_cb);
1140
1141	return idx;
1142}
1143
1144/**
1145 * user_sdma_txreq_cb() - SDMA tx request completion callback.
1146 * @txreq: valid sdma tx request
1147 * @status: success/failure of request
1148 *
1149 * Called when the SDMA progress state machine gets notification that
1150 * the SDMA descriptors for this tx request have been processed by the
1151 * DMA engine. Called in interrupt context.
1152 * Only do work on completed sequences.
1153 */
1154static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
1155{
1156	struct user_sdma_txreq *tx =
1157		container_of(txreq, struct user_sdma_txreq, txreq);
1158	struct user_sdma_request *req;
1159	struct hfi1_user_sdma_pkt_q *pq;
1160	struct hfi1_user_sdma_comp_q *cq;
1161	enum hfi1_sdma_comp_state state = COMPLETE;
1162
1163	if (!tx->req)
1164		return;
1165
1166	req = tx->req;
1167	pq = req->pq;
1168	cq = req->cq;
1169
1170	if (status != SDMA_TXREQ_S_OK) {
1171		SDMA_DBG(req, "SDMA completion with error %d",
1172			 status);
1173		WRITE_ONCE(req->has_error, 1);
1174		state = ERROR;
1175	}
1176
1177	req->seqcomp = tx->seqnum;
1178	kmem_cache_free(pq->txreq_cache, tx);
1179
1180	/* sequence isn't complete?  We are done */
1181	if (req->seqcomp != req->info.npkts - 1)
1182		return;
1183
1184	user_sdma_free_request(req);
1185	set_comp_state(pq, cq, req->info.comp_idx, state, status);
1186	pq_update(pq);
1187}
1188
1189static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
1190{
1191	if (atomic_dec_and_test(&pq->n_reqs))
1192		wake_up(&pq->wait);
1193}
1194
1195static void user_sdma_free_request(struct user_sdma_request *req)
1196{
1197	if (!list_empty(&req->txps)) {
1198		struct sdma_txreq *t, *p;
1199
1200		list_for_each_entry_safe(t, p, &req->txps, list) {
1201			struct user_sdma_txreq *tx =
1202				container_of(t, struct user_sdma_txreq, txreq);
1203			list_del_init(&t->list);
1204			sdma_txclean(req->pq->dd, t);
1205			kmem_cache_free(req->pq->txreq_cache, tx);
1206		}
1207	}
1208
1209	kfree(req->tids);
1210	clear_bit(req->info.comp_idx, req->pq->req_in_use);
1211}
1212
1213static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
1214				  struct hfi1_user_sdma_comp_q *cq,
1215				  u16 idx, enum hfi1_sdma_comp_state state,
1216				  int ret)
1217{
1218	if (state == ERROR)
1219		cq->comps[idx].errcode = -ret;
1220	smp_wmb(); /* make sure errcode is visible first */
1221	cq->comps[idx].status = state;
1222	trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt,
1223					idx, state, ret);
1224}
1225