1// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
2
3/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
4/* Copyright (c) 2008-2019, IBM Corporation */
5
6#include <linux/errno.h>
7#include <linux/types.h>
8#include <linux/net.h>
9#include <linux/scatterlist.h>
10#include <linux/highmem.h>
11
12#include <rdma/iw_cm.h>
13#include <rdma/ib_verbs.h>
14
15#include "siw.h"
16#include "siw_verbs.h"
17#include "siw_mem.h"
18
19/*
20 * siw_rx_umem()
21 *
22 * Receive data of @len into target referenced by @dest_addr.
23 *
24 * @srx:	Receive Context
25 * @umem:	siw representation of target memory
26 * @dest_addr:	user virtual address
27 * @len:	number of bytes to place
28 */
29static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem,
30		       u64 dest_addr, int len)
31{
32	int copied = 0;
33
34	while (len) {
35		struct page *p;
36		int pg_off, bytes, rv;
37		void *dest;
38
39		p = siw_get_upage(umem, dest_addr);
40		if (unlikely(!p)) {
41			pr_warn("siw: %s: [QP %u]: bogus addr: %pK, %pK\n",
42				__func__, qp_id(rx_qp(srx)),
43				(void *)(uintptr_t)dest_addr,
44				(void *)(uintptr_t)umem->fp_addr);
45			/* siw internal error */
46			srx->skb_copied += copied;
47			srx->skb_new -= copied;
48
49			return -EFAULT;
50		}
51		pg_off = dest_addr & ~PAGE_MASK;
52		bytes = min(len, (int)PAGE_SIZE - pg_off);
53
54		siw_dbg_qp(rx_qp(srx), "page %pK, bytes=%u\n", p, bytes);
55
56		dest = kmap_atomic(p);
57		rv = skb_copy_bits(srx->skb, srx->skb_offset, dest + pg_off,
58				   bytes);
59
60		if (unlikely(rv)) {
61			kunmap_atomic(dest);
62			srx->skb_copied += copied;
63			srx->skb_new -= copied;
64
65			pr_warn("siw: [QP %u]: %s, len %d, page %p, rv %d\n",
66				qp_id(rx_qp(srx)), __func__, len, p, rv);
67
68			return -EFAULT;
69		}
70		if (srx->mpa_crc_hd) {
71			if (rdma_is_kernel_res(&rx_qp(srx)->base_qp.res)) {
72				crypto_shash_update(srx->mpa_crc_hd,
73					(u8 *)(dest + pg_off), bytes);
74				kunmap_atomic(dest);
75			} else {
76				kunmap_atomic(dest);
77				/*
78				 * Do CRC on original, not target buffer.
79				 * Some user land applications may
80				 * concurrently write the target buffer,
81				 * which would yield a broken CRC.
82				 * Walking the skb twice is very ineffcient.
83				 * Folding the CRC into skb_copy_bits()
84				 * would be much better, but is currently
85				 * not supported.
86				 */
87				siw_crc_skb(srx, bytes);
88			}
89		} else {
90			kunmap_atomic(dest);
91		}
92		srx->skb_offset += bytes;
93		copied += bytes;
94		len -= bytes;
95		dest_addr += bytes;
96		pg_off = 0;
97	}
98	srx->skb_copied += copied;
99	srx->skb_new -= copied;
100
101	return copied;
102}
103
104static int siw_rx_kva(struct siw_rx_stream *srx, void *kva, int len)
105{
106	int rv;
107
108	siw_dbg_qp(rx_qp(srx), "kva: 0x%pK, len: %u\n", kva, len);
109
110	rv = skb_copy_bits(srx->skb, srx->skb_offset, kva, len);
111	if (unlikely(rv)) {
112		pr_warn("siw: [QP %u]: %s, len %d, kva 0x%pK, rv %d\n",
113			qp_id(rx_qp(srx)), __func__, len, kva, rv);
114
115		return rv;
116	}
117	if (srx->mpa_crc_hd)
118		crypto_shash_update(srx->mpa_crc_hd, (u8 *)kva, len);
119
120	srx->skb_offset += len;
121	srx->skb_copied += len;
122	srx->skb_new -= len;
123
124	return len;
125}
126
127static int siw_rx_pbl(struct siw_rx_stream *srx, int *pbl_idx,
128		      struct siw_mem *mem, u64 addr, int len)
129{
130	struct siw_pbl *pbl = mem->pbl;
131	u64 offset = addr - mem->va;
132	int copied = 0;
133
134	while (len) {
135		int bytes;
136		dma_addr_t buf_addr =
137			siw_pbl_get_buffer(pbl, offset, &bytes, pbl_idx);
138		if (!buf_addr)
139			break;
140
141		bytes = min(bytes, len);
142		if (siw_rx_kva(srx, (void *)(uintptr_t)buf_addr, bytes) ==
143		    bytes) {
144			copied += bytes;
145			offset += bytes;
146			len -= bytes;
147		} else {
148			break;
149		}
150	}
151	return copied;
152}
153
154/*
155 * siw_rresp_check_ntoh()
156 *
157 * Check incoming RRESP fragment header against expected
158 * header values and update expected values for potential next
159 * fragment.
160 *
161 * NOTE: This function must be called only if a RRESP DDP segment
162 *       starts but not for fragmented consecutive pieces of an
163 *       already started DDP segment.
164 */
165static int siw_rresp_check_ntoh(struct siw_rx_stream *srx,
166				struct siw_rx_fpdu *frx)
167{
168	struct iwarp_rdma_rresp *rresp = &srx->hdr.rresp;
169	struct siw_wqe *wqe = &frx->wqe_active;
170	enum ddp_ecode ecode;
171
172	u32 sink_stag = be32_to_cpu(rresp->sink_stag);
173	u64 sink_to = be64_to_cpu(rresp->sink_to);
174
175	if (frx->first_ddp_seg) {
176		srx->ddp_stag = wqe->sqe.sge[0].lkey;
177		srx->ddp_to = wqe->sqe.sge[0].laddr;
178		frx->pbl_idx = 0;
179	}
180	/* Below checks extend beyond the semantics of DDP, and
181	 * into RDMAP:
182	 * We check if the read response matches exactly the
183	 * read request which was send to the remote peer to
184	 * trigger this read response. RFC5040/5041 do not
185	 * always have a proper error code for the detected
186	 * error cases. We choose 'base or bounds error' for
187	 * cases where the inbound STag is valid, but offset
188	 * or length do not match our response receive state.
189	 */
190	if (unlikely(srx->ddp_stag != sink_stag)) {
191		pr_warn("siw: [QP %u]: rresp stag: %08x != %08x\n",
192			qp_id(rx_qp(srx)), sink_stag, srx->ddp_stag);
193		ecode = DDP_ECODE_T_INVALID_STAG;
194		goto error;
195	}
196	if (unlikely(srx->ddp_to != sink_to)) {
197		pr_warn("siw: [QP %u]: rresp off: %016llx != %016llx\n",
198			qp_id(rx_qp(srx)), (unsigned long long)sink_to,
199			(unsigned long long)srx->ddp_to);
200		ecode = DDP_ECODE_T_BASE_BOUNDS;
201		goto error;
202	}
203	if (unlikely(!frx->more_ddp_segs &&
204		     (wqe->processed + srx->fpdu_part_rem != wqe->bytes))) {
205		pr_warn("siw: [QP %u]: rresp len: %d != %d\n",
206			qp_id(rx_qp(srx)),
207			wqe->processed + srx->fpdu_part_rem, wqe->bytes);
208		ecode = DDP_ECODE_T_BASE_BOUNDS;
209		goto error;
210	}
211	return 0;
212error:
213	siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
214			   DDP_ETYPE_TAGGED_BUF, ecode, 0);
215	return -EINVAL;
216}
217
218/*
219 * siw_write_check_ntoh()
220 *
221 * Check incoming WRITE fragment header against expected
222 * header values and update expected values for potential next
223 * fragment
224 *
225 * NOTE: This function must be called only if a WRITE DDP segment
226 *       starts but not for fragmented consecutive pieces of an
227 *       already started DDP segment.
228 */
229static int siw_write_check_ntoh(struct siw_rx_stream *srx,
230				struct siw_rx_fpdu *frx)
231{
232	struct iwarp_rdma_write *write = &srx->hdr.rwrite;
233	enum ddp_ecode ecode;
234
235	u32 sink_stag = be32_to_cpu(write->sink_stag);
236	u64 sink_to = be64_to_cpu(write->sink_to);
237
238	if (frx->first_ddp_seg) {
239		srx->ddp_stag = sink_stag;
240		srx->ddp_to = sink_to;
241		frx->pbl_idx = 0;
242	} else {
243		if (unlikely(srx->ddp_stag != sink_stag)) {
244			pr_warn("siw: [QP %u]: write stag: %08x != %08x\n",
245				qp_id(rx_qp(srx)), sink_stag,
246				srx->ddp_stag);
247			ecode = DDP_ECODE_T_INVALID_STAG;
248			goto error;
249		}
250		if (unlikely(srx->ddp_to != sink_to)) {
251			pr_warn("siw: [QP %u]: write off: %016llx != %016llx\n",
252				qp_id(rx_qp(srx)),
253				(unsigned long long)sink_to,
254				(unsigned long long)srx->ddp_to);
255			ecode = DDP_ECODE_T_BASE_BOUNDS;
256			goto error;
257		}
258	}
259	return 0;
260error:
261	siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
262			   DDP_ETYPE_TAGGED_BUF, ecode, 0);
263	return -EINVAL;
264}
265
266/*
267 * siw_send_check_ntoh()
268 *
269 * Check incoming SEND fragment header against expected
270 * header values and update expected MSN if no next
271 * fragment expected
272 *
273 * NOTE: This function must be called only if a SEND DDP segment
274 *       starts but not for fragmented consecutive pieces of an
275 *       already started DDP segment.
276 */
277static int siw_send_check_ntoh(struct siw_rx_stream *srx,
278			       struct siw_rx_fpdu *frx)
279{
280	struct iwarp_send_inv *send = &srx->hdr.send_inv;
281	struct siw_wqe *wqe = &frx->wqe_active;
282	enum ddp_ecode ecode;
283
284	u32 ddp_msn = be32_to_cpu(send->ddp_msn);
285	u32 ddp_mo = be32_to_cpu(send->ddp_mo);
286	u32 ddp_qn = be32_to_cpu(send->ddp_qn);
287
288	if (unlikely(ddp_qn != RDMAP_UNTAGGED_QN_SEND)) {
289		pr_warn("siw: [QP %u]: invalid ddp qn %d for send\n",
290			qp_id(rx_qp(srx)), ddp_qn);
291		ecode = DDP_ECODE_UT_INVALID_QN;
292		goto error;
293	}
294	if (unlikely(ddp_msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND])) {
295		pr_warn("siw: [QP %u]: send msn: %u != %u\n",
296			qp_id(rx_qp(srx)), ddp_msn,
297			srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]);
298		ecode = DDP_ECODE_UT_INVALID_MSN_RANGE;
299		goto error;
300	}
301	if (unlikely(ddp_mo != wqe->processed)) {
302		pr_warn("siw: [QP %u], send mo: %u != %u\n",
303			qp_id(rx_qp(srx)), ddp_mo, wqe->processed);
304		ecode = DDP_ECODE_UT_INVALID_MO;
305		goto error;
306	}
307	if (frx->first_ddp_seg) {
308		/* initialize user memory write position */
309		frx->sge_idx = 0;
310		frx->sge_off = 0;
311		frx->pbl_idx = 0;
312
313		/* only valid for SEND_INV and SEND_SE_INV operations */
314		srx->inval_stag = be32_to_cpu(send->inval_stag);
315	}
316	if (unlikely(wqe->bytes < wqe->processed + srx->fpdu_part_rem)) {
317		siw_dbg_qp(rx_qp(srx), "receive space short: %d - %d < %d\n",
318			   wqe->bytes, wqe->processed, srx->fpdu_part_rem);
319		wqe->wc_status = SIW_WC_LOC_LEN_ERR;
320		ecode = DDP_ECODE_UT_INVALID_MSN_NOBUF;
321		goto error;
322	}
323	return 0;
324error:
325	siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
326			   DDP_ETYPE_UNTAGGED_BUF, ecode, 0);
327	return -EINVAL;
328}
329
330static struct siw_wqe *siw_rqe_get(struct siw_qp *qp)
331{
332	struct siw_rqe *rqe;
333	struct siw_srq *srq;
334	struct siw_wqe *wqe = NULL;
335	bool srq_event = false;
336	unsigned long flags;
337
338	srq = qp->srq;
339	if (srq) {
340		spin_lock_irqsave(&srq->lock, flags);
341		if (unlikely(!srq->num_rqe))
342			goto out;
343
344		rqe = &srq->recvq[srq->rq_get % srq->num_rqe];
345	} else {
346		if (unlikely(!qp->recvq))
347			goto out;
348
349		rqe = &qp->recvq[qp->rq_get % qp->attrs.rq_size];
350	}
351	if (likely(rqe->flags == SIW_WQE_VALID)) {
352		int num_sge = rqe->num_sge;
353
354		if (likely(num_sge <= SIW_MAX_SGE)) {
355			int i = 0;
356
357			wqe = rx_wqe(&qp->rx_untagged);
358			rx_type(wqe) = SIW_OP_RECEIVE;
359			wqe->wr_status = SIW_WR_INPROGRESS;
360			wqe->bytes = 0;
361			wqe->processed = 0;
362
363			wqe->rqe.id = rqe->id;
364			wqe->rqe.num_sge = num_sge;
365
366			while (i < num_sge) {
367				wqe->rqe.sge[i].laddr = rqe->sge[i].laddr;
368				wqe->rqe.sge[i].lkey = rqe->sge[i].lkey;
369				wqe->rqe.sge[i].length = rqe->sge[i].length;
370				wqe->bytes += wqe->rqe.sge[i].length;
371				wqe->mem[i] = NULL;
372				i++;
373			}
374			/* can be re-used by appl */
375			smp_store_mb(rqe->flags, 0);
376		} else {
377			siw_dbg_qp(qp, "too many sge's: %d\n", rqe->num_sge);
378			if (srq)
379				spin_unlock_irqrestore(&srq->lock, flags);
380			return NULL;
381		}
382		if (!srq) {
383			qp->rq_get++;
384		} else {
385			if (srq->armed) {
386				/* Test SRQ limit */
387				u32 off = (srq->rq_get + srq->limit) %
388					  srq->num_rqe;
389				struct siw_rqe *rqe2 = &srq->recvq[off];
390
391				if (!(rqe2->flags & SIW_WQE_VALID)) {
392					srq->armed = false;
393					srq_event = true;
394				}
395			}
396			srq->rq_get++;
397		}
398	}
399out:
400	if (srq) {
401		spin_unlock_irqrestore(&srq->lock, flags);
402		if (srq_event)
403			siw_srq_event(srq, IB_EVENT_SRQ_LIMIT_REACHED);
404	}
405	return wqe;
406}
407
408/*
409 * siw_proc_send:
410 *
411 * Process one incoming SEND and place data into memory referenced by
412 * receive wqe.
413 *
414 * Function supports partially received sends (suspending/resuming
415 * current receive wqe processing)
416 *
417 * return value:
418 *	0:       reached the end of a DDP segment
419 *	-EAGAIN: to be called again to finish the DDP segment
420 */
421int siw_proc_send(struct siw_qp *qp)
422{
423	struct siw_rx_stream *srx = &qp->rx_stream;
424	struct siw_rx_fpdu *frx = &qp->rx_untagged;
425	struct siw_wqe *wqe;
426	u32 data_bytes; /* all data bytes available */
427	u32 rcvd_bytes; /* sum of data bytes rcvd */
428	int rv = 0;
429
430	if (frx->first_ddp_seg) {
431		wqe = siw_rqe_get(qp);
432		if (unlikely(!wqe)) {
433			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
434					   DDP_ETYPE_UNTAGGED_BUF,
435					   DDP_ECODE_UT_INVALID_MSN_NOBUF, 0);
436			return -ENOENT;
437		}
438	} else {
439		wqe = rx_wqe(frx);
440	}
441	if (srx->state == SIW_GET_DATA_START) {
442		rv = siw_send_check_ntoh(srx, frx);
443		if (unlikely(rv)) {
444			siw_qp_event(qp, IB_EVENT_QP_FATAL);
445			return rv;
446		}
447		if (!srx->fpdu_part_rem) /* zero length SEND */
448			return 0;
449	}
450	data_bytes = min(srx->fpdu_part_rem, srx->skb_new);
451	rcvd_bytes = 0;
452
453	/* A zero length SEND will skip below loop */
454	while (data_bytes) {
455		struct ib_pd *pd;
456		struct siw_mem **mem, *mem_p;
457		struct siw_sge *sge;
458		u32 sge_bytes; /* data bytes avail for SGE */
459
460		sge = &wqe->rqe.sge[frx->sge_idx];
461
462		if (!sge->length) {
463			/* just skip empty sge's */
464			frx->sge_idx++;
465			frx->sge_off = 0;
466			frx->pbl_idx = 0;
467			continue;
468		}
469		sge_bytes = min(data_bytes, sge->length - frx->sge_off);
470		mem = &wqe->mem[frx->sge_idx];
471
472		/*
473		 * check with QP's PD if no SRQ present, SRQ's PD otherwise
474		 */
475		pd = qp->srq == NULL ? qp->pd : qp->srq->base_srq.pd;
476
477		rv = siw_check_sge(pd, sge, mem, IB_ACCESS_LOCAL_WRITE,
478				   frx->sge_off, sge_bytes);
479		if (unlikely(rv)) {
480			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
481					   DDP_ETYPE_CATASTROPHIC,
482					   DDP_ECODE_CATASTROPHIC, 0);
483
484			siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
485			break;
486		}
487		mem_p = *mem;
488		if (mem_p->mem_obj == NULL)
489			rv = siw_rx_kva(srx,
490				(void *)(uintptr_t)(sge->laddr + frx->sge_off),
491				sge_bytes);
492		else if (!mem_p->is_pbl)
493			rv = siw_rx_umem(srx, mem_p->umem,
494					 sge->laddr + frx->sge_off, sge_bytes);
495		else
496			rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p,
497					sge->laddr + frx->sge_off, sge_bytes);
498
499		if (unlikely(rv != sge_bytes)) {
500			wqe->processed += rcvd_bytes;
501
502			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
503					   DDP_ETYPE_CATASTROPHIC,
504					   DDP_ECODE_CATASTROPHIC, 0);
505			return -EINVAL;
506		}
507		frx->sge_off += rv;
508
509		if (frx->sge_off == sge->length) {
510			frx->sge_idx++;
511			frx->sge_off = 0;
512			frx->pbl_idx = 0;
513		}
514		data_bytes -= rv;
515		rcvd_bytes += rv;
516
517		srx->fpdu_part_rem -= rv;
518		srx->fpdu_part_rcvd += rv;
519	}
520	wqe->processed += rcvd_bytes;
521
522	if (!srx->fpdu_part_rem)
523		return 0;
524
525	return (rv < 0) ? rv : -EAGAIN;
526}
527
528/*
529 * siw_proc_write:
530 *
531 * Place incoming WRITE after referencing and checking target buffer
532
533 * Function supports partially received WRITEs (suspending/resuming
534 * current receive processing)
535 *
536 * return value:
537 *	0:       reached the end of a DDP segment
538 *	-EAGAIN: to be called again to finish the DDP segment
539 */
540int siw_proc_write(struct siw_qp *qp)
541{
542	struct siw_rx_stream *srx = &qp->rx_stream;
543	struct siw_rx_fpdu *frx = &qp->rx_tagged;
544	struct siw_mem *mem;
545	int bytes, rv;
546
547	if (srx->state == SIW_GET_DATA_START) {
548		if (!srx->fpdu_part_rem) /* zero length WRITE */
549			return 0;
550
551		rv = siw_write_check_ntoh(srx, frx);
552		if (unlikely(rv)) {
553			siw_qp_event(qp, IB_EVENT_QP_FATAL);
554			return rv;
555		}
556	}
557	bytes = min(srx->fpdu_part_rem, srx->skb_new);
558
559	if (frx->first_ddp_seg) {
560		struct siw_wqe *wqe = rx_wqe(frx);
561
562		rx_mem(frx) = siw_mem_id2obj(qp->sdev, srx->ddp_stag >> 8);
563		if (unlikely(!rx_mem(frx))) {
564			siw_dbg_qp(qp,
565				   "sink stag not found/invalid, stag 0x%08x\n",
566				   srx->ddp_stag);
567
568			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
569					   DDP_ETYPE_TAGGED_BUF,
570					   DDP_ECODE_T_INVALID_STAG, 0);
571			return -EINVAL;
572		}
573		wqe->rqe.num_sge = 1;
574		rx_type(wqe) = SIW_OP_WRITE;
575		wqe->wr_status = SIW_WR_INPROGRESS;
576	}
577	mem = rx_mem(frx);
578
579	/*
580	 * Check if application re-registered memory with different
581	 * key field of STag.
582	 */
583	if (unlikely(mem->stag != srx->ddp_stag)) {
584		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
585				   DDP_ETYPE_TAGGED_BUF,
586				   DDP_ECODE_T_INVALID_STAG, 0);
587		return -EINVAL;
588	}
589	rv = siw_check_mem(qp->pd, mem, srx->ddp_to + srx->fpdu_part_rcvd,
590			   IB_ACCESS_REMOTE_WRITE, bytes);
591	if (unlikely(rv)) {
592		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
593				   DDP_ETYPE_TAGGED_BUF, siw_tagged_error(-rv),
594				   0);
595
596		siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
597
598		return -EINVAL;
599	}
600
601	if (mem->mem_obj == NULL)
602		rv = siw_rx_kva(srx,
603			(void *)(uintptr_t)(srx->ddp_to + srx->fpdu_part_rcvd),
604			bytes);
605	else if (!mem->is_pbl)
606		rv = siw_rx_umem(srx, mem->umem,
607				 srx->ddp_to + srx->fpdu_part_rcvd, bytes);
608	else
609		rv = siw_rx_pbl(srx, &frx->pbl_idx, mem,
610				srx->ddp_to + srx->fpdu_part_rcvd, bytes);
611
612	if (unlikely(rv != bytes)) {
613		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
614				   DDP_ETYPE_CATASTROPHIC,
615				   DDP_ECODE_CATASTROPHIC, 0);
616		return -EINVAL;
617	}
618	srx->fpdu_part_rem -= rv;
619	srx->fpdu_part_rcvd += rv;
620
621	if (!srx->fpdu_part_rem) {
622		srx->ddp_to += srx->fpdu_part_rcvd;
623		return 0;
624	}
625	return -EAGAIN;
626}
627
628/*
629 * Inbound RREQ's cannot carry user data.
630 */
631int siw_proc_rreq(struct siw_qp *qp)
632{
633	struct siw_rx_stream *srx = &qp->rx_stream;
634
635	if (!srx->fpdu_part_rem)
636		return 0;
637
638	pr_warn("siw: [QP %u]: rreq with mpa len %d\n", qp_id(qp),
639		be16_to_cpu(srx->hdr.ctrl.mpa_len));
640
641	return -EPROTO;
642}
643
644/*
645 * siw_init_rresp:
646 *
647 * Process inbound RDMA READ REQ. Produce a pseudo READ RESPONSE WQE.
648 * Put it at the tail of the IRQ, if there is another WQE currently in
649 * transmit processing. If not, make it the current WQE to be processed
650 * and schedule transmit processing.
651 *
652 * Can be called from softirq context and from process
653 * context (RREAD socket loopback case!)
654 *
655 * return value:
656 *	0:      success,
657 *		failure code otherwise
658 */
659
660static int siw_init_rresp(struct siw_qp *qp, struct siw_rx_stream *srx)
661{
662	struct siw_wqe *tx_work = tx_wqe(qp);
663	struct siw_sqe *resp;
664
665	uint64_t raddr = be64_to_cpu(srx->hdr.rreq.sink_to),
666		 laddr = be64_to_cpu(srx->hdr.rreq.source_to);
667	uint32_t length = be32_to_cpu(srx->hdr.rreq.read_size),
668		 lkey = be32_to_cpu(srx->hdr.rreq.source_stag),
669		 rkey = be32_to_cpu(srx->hdr.rreq.sink_stag),
670		 msn = be32_to_cpu(srx->hdr.rreq.ddp_msn);
671
672	int run_sq = 1, rv = 0;
673	unsigned long flags;
674
675	if (unlikely(msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ])) {
676		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
677				   DDP_ETYPE_UNTAGGED_BUF,
678				   DDP_ECODE_UT_INVALID_MSN_RANGE, 0);
679		return -EPROTO;
680	}
681	spin_lock_irqsave(&qp->sq_lock, flags);
682
683	if (unlikely(!qp->attrs.irq_size)) {
684		run_sq = 0;
685		goto error_irq;
686	}
687	if (tx_work->wr_status == SIW_WR_IDLE) {
688		/*
689		 * immediately schedule READ response w/o
690		 * consuming IRQ entry: IRQ must be empty.
691		 */
692		tx_work->processed = 0;
693		tx_work->mem[0] = NULL;
694		tx_work->wr_status = SIW_WR_QUEUED;
695		resp = &tx_work->sqe;
696	} else {
697		resp = irq_alloc_free(qp);
698		run_sq = 0;
699	}
700	if (likely(resp)) {
701		resp->opcode = SIW_OP_READ_RESPONSE;
702
703		resp->sge[0].length = length;
704		resp->sge[0].laddr = laddr;
705		resp->sge[0].lkey = lkey;
706
707		/* Keep aside message sequence number for potential
708		 * error reporting during Read Response generation.
709		 */
710		resp->sge[1].length = msn;
711
712		resp->raddr = raddr;
713		resp->rkey = rkey;
714		resp->num_sge = length ? 1 : 0;
715
716		/* RRESP now valid as current TX wqe or placed into IRQ */
717		smp_store_mb(resp->flags, SIW_WQE_VALID);
718	} else {
719error_irq:
720		pr_warn("siw: [QP %u]: IRQ exceeded or null, size %d\n",
721			qp_id(qp), qp->attrs.irq_size);
722
723		siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
724				   RDMAP_ETYPE_REMOTE_OPERATION,
725				   RDMAP_ECODE_CATASTROPHIC_STREAM, 0);
726		rv = -EPROTO;
727	}
728
729	spin_unlock_irqrestore(&qp->sq_lock, flags);
730
731	if (run_sq)
732		rv = siw_sq_start(qp);
733
734	return rv;
735}
736
737/*
738 * Only called at start of Read.Resonse processing.
739 * Transfer pending Read from tip of ORQ into currrent rx wqe,
740 * but keep ORQ entry valid until Read.Response processing done.
741 * No Queue locking needed.
742 */
743static int siw_orqe_start_rx(struct siw_qp *qp)
744{
745	struct siw_sqe *orqe;
746	struct siw_wqe *wqe = NULL;
747
748	if (unlikely(!qp->attrs.orq_size))
749		return -EPROTO;
750
751	/* make sure ORQ indices are current */
752	smp_mb();
753
754	orqe = orq_get_current(qp);
755	if (READ_ONCE(orqe->flags) & SIW_WQE_VALID) {
756		/* RRESP is a TAGGED RDMAP operation */
757		wqe = rx_wqe(&qp->rx_tagged);
758		wqe->sqe.id = orqe->id;
759		wqe->sqe.opcode = orqe->opcode;
760		wqe->sqe.sge[0].laddr = orqe->sge[0].laddr;
761		wqe->sqe.sge[0].lkey = orqe->sge[0].lkey;
762		wqe->sqe.sge[0].length = orqe->sge[0].length;
763		wqe->sqe.flags = orqe->flags;
764		wqe->sqe.num_sge = 1;
765		wqe->bytes = orqe->sge[0].length;
766		wqe->processed = 0;
767		wqe->mem[0] = NULL;
768		/* make sure WQE is completely written before valid */
769		smp_wmb();
770		wqe->wr_status = SIW_WR_INPROGRESS;
771
772		return 0;
773	}
774	return -EPROTO;
775}
776
777/*
778 * siw_proc_rresp:
779 *
780 * Place incoming RRESP data into memory referenced by RREQ WQE
781 * which is at the tip of the ORQ
782 *
783 * Function supports partially received RRESP's (suspending/resuming
784 * current receive processing)
785 */
786int siw_proc_rresp(struct siw_qp *qp)
787{
788	struct siw_rx_stream *srx = &qp->rx_stream;
789	struct siw_rx_fpdu *frx = &qp->rx_tagged;
790	struct siw_wqe *wqe = rx_wqe(frx);
791	struct siw_mem **mem, *mem_p;
792	struct siw_sge *sge;
793	int bytes, rv;
794
795	if (frx->first_ddp_seg) {
796		if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
797			pr_warn("siw: [QP %u]: proc RRESP: status %d, op %d\n",
798				qp_id(qp), wqe->wr_status, wqe->sqe.opcode);
799			rv = -EPROTO;
800			goto error_term;
801		}
802		/*
803		 * fetch pending RREQ from orq
804		 */
805		rv = siw_orqe_start_rx(qp);
806		if (rv) {
807			pr_warn("siw: [QP %u]: ORQ empty, size %d\n",
808				qp_id(qp), qp->attrs.orq_size);
809			goto error_term;
810		}
811		rv = siw_rresp_check_ntoh(srx, frx);
812		if (unlikely(rv)) {
813			siw_qp_event(qp, IB_EVENT_QP_FATAL);
814			return rv;
815		}
816	} else {
817		if (unlikely(wqe->wr_status != SIW_WR_INPROGRESS)) {
818			pr_warn("siw: [QP %u]: resume RRESP: status %d\n",
819				qp_id(qp), wqe->wr_status);
820			rv = -EPROTO;
821			goto error_term;
822		}
823	}
824	if (!srx->fpdu_part_rem) /* zero length RRESPONSE */
825		return 0;
826
827	sge = wqe->sqe.sge; /* there is only one */
828	mem = &wqe->mem[0];
829
830	if (!(*mem)) {
831		/*
832		 * check target memory which resolves memory on first fragment
833		 */
834		rv = siw_check_sge(qp->pd, sge, mem, IB_ACCESS_LOCAL_WRITE, 0,
835				   wqe->bytes);
836		if (unlikely(rv)) {
837			siw_dbg_qp(qp, "target mem check: %d\n", rv);
838			wqe->wc_status = SIW_WC_LOC_PROT_ERR;
839
840			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
841					   DDP_ETYPE_TAGGED_BUF,
842					   siw_tagged_error(-rv), 0);
843
844			siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
845
846			return -EINVAL;
847		}
848	}
849	mem_p = *mem;
850
851	bytes = min(srx->fpdu_part_rem, srx->skb_new);
852
853	if (mem_p->mem_obj == NULL)
854		rv = siw_rx_kva(srx,
855			(void *)(uintptr_t)(sge->laddr + wqe->processed),
856			bytes);
857	else if (!mem_p->is_pbl)
858		rv = siw_rx_umem(srx, mem_p->umem, sge->laddr + wqe->processed,
859				 bytes);
860	else
861		rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p,
862				sge->laddr + wqe->processed, bytes);
863	if (rv != bytes) {
864		wqe->wc_status = SIW_WC_GENERAL_ERR;
865		rv = -EINVAL;
866		goto error_term;
867	}
868	srx->fpdu_part_rem -= rv;
869	srx->fpdu_part_rcvd += rv;
870	wqe->processed += rv;
871
872	if (!srx->fpdu_part_rem) {
873		srx->ddp_to += srx->fpdu_part_rcvd;
874		return 0;
875	}
876	return -EAGAIN;
877
878error_term:
879	siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_CATASTROPHIC,
880			   DDP_ECODE_CATASTROPHIC, 0);
881	return rv;
882}
883
884int siw_proc_terminate(struct siw_qp *qp)
885{
886	struct siw_rx_stream *srx = &qp->rx_stream;
887	struct sk_buff *skb = srx->skb;
888	struct iwarp_terminate *term = &srx->hdr.terminate;
889	union iwarp_hdr term_info;
890	u8 *infop = (u8 *)&term_info;
891	enum rdma_opcode op;
892	u16 to_copy = sizeof(struct iwarp_ctrl);
893
894	pr_warn("siw: got TERMINATE. layer %d, type %d, code %d\n",
895		__rdmap_term_layer(term), __rdmap_term_etype(term),
896		__rdmap_term_ecode(term));
897
898	if (be32_to_cpu(term->ddp_qn) != RDMAP_UNTAGGED_QN_TERMINATE ||
899	    be32_to_cpu(term->ddp_msn) !=
900		    qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] ||
901	    be32_to_cpu(term->ddp_mo) != 0) {
902		pr_warn("siw: rx bogus TERM [QN x%08x, MSN x%08x, MO x%08x]\n",
903			be32_to_cpu(term->ddp_qn), be32_to_cpu(term->ddp_msn),
904			be32_to_cpu(term->ddp_mo));
905		return -ECONNRESET;
906	}
907	/*
908	 * Receive remaining pieces of TERM if indicated
909	 */
910	if (!term->flag_m)
911		return -ECONNRESET;
912
913	/* Do not take the effort to reassemble a network fragmented
914	 * TERM message
915	 */
916	if (srx->skb_new < sizeof(struct iwarp_ctrl_tagged))
917		return -ECONNRESET;
918
919	memset(infop, 0, sizeof(term_info));
920
921	skb_copy_bits(skb, srx->skb_offset, infop, to_copy);
922
923	op = __rdmap_get_opcode(&term_info.ctrl);
924	if (op >= RDMAP_TERMINATE)
925		goto out;
926
927	infop += to_copy;
928	srx->skb_offset += to_copy;
929	srx->skb_new -= to_copy;
930	srx->skb_copied += to_copy;
931	srx->fpdu_part_rcvd += to_copy;
932	srx->fpdu_part_rem -= to_copy;
933
934	to_copy = iwarp_pktinfo[op].hdr_len - to_copy;
935
936	/* Again, no network fragmented TERM's */
937	if (to_copy + MPA_CRC_SIZE > srx->skb_new)
938		return -ECONNRESET;
939
940	skb_copy_bits(skb, srx->skb_offset, infop, to_copy);
941
942	if (term->flag_r) {
943		siw_dbg_qp(qp, "TERM reports RDMAP hdr type %u, len %u (%s)\n",
944			   op, be16_to_cpu(term_info.ctrl.mpa_len),
945			   term->flag_m ? "valid" : "invalid");
946	} else if (term->flag_d) {
947		siw_dbg_qp(qp, "TERM reports DDP hdr type %u, len %u (%s)\n",
948			   op, be16_to_cpu(term_info.ctrl.mpa_len),
949			   term->flag_m ? "valid" : "invalid");
950	}
951out:
952	srx->skb_new -= to_copy;
953	srx->skb_offset += to_copy;
954	srx->skb_copied += to_copy;
955	srx->fpdu_part_rcvd += to_copy;
956	srx->fpdu_part_rem -= to_copy;
957
958	return -ECONNRESET;
959}
960
961static int siw_get_trailer(struct siw_qp *qp, struct siw_rx_stream *srx)
962{
963	struct sk_buff *skb = srx->skb;
964	int avail = min(srx->skb_new, srx->fpdu_part_rem);
965	u8 *tbuf = (u8 *)&srx->trailer.crc - srx->pad;
966	__wsum crc_in, crc_own = 0;
967
968	siw_dbg_qp(qp, "expected %d, available %d, pad %u\n",
969		   srx->fpdu_part_rem, srx->skb_new, srx->pad);
970
971	skb_copy_bits(skb, srx->skb_offset, tbuf, avail);
972
973	srx->skb_new -= avail;
974	srx->skb_offset += avail;
975	srx->skb_copied += avail;
976	srx->fpdu_part_rem -= avail;
977
978	if (srx->fpdu_part_rem)
979		return -EAGAIN;
980
981	if (!srx->mpa_crc_hd)
982		return 0;
983
984	if (srx->pad)
985		crypto_shash_update(srx->mpa_crc_hd, tbuf, srx->pad);
986	/*
987	 * CRC32 is computed, transmitted and received directly in NBO,
988	 * so there's never a reason to convert byte order.
989	 */
990	crypto_shash_final(srx->mpa_crc_hd, (u8 *)&crc_own);
991	crc_in = (__force __wsum)srx->trailer.crc;
992
993	if (unlikely(crc_in != crc_own)) {
994		pr_warn("siw: crc error. in: %08x, own %08x, op %u\n",
995			crc_in, crc_own, qp->rx_stream.rdmap_op);
996
997		siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
998				   LLP_ETYPE_MPA,
999				   LLP_ECODE_RECEIVED_CRC, 0);
1000		return -EINVAL;
1001	}
1002	return 0;
1003}
1004
1005#define MIN_DDP_HDR sizeof(struct iwarp_ctrl_tagged)
1006
1007static int siw_get_hdr(struct siw_rx_stream *srx)
1008{
1009	struct sk_buff *skb = srx->skb;
1010	struct siw_qp *qp = rx_qp(srx);
1011	struct iwarp_ctrl *c_hdr = &srx->hdr.ctrl;
1012	struct siw_rx_fpdu *frx;
1013	u8 opcode;
1014	int bytes;
1015
1016	if (srx->fpdu_part_rcvd < MIN_DDP_HDR) {
1017		/*
1018		 * copy a mimimum sized (tagged) DDP frame control part
1019		 */
1020		bytes = min_t(int, srx->skb_new,
1021			      MIN_DDP_HDR - srx->fpdu_part_rcvd);
1022
1023		skb_copy_bits(skb, srx->skb_offset,
1024			      (char *)c_hdr + srx->fpdu_part_rcvd, bytes);
1025
1026		srx->fpdu_part_rcvd += bytes;
1027
1028		srx->skb_new -= bytes;
1029		srx->skb_offset += bytes;
1030		srx->skb_copied += bytes;
1031
1032		if (srx->fpdu_part_rcvd < MIN_DDP_HDR)
1033			return -EAGAIN;
1034
1035		if (unlikely(__ddp_get_version(c_hdr) != DDP_VERSION)) {
1036			enum ddp_etype etype;
1037			enum ddp_ecode ecode;
1038
1039			pr_warn("siw: received ddp version unsupported %d\n",
1040				__ddp_get_version(c_hdr));
1041
1042			if (c_hdr->ddp_rdmap_ctrl & DDP_FLAG_TAGGED) {
1043				etype = DDP_ETYPE_TAGGED_BUF;
1044				ecode = DDP_ECODE_T_VERSION;
1045			} else {
1046				etype = DDP_ETYPE_UNTAGGED_BUF;
1047				ecode = DDP_ECODE_UT_VERSION;
1048			}
1049			siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
1050					   etype, ecode, 0);
1051			return -EINVAL;
1052		}
1053		if (unlikely(__rdmap_get_version(c_hdr) != RDMAP_VERSION)) {
1054			pr_warn("siw: received rdmap version unsupported %d\n",
1055				__rdmap_get_version(c_hdr));
1056
1057			siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
1058					   RDMAP_ETYPE_REMOTE_OPERATION,
1059					   RDMAP_ECODE_VERSION, 0);
1060			return -EINVAL;
1061		}
1062		opcode = __rdmap_get_opcode(c_hdr);
1063
1064		if (opcode > RDMAP_TERMINATE) {
1065			pr_warn("siw: received unknown packet type %u\n",
1066				opcode);
1067
1068			siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
1069					   RDMAP_ETYPE_REMOTE_OPERATION,
1070					   RDMAP_ECODE_OPCODE, 0);
1071			return -EINVAL;
1072		}
1073		siw_dbg_qp(rx_qp(srx), "new header, opcode %u\n", opcode);
1074	} else {
1075		opcode = __rdmap_get_opcode(c_hdr);
1076	}
1077	set_rx_fpdu_context(qp, opcode);
1078	frx = qp->rx_fpdu;
1079
1080	/*
1081	 * Figure out len of current hdr: variable length of
1082	 * iwarp hdr may force us to copy hdr information in
1083	 * two steps. Only tagged DDP messages are already
1084	 * completely received.
1085	 */
1086	if (iwarp_pktinfo[opcode].hdr_len > sizeof(struct iwarp_ctrl_tagged)) {
1087		int hdrlen = iwarp_pktinfo[opcode].hdr_len;
1088
1089		bytes = min_t(int, hdrlen - MIN_DDP_HDR, srx->skb_new);
1090
1091		skb_copy_bits(skb, srx->skb_offset,
1092			      (char *)c_hdr + srx->fpdu_part_rcvd, bytes);
1093
1094		srx->fpdu_part_rcvd += bytes;
1095
1096		srx->skb_new -= bytes;
1097		srx->skb_offset += bytes;
1098		srx->skb_copied += bytes;
1099
1100		if (srx->fpdu_part_rcvd < hdrlen)
1101			return -EAGAIN;
1102	}
1103
1104	/*
1105	 * DDP/RDMAP header receive completed. Check if the current
1106	 * DDP segment starts a new RDMAP message or continues a previously
1107	 * started RDMAP message.
1108	 *
1109	 * Alternating reception of DDP segments (or FPDUs) from incomplete
1110	 * tagged and untagged RDMAP messages is supported, as long as
1111	 * the current tagged or untagged message gets eventually completed
1112	 * w/o intersection from another message of the same type
1113	 * (tagged/untagged). E.g., a WRITE can get intersected by a SEND,
1114	 * but not by a READ RESPONSE etc.
1115	 */
1116	if (srx->mpa_crc_hd) {
1117		/*
1118		 * Restart CRC computation
1119		 */
1120		crypto_shash_init(srx->mpa_crc_hd);
1121		crypto_shash_update(srx->mpa_crc_hd, (u8 *)c_hdr,
1122				    srx->fpdu_part_rcvd);
1123	}
1124	if (frx->more_ddp_segs) {
1125		frx->first_ddp_seg = 0;
1126		if (frx->prev_rdmap_op != opcode) {
1127			pr_warn("siw: packet intersection: %u : %u\n",
1128				frx->prev_rdmap_op, opcode);
1129			/*
1130			 * The last inbound RDMA operation of same type
1131			 * (tagged or untagged) is left unfinished.
1132			 * To complete it in error, make it the current
1133			 * operation again, even with the header already
1134			 * overwritten. For error handling, only the opcode
1135			 * and current rx context are relevant.
1136			 */
1137			set_rx_fpdu_context(qp, frx->prev_rdmap_op);
1138			__rdmap_set_opcode(c_hdr, frx->prev_rdmap_op);
1139			return -EPROTO;
1140		}
1141	} else {
1142		frx->prev_rdmap_op = opcode;
1143		frx->first_ddp_seg = 1;
1144	}
1145	frx->more_ddp_segs = c_hdr->ddp_rdmap_ctrl & DDP_FLAG_LAST ? 0 : 1;
1146
1147	return 0;
1148}
1149
1150static int siw_check_tx_fence(struct siw_qp *qp)
1151{
1152	struct siw_wqe *tx_waiting = tx_wqe(qp);
1153	struct siw_sqe *rreq;
1154	int resume_tx = 0, rv = 0;
1155	unsigned long flags;
1156
1157	spin_lock_irqsave(&qp->orq_lock, flags);
1158
1159	/* free current orq entry */
1160	rreq = orq_get_current(qp);
1161	WRITE_ONCE(rreq->flags, 0);
1162
1163	qp->orq_get++;
1164
1165	if (qp->tx_ctx.orq_fence) {
1166		if (unlikely(tx_waiting->wr_status != SIW_WR_QUEUED)) {
1167			pr_warn("siw: [QP %u]: fence resume: bad status %d\n",
1168				qp_id(qp), tx_waiting->wr_status);
1169			rv = -EPROTO;
1170			goto out;
1171		}
1172		/* resume SQ processing, if possible */
1173		if (tx_waiting->sqe.opcode == SIW_OP_READ ||
1174		    tx_waiting->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
1175
1176			/* SQ processing was stopped because of a full ORQ */
1177			rreq = orq_get_free(qp);
1178			if (unlikely(!rreq)) {
1179				pr_warn("siw: [QP %u]: no ORQE\n", qp_id(qp));
1180				rv = -EPROTO;
1181				goto out;
1182			}
1183			siw_read_to_orq(rreq, &tx_waiting->sqe);
1184
1185			qp->orq_put++;
1186			qp->tx_ctx.orq_fence = 0;
1187			resume_tx = 1;
1188
1189		} else if (siw_orq_empty(qp)) {
1190			/*
1191			 * SQ processing was stopped by fenced work request.
1192			 * Resume since all previous Read's are now completed.
1193			 */
1194			qp->tx_ctx.orq_fence = 0;
1195			resume_tx = 1;
1196		}
1197	}
1198out:
1199	spin_unlock_irqrestore(&qp->orq_lock, flags);
1200
1201	if (resume_tx)
1202		rv = siw_sq_start(qp);
1203
1204	return rv;
1205}
1206
1207/*
1208 * siw_rdmap_complete()
1209 *
1210 * Complete processing of an RDMA message after receiving all
1211 * DDP segmens or ABort processing after encountering error case.
1212 *
1213 *   o SENDs + RRESPs will need for completion,
1214 *   o RREQs need for  READ RESPONSE initialization
1215 *   o WRITEs need memory dereferencing
1216 *
1217 * TODO: Failed WRITEs need local error to be surfaced.
1218 */
1219static int siw_rdmap_complete(struct siw_qp *qp, int error)
1220{
1221	struct siw_rx_stream *srx = &qp->rx_stream;
1222	struct siw_wqe *wqe = rx_wqe(qp->rx_fpdu);
1223	enum siw_wc_status wc_status = wqe->wc_status;
1224	u8 opcode = __rdmap_get_opcode(&srx->hdr.ctrl);
1225	int rv = 0;
1226
1227	switch (opcode) {
1228	case RDMAP_SEND_SE:
1229	case RDMAP_SEND_SE_INVAL:
1230		wqe->rqe.flags |= SIW_WQE_SOLICITED;
1231		fallthrough;
1232
1233	case RDMAP_SEND:
1234	case RDMAP_SEND_INVAL:
1235		if (wqe->wr_status == SIW_WR_IDLE)
1236			break;
1237
1238		srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++;
1239
1240		if (error != 0 && wc_status == SIW_WC_SUCCESS)
1241			wc_status = SIW_WC_GENERAL_ERR;
1242		/*
1243		 * Handle STag invalidation request
1244		 */
1245		if (wc_status == SIW_WC_SUCCESS &&
1246		    (opcode == RDMAP_SEND_INVAL ||
1247		     opcode == RDMAP_SEND_SE_INVAL)) {
1248			rv = siw_invalidate_stag(qp->pd, srx->inval_stag);
1249			if (rv) {
1250				siw_init_terminate(
1251					qp, TERM_ERROR_LAYER_RDMAP,
1252					rv == -EACCES ?
1253						RDMAP_ETYPE_REMOTE_PROTECTION :
1254						RDMAP_ETYPE_REMOTE_OPERATION,
1255					RDMAP_ECODE_CANNOT_INVALIDATE, 0);
1256
1257				wc_status = SIW_WC_REM_INV_REQ_ERR;
1258			}
1259			rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
1260					      rv ? 0 : srx->inval_stag,
1261					      wc_status);
1262		} else {
1263			rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
1264					      0, wc_status);
1265		}
1266		siw_wqe_put_mem(wqe, SIW_OP_RECEIVE);
1267		break;
1268
1269	case RDMAP_RDMA_READ_RESP:
1270		if (wqe->wr_status == SIW_WR_IDLE)
1271			break;
1272
1273		if (error != 0) {
1274			if ((srx->state == SIW_GET_HDR &&
1275			     qp->rx_fpdu->first_ddp_seg) || error == -ENODATA)
1276				/* possible RREQ in ORQ left untouched */
1277				break;
1278
1279			if (wc_status == SIW_WC_SUCCESS)
1280				wc_status = SIW_WC_GENERAL_ERR;
1281		} else if (rdma_is_kernel_res(&qp->base_qp.res) &&
1282			   rx_type(wqe) == SIW_OP_READ_LOCAL_INV) {
1283			/*
1284			 * Handle any STag invalidation request
1285			 */
1286			rv = siw_invalidate_stag(qp->pd, wqe->sqe.sge[0].lkey);
1287			if (rv) {
1288				siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
1289						   RDMAP_ETYPE_CATASTROPHIC,
1290						   RDMAP_ECODE_UNSPECIFIED, 0);
1291
1292				if (wc_status == SIW_WC_SUCCESS) {
1293					wc_status = SIW_WC_GENERAL_ERR;
1294					error = rv;
1295				}
1296			}
1297		}
1298		/*
1299		 * All errors turn the wqe into signalled.
1300		 */
1301		if ((wqe->sqe.flags & SIW_WQE_SIGNALLED) || error != 0)
1302			rv = siw_sqe_complete(qp, &wqe->sqe, wqe->processed,
1303					      wc_status);
1304		siw_wqe_put_mem(wqe, SIW_OP_READ);
1305
1306		if (!error) {
1307			rv = siw_check_tx_fence(qp);
1308		} else {
1309			/* Disable current ORQ element */
1310			if (qp->attrs.orq_size)
1311				WRITE_ONCE(orq_get_current(qp)->flags, 0);
1312		}
1313		break;
1314
1315	case RDMAP_RDMA_READ_REQ:
1316		if (!error) {
1317			rv = siw_init_rresp(qp, srx);
1318			srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++;
1319		}
1320		break;
1321
1322	case RDMAP_RDMA_WRITE:
1323		if (wqe->wr_status == SIW_WR_IDLE)
1324			break;
1325
1326		/*
1327		 * Free References from memory object if
1328		 * attached to receive context (inbound WRITE).
1329		 * While a zero-length WRITE is allowed,
1330		 * no memory reference got created.
1331		 */
1332		if (rx_mem(&qp->rx_tagged)) {
1333			siw_mem_put(rx_mem(&qp->rx_tagged));
1334			rx_mem(&qp->rx_tagged) = NULL;
1335		}
1336		break;
1337
1338	default:
1339		break;
1340	}
1341	wqe->wr_status = SIW_WR_IDLE;
1342
1343	return rv;
1344}
1345
1346/*
1347 * siw_tcp_rx_data()
1348 *
1349 * Main routine to consume inbound TCP payload
1350 *
1351 * @rd_desc:	read descriptor
1352 * @skb:	socket buffer
1353 * @off:	offset in skb
1354 * @len:	skb->len - offset : payload in skb
1355 */
1356int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb,
1357		    unsigned int off, size_t len)
1358{
1359	struct siw_qp *qp = rd_desc->arg.data;
1360	struct siw_rx_stream *srx = &qp->rx_stream;
1361	int rv;
1362
1363	srx->skb = skb;
1364	srx->skb_new = skb->len - off;
1365	srx->skb_offset = off;
1366	srx->skb_copied = 0;
1367
1368	siw_dbg_qp(qp, "new data, len %d\n", srx->skb_new);
1369
1370	while (srx->skb_new) {
1371		int run_completion = 1;
1372
1373		if (unlikely(srx->rx_suspend)) {
1374			/* Do not process any more data */
1375			srx->skb_copied += srx->skb_new;
1376			break;
1377		}
1378		switch (srx->state) {
1379		case SIW_GET_HDR:
1380			rv = siw_get_hdr(srx);
1381			if (!rv) {
1382				srx->fpdu_part_rem =
1383					be16_to_cpu(srx->hdr.ctrl.mpa_len) -
1384					srx->fpdu_part_rcvd + MPA_HDR_SIZE;
1385
1386				if (srx->fpdu_part_rem)
1387					srx->pad = -srx->fpdu_part_rem & 0x3;
1388				else
1389					srx->pad = 0;
1390
1391				srx->state = SIW_GET_DATA_START;
1392				srx->fpdu_part_rcvd = 0;
1393			}
1394			break;
1395
1396		case SIW_GET_DATA_MORE:
1397			/*
1398			 * Another data fragment of the same DDP segment.
1399			 * Setting first_ddp_seg = 0 avoids repeating
1400			 * initializations that shall occur only once per
1401			 * DDP segment.
1402			 */
1403			qp->rx_fpdu->first_ddp_seg = 0;
1404			fallthrough;
1405
1406		case SIW_GET_DATA_START:
1407			/*
1408			 * Headers will be checked by the opcode-specific
1409			 * data receive function below.
1410			 */
1411			rv = iwarp_pktinfo[qp->rx_stream.rdmap_op].rx_data(qp);
1412			if (!rv) {
1413				int mpa_len =
1414					be16_to_cpu(srx->hdr.ctrl.mpa_len)
1415					+ MPA_HDR_SIZE;
1416
1417				srx->fpdu_part_rem = (-mpa_len & 0x3)
1418						      + MPA_CRC_SIZE;
1419				srx->fpdu_part_rcvd = 0;
1420				srx->state = SIW_GET_TRAILER;
1421			} else {
1422				if (unlikely(rv == -ECONNRESET))
1423					run_completion = 0;
1424				else
1425					srx->state = SIW_GET_DATA_MORE;
1426			}
1427			break;
1428
1429		case SIW_GET_TRAILER:
1430			/*
1431			 * read CRC + any padding
1432			 */
1433			rv = siw_get_trailer(qp, srx);
1434			if (likely(!rv)) {
1435				/*
1436				 * FPDU completed.
1437				 * complete RDMAP message if last fragment
1438				 */
1439				srx->state = SIW_GET_HDR;
1440				srx->fpdu_part_rcvd = 0;
1441
1442				if (!(srx->hdr.ctrl.ddp_rdmap_ctrl &
1443				      DDP_FLAG_LAST))
1444					/* more frags */
1445					break;
1446
1447				rv = siw_rdmap_complete(qp, 0);
1448				run_completion = 0;
1449			}
1450			break;
1451
1452		default:
1453			pr_warn("QP[%u]: RX out of state\n", qp_id(qp));
1454			rv = -EPROTO;
1455			run_completion = 0;
1456		}
1457		if (unlikely(rv != 0 && rv != -EAGAIN)) {
1458			if ((srx->state > SIW_GET_HDR ||
1459			     qp->rx_fpdu->more_ddp_segs) && run_completion)
1460				siw_rdmap_complete(qp, rv);
1461
1462			siw_dbg_qp(qp, "rx error %d, rx state %d\n", rv,
1463				   srx->state);
1464
1465			siw_qp_cm_drop(qp, 1);
1466
1467			break;
1468		}
1469		if (rv) {
1470			siw_dbg_qp(qp, "fpdu fragment, state %d, missing %d\n",
1471				   srx->state, srx->fpdu_part_rem);
1472			break;
1473		}
1474	}
1475	return srx->skb_copied;
1476}
1477