1/*
2 * Copyright(c) 2020 Cornelis Networks, Inc.
3 * Copyright(c) 2015-2018 Intel Corporation.
4 *
5 * This file is provided under a dual BSD/GPLv2 license.  When using or
6 * redistributing this file, you may do so under either license.
7 *
8 * GPL LICENSE SUMMARY
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Redistribution and use in source and binary forms, with or without
22 * modification, are permitted provided that the following conditions
23 * are met:
24 *
25 *  - Redistributions of source code must retain the above copyright
26 *    notice, this list of conditions and the following disclaimer.
27 *  - Redistributions in binary form must reproduce the above copyright
28 *    notice, this list of conditions and the following disclaimer in
29 *    the documentation and/or other materials provided with the
30 *    distribution.
31 *  - Neither the name of Intel Corporation nor the names of its
32 *    contributors may be used to endorse or promote products derived
33 *    from this software without specific prior written permission.
34 *
35 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
36 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
37 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
38 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
39 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
41 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
42 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
43 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
44 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
45 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46 *
47 */
48#include <asm/page.h>
49#include <linux/string.h>
50
51#include "mmu_rb.h"
52#include "user_exp_rcv.h"
53#include "trace.h"
54
55static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
56			    struct exp_tid_set *set,
57			    struct hfi1_filedata *fd);
58static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages);
59static int set_rcvarray_entry(struct hfi1_filedata *fd,
60			      struct tid_user_buf *tbuf,
61			      u32 rcventry, struct tid_group *grp,
62			      u16 pageidx, unsigned int npages);
63static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
64				    struct tid_rb_node *tnode);
65static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
66			      const struct mmu_notifier_range *range,
67			      unsigned long cur_seq);
68static bool tid_cover_invalidate(struct mmu_interval_notifier *mni,
69			         const struct mmu_notifier_range *range,
70			         unsigned long cur_seq);
71static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *,
72			    struct tid_group *grp,
73			    unsigned int start, u16 count,
74			    u32 *tidlist, unsigned int *tididx,
75			    unsigned int *pmapped);
76static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo);
77static void __clear_tid_node(struct hfi1_filedata *fd,
78			     struct tid_rb_node *node);
79static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node);
80
81static const struct mmu_interval_notifier_ops tid_mn_ops = {
82	.invalidate = tid_rb_invalidate,
83};
84static const struct mmu_interval_notifier_ops tid_cover_ops = {
85	.invalidate = tid_cover_invalidate,
86};
87
88/*
89 * Initialize context and file private data needed for Expected
90 * receive caching. This needs to be done after the context has
91 * been configured with the eager/expected RcvEntry counts.
92 */
93int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd,
94			   struct hfi1_ctxtdata *uctxt)
95{
96	int ret = 0;
97
98	fd->entry_to_rb = kcalloc(uctxt->expected_count,
99				  sizeof(struct rb_node *),
100				  GFP_KERNEL);
101	if (!fd->entry_to_rb)
102		return -ENOMEM;
103
104	if (!HFI1_CAP_UGET_MASK(uctxt->flags, TID_UNMAP)) {
105		fd->invalid_tid_idx = 0;
106		fd->invalid_tids = kcalloc(uctxt->expected_count,
107					   sizeof(*fd->invalid_tids),
108					   GFP_KERNEL);
109		if (!fd->invalid_tids) {
110			kfree(fd->entry_to_rb);
111			fd->entry_to_rb = NULL;
112			return -ENOMEM;
113		}
114		fd->use_mn = true;
115	}
116
117	/*
118	 * PSM does not have a good way to separate, count, and
119	 * effectively enforce a limit on RcvArray entries used by
120	 * subctxts (when context sharing is used) when TID caching
121	 * is enabled. To help with that, we calculate a per-process
122	 * RcvArray entry share and enforce that.
123	 * If TID caching is not in use, PSM deals with usage on its
124	 * own. In that case, we allow any subctxt to take all of the
125	 * entries.
126	 *
127	 * Make sure that we set the tid counts only after successful
128	 * init.
129	 */
130	spin_lock(&fd->tid_lock);
131	if (uctxt->subctxt_cnt && fd->use_mn) {
132		u16 remainder;
133
134		fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
135		remainder = uctxt->expected_count % uctxt->subctxt_cnt;
136		if (remainder && fd->subctxt < remainder)
137			fd->tid_limit++;
138	} else {
139		fd->tid_limit = uctxt->expected_count;
140	}
141	spin_unlock(&fd->tid_lock);
142
143	return ret;
144}
145
146void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
147{
148	struct hfi1_ctxtdata *uctxt = fd->uctxt;
149
150	mutex_lock(&uctxt->exp_mutex);
151	if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
152		unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd);
153	if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
154		unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd);
155	mutex_unlock(&uctxt->exp_mutex);
156
157	kfree(fd->invalid_tids);
158	fd->invalid_tids = NULL;
159
160	kfree(fd->entry_to_rb);
161	fd->entry_to_rb = NULL;
162}
163
164/**
165 * Release pinned receive buffer pages.
166 *
167 * @mapped - true if the pages have been DMA mapped. false otherwise.
168 * @idx - Index of the first page to unpin.
169 * @npages - No of pages to unpin.
170 *
171 * If the pages have been DMA mapped (indicated by mapped parameter), their
172 * info will be passed via a struct tid_rb_node. If they haven't been mapped,
173 * their info will be passed via a struct tid_user_buf.
174 */
175static void unpin_rcv_pages(struct hfi1_filedata *fd,
176			    struct tid_user_buf *tidbuf,
177			    struct tid_rb_node *node,
178			    unsigned int idx,
179			    unsigned int npages,
180			    bool mapped)
181{
182	struct page **pages;
183	struct hfi1_devdata *dd = fd->uctxt->dd;
184	struct mm_struct *mm;
185
186	if (mapped) {
187		pci_unmap_single(dd->pcidev, node->dma_addr,
188				 node->npages * PAGE_SIZE, PCI_DMA_FROMDEVICE);
189		pages = &node->pages[idx];
190		mm = mm_from_tid_node(node);
191	} else {
192		pages = &tidbuf->pages[idx];
193		mm = current->mm;
194	}
195	hfi1_release_user_pages(mm, pages, npages, mapped);
196	fd->tid_n_pinned -= npages;
197}
198
199/**
200 * Pin receive buffer pages.
201 */
202static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf)
203{
204	int pinned;
205	unsigned int npages = tidbuf->npages;
206	unsigned long vaddr = tidbuf->vaddr;
207	struct page **pages = NULL;
208	struct hfi1_devdata *dd = fd->uctxt->dd;
209
210	if (npages > fd->uctxt->expected_count) {
211		dd_dev_err(dd, "Expected buffer too big\n");
212		return -EINVAL;
213	}
214
215	/* Allocate the array of struct page pointers needed for pinning */
216	pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
217	if (!pages)
218		return -ENOMEM;
219
220	/*
221	 * Pin all the pages of the user buffer. If we can't pin all the
222	 * pages, accept the amount pinned so far and program only that.
223	 * User space knows how to deal with partially programmed buffers.
224	 */
225	if (!hfi1_can_pin_pages(dd, current->mm, fd->tid_n_pinned, npages)) {
226		kfree(pages);
227		return -ENOMEM;
228	}
229
230	pinned = hfi1_acquire_user_pages(current->mm, vaddr, npages, true, pages);
231	if (pinned <= 0) {
232		kfree(pages);
233		return pinned;
234	}
235	tidbuf->pages = pages;
236	fd->tid_n_pinned += pinned;
237	return pinned;
238}
239
240/*
241 * RcvArray entry allocation for Expected Receives is done by the
242 * following algorithm:
243 *
244 * The context keeps 3 lists of groups of RcvArray entries:
245 *   1. List of empty groups - tid_group_list
246 *      This list is created during user context creation and
247 *      contains elements which describe sets (of 8) of empty
248 *      RcvArray entries.
249 *   2. List of partially used groups - tid_used_list
250 *      This list contains sets of RcvArray entries which are
251 *      not completely used up. Another mapping request could
252 *      use some of all of the remaining entries.
253 *   3. List of full groups - tid_full_list
254 *      This is the list where sets that are completely used
255 *      up go.
256 *
257 * An attempt to optimize the usage of RcvArray entries is
258 * made by finding all sets of physically contiguous pages in a
259 * user's buffer.
260 * These physically contiguous sets are further split into
261 * sizes supported by the receive engine of the HFI. The
262 * resulting sets of pages are stored in struct tid_pageset,
263 * which describes the sets as:
264 *    * .count - number of pages in this set
265 *    * .idx - starting index into struct page ** array
266 *                    of this set
267 *
268 * From this point on, the algorithm deals with the page sets
269 * described above. The number of pagesets is divided by the
270 * RcvArray group size to produce the number of full groups
271 * needed.
272 *
273 * Groups from the 3 lists are manipulated using the following
274 * rules:
275 *   1. For each set of 8 pagesets, a complete group from
276 *      tid_group_list is taken, programmed, and moved to
277 *      the tid_full_list list.
278 *   2. For all remaining pagesets:
279 *      2.1 If the tid_used_list is empty and the tid_group_list
280 *          is empty, stop processing pageset and return only
281 *          what has been programmed up to this point.
282 *      2.2 If the tid_used_list is empty and the tid_group_list
283 *          is not empty, move a group from tid_group_list to
284 *          tid_used_list.
285 *      2.3 For each group is tid_used_group, program as much as
286 *          can fit into the group. If the group becomes fully
287 *          used, move it to tid_full_list.
288 */
289int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
290			    struct hfi1_tid_info *tinfo)
291{
292	int ret = 0, need_group = 0, pinned;
293	struct hfi1_ctxtdata *uctxt = fd->uctxt;
294	struct hfi1_devdata *dd = uctxt->dd;
295	unsigned int ngroups, pageidx = 0, pageset_count,
296		tididx = 0, mapped, mapped_pages = 0;
297	u32 *tidlist = NULL;
298	struct tid_user_buf *tidbuf;
299	unsigned long mmu_seq = 0;
300
301	if (!PAGE_ALIGNED(tinfo->vaddr))
302		return -EINVAL;
303	if (tinfo->length == 0)
304		return -EINVAL;
305
306	tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL);
307	if (!tidbuf)
308		return -ENOMEM;
309
310	mutex_init(&tidbuf->cover_mutex);
311	tidbuf->vaddr = tinfo->vaddr;
312	tidbuf->length = tinfo->length;
313	tidbuf->npages = num_user_pages(tidbuf->vaddr, tidbuf->length);
314	tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets),
315				GFP_KERNEL);
316	if (!tidbuf->psets) {
317		ret = -ENOMEM;
318		goto fail_release_mem;
319	}
320
321	if (fd->use_mn) {
322		ret = mmu_interval_notifier_insert(
323			&tidbuf->notifier, current->mm,
324			tidbuf->vaddr, tidbuf->npages * PAGE_SIZE,
325			&tid_cover_ops);
326		if (ret)
327			goto fail_release_mem;
328		mmu_seq = mmu_interval_read_begin(&tidbuf->notifier);
329	}
330
331	pinned = pin_rcv_pages(fd, tidbuf);
332	if (pinned <= 0) {
333		ret = (pinned < 0) ? pinned : -ENOSPC;
334		goto fail_unpin;
335	}
336
337	/* Find sets of physically contiguous pages */
338	tidbuf->n_psets = find_phys_blocks(tidbuf, pinned);
339
340	/* Reserve the number of expected tids to be used. */
341	spin_lock(&fd->tid_lock);
342	if (fd->tid_used + tidbuf->n_psets > fd->tid_limit)
343		pageset_count = fd->tid_limit - fd->tid_used;
344	else
345		pageset_count = tidbuf->n_psets;
346	fd->tid_used += pageset_count;
347	spin_unlock(&fd->tid_lock);
348
349	if (!pageset_count) {
350		ret = -ENOSPC;
351		goto fail_unreserve;
352	}
353
354	ngroups = pageset_count / dd->rcv_entries.group_size;
355	tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
356	if (!tidlist) {
357		ret = -ENOMEM;
358		goto fail_unreserve;
359	}
360
361	tididx = 0;
362
363	/*
364	 * From this point on, we are going to be using shared (between master
365	 * and subcontexts) context resources. We need to take the lock.
366	 */
367	mutex_lock(&uctxt->exp_mutex);
368	/*
369	 * The first step is to program the RcvArray entries which are complete
370	 * groups.
371	 */
372	while (ngroups && uctxt->tid_group_list.count) {
373		struct tid_group *grp =
374			tid_group_pop(&uctxt->tid_group_list);
375
376		ret = program_rcvarray(fd, tidbuf, grp,
377				       pageidx, dd->rcv_entries.group_size,
378				       tidlist, &tididx, &mapped);
379		/*
380		 * If there was a failure to program the RcvArray
381		 * entries for the entire group, reset the grp fields
382		 * and add the grp back to the free group list.
383		 */
384		if (ret <= 0) {
385			tid_group_add_tail(grp, &uctxt->tid_group_list);
386			hfi1_cdbg(TID,
387				  "Failed to program RcvArray group %d", ret);
388			goto unlock;
389		}
390
391		tid_group_add_tail(grp, &uctxt->tid_full_list);
392		ngroups--;
393		pageidx += ret;
394		mapped_pages += mapped;
395	}
396
397	while (pageidx < pageset_count) {
398		struct tid_group *grp, *ptr;
399		/*
400		 * If we don't have any partially used tid groups, check
401		 * if we have empty groups. If so, take one from there and
402		 * put in the partially used list.
403		 */
404		if (!uctxt->tid_used_list.count || need_group) {
405			if (!uctxt->tid_group_list.count)
406				goto unlock;
407
408			grp = tid_group_pop(&uctxt->tid_group_list);
409			tid_group_add_tail(grp, &uctxt->tid_used_list);
410			need_group = 0;
411		}
412		/*
413		 * There is an optimization opportunity here - instead of
414		 * fitting as many page sets as we can, check for a group
415		 * later on in the list that could fit all of them.
416		 */
417		list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list,
418					 list) {
419			unsigned use = min_t(unsigned, pageset_count - pageidx,
420					     grp->size - grp->used);
421
422			ret = program_rcvarray(fd, tidbuf, grp,
423					       pageidx, use, tidlist,
424					       &tididx, &mapped);
425			if (ret < 0) {
426				hfi1_cdbg(TID,
427					  "Failed to program RcvArray entries %d",
428					  ret);
429				goto unlock;
430			} else if (ret > 0) {
431				if (grp->used == grp->size)
432					tid_group_move(grp,
433						       &uctxt->tid_used_list,
434						       &uctxt->tid_full_list);
435				pageidx += ret;
436				mapped_pages += mapped;
437				need_group = 0;
438				/* Check if we are done so we break out early */
439				if (pageidx >= pageset_count)
440					break;
441			} else if (WARN_ON(ret == 0)) {
442				/*
443				 * If ret is 0, we did not program any entries
444				 * into this group, which can only happen if
445				 * we've screwed up the accounting somewhere.
446				 * Warn and try to continue.
447				 */
448				need_group = 1;
449			}
450		}
451	}
452unlock:
453	mutex_unlock(&uctxt->exp_mutex);
454	hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
455		  mapped_pages, ret);
456
457	/* fail if nothing was programmed, set error if none provided */
458	if (tididx == 0) {
459		if (ret >= 0)
460			ret = -ENOSPC;
461		goto fail_unreserve;
462	}
463
464	/* adjust reserved tid_used to actual count */
465	spin_lock(&fd->tid_lock);
466	fd->tid_used -= pageset_count - tididx;
467	spin_unlock(&fd->tid_lock);
468
469	/* unpin all pages not covered by a TID */
470	unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages, pinned - mapped_pages,
471			false);
472
473	if (fd->use_mn) {
474		/* check for an invalidate during setup */
475		bool fail = false;
476
477		mutex_lock(&tidbuf->cover_mutex);
478		fail = mmu_interval_read_retry(&tidbuf->notifier, mmu_seq);
479		mutex_unlock(&tidbuf->cover_mutex);
480
481		if (fail) {
482			ret = -EBUSY;
483			goto fail_unprogram;
484		}
485	}
486
487	tinfo->tidcnt = tididx;
488	tinfo->length = mapped_pages * PAGE_SIZE;
489
490	if (copy_to_user(u64_to_user_ptr(tinfo->tidlist),
491			 tidlist, sizeof(tidlist[0]) * tididx)) {
492		ret = -EFAULT;
493		goto fail_unprogram;
494	}
495
496	if (fd->use_mn)
497		mmu_interval_notifier_remove(&tidbuf->notifier);
498	kfree(tidbuf->pages);
499	kfree(tidbuf->psets);
500	kfree(tidbuf);
501	kfree(tidlist);
502	return 0;
503
504fail_unprogram:
505	/* unprogram, unmap, and unpin all allocated TIDs */
506	tinfo->tidlist = (unsigned long)tidlist;
507	hfi1_user_exp_rcv_clear(fd, tinfo);
508	tinfo->tidlist = 0;
509	pinned = 0;		/* nothing left to unpin */
510	pageset_count = 0;	/* nothing left reserved */
511fail_unreserve:
512	spin_lock(&fd->tid_lock);
513	fd->tid_used -= pageset_count;
514	spin_unlock(&fd->tid_lock);
515fail_unpin:
516	if (fd->use_mn)
517		mmu_interval_notifier_remove(&tidbuf->notifier);
518	if (pinned > 0)
519		unpin_rcv_pages(fd, tidbuf, NULL, 0, pinned, false);
520fail_release_mem:
521	kfree(tidbuf->pages);
522	kfree(tidbuf->psets);
523	kfree(tidbuf);
524	kfree(tidlist);
525	return ret;
526}
527
528int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd,
529			    struct hfi1_tid_info *tinfo)
530{
531	int ret = 0;
532	struct hfi1_ctxtdata *uctxt = fd->uctxt;
533	u32 *tidinfo;
534	unsigned tididx;
535
536	if (unlikely(tinfo->tidcnt > fd->tid_used))
537		return -EINVAL;
538
539	tidinfo = memdup_user(u64_to_user_ptr(tinfo->tidlist),
540			      sizeof(tidinfo[0]) * tinfo->tidcnt);
541	if (IS_ERR(tidinfo))
542		return PTR_ERR(tidinfo);
543
544	mutex_lock(&uctxt->exp_mutex);
545	for (tididx = 0; tididx < tinfo->tidcnt; tididx++) {
546		ret = unprogram_rcvarray(fd, tidinfo[tididx]);
547		if (ret) {
548			hfi1_cdbg(TID, "Failed to unprogram rcv array %d",
549				  ret);
550			break;
551		}
552	}
553	spin_lock(&fd->tid_lock);
554	fd->tid_used -= tididx;
555	spin_unlock(&fd->tid_lock);
556	tinfo->tidcnt = tididx;
557	mutex_unlock(&uctxt->exp_mutex);
558
559	kfree(tidinfo);
560	return ret;
561}
562
563int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd,
564			      struct hfi1_tid_info *tinfo)
565{
566	struct hfi1_ctxtdata *uctxt = fd->uctxt;
567	unsigned long *ev = uctxt->dd->events +
568		(uctxt_offset(uctxt) + fd->subctxt);
569	u32 *array;
570	int ret = 0;
571
572	/*
573	 * copy_to_user() can sleep, which will leave the invalid_lock
574	 * locked and cause the MMU notifier to be blocked on the lock
575	 * for a long time.
576	 * Copy the data to a local buffer so we can release the lock.
577	 */
578	array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL);
579	if (!array)
580		return -EFAULT;
581
582	spin_lock(&fd->invalid_lock);
583	if (fd->invalid_tid_idx) {
584		memcpy(array, fd->invalid_tids, sizeof(*array) *
585		       fd->invalid_tid_idx);
586		memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) *
587		       fd->invalid_tid_idx);
588		tinfo->tidcnt = fd->invalid_tid_idx;
589		fd->invalid_tid_idx = 0;
590		/*
591		 * Reset the user flag while still holding the lock.
592		 * Otherwise, PSM can miss events.
593		 */
594		clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
595	} else {
596		tinfo->tidcnt = 0;
597	}
598	spin_unlock(&fd->invalid_lock);
599
600	if (tinfo->tidcnt) {
601		if (copy_to_user((void __user *)tinfo->tidlist,
602				 array, sizeof(*array) * tinfo->tidcnt))
603			ret = -EFAULT;
604	}
605	kfree(array);
606
607	return ret;
608}
609
610static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages)
611{
612	unsigned pagecount, pageidx, setcount = 0, i;
613	unsigned long pfn, this_pfn;
614	struct page **pages = tidbuf->pages;
615	struct tid_pageset *list = tidbuf->psets;
616
617	if (!npages)
618		return 0;
619
620	/*
621	 * Look for sets of physically contiguous pages in the user buffer.
622	 * This will allow us to optimize Expected RcvArray entry usage by
623	 * using the bigger supported sizes.
624	 */
625	pfn = page_to_pfn(pages[0]);
626	for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
627		this_pfn = i < npages ? page_to_pfn(pages[i]) : 0;
628
629		/*
630		 * If the pfn's are not sequential, pages are not physically
631		 * contiguous.
632		 */
633		if (this_pfn != ++pfn) {
634			/*
635			 * At this point we have to loop over the set of
636			 * physically contiguous pages and break them down it
637			 * sizes supported by the HW.
638			 * There are two main constraints:
639			 *     1. The max buffer size is MAX_EXPECTED_BUFFER.
640			 *        If the total set size is bigger than that
641			 *        program only a MAX_EXPECTED_BUFFER chunk.
642			 *     2. The buffer size has to be a power of two. If
643			 *        it is not, round down to the closes power of
644			 *        2 and program that size.
645			 */
646			while (pagecount) {
647				int maxpages = pagecount;
648				u32 bufsize = pagecount * PAGE_SIZE;
649
650				if (bufsize > MAX_EXPECTED_BUFFER)
651					maxpages =
652						MAX_EXPECTED_BUFFER >>
653						PAGE_SHIFT;
654				else if (!is_power_of_2(bufsize))
655					maxpages =
656						rounddown_pow_of_two(bufsize) >>
657						PAGE_SHIFT;
658
659				list[setcount].idx = pageidx;
660				list[setcount].count = maxpages;
661				pagecount -= maxpages;
662				pageidx += maxpages;
663				setcount++;
664			}
665			pageidx = i;
666			pagecount = 1;
667			pfn = this_pfn;
668		} else {
669			pagecount++;
670		}
671	}
672	return setcount;
673}
674
675/**
676 * program_rcvarray() - program an RcvArray group with receive buffers
677 * @fd: filedata pointer
678 * @tbuf: pointer to struct tid_user_buf that has the user buffer starting
679 *	  virtual address, buffer length, page pointers, pagesets (array of
680 *	  struct tid_pageset holding information on physically contiguous
681 *	  chunks from the user buffer), and other fields.
682 * @grp: RcvArray group
683 * @start: starting index into sets array
684 * @count: number of struct tid_pageset's to program
685 * @tidlist: the array of u32 elements when the information about the
686 *           programmed RcvArray entries is to be encoded.
687 * @tididx: starting offset into tidlist
688 * @pmapped: (output parameter) number of pages programmed into the RcvArray
689 *           entries.
690 *
691 * This function will program up to 'count' number of RcvArray entries from the
692 * group 'grp'. To make best use of write-combining writes, the function will
693 * perform writes to the unused RcvArray entries which will be ignored by the
694 * HW. Each RcvArray entry will be programmed with a physically contiguous
695 * buffer chunk from the user's virtual buffer.
696 *
697 * Return:
698 * -EINVAL if the requested count is larger than the size of the group,
699 * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or
700 * number of RcvArray entries programmed.
701 */
702static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *tbuf,
703			    struct tid_group *grp,
704			    unsigned int start, u16 count,
705			    u32 *tidlist, unsigned int *tididx,
706			    unsigned int *pmapped)
707{
708	struct hfi1_ctxtdata *uctxt = fd->uctxt;
709	struct hfi1_devdata *dd = uctxt->dd;
710	u16 idx;
711	u32 tidinfo = 0, rcventry, useidx = 0;
712	int mapped = 0;
713
714	/* Count should never be larger than the group size */
715	if (count > grp->size)
716		return -EINVAL;
717
718	/* Find the first unused entry in the group */
719	for (idx = 0; idx < grp->size; idx++) {
720		if (!(grp->map & (1 << idx))) {
721			useidx = idx;
722			break;
723		}
724		rcv_array_wc_fill(dd, grp->base + idx);
725	}
726
727	idx = 0;
728	while (idx < count) {
729		u16 npages, pageidx, setidx = start + idx;
730		int ret = 0;
731
732		/*
733		 * If this entry in the group is used, move to the next one.
734		 * If we go past the end of the group, exit the loop.
735		 */
736		if (useidx >= grp->size) {
737			break;
738		} else if (grp->map & (1 << useidx)) {
739			rcv_array_wc_fill(dd, grp->base + useidx);
740			useidx++;
741			continue;
742		}
743
744		rcventry = grp->base + useidx;
745		npages = tbuf->psets[setidx].count;
746		pageidx = tbuf->psets[setidx].idx;
747
748		ret = set_rcvarray_entry(fd, tbuf,
749					 rcventry, grp, pageidx,
750					 npages);
751		if (ret)
752			return ret;
753		mapped += npages;
754
755		tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) |
756			EXP_TID_SET(LEN, npages);
757		tidlist[(*tididx)++] = tidinfo;
758		grp->used++;
759		grp->map |= 1 << useidx++;
760		idx++;
761	}
762
763	/* Fill the rest of the group with "blank" writes */
764	for (; useidx < grp->size; useidx++)
765		rcv_array_wc_fill(dd, grp->base + useidx);
766	*pmapped = mapped;
767	return idx;
768}
769
770static int set_rcvarray_entry(struct hfi1_filedata *fd,
771			      struct tid_user_buf *tbuf,
772			      u32 rcventry, struct tid_group *grp,
773			      u16 pageidx, unsigned int npages)
774{
775	int ret;
776	struct hfi1_ctxtdata *uctxt = fd->uctxt;
777	struct tid_rb_node *node;
778	struct hfi1_devdata *dd = uctxt->dd;
779	dma_addr_t phys;
780	struct page **pages = tbuf->pages + pageidx;
781
782	/*
783	 * Allocate the node first so we can handle a potential
784	 * failure before we've programmed anything.
785	 */
786	node = kzalloc(sizeof(*node) + (sizeof(struct page *) * npages),
787		       GFP_KERNEL);
788	if (!node)
789		return -ENOMEM;
790
791	phys = pci_map_single(dd->pcidev,
792			      __va(page_to_phys(pages[0])),
793			      npages * PAGE_SIZE, PCI_DMA_FROMDEVICE);
794	if (dma_mapping_error(&dd->pcidev->dev, phys)) {
795		dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n",
796			   phys);
797		kfree(node);
798		return -EFAULT;
799	}
800
801	node->fdata = fd;
802	mutex_init(&node->invalidate_mutex);
803	node->phys = page_to_phys(pages[0]);
804	node->npages = npages;
805	node->rcventry = rcventry;
806	node->dma_addr = phys;
807	node->grp = grp;
808	node->freed = false;
809	memcpy(node->pages, pages, sizeof(struct page *) * npages);
810
811	if (fd->use_mn) {
812		ret = mmu_interval_notifier_insert(
813			&node->notifier, current->mm,
814			tbuf->vaddr + (pageidx * PAGE_SIZE), npages * PAGE_SIZE,
815			&tid_mn_ops);
816		if (ret)
817			goto out_unmap;
818	}
819	fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node;
820
821	hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1);
822	trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages,
823			       node->notifier.interval_tree.start, node->phys,
824			       phys);
825	return 0;
826
827out_unmap:
828	hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
829		  node->rcventry, node->notifier.interval_tree.start,
830		  node->phys, ret);
831	pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE,
832			 PCI_DMA_FROMDEVICE);
833	kfree(node);
834	return -EFAULT;
835}
836
837static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo)
838{
839	struct hfi1_ctxtdata *uctxt = fd->uctxt;
840	struct hfi1_devdata *dd = uctxt->dd;
841	struct tid_rb_node *node;
842	u8 tidctrl = EXP_TID_GET(tidinfo, CTRL);
843	u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry;
844
845	if (tididx >= uctxt->expected_count) {
846		dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n",
847			   tididx, uctxt->ctxt);
848		return -EINVAL;
849	}
850
851	if (tidctrl == 0x3)
852		return -EINVAL;
853
854	rcventry = tididx + (tidctrl - 1);
855
856	node = fd->entry_to_rb[rcventry];
857	if (!node || node->rcventry != (uctxt->expected_base + rcventry))
858		return -EBADF;
859
860	if (fd->use_mn)
861		mmu_interval_notifier_remove(&node->notifier);
862	cacheless_tid_rb_remove(fd, node);
863
864	return 0;
865}
866
867static void __clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
868{
869	struct hfi1_ctxtdata *uctxt = fd->uctxt;
870	struct hfi1_devdata *dd = uctxt->dd;
871
872	mutex_lock(&node->invalidate_mutex);
873	if (node->freed)
874		goto done;
875	node->freed = true;
876
877	trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
878				 node->npages,
879				 node->notifier.interval_tree.start, node->phys,
880				 node->dma_addr);
881
882	/* Make sure device has seen the write before pages are unpinned */
883	hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0);
884
885	unpin_rcv_pages(fd, NULL, node, 0, node->npages, true);
886done:
887	mutex_unlock(&node->invalidate_mutex);
888}
889
890static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
891{
892	struct hfi1_ctxtdata *uctxt = fd->uctxt;
893
894	__clear_tid_node(fd, node);
895
896	node->grp->used--;
897	node->grp->map &= ~(1 << (node->rcventry - node->grp->base));
898
899	if (node->grp->used == node->grp->size - 1)
900		tid_group_move(node->grp, &uctxt->tid_full_list,
901			       &uctxt->tid_used_list);
902	else if (!node->grp->used)
903		tid_group_move(node->grp, &uctxt->tid_used_list,
904			       &uctxt->tid_group_list);
905	kfree(node);
906}
907
908/*
909 * As a simple helper for hfi1_user_exp_rcv_free, this function deals with
910 * clearing nodes in the non-cached case.
911 */
912static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
913			    struct exp_tid_set *set,
914			    struct hfi1_filedata *fd)
915{
916	struct tid_group *grp, *ptr;
917	int i;
918
919	list_for_each_entry_safe(grp, ptr, &set->list, list) {
920		list_del_init(&grp->list);
921
922		for (i = 0; i < grp->size; i++) {
923			if (grp->map & (1 << i)) {
924				u16 rcventry = grp->base + i;
925				struct tid_rb_node *node;
926
927				node = fd->entry_to_rb[rcventry -
928							  uctxt->expected_base];
929				if (!node || node->rcventry != rcventry)
930					continue;
931
932				if (fd->use_mn)
933					mmu_interval_notifier_remove(
934						&node->notifier);
935				cacheless_tid_rb_remove(fd, node);
936			}
937		}
938	}
939}
940
941static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
942			      const struct mmu_notifier_range *range,
943			      unsigned long cur_seq)
944{
945	struct tid_rb_node *node =
946		container_of(mni, struct tid_rb_node, notifier);
947	struct hfi1_filedata *fdata = node->fdata;
948	struct hfi1_ctxtdata *uctxt = fdata->uctxt;
949
950	if (node->freed)
951		return true;
952
953	/* take action only if unmapping */
954	if (range->event != MMU_NOTIFY_UNMAP)
955		return true;
956
957	trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt,
958				 node->notifier.interval_tree.start,
959				 node->rcventry, node->npages, node->dma_addr);
960
961	/* clear the hardware rcvarray entry */
962	__clear_tid_node(fdata, node);
963
964	spin_lock(&fdata->invalid_lock);
965	if (fdata->invalid_tid_idx < uctxt->expected_count) {
966		fdata->invalid_tids[fdata->invalid_tid_idx] =
967			rcventry2tidinfo(node->rcventry - uctxt->expected_base);
968		fdata->invalid_tids[fdata->invalid_tid_idx] |=
969			EXP_TID_SET(LEN, node->npages);
970		if (!fdata->invalid_tid_idx) {
971			unsigned long *ev;
972
973			/*
974			 * hfi1_set_uevent_bits() sets a user event flag
975			 * for all processes. Because calling into the
976			 * driver to process TID cache invalidations is
977			 * expensive and TID cache invalidations are
978			 * handled on a per-process basis, we can
979			 * optimize this to set the flag only for the
980			 * process in question.
981			 */
982			ev = uctxt->dd->events +
983				(uctxt_offset(uctxt) + fdata->subctxt);
984			set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
985		}
986		fdata->invalid_tid_idx++;
987	}
988	spin_unlock(&fdata->invalid_lock);
989	return true;
990}
991
992static bool tid_cover_invalidate(struct mmu_interval_notifier *mni,
993			         const struct mmu_notifier_range *range,
994			         unsigned long cur_seq)
995{
996	struct tid_user_buf *tidbuf =
997		container_of(mni, struct tid_user_buf, notifier);
998
999	/* take action only if unmapping */
1000	if (range->event == MMU_NOTIFY_UNMAP) {
1001		mutex_lock(&tidbuf->cover_mutex);
1002		mmu_interval_set_seq(mni, cur_seq);
1003		mutex_unlock(&tidbuf->cover_mutex);
1004	}
1005
1006	return true;
1007}
1008
1009static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
1010				    struct tid_rb_node *tnode)
1011{
1012	u32 base = fdata->uctxt->expected_base;
1013
1014	fdata->entry_to_rb[tnode->rcventry - base] = NULL;
1015	clear_tid_node(fdata, tnode);
1016}
1017