1/* 2 * Copyright(c) 2020 Cornelis Networks, Inc. 3 * Copyright(c) 2015-2018 Intel Corporation. 4 * 5 * This file is provided under a dual BSD/GPLv2 license. When using or 6 * redistributing this file, you may do so under either license. 7 * 8 * GPL LICENSE SUMMARY 9 * 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of version 2 of the GNU General Public License as 12 * published by the Free Software Foundation. 13 * 14 * This program is distributed in the hope that it will be useful, but 15 * WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * General Public License for more details. 18 * 19 * BSD LICENSE 20 * 21 * Redistribution and use in source and binary forms, with or without 22 * modification, are permitted provided that the following conditions 23 * are met: 24 * 25 * - Redistributions of source code must retain the above copyright 26 * notice, this list of conditions and the following disclaimer. 27 * - Redistributions in binary form must reproduce the above copyright 28 * notice, this list of conditions and the following disclaimer in 29 * the documentation and/or other materials provided with the 30 * distribution. 31 * - Neither the name of Intel Corporation nor the names of its 32 * contributors may be used to endorse or promote products derived 33 * from this software without specific prior written permission. 34 * 35 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 36 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 37 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 38 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 39 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 41 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 42 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 43 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 44 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 45 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 46 * 47 */ 48#include <asm/page.h> 49#include <linux/string.h> 50 51#include "mmu_rb.h" 52#include "user_exp_rcv.h" 53#include "trace.h" 54 55static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, 56 struct exp_tid_set *set, 57 struct hfi1_filedata *fd); 58static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages); 59static int set_rcvarray_entry(struct hfi1_filedata *fd, 60 struct tid_user_buf *tbuf, 61 u32 rcventry, struct tid_group *grp, 62 u16 pageidx, unsigned int npages); 63static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata, 64 struct tid_rb_node *tnode); 65static bool tid_rb_invalidate(struct mmu_interval_notifier *mni, 66 const struct mmu_notifier_range *range, 67 unsigned long cur_seq); 68static bool tid_cover_invalidate(struct mmu_interval_notifier *mni, 69 const struct mmu_notifier_range *range, 70 unsigned long cur_seq); 71static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *, 72 struct tid_group *grp, 73 unsigned int start, u16 count, 74 u32 *tidlist, unsigned int *tididx, 75 unsigned int *pmapped); 76static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo); 77static void __clear_tid_node(struct hfi1_filedata *fd, 78 struct tid_rb_node *node); 79static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node); 80 81static const struct mmu_interval_notifier_ops tid_mn_ops = { 82 .invalidate = tid_rb_invalidate, 83}; 84static const struct mmu_interval_notifier_ops tid_cover_ops = { 85 .invalidate = tid_cover_invalidate, 86}; 87 88/* 89 * Initialize context and file private data needed for Expected 90 * receive caching. This needs to be done after the context has 91 * been configured with the eager/expected RcvEntry counts. 92 */ 93int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd, 94 struct hfi1_ctxtdata *uctxt) 95{ 96 int ret = 0; 97 98 fd->entry_to_rb = kcalloc(uctxt->expected_count, 99 sizeof(struct rb_node *), 100 GFP_KERNEL); 101 if (!fd->entry_to_rb) 102 return -ENOMEM; 103 104 if (!HFI1_CAP_UGET_MASK(uctxt->flags, TID_UNMAP)) { 105 fd->invalid_tid_idx = 0; 106 fd->invalid_tids = kcalloc(uctxt->expected_count, 107 sizeof(*fd->invalid_tids), 108 GFP_KERNEL); 109 if (!fd->invalid_tids) { 110 kfree(fd->entry_to_rb); 111 fd->entry_to_rb = NULL; 112 return -ENOMEM; 113 } 114 fd->use_mn = true; 115 } 116 117 /* 118 * PSM does not have a good way to separate, count, and 119 * effectively enforce a limit on RcvArray entries used by 120 * subctxts (when context sharing is used) when TID caching 121 * is enabled. To help with that, we calculate a per-process 122 * RcvArray entry share and enforce that. 123 * If TID caching is not in use, PSM deals with usage on its 124 * own. In that case, we allow any subctxt to take all of the 125 * entries. 126 * 127 * Make sure that we set the tid counts only after successful 128 * init. 129 */ 130 spin_lock(&fd->tid_lock); 131 if (uctxt->subctxt_cnt && fd->use_mn) { 132 u16 remainder; 133 134 fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt; 135 remainder = uctxt->expected_count % uctxt->subctxt_cnt; 136 if (remainder && fd->subctxt < remainder) 137 fd->tid_limit++; 138 } else { 139 fd->tid_limit = uctxt->expected_count; 140 } 141 spin_unlock(&fd->tid_lock); 142 143 return ret; 144} 145 146void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd) 147{ 148 struct hfi1_ctxtdata *uctxt = fd->uctxt; 149 150 mutex_lock(&uctxt->exp_mutex); 151 if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list)) 152 unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd); 153 if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list)) 154 unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd); 155 mutex_unlock(&uctxt->exp_mutex); 156 157 kfree(fd->invalid_tids); 158 fd->invalid_tids = NULL; 159 160 kfree(fd->entry_to_rb); 161 fd->entry_to_rb = NULL; 162} 163 164/** 165 * Release pinned receive buffer pages. 166 * 167 * @mapped - true if the pages have been DMA mapped. false otherwise. 168 * @idx - Index of the first page to unpin. 169 * @npages - No of pages to unpin. 170 * 171 * If the pages have been DMA mapped (indicated by mapped parameter), their 172 * info will be passed via a struct tid_rb_node. If they haven't been mapped, 173 * their info will be passed via a struct tid_user_buf. 174 */ 175static void unpin_rcv_pages(struct hfi1_filedata *fd, 176 struct tid_user_buf *tidbuf, 177 struct tid_rb_node *node, 178 unsigned int idx, 179 unsigned int npages, 180 bool mapped) 181{ 182 struct page **pages; 183 struct hfi1_devdata *dd = fd->uctxt->dd; 184 struct mm_struct *mm; 185 186 if (mapped) { 187 pci_unmap_single(dd->pcidev, node->dma_addr, 188 node->npages * PAGE_SIZE, PCI_DMA_FROMDEVICE); 189 pages = &node->pages[idx]; 190 mm = mm_from_tid_node(node); 191 } else { 192 pages = &tidbuf->pages[idx]; 193 mm = current->mm; 194 } 195 hfi1_release_user_pages(mm, pages, npages, mapped); 196 fd->tid_n_pinned -= npages; 197} 198 199/** 200 * Pin receive buffer pages. 201 */ 202static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf) 203{ 204 int pinned; 205 unsigned int npages = tidbuf->npages; 206 unsigned long vaddr = tidbuf->vaddr; 207 struct page **pages = NULL; 208 struct hfi1_devdata *dd = fd->uctxt->dd; 209 210 if (npages > fd->uctxt->expected_count) { 211 dd_dev_err(dd, "Expected buffer too big\n"); 212 return -EINVAL; 213 } 214 215 /* Allocate the array of struct page pointers needed for pinning */ 216 pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); 217 if (!pages) 218 return -ENOMEM; 219 220 /* 221 * Pin all the pages of the user buffer. If we can't pin all the 222 * pages, accept the amount pinned so far and program only that. 223 * User space knows how to deal with partially programmed buffers. 224 */ 225 if (!hfi1_can_pin_pages(dd, current->mm, fd->tid_n_pinned, npages)) { 226 kfree(pages); 227 return -ENOMEM; 228 } 229 230 pinned = hfi1_acquire_user_pages(current->mm, vaddr, npages, true, pages); 231 if (pinned <= 0) { 232 kfree(pages); 233 return pinned; 234 } 235 tidbuf->pages = pages; 236 fd->tid_n_pinned += pinned; 237 return pinned; 238} 239 240/* 241 * RcvArray entry allocation for Expected Receives is done by the 242 * following algorithm: 243 * 244 * The context keeps 3 lists of groups of RcvArray entries: 245 * 1. List of empty groups - tid_group_list 246 * This list is created during user context creation and 247 * contains elements which describe sets (of 8) of empty 248 * RcvArray entries. 249 * 2. List of partially used groups - tid_used_list 250 * This list contains sets of RcvArray entries which are 251 * not completely used up. Another mapping request could 252 * use some of all of the remaining entries. 253 * 3. List of full groups - tid_full_list 254 * This is the list where sets that are completely used 255 * up go. 256 * 257 * An attempt to optimize the usage of RcvArray entries is 258 * made by finding all sets of physically contiguous pages in a 259 * user's buffer. 260 * These physically contiguous sets are further split into 261 * sizes supported by the receive engine of the HFI. The 262 * resulting sets of pages are stored in struct tid_pageset, 263 * which describes the sets as: 264 * * .count - number of pages in this set 265 * * .idx - starting index into struct page ** array 266 * of this set 267 * 268 * From this point on, the algorithm deals with the page sets 269 * described above. The number of pagesets is divided by the 270 * RcvArray group size to produce the number of full groups 271 * needed. 272 * 273 * Groups from the 3 lists are manipulated using the following 274 * rules: 275 * 1. For each set of 8 pagesets, a complete group from 276 * tid_group_list is taken, programmed, and moved to 277 * the tid_full_list list. 278 * 2. For all remaining pagesets: 279 * 2.1 If the tid_used_list is empty and the tid_group_list 280 * is empty, stop processing pageset and return only 281 * what has been programmed up to this point. 282 * 2.2 If the tid_used_list is empty and the tid_group_list 283 * is not empty, move a group from tid_group_list to 284 * tid_used_list. 285 * 2.3 For each group is tid_used_group, program as much as 286 * can fit into the group. If the group becomes fully 287 * used, move it to tid_full_list. 288 */ 289int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, 290 struct hfi1_tid_info *tinfo) 291{ 292 int ret = 0, need_group = 0, pinned; 293 struct hfi1_ctxtdata *uctxt = fd->uctxt; 294 struct hfi1_devdata *dd = uctxt->dd; 295 unsigned int ngroups, pageidx = 0, pageset_count, 296 tididx = 0, mapped, mapped_pages = 0; 297 u32 *tidlist = NULL; 298 struct tid_user_buf *tidbuf; 299 unsigned long mmu_seq = 0; 300 301 if (!PAGE_ALIGNED(tinfo->vaddr)) 302 return -EINVAL; 303 if (tinfo->length == 0) 304 return -EINVAL; 305 306 tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL); 307 if (!tidbuf) 308 return -ENOMEM; 309 310 mutex_init(&tidbuf->cover_mutex); 311 tidbuf->vaddr = tinfo->vaddr; 312 tidbuf->length = tinfo->length; 313 tidbuf->npages = num_user_pages(tidbuf->vaddr, tidbuf->length); 314 tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets), 315 GFP_KERNEL); 316 if (!tidbuf->psets) { 317 ret = -ENOMEM; 318 goto fail_release_mem; 319 } 320 321 if (fd->use_mn) { 322 ret = mmu_interval_notifier_insert( 323 &tidbuf->notifier, current->mm, 324 tidbuf->vaddr, tidbuf->npages * PAGE_SIZE, 325 &tid_cover_ops); 326 if (ret) 327 goto fail_release_mem; 328 mmu_seq = mmu_interval_read_begin(&tidbuf->notifier); 329 } 330 331 pinned = pin_rcv_pages(fd, tidbuf); 332 if (pinned <= 0) { 333 ret = (pinned < 0) ? pinned : -ENOSPC; 334 goto fail_unpin; 335 } 336 337 /* Find sets of physically contiguous pages */ 338 tidbuf->n_psets = find_phys_blocks(tidbuf, pinned); 339 340 /* Reserve the number of expected tids to be used. */ 341 spin_lock(&fd->tid_lock); 342 if (fd->tid_used + tidbuf->n_psets > fd->tid_limit) 343 pageset_count = fd->tid_limit - fd->tid_used; 344 else 345 pageset_count = tidbuf->n_psets; 346 fd->tid_used += pageset_count; 347 spin_unlock(&fd->tid_lock); 348 349 if (!pageset_count) { 350 ret = -ENOSPC; 351 goto fail_unreserve; 352 } 353 354 ngroups = pageset_count / dd->rcv_entries.group_size; 355 tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL); 356 if (!tidlist) { 357 ret = -ENOMEM; 358 goto fail_unreserve; 359 } 360 361 tididx = 0; 362 363 /* 364 * From this point on, we are going to be using shared (between master 365 * and subcontexts) context resources. We need to take the lock. 366 */ 367 mutex_lock(&uctxt->exp_mutex); 368 /* 369 * The first step is to program the RcvArray entries which are complete 370 * groups. 371 */ 372 while (ngroups && uctxt->tid_group_list.count) { 373 struct tid_group *grp = 374 tid_group_pop(&uctxt->tid_group_list); 375 376 ret = program_rcvarray(fd, tidbuf, grp, 377 pageidx, dd->rcv_entries.group_size, 378 tidlist, &tididx, &mapped); 379 /* 380 * If there was a failure to program the RcvArray 381 * entries for the entire group, reset the grp fields 382 * and add the grp back to the free group list. 383 */ 384 if (ret <= 0) { 385 tid_group_add_tail(grp, &uctxt->tid_group_list); 386 hfi1_cdbg(TID, 387 "Failed to program RcvArray group %d", ret); 388 goto unlock; 389 } 390 391 tid_group_add_tail(grp, &uctxt->tid_full_list); 392 ngroups--; 393 pageidx += ret; 394 mapped_pages += mapped; 395 } 396 397 while (pageidx < pageset_count) { 398 struct tid_group *grp, *ptr; 399 /* 400 * If we don't have any partially used tid groups, check 401 * if we have empty groups. If so, take one from there and 402 * put in the partially used list. 403 */ 404 if (!uctxt->tid_used_list.count || need_group) { 405 if (!uctxt->tid_group_list.count) 406 goto unlock; 407 408 grp = tid_group_pop(&uctxt->tid_group_list); 409 tid_group_add_tail(grp, &uctxt->tid_used_list); 410 need_group = 0; 411 } 412 /* 413 * There is an optimization opportunity here - instead of 414 * fitting as many page sets as we can, check for a group 415 * later on in the list that could fit all of them. 416 */ 417 list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list, 418 list) { 419 unsigned use = min_t(unsigned, pageset_count - pageidx, 420 grp->size - grp->used); 421 422 ret = program_rcvarray(fd, tidbuf, grp, 423 pageidx, use, tidlist, 424 &tididx, &mapped); 425 if (ret < 0) { 426 hfi1_cdbg(TID, 427 "Failed to program RcvArray entries %d", 428 ret); 429 goto unlock; 430 } else if (ret > 0) { 431 if (grp->used == grp->size) 432 tid_group_move(grp, 433 &uctxt->tid_used_list, 434 &uctxt->tid_full_list); 435 pageidx += ret; 436 mapped_pages += mapped; 437 need_group = 0; 438 /* Check if we are done so we break out early */ 439 if (pageidx >= pageset_count) 440 break; 441 } else if (WARN_ON(ret == 0)) { 442 /* 443 * If ret is 0, we did not program any entries 444 * into this group, which can only happen if 445 * we've screwed up the accounting somewhere. 446 * Warn and try to continue. 447 */ 448 need_group = 1; 449 } 450 } 451 } 452unlock: 453 mutex_unlock(&uctxt->exp_mutex); 454 hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx, 455 mapped_pages, ret); 456 457 /* fail if nothing was programmed, set error if none provided */ 458 if (tididx == 0) { 459 if (ret >= 0) 460 ret = -ENOSPC; 461 goto fail_unreserve; 462 } 463 464 /* adjust reserved tid_used to actual count */ 465 spin_lock(&fd->tid_lock); 466 fd->tid_used -= pageset_count - tididx; 467 spin_unlock(&fd->tid_lock); 468 469 /* unpin all pages not covered by a TID */ 470 unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages, pinned - mapped_pages, 471 false); 472 473 if (fd->use_mn) { 474 /* check for an invalidate during setup */ 475 bool fail = false; 476 477 mutex_lock(&tidbuf->cover_mutex); 478 fail = mmu_interval_read_retry(&tidbuf->notifier, mmu_seq); 479 mutex_unlock(&tidbuf->cover_mutex); 480 481 if (fail) { 482 ret = -EBUSY; 483 goto fail_unprogram; 484 } 485 } 486 487 tinfo->tidcnt = tididx; 488 tinfo->length = mapped_pages * PAGE_SIZE; 489 490 if (copy_to_user(u64_to_user_ptr(tinfo->tidlist), 491 tidlist, sizeof(tidlist[0]) * tididx)) { 492 ret = -EFAULT; 493 goto fail_unprogram; 494 } 495 496 if (fd->use_mn) 497 mmu_interval_notifier_remove(&tidbuf->notifier); 498 kfree(tidbuf->pages); 499 kfree(tidbuf->psets); 500 kfree(tidbuf); 501 kfree(tidlist); 502 return 0; 503 504fail_unprogram: 505 /* unprogram, unmap, and unpin all allocated TIDs */ 506 tinfo->tidlist = (unsigned long)tidlist; 507 hfi1_user_exp_rcv_clear(fd, tinfo); 508 tinfo->tidlist = 0; 509 pinned = 0; /* nothing left to unpin */ 510 pageset_count = 0; /* nothing left reserved */ 511fail_unreserve: 512 spin_lock(&fd->tid_lock); 513 fd->tid_used -= pageset_count; 514 spin_unlock(&fd->tid_lock); 515fail_unpin: 516 if (fd->use_mn) 517 mmu_interval_notifier_remove(&tidbuf->notifier); 518 if (pinned > 0) 519 unpin_rcv_pages(fd, tidbuf, NULL, 0, pinned, false); 520fail_release_mem: 521 kfree(tidbuf->pages); 522 kfree(tidbuf->psets); 523 kfree(tidbuf); 524 kfree(tidlist); 525 return ret; 526} 527 528int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd, 529 struct hfi1_tid_info *tinfo) 530{ 531 int ret = 0; 532 struct hfi1_ctxtdata *uctxt = fd->uctxt; 533 u32 *tidinfo; 534 unsigned tididx; 535 536 if (unlikely(tinfo->tidcnt > fd->tid_used)) 537 return -EINVAL; 538 539 tidinfo = memdup_user(u64_to_user_ptr(tinfo->tidlist), 540 sizeof(tidinfo[0]) * tinfo->tidcnt); 541 if (IS_ERR(tidinfo)) 542 return PTR_ERR(tidinfo); 543 544 mutex_lock(&uctxt->exp_mutex); 545 for (tididx = 0; tididx < tinfo->tidcnt; tididx++) { 546 ret = unprogram_rcvarray(fd, tidinfo[tididx]); 547 if (ret) { 548 hfi1_cdbg(TID, "Failed to unprogram rcv array %d", 549 ret); 550 break; 551 } 552 } 553 spin_lock(&fd->tid_lock); 554 fd->tid_used -= tididx; 555 spin_unlock(&fd->tid_lock); 556 tinfo->tidcnt = tididx; 557 mutex_unlock(&uctxt->exp_mutex); 558 559 kfree(tidinfo); 560 return ret; 561} 562 563int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd, 564 struct hfi1_tid_info *tinfo) 565{ 566 struct hfi1_ctxtdata *uctxt = fd->uctxt; 567 unsigned long *ev = uctxt->dd->events + 568 (uctxt_offset(uctxt) + fd->subctxt); 569 u32 *array; 570 int ret = 0; 571 572 /* 573 * copy_to_user() can sleep, which will leave the invalid_lock 574 * locked and cause the MMU notifier to be blocked on the lock 575 * for a long time. 576 * Copy the data to a local buffer so we can release the lock. 577 */ 578 array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL); 579 if (!array) 580 return -EFAULT; 581 582 spin_lock(&fd->invalid_lock); 583 if (fd->invalid_tid_idx) { 584 memcpy(array, fd->invalid_tids, sizeof(*array) * 585 fd->invalid_tid_idx); 586 memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) * 587 fd->invalid_tid_idx); 588 tinfo->tidcnt = fd->invalid_tid_idx; 589 fd->invalid_tid_idx = 0; 590 /* 591 * Reset the user flag while still holding the lock. 592 * Otherwise, PSM can miss events. 593 */ 594 clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev); 595 } else { 596 tinfo->tidcnt = 0; 597 } 598 spin_unlock(&fd->invalid_lock); 599 600 if (tinfo->tidcnt) { 601 if (copy_to_user((void __user *)tinfo->tidlist, 602 array, sizeof(*array) * tinfo->tidcnt)) 603 ret = -EFAULT; 604 } 605 kfree(array); 606 607 return ret; 608} 609 610static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages) 611{ 612 unsigned pagecount, pageidx, setcount = 0, i; 613 unsigned long pfn, this_pfn; 614 struct page **pages = tidbuf->pages; 615 struct tid_pageset *list = tidbuf->psets; 616 617 if (!npages) 618 return 0; 619 620 /* 621 * Look for sets of physically contiguous pages in the user buffer. 622 * This will allow us to optimize Expected RcvArray entry usage by 623 * using the bigger supported sizes. 624 */ 625 pfn = page_to_pfn(pages[0]); 626 for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) { 627 this_pfn = i < npages ? page_to_pfn(pages[i]) : 0; 628 629 /* 630 * If the pfn's are not sequential, pages are not physically 631 * contiguous. 632 */ 633 if (this_pfn != ++pfn) { 634 /* 635 * At this point we have to loop over the set of 636 * physically contiguous pages and break them down it 637 * sizes supported by the HW. 638 * There are two main constraints: 639 * 1. The max buffer size is MAX_EXPECTED_BUFFER. 640 * If the total set size is bigger than that 641 * program only a MAX_EXPECTED_BUFFER chunk. 642 * 2. The buffer size has to be a power of two. If 643 * it is not, round down to the closes power of 644 * 2 and program that size. 645 */ 646 while (pagecount) { 647 int maxpages = pagecount; 648 u32 bufsize = pagecount * PAGE_SIZE; 649 650 if (bufsize > MAX_EXPECTED_BUFFER) 651 maxpages = 652 MAX_EXPECTED_BUFFER >> 653 PAGE_SHIFT; 654 else if (!is_power_of_2(bufsize)) 655 maxpages = 656 rounddown_pow_of_two(bufsize) >> 657 PAGE_SHIFT; 658 659 list[setcount].idx = pageidx; 660 list[setcount].count = maxpages; 661 pagecount -= maxpages; 662 pageidx += maxpages; 663 setcount++; 664 } 665 pageidx = i; 666 pagecount = 1; 667 pfn = this_pfn; 668 } else { 669 pagecount++; 670 } 671 } 672 return setcount; 673} 674 675/** 676 * program_rcvarray() - program an RcvArray group with receive buffers 677 * @fd: filedata pointer 678 * @tbuf: pointer to struct tid_user_buf that has the user buffer starting 679 * virtual address, buffer length, page pointers, pagesets (array of 680 * struct tid_pageset holding information on physically contiguous 681 * chunks from the user buffer), and other fields. 682 * @grp: RcvArray group 683 * @start: starting index into sets array 684 * @count: number of struct tid_pageset's to program 685 * @tidlist: the array of u32 elements when the information about the 686 * programmed RcvArray entries is to be encoded. 687 * @tididx: starting offset into tidlist 688 * @pmapped: (output parameter) number of pages programmed into the RcvArray 689 * entries. 690 * 691 * This function will program up to 'count' number of RcvArray entries from the 692 * group 'grp'. To make best use of write-combining writes, the function will 693 * perform writes to the unused RcvArray entries which will be ignored by the 694 * HW. Each RcvArray entry will be programmed with a physically contiguous 695 * buffer chunk from the user's virtual buffer. 696 * 697 * Return: 698 * -EINVAL if the requested count is larger than the size of the group, 699 * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or 700 * number of RcvArray entries programmed. 701 */ 702static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *tbuf, 703 struct tid_group *grp, 704 unsigned int start, u16 count, 705 u32 *tidlist, unsigned int *tididx, 706 unsigned int *pmapped) 707{ 708 struct hfi1_ctxtdata *uctxt = fd->uctxt; 709 struct hfi1_devdata *dd = uctxt->dd; 710 u16 idx; 711 u32 tidinfo = 0, rcventry, useidx = 0; 712 int mapped = 0; 713 714 /* Count should never be larger than the group size */ 715 if (count > grp->size) 716 return -EINVAL; 717 718 /* Find the first unused entry in the group */ 719 for (idx = 0; idx < grp->size; idx++) { 720 if (!(grp->map & (1 << idx))) { 721 useidx = idx; 722 break; 723 } 724 rcv_array_wc_fill(dd, grp->base + idx); 725 } 726 727 idx = 0; 728 while (idx < count) { 729 u16 npages, pageidx, setidx = start + idx; 730 int ret = 0; 731 732 /* 733 * If this entry in the group is used, move to the next one. 734 * If we go past the end of the group, exit the loop. 735 */ 736 if (useidx >= grp->size) { 737 break; 738 } else if (grp->map & (1 << useidx)) { 739 rcv_array_wc_fill(dd, grp->base + useidx); 740 useidx++; 741 continue; 742 } 743 744 rcventry = grp->base + useidx; 745 npages = tbuf->psets[setidx].count; 746 pageidx = tbuf->psets[setidx].idx; 747 748 ret = set_rcvarray_entry(fd, tbuf, 749 rcventry, grp, pageidx, 750 npages); 751 if (ret) 752 return ret; 753 mapped += npages; 754 755 tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) | 756 EXP_TID_SET(LEN, npages); 757 tidlist[(*tididx)++] = tidinfo; 758 grp->used++; 759 grp->map |= 1 << useidx++; 760 idx++; 761 } 762 763 /* Fill the rest of the group with "blank" writes */ 764 for (; useidx < grp->size; useidx++) 765 rcv_array_wc_fill(dd, grp->base + useidx); 766 *pmapped = mapped; 767 return idx; 768} 769 770static int set_rcvarray_entry(struct hfi1_filedata *fd, 771 struct tid_user_buf *tbuf, 772 u32 rcventry, struct tid_group *grp, 773 u16 pageidx, unsigned int npages) 774{ 775 int ret; 776 struct hfi1_ctxtdata *uctxt = fd->uctxt; 777 struct tid_rb_node *node; 778 struct hfi1_devdata *dd = uctxt->dd; 779 dma_addr_t phys; 780 struct page **pages = tbuf->pages + pageidx; 781 782 /* 783 * Allocate the node first so we can handle a potential 784 * failure before we've programmed anything. 785 */ 786 node = kzalloc(sizeof(*node) + (sizeof(struct page *) * npages), 787 GFP_KERNEL); 788 if (!node) 789 return -ENOMEM; 790 791 phys = pci_map_single(dd->pcidev, 792 __va(page_to_phys(pages[0])), 793 npages * PAGE_SIZE, PCI_DMA_FROMDEVICE); 794 if (dma_mapping_error(&dd->pcidev->dev, phys)) { 795 dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n", 796 phys); 797 kfree(node); 798 return -EFAULT; 799 } 800 801 node->fdata = fd; 802 mutex_init(&node->invalidate_mutex); 803 node->phys = page_to_phys(pages[0]); 804 node->npages = npages; 805 node->rcventry = rcventry; 806 node->dma_addr = phys; 807 node->grp = grp; 808 node->freed = false; 809 memcpy(node->pages, pages, sizeof(struct page *) * npages); 810 811 if (fd->use_mn) { 812 ret = mmu_interval_notifier_insert( 813 &node->notifier, current->mm, 814 tbuf->vaddr + (pageidx * PAGE_SIZE), npages * PAGE_SIZE, 815 &tid_mn_ops); 816 if (ret) 817 goto out_unmap; 818 } 819 fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node; 820 821 hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1); 822 trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages, 823 node->notifier.interval_tree.start, node->phys, 824 phys); 825 return 0; 826 827out_unmap: 828 hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d", 829 node->rcventry, node->notifier.interval_tree.start, 830 node->phys, ret); 831 pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE, 832 PCI_DMA_FROMDEVICE); 833 kfree(node); 834 return -EFAULT; 835} 836 837static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo) 838{ 839 struct hfi1_ctxtdata *uctxt = fd->uctxt; 840 struct hfi1_devdata *dd = uctxt->dd; 841 struct tid_rb_node *node; 842 u8 tidctrl = EXP_TID_GET(tidinfo, CTRL); 843 u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry; 844 845 if (tididx >= uctxt->expected_count) { 846 dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n", 847 tididx, uctxt->ctxt); 848 return -EINVAL; 849 } 850 851 if (tidctrl == 0x3) 852 return -EINVAL; 853 854 rcventry = tididx + (tidctrl - 1); 855 856 node = fd->entry_to_rb[rcventry]; 857 if (!node || node->rcventry != (uctxt->expected_base + rcventry)) 858 return -EBADF; 859 860 if (fd->use_mn) 861 mmu_interval_notifier_remove(&node->notifier); 862 cacheless_tid_rb_remove(fd, node); 863 864 return 0; 865} 866 867static void __clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node) 868{ 869 struct hfi1_ctxtdata *uctxt = fd->uctxt; 870 struct hfi1_devdata *dd = uctxt->dd; 871 872 mutex_lock(&node->invalidate_mutex); 873 if (node->freed) 874 goto done; 875 node->freed = true; 876 877 trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry, 878 node->npages, 879 node->notifier.interval_tree.start, node->phys, 880 node->dma_addr); 881 882 /* Make sure device has seen the write before pages are unpinned */ 883 hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0); 884 885 unpin_rcv_pages(fd, NULL, node, 0, node->npages, true); 886done: 887 mutex_unlock(&node->invalidate_mutex); 888} 889 890static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node) 891{ 892 struct hfi1_ctxtdata *uctxt = fd->uctxt; 893 894 __clear_tid_node(fd, node); 895 896 node->grp->used--; 897 node->grp->map &= ~(1 << (node->rcventry - node->grp->base)); 898 899 if (node->grp->used == node->grp->size - 1) 900 tid_group_move(node->grp, &uctxt->tid_full_list, 901 &uctxt->tid_used_list); 902 else if (!node->grp->used) 903 tid_group_move(node->grp, &uctxt->tid_used_list, 904 &uctxt->tid_group_list); 905 kfree(node); 906} 907 908/* 909 * As a simple helper for hfi1_user_exp_rcv_free, this function deals with 910 * clearing nodes in the non-cached case. 911 */ 912static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, 913 struct exp_tid_set *set, 914 struct hfi1_filedata *fd) 915{ 916 struct tid_group *grp, *ptr; 917 int i; 918 919 list_for_each_entry_safe(grp, ptr, &set->list, list) { 920 list_del_init(&grp->list); 921 922 for (i = 0; i < grp->size; i++) { 923 if (grp->map & (1 << i)) { 924 u16 rcventry = grp->base + i; 925 struct tid_rb_node *node; 926 927 node = fd->entry_to_rb[rcventry - 928 uctxt->expected_base]; 929 if (!node || node->rcventry != rcventry) 930 continue; 931 932 if (fd->use_mn) 933 mmu_interval_notifier_remove( 934 &node->notifier); 935 cacheless_tid_rb_remove(fd, node); 936 } 937 } 938 } 939} 940 941static bool tid_rb_invalidate(struct mmu_interval_notifier *mni, 942 const struct mmu_notifier_range *range, 943 unsigned long cur_seq) 944{ 945 struct tid_rb_node *node = 946 container_of(mni, struct tid_rb_node, notifier); 947 struct hfi1_filedata *fdata = node->fdata; 948 struct hfi1_ctxtdata *uctxt = fdata->uctxt; 949 950 if (node->freed) 951 return true; 952 953 /* take action only if unmapping */ 954 if (range->event != MMU_NOTIFY_UNMAP) 955 return true; 956 957 trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt, 958 node->notifier.interval_tree.start, 959 node->rcventry, node->npages, node->dma_addr); 960 961 /* clear the hardware rcvarray entry */ 962 __clear_tid_node(fdata, node); 963 964 spin_lock(&fdata->invalid_lock); 965 if (fdata->invalid_tid_idx < uctxt->expected_count) { 966 fdata->invalid_tids[fdata->invalid_tid_idx] = 967 rcventry2tidinfo(node->rcventry - uctxt->expected_base); 968 fdata->invalid_tids[fdata->invalid_tid_idx] |= 969 EXP_TID_SET(LEN, node->npages); 970 if (!fdata->invalid_tid_idx) { 971 unsigned long *ev; 972 973 /* 974 * hfi1_set_uevent_bits() sets a user event flag 975 * for all processes. Because calling into the 976 * driver to process TID cache invalidations is 977 * expensive and TID cache invalidations are 978 * handled on a per-process basis, we can 979 * optimize this to set the flag only for the 980 * process in question. 981 */ 982 ev = uctxt->dd->events + 983 (uctxt_offset(uctxt) + fdata->subctxt); 984 set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev); 985 } 986 fdata->invalid_tid_idx++; 987 } 988 spin_unlock(&fdata->invalid_lock); 989 return true; 990} 991 992static bool tid_cover_invalidate(struct mmu_interval_notifier *mni, 993 const struct mmu_notifier_range *range, 994 unsigned long cur_seq) 995{ 996 struct tid_user_buf *tidbuf = 997 container_of(mni, struct tid_user_buf, notifier); 998 999 /* take action only if unmapping */ 1000 if (range->event == MMU_NOTIFY_UNMAP) { 1001 mutex_lock(&tidbuf->cover_mutex); 1002 mmu_interval_set_seq(mni, cur_seq); 1003 mutex_unlock(&tidbuf->cover_mutex); 1004 } 1005 1006 return true; 1007} 1008 1009static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata, 1010 struct tid_rb_node *tnode) 1011{ 1012 u32 base = fdata->uctxt->expected_base; 1013 1014 fdata->entry_to_rb[tnode->rcventry - base] = NULL; 1015 clear_tid_node(fdata, tnode); 1016} 1017