1/* 2 * Copyright(c) 2020 - Cornelis Networks, Inc. 3 * Copyright(c) 2015 - 2018 Intel Corporation. 4 * 5 * This file is provided under a dual BSD/GPLv2 license. When using or 6 * redistributing this file, you may do so under either license. 7 * 8 * GPL LICENSE SUMMARY 9 * 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of version 2 of the GNU General Public License as 12 * published by the Free Software Foundation. 13 * 14 * This program is distributed in the hope that it will be useful, but 15 * WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * General Public License for more details. 18 * 19 * BSD LICENSE 20 * 21 * Redistribution and use in source and binary forms, with or without 22 * modification, are permitted provided that the following conditions 23 * are met: 24 * 25 * - Redistributions of source code must retain the above copyright 26 * notice, this list of conditions and the following disclaimer. 27 * - Redistributions in binary form must reproduce the above copyright 28 * notice, this list of conditions and the following disclaimer in 29 * the documentation and/or other materials provided with the 30 * distribution. 31 * - Neither the name of Intel Corporation nor the names of its 32 * contributors may be used to endorse or promote products derived 33 * from this software without specific prior written permission. 34 * 35 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 36 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 37 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 38 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 39 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 41 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 42 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 43 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 44 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 45 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 46 * 47 */ 48#include <linux/mm.h> 49#include <linux/types.h> 50#include <linux/device.h> 51#include <linux/dmapool.h> 52#include <linux/slab.h> 53#include <linux/list.h> 54#include <linux/highmem.h> 55#include <linux/io.h> 56#include <linux/uio.h> 57#include <linux/rbtree.h> 58#include <linux/spinlock.h> 59#include <linux/delay.h> 60#include <linux/kthread.h> 61#include <linux/mmu_context.h> 62#include <linux/module.h> 63#include <linux/vmalloc.h> 64#include <linux/string.h> 65 66#include "hfi.h" 67#include "sdma.h" 68#include "user_sdma.h" 69#include "verbs.h" /* for the headers */ 70#include "common.h" /* for struct hfi1_tid_info */ 71#include "trace.h" 72 73static uint hfi1_sdma_comp_ring_size = 128; 74module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO); 75MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128"); 76 77static unsigned initial_pkt_count = 8; 78 79static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts); 80static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status); 81static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq); 82static void user_sdma_free_request(struct user_sdma_request *req); 83static int check_header_template(struct user_sdma_request *req, 84 struct hfi1_pkt_header *hdr, u32 lrhlen, 85 u32 datalen); 86static int set_txreq_header(struct user_sdma_request *req, 87 struct user_sdma_txreq *tx, u32 datalen); 88static int set_txreq_header_ahg(struct user_sdma_request *req, 89 struct user_sdma_txreq *tx, u32 len); 90static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 91 struct hfi1_user_sdma_comp_q *cq, 92 u16 idx, enum hfi1_sdma_comp_state state, 93 int ret); 94static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags); 95static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len); 96 97static int defer_packet_queue( 98 struct sdma_engine *sde, 99 struct iowait_work *wait, 100 struct sdma_txreq *txreq, 101 uint seq, 102 bool pkts_sent); 103static void activate_packet_queue(struct iowait *wait, int reason); 104static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 105 unsigned long len); 106static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 107 void *arg2, bool *stop); 108static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode); 109 110static struct mmu_rb_ops sdma_rb_ops = { 111 .filter = sdma_rb_filter, 112 .evict = sdma_rb_evict, 113 .remove = sdma_rb_remove, 114}; 115 116static int add_system_pages_to_sdma_packet(struct user_sdma_request *req, 117 struct user_sdma_txreq *tx, 118 struct user_sdma_iovec *iovec, 119 u32 *pkt_remaining); 120 121static int defer_packet_queue( 122 struct sdma_engine *sde, 123 struct iowait_work *wait, 124 struct sdma_txreq *txreq, 125 uint seq, 126 bool pkts_sent) 127{ 128 struct hfi1_user_sdma_pkt_q *pq = 129 container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy); 130 131 write_seqlock(&sde->waitlock); 132 trace_hfi1_usdma_defer(pq, sde, &pq->busy); 133 if (sdma_progress(sde, seq, txreq)) 134 goto eagain; 135 /* 136 * We are assuming that if the list is enqueued somewhere, it 137 * is to the dmawait list since that is the only place where 138 * it is supposed to be enqueued. 139 */ 140 xchg(&pq->state, SDMA_PKT_Q_DEFERRED); 141 if (list_empty(&pq->busy.list)) { 142 pq->busy.lock = &sde->waitlock; 143 iowait_get_priority(&pq->busy); 144 iowait_queue(pkts_sent, &pq->busy, &sde->dmawait); 145 } 146 write_sequnlock(&sde->waitlock); 147 return -EBUSY; 148eagain: 149 write_sequnlock(&sde->waitlock); 150 return -EAGAIN; 151} 152 153static void activate_packet_queue(struct iowait *wait, int reason) 154{ 155 struct hfi1_user_sdma_pkt_q *pq = 156 container_of(wait, struct hfi1_user_sdma_pkt_q, busy); 157 158 trace_hfi1_usdma_activate(pq, wait, reason); 159 xchg(&pq->state, SDMA_PKT_Q_ACTIVE); 160 wake_up(&wait->wait_dma); 161}; 162 163int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, 164 struct hfi1_filedata *fd) 165{ 166 int ret = -ENOMEM; 167 char buf[64]; 168 struct hfi1_devdata *dd; 169 struct hfi1_user_sdma_comp_q *cq; 170 struct hfi1_user_sdma_pkt_q *pq; 171 172 if (!uctxt || !fd) 173 return -EBADF; 174 175 if (!hfi1_sdma_comp_ring_size) 176 return -EINVAL; 177 178 dd = uctxt->dd; 179 180 pq = kzalloc(sizeof(*pq), GFP_KERNEL); 181 if (!pq) 182 return -ENOMEM; 183 pq->dd = dd; 184 pq->ctxt = uctxt->ctxt; 185 pq->subctxt = fd->subctxt; 186 pq->n_max_reqs = hfi1_sdma_comp_ring_size; 187 atomic_set(&pq->n_reqs, 0); 188 init_waitqueue_head(&pq->wait); 189 atomic_set(&pq->n_locked, 0); 190 191 iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue, 192 activate_packet_queue, NULL, NULL); 193 pq->reqidx = 0; 194 195 pq->reqs = kcalloc(hfi1_sdma_comp_ring_size, 196 sizeof(*pq->reqs), 197 GFP_KERNEL); 198 if (!pq->reqs) 199 goto pq_reqs_nomem; 200 201 pq->req_in_use = bitmap_zalloc(hfi1_sdma_comp_ring_size, GFP_KERNEL); 202 if (!pq->req_in_use) 203 goto pq_reqs_no_in_use; 204 205 snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt, 206 fd->subctxt); 207 pq->txreq_cache = kmem_cache_create(buf, 208 sizeof(struct user_sdma_txreq), 209 L1_CACHE_BYTES, 210 SLAB_HWCACHE_ALIGN, 211 NULL); 212 if (!pq->txreq_cache) { 213 dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n", 214 uctxt->ctxt); 215 goto pq_txreq_nomem; 216 } 217 218 cq = kzalloc(sizeof(*cq), GFP_KERNEL); 219 if (!cq) 220 goto cq_nomem; 221 222 cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps) 223 * hfi1_sdma_comp_ring_size)); 224 if (!cq->comps) 225 goto cq_comps_nomem; 226 227 cq->nentries = hfi1_sdma_comp_ring_size; 228 229 ret = hfi1_mmu_rb_register(pq, &sdma_rb_ops, dd->pport->hfi1_wq, 230 &pq->handler); 231 if (ret) { 232 dd_dev_err(dd, "Failed to register with MMU %d", ret); 233 goto pq_mmu_fail; 234 } 235 236 rcu_assign_pointer(fd->pq, pq); 237 fd->cq = cq; 238 239 return 0; 240 241pq_mmu_fail: 242 vfree(cq->comps); 243cq_comps_nomem: 244 kfree(cq); 245cq_nomem: 246 kmem_cache_destroy(pq->txreq_cache); 247pq_txreq_nomem: 248 bitmap_free(pq->req_in_use); 249pq_reqs_no_in_use: 250 kfree(pq->reqs); 251pq_reqs_nomem: 252 kfree(pq); 253 254 return ret; 255} 256 257static void flush_pq_iowait(struct hfi1_user_sdma_pkt_q *pq) 258{ 259 unsigned long flags; 260 seqlock_t *lock = pq->busy.lock; 261 262 if (!lock) 263 return; 264 write_seqlock_irqsave(lock, flags); 265 if (!list_empty(&pq->busy.list)) { 266 list_del_init(&pq->busy.list); 267 pq->busy.lock = NULL; 268 } 269 write_sequnlock_irqrestore(lock, flags); 270} 271 272int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd, 273 struct hfi1_ctxtdata *uctxt) 274{ 275 struct hfi1_user_sdma_pkt_q *pq; 276 277 trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt); 278 279 spin_lock(&fd->pq_rcu_lock); 280 pq = srcu_dereference_check(fd->pq, &fd->pq_srcu, 281 lockdep_is_held(&fd->pq_rcu_lock)); 282 if (pq) { 283 rcu_assign_pointer(fd->pq, NULL); 284 spin_unlock(&fd->pq_rcu_lock); 285 synchronize_srcu(&fd->pq_srcu); 286 /* at this point there can be no more new requests */ 287 iowait_sdma_drain(&pq->busy); 288 /* Wait until all requests have been freed. */ 289 wait_event_interruptible( 290 pq->wait, 291 !atomic_read(&pq->n_reqs)); 292 kfree(pq->reqs); 293 if (pq->handler) 294 hfi1_mmu_rb_unregister(pq->handler); 295 bitmap_free(pq->req_in_use); 296 kmem_cache_destroy(pq->txreq_cache); 297 flush_pq_iowait(pq); 298 kfree(pq); 299 } else { 300 spin_unlock(&fd->pq_rcu_lock); 301 } 302 if (fd->cq) { 303 vfree(fd->cq->comps); 304 kfree(fd->cq); 305 fd->cq = NULL; 306 } 307 return 0; 308} 309 310static u8 dlid_to_selector(u16 dlid) 311{ 312 static u8 mapping[256]; 313 static int initialized; 314 static u8 next; 315 int hash; 316 317 if (!initialized) { 318 memset(mapping, 0xFF, 256); 319 initialized = 1; 320 } 321 322 hash = ((dlid >> 8) ^ dlid) & 0xFF; 323 if (mapping[hash] == 0xFF) { 324 mapping[hash] = next; 325 next = (next + 1) & 0x7F; 326 } 327 328 return mapping[hash]; 329} 330 331/** 332 * hfi1_user_sdma_process_request() - Process and start a user sdma request 333 * @fd: valid file descriptor 334 * @iovec: array of io vectors to process 335 * @dim: overall iovec array size 336 * @count: number of io vector array entries processed 337 */ 338int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, 339 struct iovec *iovec, unsigned long dim, 340 unsigned long *count) 341{ 342 int ret = 0, i; 343 struct hfi1_ctxtdata *uctxt = fd->uctxt; 344 struct hfi1_user_sdma_pkt_q *pq = 345 srcu_dereference(fd->pq, &fd->pq_srcu); 346 struct hfi1_user_sdma_comp_q *cq = fd->cq; 347 struct hfi1_devdata *dd = pq->dd; 348 unsigned long idx = 0; 349 u8 pcount = initial_pkt_count; 350 struct sdma_req_info info; 351 struct user_sdma_request *req; 352 u8 opcode, sc, vl; 353 u16 pkey; 354 u32 slid; 355 u16 dlid; 356 u32 selector; 357 358 if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) { 359 hfi1_cdbg( 360 SDMA, 361 "[%u:%u:%u] First vector not big enough for header %lu/%lu", 362 dd->unit, uctxt->ctxt, fd->subctxt, 363 iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr)); 364 return -EINVAL; 365 } 366 ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info)); 367 if (ret) { 368 hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)", 369 dd->unit, uctxt->ctxt, fd->subctxt, ret); 370 return -EFAULT; 371 } 372 373 trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt, 374 (u16 *)&info); 375 if (info.comp_idx >= hfi1_sdma_comp_ring_size) { 376 hfi1_cdbg(SDMA, 377 "[%u:%u:%u:%u] Invalid comp index", 378 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 379 return -EINVAL; 380 } 381 382 /* 383 * Sanity check the header io vector count. Need at least 1 vector 384 * (header) and cannot be larger than the actual io vector count. 385 */ 386 if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) { 387 hfi1_cdbg(SDMA, 388 "[%u:%u:%u:%u] Invalid iov count %d, dim %ld", 389 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx, 390 req_iovcnt(info.ctrl), dim); 391 return -EINVAL; 392 } 393 394 if (!info.fragsize) { 395 hfi1_cdbg(SDMA, 396 "[%u:%u:%u:%u] Request does not specify fragsize", 397 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 398 return -EINVAL; 399 } 400 401 /* Try to claim the request. */ 402 if (test_and_set_bit(info.comp_idx, pq->req_in_use)) { 403 hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use", 404 dd->unit, uctxt->ctxt, fd->subctxt, 405 info.comp_idx); 406 return -EBADSLT; 407 } 408 /* 409 * All safety checks have been done and this request has been claimed. 410 */ 411 trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt, 412 info.comp_idx); 413 req = pq->reqs + info.comp_idx; 414 req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */ 415 req->data_len = 0; 416 req->pq = pq; 417 req->cq = cq; 418 req->ahg_idx = -1; 419 req->iov_idx = 0; 420 req->sent = 0; 421 req->seqnum = 0; 422 req->seqcomp = 0; 423 req->seqsubmitted = 0; 424 req->tids = NULL; 425 req->has_error = 0; 426 INIT_LIST_HEAD(&req->txps); 427 428 memcpy(&req->info, &info, sizeof(info)); 429 430 /* The request is initialized, count it */ 431 atomic_inc(&pq->n_reqs); 432 433 if (req_opcode(info.ctrl) == EXPECTED) { 434 /* expected must have a TID info and at least one data vector */ 435 if (req->data_iovs < 2) { 436 SDMA_DBG(req, 437 "Not enough vectors for expected request"); 438 ret = -EINVAL; 439 goto free_req; 440 } 441 req->data_iovs--; 442 } 443 444 if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) { 445 SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs, 446 MAX_VECTORS_PER_REQ); 447 ret = -EINVAL; 448 goto free_req; 449 } 450 451 /* Copy the header from the user buffer */ 452 ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info), 453 sizeof(req->hdr)); 454 if (ret) { 455 SDMA_DBG(req, "Failed to copy header template (%d)", ret); 456 ret = -EFAULT; 457 goto free_req; 458 } 459 460 /* If Static rate control is not enabled, sanitize the header. */ 461 if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL)) 462 req->hdr.pbc[2] = 0; 463 464 /* Validate the opcode. Do not trust packets from user space blindly. */ 465 opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff; 466 if ((opcode & USER_OPCODE_CHECK_MASK) != 467 USER_OPCODE_CHECK_VAL) { 468 SDMA_DBG(req, "Invalid opcode (%d)", opcode); 469 ret = -EINVAL; 470 goto free_req; 471 } 472 /* 473 * Validate the vl. Do not trust packets from user space blindly. 474 * VL comes from PBC, SC comes from LRH, and the VL needs to 475 * match the SC look up. 476 */ 477 vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF; 478 sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) | 479 (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4)); 480 if (vl >= dd->pport->vls_operational || 481 vl != sc_to_vlt(dd, sc)) { 482 SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl); 483 ret = -EINVAL; 484 goto free_req; 485 } 486 487 /* Checking P_KEY for requests from user-space */ 488 pkey = (u16)be32_to_cpu(req->hdr.bth[0]); 489 slid = be16_to_cpu(req->hdr.lrh[3]); 490 if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) { 491 ret = -EINVAL; 492 goto free_req; 493 } 494 495 /* 496 * Also should check the BTH.lnh. If it says the next header is GRH then 497 * the RXE parsing will be off and will land in the middle of the KDETH 498 * or miss it entirely. 499 */ 500 if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) { 501 SDMA_DBG(req, "User tried to pass in a GRH"); 502 ret = -EINVAL; 503 goto free_req; 504 } 505 506 req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]); 507 /* 508 * Calculate the initial TID offset based on the values of 509 * KDETH.OFFSET and KDETH.OM that are passed in. 510 */ 511 req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) * 512 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 513 KDETH_OM_LARGE : KDETH_OM_SMALL); 514 trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt, 515 info.comp_idx, req->tidoffset); 516 idx++; 517 518 /* Save all the IO vector structures */ 519 for (i = 0; i < req->data_iovs; i++) { 520 req->iovs[i].offset = 0; 521 INIT_LIST_HEAD(&req->iovs[i].list); 522 memcpy(&req->iovs[i].iov, 523 iovec + idx++, 524 sizeof(req->iovs[i].iov)); 525 if (req->iovs[i].iov.iov_len == 0) { 526 ret = -EINVAL; 527 goto free_req; 528 } 529 req->data_len += req->iovs[i].iov.iov_len; 530 } 531 trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt, 532 info.comp_idx, req->data_len); 533 if (pcount > req->info.npkts) 534 pcount = req->info.npkts; 535 /* 536 * Copy any TID info 537 * User space will provide the TID info only when the 538 * request type is EXPECTED. This is true even if there is 539 * only one packet in the request and the header is already 540 * setup. The reason for the singular TID case is that the 541 * driver needs to perform safety checks. 542 */ 543 if (req_opcode(req->info.ctrl) == EXPECTED) { 544 u16 ntids = iovec[idx].iov_len / sizeof(*req->tids); 545 u32 *tmp; 546 547 if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) { 548 ret = -EINVAL; 549 goto free_req; 550 } 551 552 /* 553 * We have to copy all of the tids because they may vary 554 * in size and, therefore, the TID count might not be 555 * equal to the pkt count. However, there is no way to 556 * tell at this point. 557 */ 558 tmp = memdup_user(iovec[idx].iov_base, 559 ntids * sizeof(*req->tids)); 560 if (IS_ERR(tmp)) { 561 ret = PTR_ERR(tmp); 562 SDMA_DBG(req, "Failed to copy %d TIDs (%d)", 563 ntids, ret); 564 goto free_req; 565 } 566 req->tids = tmp; 567 req->n_tids = ntids; 568 req->tididx = 0; 569 idx++; 570 } 571 572 dlid = be16_to_cpu(req->hdr.lrh[1]); 573 selector = dlid_to_selector(dlid); 574 selector += uctxt->ctxt + fd->subctxt; 575 req->sde = sdma_select_user_engine(dd, selector, vl); 576 577 if (!req->sde || !sdma_running(req->sde)) { 578 ret = -ECOMM; 579 goto free_req; 580 } 581 582 /* We don't need an AHG entry if the request contains only one packet */ 583 if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) 584 req->ahg_idx = sdma_ahg_alloc(req->sde); 585 586 set_comp_state(pq, cq, info.comp_idx, QUEUED, 0); 587 pq->state = SDMA_PKT_Q_ACTIVE; 588 589 /* 590 * This is a somewhat blocking send implementation. 591 * The driver will block the caller until all packets of the 592 * request have been submitted to the SDMA engine. However, it 593 * will not wait for send completions. 594 */ 595 while (req->seqsubmitted != req->info.npkts) { 596 ret = user_sdma_send_pkts(req, pcount); 597 if (ret < 0) { 598 int we_ret; 599 600 if (ret != -EBUSY) 601 goto free_req; 602 we_ret = wait_event_interruptible_timeout( 603 pq->busy.wait_dma, 604 pq->state == SDMA_PKT_Q_ACTIVE, 605 msecs_to_jiffies( 606 SDMA_IOWAIT_TIMEOUT)); 607 trace_hfi1_usdma_we(pq, we_ret); 608 if (we_ret <= 0) 609 flush_pq_iowait(pq); 610 } 611 } 612 *count += idx; 613 return 0; 614free_req: 615 /* 616 * If the submitted seqsubmitted == npkts, the completion routine 617 * controls the final state. If sequbmitted < npkts, wait for any 618 * outstanding packets to finish before cleaning up. 619 */ 620 if (req->seqsubmitted < req->info.npkts) { 621 if (req->seqsubmitted) 622 wait_event(pq->busy.wait_dma, 623 (req->seqcomp == req->seqsubmitted - 1)); 624 user_sdma_free_request(req); 625 pq_update(pq); 626 set_comp_state(pq, cq, info.comp_idx, ERROR, ret); 627 } 628 return ret; 629} 630 631static inline u32 compute_data_length(struct user_sdma_request *req, 632 struct user_sdma_txreq *tx) 633{ 634 /* 635 * Determine the proper size of the packet data. 636 * The size of the data of the first packet is in the header 637 * template. However, it includes the header and ICRC, which need 638 * to be subtracted. 639 * The minimum representable packet data length in a header is 4 bytes, 640 * therefore, when the data length request is less than 4 bytes, there's 641 * only one packet, and the packet data length is equal to that of the 642 * request data length. 643 * The size of the remaining packets is the minimum of the frag 644 * size (MTU) or remaining data in the request. 645 */ 646 u32 len; 647 648 if (!req->seqnum) { 649 if (req->data_len < sizeof(u32)) 650 len = req->data_len; 651 else 652 len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) - 653 (sizeof(tx->hdr) - 4)); 654 } else if (req_opcode(req->info.ctrl) == EXPECTED) { 655 u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) * 656 PAGE_SIZE; 657 /* 658 * Get the data length based on the remaining space in the 659 * TID pair. 660 */ 661 len = min(tidlen - req->tidoffset, (u32)req->info.fragsize); 662 /* If we've filled up the TID pair, move to the next one. */ 663 if (unlikely(!len) && ++req->tididx < req->n_tids && 664 req->tids[req->tididx]) { 665 tidlen = EXP_TID_GET(req->tids[req->tididx], 666 LEN) * PAGE_SIZE; 667 req->tidoffset = 0; 668 len = min_t(u32, tidlen, req->info.fragsize); 669 } 670 /* 671 * Since the TID pairs map entire pages, make sure that we 672 * are not going to try to send more data that we have 673 * remaining. 674 */ 675 len = min(len, req->data_len - req->sent); 676 } else { 677 len = min(req->data_len - req->sent, (u32)req->info.fragsize); 678 } 679 trace_hfi1_sdma_user_compute_length(req->pq->dd, 680 req->pq->ctxt, 681 req->pq->subctxt, 682 req->info.comp_idx, 683 len); 684 return len; 685} 686 687static inline u32 pad_len(u32 len) 688{ 689 if (len & (sizeof(u32) - 1)) 690 len += sizeof(u32) - (len & (sizeof(u32) - 1)); 691 return len; 692} 693 694static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len) 695{ 696 /* (Size of complete header - size of PBC) + 4B ICRC + data length */ 697 return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len); 698} 699 700static int user_sdma_txadd_ahg(struct user_sdma_request *req, 701 struct user_sdma_txreq *tx, 702 u32 datalen) 703{ 704 int ret; 705 u16 pbclen = le16_to_cpu(req->hdr.pbc[0]); 706 u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen)); 707 struct hfi1_user_sdma_pkt_q *pq = req->pq; 708 709 /* 710 * Copy the request header into the tx header 711 * because the HW needs a cacheline-aligned 712 * address. 713 * This copy can be optimized out if the hdr 714 * member of user_sdma_request were also 715 * cacheline aligned. 716 */ 717 memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr)); 718 if (PBC2LRH(pbclen) != lrhlen) { 719 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 720 tx->hdr.pbc[0] = cpu_to_le16(pbclen); 721 } 722 ret = check_header_template(req, &tx->hdr, lrhlen, datalen); 723 if (ret) 724 return ret; 725 ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY, 726 sizeof(tx->hdr) + datalen, req->ahg_idx, 727 0, NULL, 0, user_sdma_txreq_cb); 728 if (ret) 729 return ret; 730 ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr)); 731 if (ret) 732 sdma_txclean(pq->dd, &tx->txreq); 733 return ret; 734} 735 736static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts) 737{ 738 int ret = 0; 739 u16 count; 740 unsigned npkts = 0; 741 struct user_sdma_txreq *tx = NULL; 742 struct hfi1_user_sdma_pkt_q *pq = NULL; 743 struct user_sdma_iovec *iovec = NULL; 744 745 if (!req->pq) 746 return -EINVAL; 747 748 pq = req->pq; 749 750 /* If tx completion has reported an error, we are done. */ 751 if (READ_ONCE(req->has_error)) 752 return -EFAULT; 753 754 /* 755 * Check if we might have sent the entire request already 756 */ 757 if (unlikely(req->seqnum == req->info.npkts)) { 758 if (!list_empty(&req->txps)) 759 goto dosend; 760 return ret; 761 } 762 763 if (!maxpkts || maxpkts > req->info.npkts - req->seqnum) 764 maxpkts = req->info.npkts - req->seqnum; 765 766 while (npkts < maxpkts) { 767 u32 datalen = 0; 768 769 /* 770 * Check whether any of the completions have come back 771 * with errors. If so, we are not going to process any 772 * more packets from this request. 773 */ 774 if (READ_ONCE(req->has_error)) 775 return -EFAULT; 776 777 tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL); 778 if (!tx) 779 return -ENOMEM; 780 781 tx->flags = 0; 782 tx->req = req; 783 INIT_LIST_HEAD(&tx->list); 784 785 /* 786 * For the last packet set the ACK request 787 * and disable header suppression. 788 */ 789 if (req->seqnum == req->info.npkts - 1) 790 tx->flags |= (TXREQ_FLAGS_REQ_ACK | 791 TXREQ_FLAGS_REQ_DISABLE_SH); 792 793 /* 794 * Calculate the payload size - this is min of the fragment 795 * (MTU) size or the remaining bytes in the request but only 796 * if we have payload data. 797 */ 798 if (req->data_len) { 799 iovec = &req->iovs[req->iov_idx]; 800 if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) { 801 if (++req->iov_idx == req->data_iovs) { 802 ret = -EFAULT; 803 goto free_tx; 804 } 805 iovec = &req->iovs[req->iov_idx]; 806 WARN_ON(iovec->offset); 807 } 808 809 datalen = compute_data_length(req, tx); 810 811 /* 812 * Disable header suppression for the payload <= 8DWS. 813 * If there is an uncorrectable error in the receive 814 * data FIFO when the received payload size is less than 815 * or equal to 8DWS then the RxDmaDataFifoRdUncErr is 816 * not reported.There is set RHF.EccErr if the header 817 * is not suppressed. 818 */ 819 if (!datalen) { 820 SDMA_DBG(req, 821 "Request has data but pkt len is 0"); 822 ret = -EFAULT; 823 goto free_tx; 824 } else if (datalen <= 32) { 825 tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH; 826 } 827 } 828 829 if (req->ahg_idx >= 0) { 830 if (!req->seqnum) { 831 ret = user_sdma_txadd_ahg(req, tx, datalen); 832 if (ret) 833 goto free_tx; 834 } else { 835 int changes; 836 837 changes = set_txreq_header_ahg(req, tx, 838 datalen); 839 if (changes < 0) { 840 ret = changes; 841 goto free_tx; 842 } 843 } 844 } else { 845 ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) + 846 datalen, user_sdma_txreq_cb); 847 if (ret) 848 goto free_tx; 849 /* 850 * Modify the header for this packet. This only needs 851 * to be done if we are not going to use AHG. Otherwise, 852 * the HW will do it based on the changes we gave it 853 * during sdma_txinit_ahg(). 854 */ 855 ret = set_txreq_header(req, tx, datalen); 856 if (ret) 857 goto free_txreq; 858 } 859 860 req->koffset += datalen; 861 if (req_opcode(req->info.ctrl) == EXPECTED) 862 req->tidoffset += datalen; 863 req->sent += datalen; 864 while (datalen) { 865 ret = add_system_pages_to_sdma_packet(req, tx, iovec, 866 &datalen); 867 if (ret) 868 goto free_txreq; 869 iovec = &req->iovs[req->iov_idx]; 870 } 871 list_add_tail(&tx->txreq.list, &req->txps); 872 /* 873 * It is important to increment this here as it is used to 874 * generate the BTH.PSN and, therefore, can't be bulk-updated 875 * outside of the loop. 876 */ 877 tx->seqnum = req->seqnum++; 878 npkts++; 879 } 880dosend: 881 ret = sdma_send_txlist(req->sde, 882 iowait_get_ib_work(&pq->busy), 883 &req->txps, &count); 884 req->seqsubmitted += count; 885 if (req->seqsubmitted == req->info.npkts) { 886 /* 887 * The txreq has already been submitted to the HW queue 888 * so we can free the AHG entry now. Corruption will not 889 * happen due to the sequential manner in which 890 * descriptors are processed. 891 */ 892 if (req->ahg_idx >= 0) 893 sdma_ahg_free(req->sde, req->ahg_idx); 894 } 895 return ret; 896 897free_txreq: 898 sdma_txclean(pq->dd, &tx->txreq); 899free_tx: 900 kmem_cache_free(pq->txreq_cache, tx); 901 return ret; 902} 903 904static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) 905{ 906 struct evict_data evict_data; 907 struct mmu_rb_handler *handler = pq->handler; 908 909 evict_data.cleared = 0; 910 evict_data.target = npages; 911 hfi1_mmu_rb_evict(handler, &evict_data); 912 return evict_data.cleared; 913} 914 915static int check_header_template(struct user_sdma_request *req, 916 struct hfi1_pkt_header *hdr, u32 lrhlen, 917 u32 datalen) 918{ 919 /* 920 * Perform safety checks for any type of packet: 921 * - transfer size is multiple of 64bytes 922 * - packet length is multiple of 4 bytes 923 * - packet length is not larger than MTU size 924 * 925 * These checks are only done for the first packet of the 926 * transfer since the header is "given" to us by user space. 927 * For the remainder of the packets we compute the values. 928 */ 929 if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 || 930 lrhlen > get_lrh_len(*hdr, req->info.fragsize)) 931 return -EINVAL; 932 933 if (req_opcode(req->info.ctrl) == EXPECTED) { 934 /* 935 * The header is checked only on the first packet. Furthermore, 936 * we ensure that at least one TID entry is copied when the 937 * request is submitted. Therefore, we don't have to verify that 938 * tididx points to something sane. 939 */ 940 u32 tidval = req->tids[req->tididx], 941 tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE, 942 tididx = EXP_TID_GET(tidval, IDX), 943 tidctrl = EXP_TID_GET(tidval, CTRL), 944 tidoff; 945 __le32 kval = hdr->kdeth.ver_tid_offset; 946 947 tidoff = KDETH_GET(kval, OFFSET) * 948 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 949 KDETH_OM_LARGE : KDETH_OM_SMALL); 950 /* 951 * Expected receive packets have the following 952 * additional checks: 953 * - offset is not larger than the TID size 954 * - TIDCtrl values match between header and TID array 955 * - TID indexes match between header and TID array 956 */ 957 if ((tidoff + datalen > tidlen) || 958 KDETH_GET(kval, TIDCTRL) != tidctrl || 959 KDETH_GET(kval, TID) != tididx) 960 return -EINVAL; 961 } 962 return 0; 963} 964 965/* 966 * Correctly set the BTH.PSN field based on type of 967 * transfer - eager packets can just increment the PSN but 968 * expected packets encode generation and sequence in the 969 * BTH.PSN field so just incrementing will result in errors. 970 */ 971static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags) 972{ 973 u32 val = be32_to_cpu(bthpsn), 974 mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull : 975 0xffffffull), 976 psn = val & mask; 977 if (expct) 978 psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) | 979 ((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK); 980 else 981 psn = psn + frags; 982 return psn & mask; 983} 984 985static int set_txreq_header(struct user_sdma_request *req, 986 struct user_sdma_txreq *tx, u32 datalen) 987{ 988 struct hfi1_user_sdma_pkt_q *pq = req->pq; 989 struct hfi1_pkt_header *hdr = &tx->hdr; 990 u8 omfactor; /* KDETH.OM */ 991 u16 pbclen; 992 int ret; 993 u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 994 995 /* Copy the header template to the request before modification */ 996 memcpy(hdr, &req->hdr, sizeof(*hdr)); 997 998 /* 999 * Check if the PBC and LRH length are mismatched. If so 1000 * adjust both in the header. 1001 */ 1002 pbclen = le16_to_cpu(hdr->pbc[0]); 1003 if (PBC2LRH(pbclen) != lrhlen) { 1004 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 1005 hdr->pbc[0] = cpu_to_le16(pbclen); 1006 hdr->lrh[2] = cpu_to_be16(lrhlen >> 2); 1007 /* 1008 * Third packet 1009 * This is the first packet in the sequence that has 1010 * a "static" size that can be used for the rest of 1011 * the packets (besides the last one). 1012 */ 1013 if (unlikely(req->seqnum == 2)) { 1014 /* 1015 * From this point on the lengths in both the 1016 * PBC and LRH are the same until the last 1017 * packet. 1018 * Adjust the template so we don't have to update 1019 * every packet 1020 */ 1021 req->hdr.pbc[0] = hdr->pbc[0]; 1022 req->hdr.lrh[2] = hdr->lrh[2]; 1023 } 1024 } 1025 /* 1026 * We only have to modify the header if this is not the 1027 * first packet in the request. Otherwise, we use the 1028 * header given to us. 1029 */ 1030 if (unlikely(!req->seqnum)) { 1031 ret = check_header_template(req, hdr, lrhlen, datalen); 1032 if (ret) 1033 return ret; 1034 goto done; 1035 } 1036 1037 hdr->bth[2] = cpu_to_be32( 1038 set_pkt_bth_psn(hdr->bth[2], 1039 (req_opcode(req->info.ctrl) == EXPECTED), 1040 req->seqnum)); 1041 1042 /* Set ACK request on last packet */ 1043 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1044 hdr->bth[2] |= cpu_to_be32(1UL << 31); 1045 1046 /* Set the new offset */ 1047 hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset); 1048 /* Expected packets have to fill in the new TID information */ 1049 if (req_opcode(req->info.ctrl) == EXPECTED) { 1050 tidval = req->tids[req->tididx]; 1051 /* 1052 * If the offset puts us at the end of the current TID, 1053 * advance everything. 1054 */ 1055 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1056 PAGE_SIZE)) { 1057 req->tidoffset = 0; 1058 /* 1059 * Since we don't copy all the TIDs, all at once, 1060 * we have to check again. 1061 */ 1062 if (++req->tididx > req->n_tids - 1 || 1063 !req->tids[req->tididx]) { 1064 return -EINVAL; 1065 } 1066 tidval = req->tids[req->tididx]; 1067 } 1068 omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >= 1069 KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT : 1070 KDETH_OM_SMALL_SHIFT; 1071 /* Set KDETH.TIDCtrl based on value for this TID. */ 1072 KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL, 1073 EXP_TID_GET(tidval, CTRL)); 1074 /* Set KDETH.TID based on value for this TID */ 1075 KDETH_SET(hdr->kdeth.ver_tid_offset, TID, 1076 EXP_TID_GET(tidval, IDX)); 1077 /* Clear KDETH.SH when DISABLE_SH flag is set */ 1078 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) 1079 KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0); 1080 /* 1081 * Set the KDETH.OFFSET and KDETH.OM based on size of 1082 * transfer. 1083 */ 1084 trace_hfi1_sdma_user_tid_info( 1085 pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx, 1086 req->tidoffset, req->tidoffset >> omfactor, 1087 omfactor != KDETH_OM_SMALL_SHIFT); 1088 KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET, 1089 req->tidoffset >> omfactor); 1090 KDETH_SET(hdr->kdeth.ver_tid_offset, OM, 1091 omfactor != KDETH_OM_SMALL_SHIFT); 1092 } 1093done: 1094 trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt, 1095 req->info.comp_idx, hdr, tidval); 1096 return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr)); 1097} 1098 1099static int set_txreq_header_ahg(struct user_sdma_request *req, 1100 struct user_sdma_txreq *tx, u32 datalen) 1101{ 1102 u32 ahg[AHG_KDETH_ARRAY_SIZE]; 1103 int idx = 0; 1104 u8 omfactor; /* KDETH.OM */ 1105 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1106 struct hfi1_pkt_header *hdr = &req->hdr; 1107 u16 pbclen = le16_to_cpu(hdr->pbc[0]); 1108 u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 1109 size_t array_size = ARRAY_SIZE(ahg); 1110 1111 if (PBC2LRH(pbclen) != lrhlen) { 1112 /* PBC.PbcLengthDWs */ 1113 idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12, 1114 (__force u16)cpu_to_le16(LRH2PBC(lrhlen))); 1115 if (idx < 0) 1116 return idx; 1117 /* LRH.PktLen (we need the full 16 bits due to byte swap) */ 1118 idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16, 1119 (__force u16)cpu_to_be16(lrhlen >> 2)); 1120 if (idx < 0) 1121 return idx; 1122 } 1123 1124 /* 1125 * Do the common updates 1126 */ 1127 /* BTH.PSN and BTH.A */ 1128 val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) & 1129 (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff); 1130 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1131 val32 |= 1UL << 31; 1132 idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16, 1133 (__force u16)cpu_to_be16(val32 >> 16)); 1134 if (idx < 0) 1135 return idx; 1136 idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16, 1137 (__force u16)cpu_to_be16(val32 & 0xffff)); 1138 if (idx < 0) 1139 return idx; 1140 /* KDETH.Offset */ 1141 idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16, 1142 (__force u16)cpu_to_le16(req->koffset & 0xffff)); 1143 if (idx < 0) 1144 return idx; 1145 idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16, 1146 (__force u16)cpu_to_le16(req->koffset >> 16)); 1147 if (idx < 0) 1148 return idx; 1149 if (req_opcode(req->info.ctrl) == EXPECTED) { 1150 __le16 val; 1151 1152 tidval = req->tids[req->tididx]; 1153 1154 /* 1155 * If the offset puts us at the end of the current TID, 1156 * advance everything. 1157 */ 1158 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1159 PAGE_SIZE)) { 1160 req->tidoffset = 0; 1161 /* 1162 * Since we don't copy all the TIDs, all at once, 1163 * we have to check again. 1164 */ 1165 if (++req->tididx > req->n_tids - 1 || 1166 !req->tids[req->tididx]) 1167 return -EINVAL; 1168 tidval = req->tids[req->tididx]; 1169 } 1170 omfactor = ((EXP_TID_GET(tidval, LEN) * 1171 PAGE_SIZE) >= 1172 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT : 1173 KDETH_OM_SMALL_SHIFT; 1174 /* KDETH.OM and KDETH.OFFSET (TID) */ 1175 idx = ahg_header_set( 1176 ahg, idx, array_size, 7, 0, 16, 1177 ((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 | 1178 ((req->tidoffset >> omfactor) 1179 & 0x7fff))); 1180 if (idx < 0) 1181 return idx; 1182 /* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */ 1183 val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) | 1184 (EXP_TID_GET(tidval, IDX) & 0x3ff)); 1185 1186 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) { 1187 val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1188 INTR) << 1189 AHG_KDETH_INTR_SHIFT)); 1190 } else { 1191 val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ? 1192 cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) : 1193 cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1194 INTR) << 1195 AHG_KDETH_INTR_SHIFT)); 1196 } 1197 1198 idx = ahg_header_set(ahg, idx, array_size, 1199 7, 16, 14, (__force u16)val); 1200 if (idx < 0) 1201 return idx; 1202 } 1203 1204 trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt, 1205 req->info.comp_idx, req->sde->this_idx, 1206 req->ahg_idx, ahg, idx, tidval); 1207 sdma_txinit_ahg(&tx->txreq, 1208 SDMA_TXREQ_F_USE_AHG, 1209 datalen, req->ahg_idx, idx, 1210 ahg, sizeof(req->hdr), 1211 user_sdma_txreq_cb); 1212 1213 return idx; 1214} 1215 1216/** 1217 * user_sdma_txreq_cb() - SDMA tx request completion callback. 1218 * @txreq: valid sdma tx request 1219 * @status: success/failure of request 1220 * 1221 * Called when the SDMA progress state machine gets notification that 1222 * the SDMA descriptors for this tx request have been processed by the 1223 * DMA engine. Called in interrupt context. 1224 * Only do work on completed sequences. 1225 */ 1226static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) 1227{ 1228 struct user_sdma_txreq *tx = 1229 container_of(txreq, struct user_sdma_txreq, txreq); 1230 struct user_sdma_request *req; 1231 struct hfi1_user_sdma_pkt_q *pq; 1232 struct hfi1_user_sdma_comp_q *cq; 1233 enum hfi1_sdma_comp_state state = COMPLETE; 1234 1235 if (!tx->req) 1236 return; 1237 1238 req = tx->req; 1239 pq = req->pq; 1240 cq = req->cq; 1241 1242 if (status != SDMA_TXREQ_S_OK) { 1243 SDMA_DBG(req, "SDMA completion with error %d", 1244 status); 1245 WRITE_ONCE(req->has_error, 1); 1246 state = ERROR; 1247 } 1248 1249 req->seqcomp = tx->seqnum; 1250 kmem_cache_free(pq->txreq_cache, tx); 1251 1252 /* sequence isn't complete? We are done */ 1253 if (req->seqcomp != req->info.npkts - 1) 1254 return; 1255 1256 user_sdma_free_request(req); 1257 set_comp_state(pq, cq, req->info.comp_idx, state, status); 1258 pq_update(pq); 1259} 1260 1261static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq) 1262{ 1263 if (atomic_dec_and_test(&pq->n_reqs)) 1264 wake_up(&pq->wait); 1265} 1266 1267static void user_sdma_free_request(struct user_sdma_request *req) 1268{ 1269 if (!list_empty(&req->txps)) { 1270 struct sdma_txreq *t, *p; 1271 1272 list_for_each_entry_safe(t, p, &req->txps, list) { 1273 struct user_sdma_txreq *tx = 1274 container_of(t, struct user_sdma_txreq, txreq); 1275 list_del_init(&t->list); 1276 sdma_txclean(req->pq->dd, t); 1277 kmem_cache_free(req->pq->txreq_cache, tx); 1278 } 1279 } 1280 1281 kfree(req->tids); 1282 clear_bit(req->info.comp_idx, req->pq->req_in_use); 1283} 1284 1285static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 1286 struct hfi1_user_sdma_comp_q *cq, 1287 u16 idx, enum hfi1_sdma_comp_state state, 1288 int ret) 1289{ 1290 if (state == ERROR) 1291 cq->comps[idx].errcode = -ret; 1292 smp_wmb(); /* make sure errcode is visible first */ 1293 cq->comps[idx].status = state; 1294 trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt, 1295 idx, state, ret); 1296} 1297 1298static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, 1299 unsigned int start, unsigned int npages) 1300{ 1301 hfi1_release_user_pages(mm, pages + start, npages, false); 1302 kfree(pages); 1303} 1304 1305static void free_system_node(struct sdma_mmu_node *node) 1306{ 1307 if (node->npages) { 1308 unpin_vector_pages(mm_from_sdma_node(node), node->pages, 0, 1309 node->npages); 1310 atomic_sub(node->npages, &node->pq->n_locked); 1311 } 1312 kfree(node); 1313} 1314 1315/* 1316 * kref_get()'s an additional kref on the returned rb_node to prevent rb_node 1317 * from being released until after rb_node is assigned to an SDMA descriptor 1318 * (struct sdma_desc) under add_system_iovec_to_sdma_packet(), even if the 1319 * virtual address range for rb_node is invalidated between now and then. 1320 */ 1321static struct sdma_mmu_node *find_system_node(struct mmu_rb_handler *handler, 1322 unsigned long start, 1323 unsigned long end) 1324{ 1325 struct mmu_rb_node *rb_node; 1326 unsigned long flags; 1327 1328 spin_lock_irqsave(&handler->lock, flags); 1329 rb_node = hfi1_mmu_rb_get_first(handler, start, (end - start)); 1330 if (!rb_node) { 1331 spin_unlock_irqrestore(&handler->lock, flags); 1332 return NULL; 1333 } 1334 1335 /* "safety" kref to prevent release before add_system_iovec_to_sdma_packet() */ 1336 kref_get(&rb_node->refcount); 1337 spin_unlock_irqrestore(&handler->lock, flags); 1338 1339 return container_of(rb_node, struct sdma_mmu_node, rb); 1340} 1341 1342static int pin_system_pages(struct user_sdma_request *req, 1343 uintptr_t start_address, size_t length, 1344 struct sdma_mmu_node *node, int npages) 1345{ 1346 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1347 int pinned, cleared; 1348 struct page **pages; 1349 1350 pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); 1351 if (!pages) 1352 return -ENOMEM; 1353 1354retry: 1355 if (!hfi1_can_pin_pages(pq->dd, current->mm, atomic_read(&pq->n_locked), 1356 npages)) { 1357 SDMA_DBG(req, "Evicting: nlocked %u npages %u", 1358 atomic_read(&pq->n_locked), npages); 1359 cleared = sdma_cache_evict(pq, npages); 1360 if (cleared >= npages) 1361 goto retry; 1362 } 1363 1364 SDMA_DBG(req, "Acquire user pages start_address %lx node->npages %u npages %u", 1365 start_address, node->npages, npages); 1366 pinned = hfi1_acquire_user_pages(current->mm, start_address, npages, 0, 1367 pages); 1368 1369 if (pinned < 0) { 1370 kfree(pages); 1371 SDMA_DBG(req, "pinned %d", pinned); 1372 return pinned; 1373 } 1374 if (pinned != npages) { 1375 unpin_vector_pages(current->mm, pages, node->npages, pinned); 1376 SDMA_DBG(req, "npages %u pinned %d", npages, pinned); 1377 return -EFAULT; 1378 } 1379 node->rb.addr = start_address; 1380 node->rb.len = length; 1381 node->pages = pages; 1382 node->npages = npages; 1383 atomic_add(pinned, &pq->n_locked); 1384 SDMA_DBG(req, "done. pinned %d", pinned); 1385 return 0; 1386} 1387 1388/* 1389 * kref refcount on *node_p will be 2 on successful addition: one kref from 1390 * kref_init() for mmu_rb_handler and one kref to prevent *node_p from being 1391 * released until after *node_p is assigned to an SDMA descriptor (struct 1392 * sdma_desc) under add_system_iovec_to_sdma_packet(), even if the virtual 1393 * address range for *node_p is invalidated between now and then. 1394 */ 1395static int add_system_pinning(struct user_sdma_request *req, 1396 struct sdma_mmu_node **node_p, 1397 unsigned long start, unsigned long len) 1398 1399{ 1400 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1401 struct sdma_mmu_node *node; 1402 int ret; 1403 1404 node = kzalloc(sizeof(*node), GFP_KERNEL); 1405 if (!node) 1406 return -ENOMEM; 1407 1408 /* First kref "moves" to mmu_rb_handler */ 1409 kref_init(&node->rb.refcount); 1410 1411 /* "safety" kref to prevent release before add_system_iovec_to_sdma_packet() */ 1412 kref_get(&node->rb.refcount); 1413 1414 node->pq = pq; 1415 ret = pin_system_pages(req, start, len, node, PFN_DOWN(len)); 1416 if (ret == 0) { 1417 ret = hfi1_mmu_rb_insert(pq->handler, &node->rb); 1418 if (ret) 1419 free_system_node(node); 1420 else 1421 *node_p = node; 1422 1423 return ret; 1424 } 1425 1426 kfree(node); 1427 return ret; 1428} 1429 1430static int get_system_cache_entry(struct user_sdma_request *req, 1431 struct sdma_mmu_node **node_p, 1432 size_t req_start, size_t req_len) 1433{ 1434 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1435 u64 start = ALIGN_DOWN(req_start, PAGE_SIZE); 1436 u64 end = PFN_ALIGN(req_start + req_len); 1437 struct mmu_rb_handler *handler = pq->handler; 1438 int ret; 1439 1440 if ((end - start) == 0) { 1441 SDMA_DBG(req, 1442 "Request for empty cache entry req_start %lx req_len %lx start %llx end %llx", 1443 req_start, req_len, start, end); 1444 return -EINVAL; 1445 } 1446 1447 SDMA_DBG(req, "req_start %lx req_len %lu", req_start, req_len); 1448 1449 while (1) { 1450 struct sdma_mmu_node *node = 1451 find_system_node(handler, start, end); 1452 u64 prepend_len = 0; 1453 1454 SDMA_DBG(req, "node %p start %llx end %llu", node, start, end); 1455 if (!node) { 1456 ret = add_system_pinning(req, node_p, start, 1457 end - start); 1458 if (ret == -EEXIST) { 1459 /* 1460 * Another execution context has inserted a 1461 * conficting entry first. 1462 */ 1463 continue; 1464 } 1465 return ret; 1466 } 1467 1468 if (node->rb.addr <= start) { 1469 /* 1470 * This entry covers at least part of the region. If it doesn't extend 1471 * to the end, then this will be called again for the next segment. 1472 */ 1473 *node_p = node; 1474 return 0; 1475 } 1476 1477 SDMA_DBG(req, "prepend: node->rb.addr %lx, node->rb.refcount %d", 1478 node->rb.addr, kref_read(&node->rb.refcount)); 1479 prepend_len = node->rb.addr - start; 1480 1481 /* 1482 * This node will not be returned, instead a new node 1483 * will be. So release the reference. 1484 */ 1485 kref_put(&node->rb.refcount, hfi1_mmu_rb_release); 1486 1487 /* Prepend a node to cover the beginning of the allocation */ 1488 ret = add_system_pinning(req, node_p, start, prepend_len); 1489 if (ret == -EEXIST) { 1490 /* Another execution context has inserted a conficting entry first. */ 1491 continue; 1492 } 1493 return ret; 1494 } 1495} 1496 1497static void sdma_mmu_rb_node_get(void *ctx) 1498{ 1499 struct mmu_rb_node *node = ctx; 1500 1501 kref_get(&node->refcount); 1502} 1503 1504static void sdma_mmu_rb_node_put(void *ctx) 1505{ 1506 struct sdma_mmu_node *node = ctx; 1507 1508 kref_put(&node->rb.refcount, hfi1_mmu_rb_release); 1509} 1510 1511static int add_mapping_to_sdma_packet(struct user_sdma_request *req, 1512 struct user_sdma_txreq *tx, 1513 struct sdma_mmu_node *cache_entry, 1514 size_t start, 1515 size_t from_this_cache_entry) 1516{ 1517 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1518 unsigned int page_offset; 1519 unsigned int from_this_page; 1520 size_t page_index; 1521 void *ctx; 1522 int ret; 1523 1524 /* 1525 * Because the cache may be more fragmented than the memory that is being accessed, 1526 * it's not strictly necessary to have a descriptor per cache entry. 1527 */ 1528 1529 while (from_this_cache_entry) { 1530 page_index = PFN_DOWN(start - cache_entry->rb.addr); 1531 1532 if (page_index >= cache_entry->npages) { 1533 SDMA_DBG(req, 1534 "Request for page_index %zu >= cache_entry->npages %u", 1535 page_index, cache_entry->npages); 1536 return -EINVAL; 1537 } 1538 1539 page_offset = start - ALIGN_DOWN(start, PAGE_SIZE); 1540 from_this_page = PAGE_SIZE - page_offset; 1541 1542 if (from_this_page < from_this_cache_entry) { 1543 ctx = NULL; 1544 } else { 1545 /* 1546 * In the case they are equal the next line has no practical effect, 1547 * but it's better to do a register to register copy than a conditional 1548 * branch. 1549 */ 1550 from_this_page = from_this_cache_entry; 1551 ctx = cache_entry; 1552 } 1553 1554 ret = sdma_txadd_page(pq->dd, &tx->txreq, 1555 cache_entry->pages[page_index], 1556 page_offset, from_this_page, 1557 ctx, 1558 sdma_mmu_rb_node_get, 1559 sdma_mmu_rb_node_put); 1560 if (ret) { 1561 /* 1562 * When there's a failure, the entire request is freed by 1563 * user_sdma_send_pkts(). 1564 */ 1565 SDMA_DBG(req, 1566 "sdma_txadd_page failed %d page_index %lu page_offset %u from_this_page %u", 1567 ret, page_index, page_offset, from_this_page); 1568 return ret; 1569 } 1570 start += from_this_page; 1571 from_this_cache_entry -= from_this_page; 1572 } 1573 return 0; 1574} 1575 1576static int add_system_iovec_to_sdma_packet(struct user_sdma_request *req, 1577 struct user_sdma_txreq *tx, 1578 struct user_sdma_iovec *iovec, 1579 size_t from_this_iovec) 1580{ 1581 while (from_this_iovec > 0) { 1582 struct sdma_mmu_node *cache_entry; 1583 size_t from_this_cache_entry; 1584 size_t start; 1585 int ret; 1586 1587 start = (uintptr_t)iovec->iov.iov_base + iovec->offset; 1588 ret = get_system_cache_entry(req, &cache_entry, start, 1589 from_this_iovec); 1590 if (ret) { 1591 SDMA_DBG(req, "pin system segment failed %d", ret); 1592 return ret; 1593 } 1594 1595 from_this_cache_entry = cache_entry->rb.len - (start - cache_entry->rb.addr); 1596 if (from_this_cache_entry > from_this_iovec) 1597 from_this_cache_entry = from_this_iovec; 1598 1599 ret = add_mapping_to_sdma_packet(req, tx, cache_entry, start, 1600 from_this_cache_entry); 1601 1602 /* 1603 * Done adding cache_entry to zero or more sdma_desc. Can 1604 * kref_put() the "safety" kref taken under 1605 * get_system_cache_entry(). 1606 */ 1607 kref_put(&cache_entry->rb.refcount, hfi1_mmu_rb_release); 1608 1609 if (ret) { 1610 SDMA_DBG(req, "add system segment failed %d", ret); 1611 return ret; 1612 } 1613 1614 iovec->offset += from_this_cache_entry; 1615 from_this_iovec -= from_this_cache_entry; 1616 } 1617 1618 return 0; 1619} 1620 1621static int add_system_pages_to_sdma_packet(struct user_sdma_request *req, 1622 struct user_sdma_txreq *tx, 1623 struct user_sdma_iovec *iovec, 1624 u32 *pkt_data_remaining) 1625{ 1626 size_t remaining_to_add = *pkt_data_remaining; 1627 /* 1628 * Walk through iovec entries, ensure the associated pages 1629 * are pinned and mapped, add data to the packet until no more 1630 * data remains to be added. 1631 */ 1632 while (remaining_to_add > 0) { 1633 struct user_sdma_iovec *cur_iovec; 1634 size_t from_this_iovec; 1635 int ret; 1636 1637 cur_iovec = iovec; 1638 from_this_iovec = iovec->iov.iov_len - iovec->offset; 1639 1640 if (from_this_iovec > remaining_to_add) { 1641 from_this_iovec = remaining_to_add; 1642 } else { 1643 /* The current iovec entry will be consumed by this pass. */ 1644 req->iov_idx++; 1645 iovec++; 1646 } 1647 1648 ret = add_system_iovec_to_sdma_packet(req, tx, cur_iovec, 1649 from_this_iovec); 1650 if (ret) 1651 return ret; 1652 1653 remaining_to_add -= from_this_iovec; 1654 } 1655 *pkt_data_remaining = remaining_to_add; 1656 1657 return 0; 1658} 1659 1660static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 1661 unsigned long len) 1662{ 1663 return (bool)(node->addr == addr); 1664} 1665 1666/* 1667 * Return 1 to remove the node from the rb tree and call the remove op. 1668 * 1669 * Called with the rb tree lock held. 1670 */ 1671static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 1672 void *evict_arg, bool *stop) 1673{ 1674 struct sdma_mmu_node *node = 1675 container_of(mnode, struct sdma_mmu_node, rb); 1676 struct evict_data *evict_data = evict_arg; 1677 1678 /* this node will be evicted, add its pages to our count */ 1679 evict_data->cleared += node->npages; 1680 1681 /* have enough pages been cleared? */ 1682 if (evict_data->cleared >= evict_data->target) 1683 *stop = true; 1684 1685 return 1; /* remove this node */ 1686} 1687 1688static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode) 1689{ 1690 struct sdma_mmu_node *node = 1691 container_of(mnode, struct sdma_mmu_node, rb); 1692 1693 free_system_node(node); 1694} 1695