1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * Copyright © 2015 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org> 6 */ 7 8#include <linux/intel-iommu.h> 9#include <linux/mmu_notifier.h> 10#include <linux/sched.h> 11#include <linux/sched/mm.h> 12#include <linux/slab.h> 13#include <linux/intel-svm.h> 14#include <linux/rculist.h> 15#include <linux/pci.h> 16#include <linux/pci-ats.h> 17#include <linux/dmar.h> 18#include <linux/interrupt.h> 19#include <linux/mm_types.h> 20#include <linux/ioasid.h> 21#include <asm/page.h> 22#include <asm/fpu/api.h> 23 24#include "pasid.h" 25 26static irqreturn_t prq_event_thread(int irq, void *d); 27static void intel_svm_drain_prq(struct device *dev, u32 pasid); 28 29#define PRQ_ORDER 0 30 31int intel_svm_enable_prq(struct intel_iommu *iommu) 32{ 33 struct page *pages; 34 int irq, ret; 35 36 pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER); 37 if (!pages) { 38 pr_warn("IOMMU: %s: Failed to allocate page request queue\n", 39 iommu->name); 40 return -ENOMEM; 41 } 42 iommu->prq = page_address(pages); 43 44 irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu); 45 if (irq <= 0) { 46 pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n", 47 iommu->name); 48 ret = -EINVAL; 49 err: 50 free_pages((unsigned long)iommu->prq, PRQ_ORDER); 51 iommu->prq = NULL; 52 return ret; 53 } 54 iommu->pr_irq = irq; 55 56 snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id); 57 58 ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT, 59 iommu->prq_name, iommu); 60 if (ret) { 61 pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n", 62 iommu->name); 63 dmar_free_hwirq(irq); 64 iommu->pr_irq = 0; 65 goto err; 66 } 67 dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); 68 dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); 69 dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER); 70 71 init_completion(&iommu->prq_complete); 72 73 return 0; 74} 75 76int intel_svm_finish_prq(struct intel_iommu *iommu) 77{ 78 dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); 79 dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); 80 dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL); 81 82 if (iommu->pr_irq) { 83 free_irq(iommu->pr_irq, iommu); 84 dmar_free_hwirq(iommu->pr_irq); 85 iommu->pr_irq = 0; 86 } 87 88 free_pages((unsigned long)iommu->prq, PRQ_ORDER); 89 iommu->prq = NULL; 90 91 return 0; 92} 93 94static inline bool intel_svm_capable(struct intel_iommu *iommu) 95{ 96 return iommu->flags & VTD_FLAG_SVM_CAPABLE; 97} 98 99void intel_svm_check(struct intel_iommu *iommu) 100{ 101 if (!pasid_supported(iommu)) 102 return; 103 104 if (cpu_feature_enabled(X86_FEATURE_GBPAGES) && 105 !cap_fl1gp_support(iommu->cap)) { 106 pr_err("%s SVM disabled, incompatible 1GB page capability\n", 107 iommu->name); 108 return; 109 } 110 111 if (cpu_feature_enabled(X86_FEATURE_LA57) && 112 !cap_5lp_support(iommu->cap)) { 113 pr_err("%s SVM disabled, incompatible paging mode\n", 114 iommu->name); 115 return; 116 } 117 118 iommu->flags |= VTD_FLAG_SVM_CAPABLE; 119} 120 121static void __flush_svm_range_dev(struct intel_svm *svm, 122 struct intel_svm_dev *sdev, 123 unsigned long address, 124 unsigned long pages, int ih) 125{ 126 struct device_domain_info *info = get_domain_info(sdev->dev); 127 128 if (WARN_ON(!pages)) 129 return; 130 131 qi_flush_piotlb(sdev->iommu, sdev->did, svm->pasid, address, pages, ih); 132 if (info->ats_enabled) 133 qi_flush_dev_iotlb_pasid(sdev->iommu, sdev->sid, info->pfsid, 134 svm->pasid, sdev->qdep, address, 135 order_base_2(pages)); 136} 137 138static void intel_flush_svm_range_dev(struct intel_svm *svm, 139 struct intel_svm_dev *sdev, 140 unsigned long address, 141 unsigned long pages, int ih) 142{ 143 unsigned long shift = ilog2(__roundup_pow_of_two(pages)); 144 unsigned long align = (1ULL << (VTD_PAGE_SHIFT + shift)); 145 unsigned long start = ALIGN_DOWN(address, align); 146 unsigned long end = ALIGN(address + (pages << VTD_PAGE_SHIFT), align); 147 148 while (start < end) { 149 __flush_svm_range_dev(svm, sdev, start, align >> VTD_PAGE_SHIFT, ih); 150 start += align; 151 } 152} 153 154static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address, 155 unsigned long pages, int ih) 156{ 157 struct intel_svm_dev *sdev; 158 159 rcu_read_lock(); 160 list_for_each_entry_rcu(sdev, &svm->devs, list) 161 intel_flush_svm_range_dev(svm, sdev, address, pages, ih); 162 rcu_read_unlock(); 163} 164 165/* Pages have been freed at this point */ 166static void intel_invalidate_range(struct mmu_notifier *mn, 167 struct mm_struct *mm, 168 unsigned long start, unsigned long end) 169{ 170 struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); 171 172 intel_flush_svm_range(svm, start, 173 (end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT, 0); 174} 175 176static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) 177{ 178 struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); 179 struct intel_svm_dev *sdev; 180 181 /* This might end up being called from exit_mmap(), *before* the page 182 * tables are cleared. And __mmu_notifier_release() will delete us from 183 * the list of notifiers so that our invalidate_range() callback doesn't 184 * get called when the page tables are cleared. So we need to protect 185 * against hardware accessing those page tables. 186 * 187 * We do it by clearing the entry in the PASID table and then flushing 188 * the IOTLB and the PASID table caches. This might upset hardware; 189 * perhaps we'll want to point the PASID to a dummy PGD (like the zero 190 * page) so that we end up taking a fault that the hardware really 191 * *has* to handle gracefully without affecting other processes. 192 */ 193 rcu_read_lock(); 194 list_for_each_entry_rcu(sdev, &svm->devs, list) 195 intel_pasid_tear_down_entry(sdev->iommu, sdev->dev, 196 svm->pasid, true); 197 rcu_read_unlock(); 198 199} 200 201static const struct mmu_notifier_ops intel_mmuops = { 202 .release = intel_mm_release, 203 .invalidate_range = intel_invalidate_range, 204}; 205 206static DEFINE_MUTEX(pasid_mutex); 207static LIST_HEAD(global_svm_list); 208 209#define for_each_svm_dev(sdev, svm, d) \ 210 list_for_each_entry((sdev), &(svm)->devs, list) \ 211 if ((d) != (sdev)->dev) {} else 212 213static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid, 214 struct intel_svm **rsvm, 215 struct intel_svm_dev **rsdev) 216{ 217 struct intel_svm_dev *d, *sdev = NULL; 218 struct intel_svm *svm; 219 220 /* The caller should hold the pasid_mutex lock */ 221 if (WARN_ON(!mutex_is_locked(&pasid_mutex))) 222 return -EINVAL; 223 224 if (pasid == INVALID_IOASID || pasid >= PASID_MAX) 225 return -EINVAL; 226 227 svm = ioasid_find(NULL, pasid, NULL); 228 if (IS_ERR(svm)) 229 return PTR_ERR(svm); 230 231 if (!svm) 232 goto out; 233 234 /* 235 * If we found svm for the PASID, there must be at least one device 236 * bond. 237 */ 238 if (WARN_ON(list_empty(&svm->devs))) 239 return -EINVAL; 240 241 rcu_read_lock(); 242 list_for_each_entry_rcu(d, &svm->devs, list) { 243 if (d->dev == dev) { 244 sdev = d; 245 break; 246 } 247 } 248 rcu_read_unlock(); 249 250out: 251 *rsvm = svm; 252 *rsdev = sdev; 253 254 return 0; 255} 256 257int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, 258 struct iommu_gpasid_bind_data *data) 259{ 260 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 261 struct intel_svm_dev *sdev = NULL; 262 struct dmar_domain *dmar_domain; 263 struct device_domain_info *info; 264 struct intel_svm *svm = NULL; 265 unsigned long iflags; 266 int ret = 0; 267 268 if (WARN_ON(!iommu) || !data) 269 return -EINVAL; 270 271 if (data->format != IOMMU_PASID_FORMAT_INTEL_VTD) 272 return -EINVAL; 273 274 /* IOMMU core ensures argsz is more than the start of the union */ 275 if (data->argsz < offsetofend(struct iommu_gpasid_bind_data, vendor.vtd)) 276 return -EINVAL; 277 278 /* Make sure no undefined flags are used in vendor data */ 279 if (data->vendor.vtd.flags & ~(IOMMU_SVA_VTD_GPASID_LAST - 1)) 280 return -EINVAL; 281 282 if (!dev_is_pci(dev)) 283 return -ENOTSUPP; 284 285 /* VT-d supports devices with full 20 bit PASIDs only */ 286 if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX) 287 return -EINVAL; 288 289 /* 290 * We only check host PASID range, we have no knowledge to check 291 * guest PASID range. 292 */ 293 if (data->hpasid <= 0 || data->hpasid >= PASID_MAX) 294 return -EINVAL; 295 296 info = get_domain_info(dev); 297 if (!info) 298 return -EINVAL; 299 300 dmar_domain = to_dmar_domain(domain); 301 302 mutex_lock(&pasid_mutex); 303 ret = pasid_to_svm_sdev(dev, data->hpasid, &svm, &sdev); 304 if (ret) 305 goto out; 306 307 if (sdev) { 308 /* 309 * Do not allow multiple bindings of the same device-PASID since 310 * there is only one SL page tables per PASID. We may revisit 311 * once sharing PGD across domains are supported. 312 */ 313 dev_warn_ratelimited(dev, "Already bound with PASID %u\n", 314 svm->pasid); 315 ret = -EBUSY; 316 goto out; 317 } 318 319 if (!svm) { 320 /* We come here when PASID has never been bond to a device. */ 321 svm = kzalloc(sizeof(*svm), GFP_KERNEL); 322 if (!svm) { 323 ret = -ENOMEM; 324 goto out; 325 } 326 /* REVISIT: upper layer/VFIO can track host process that bind 327 * the PASID. ioasid_set = mm might be sufficient for vfio to 328 * check pasid VMM ownership. We can drop the following line 329 * once VFIO and IOASID set check is in place. 330 */ 331 svm->mm = get_task_mm(current); 332 svm->pasid = data->hpasid; 333 if (data->flags & IOMMU_SVA_GPASID_VAL) { 334 svm->gpasid = data->gpasid; 335 svm->flags |= SVM_FLAG_GUEST_PASID; 336 } 337 ioasid_set_data(data->hpasid, svm); 338 INIT_LIST_HEAD_RCU(&svm->devs); 339 mmput(svm->mm); 340 } 341 sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); 342 if (!sdev) { 343 ret = -ENOMEM; 344 goto out; 345 } 346 sdev->dev = dev; 347 sdev->sid = PCI_DEVID(info->bus, info->devfn); 348 sdev->iommu = iommu; 349 350 /* Only count users if device has aux domains */ 351 if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX)) 352 sdev->users = 1; 353 354 /* Set up device context entry for PASID if not enabled already */ 355 ret = intel_iommu_enable_pasid(iommu, sdev->dev); 356 if (ret) { 357 dev_err_ratelimited(dev, "Failed to enable PASID capability\n"); 358 kfree(sdev); 359 goto out; 360 } 361 362 /* 363 * PASID table is per device for better security. Therefore, for 364 * each bind of a new device even with an existing PASID, we need to 365 * call the nested mode setup function here. 366 */ 367 spin_lock_irqsave(&iommu->lock, iflags); 368 ret = intel_pasid_setup_nested(iommu, dev, 369 (pgd_t *)(uintptr_t)data->gpgd, 370 data->hpasid, &data->vendor.vtd, dmar_domain, 371 data->addr_width); 372 spin_unlock_irqrestore(&iommu->lock, iflags); 373 if (ret) { 374 dev_err_ratelimited(dev, "Failed to set up PASID %llu in nested mode, Err %d\n", 375 data->hpasid, ret); 376 /* 377 * PASID entry should be in cleared state if nested mode 378 * set up failed. So we only need to clear IOASID tracking 379 * data such that free call will succeed. 380 */ 381 kfree(sdev); 382 goto out; 383 } 384 385 svm->flags |= SVM_FLAG_GUEST_MODE; 386 387 init_rcu_head(&sdev->rcu); 388 list_add_rcu(&sdev->list, &svm->devs); 389 out: 390 if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) { 391 ioasid_set_data(data->hpasid, NULL); 392 kfree(svm); 393 } 394 395 mutex_unlock(&pasid_mutex); 396 return ret; 397} 398 399int intel_svm_unbind_gpasid(struct device *dev, u32 pasid) 400{ 401 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 402 struct intel_svm_dev *sdev; 403 struct intel_svm *svm; 404 int ret; 405 406 if (WARN_ON(!iommu)) 407 return -EINVAL; 408 409 mutex_lock(&pasid_mutex); 410 ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev); 411 if (ret) 412 goto out; 413 414 if (sdev) { 415 if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX)) 416 sdev->users--; 417 if (!sdev->users) { 418 list_del_rcu(&sdev->list); 419 intel_pasid_tear_down_entry(iommu, dev, 420 svm->pasid, false); 421 intel_svm_drain_prq(dev, svm->pasid); 422 kfree_rcu(sdev, rcu); 423 424 if (list_empty(&svm->devs)) { 425 /* 426 * We do not free the IOASID here in that 427 * IOMMU driver did not allocate it. 428 * Unlike native SVM, IOASID for guest use was 429 * allocated prior to the bind call. 430 * In any case, if the free call comes before 431 * the unbind, IOMMU driver will get notified 432 * and perform cleanup. 433 */ 434 ioasid_set_data(pasid, NULL); 435 kfree(svm); 436 } 437 } 438 } 439out: 440 mutex_unlock(&pasid_mutex); 441 return ret; 442} 443 444static void _load_pasid(void *unused) 445{ 446 update_pasid(); 447} 448 449static void load_pasid(struct mm_struct *mm, u32 pasid) 450{ 451 mutex_lock(&mm->context.lock); 452 453 /* Synchronize with READ_ONCE in update_pasid(). */ 454 smp_store_release(&mm->pasid, pasid); 455 456 /* Update PASID MSR on all CPUs running the mm's tasks. */ 457 on_each_cpu_mask(mm_cpumask(mm), _load_pasid, NULL, true); 458 459 mutex_unlock(&mm->context.lock); 460} 461 462/* Caller must hold pasid_mutex, mm reference */ 463static int 464intel_svm_bind_mm(struct device *dev, unsigned int flags, 465 struct svm_dev_ops *ops, 466 struct mm_struct *mm, struct intel_svm_dev **sd) 467{ 468 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 469 struct device_domain_info *info; 470 struct intel_svm_dev *sdev; 471 struct intel_svm *svm = NULL; 472 unsigned long iflags; 473 int pasid_max; 474 int ret; 475 476 if (!iommu || dmar_disabled) 477 return -EINVAL; 478 479 if (!intel_svm_capable(iommu)) 480 return -ENOTSUPP; 481 482 if (dev_is_pci(dev)) { 483 pasid_max = pci_max_pasids(to_pci_dev(dev)); 484 if (pasid_max < 0) 485 return -EINVAL; 486 } else 487 pasid_max = 1 << 20; 488 489 /* Bind supervisor PASID shuld have mm = NULL */ 490 if (flags & SVM_FLAG_SUPERVISOR_MODE) { 491 if (!ecap_srs(iommu->ecap) || mm) { 492 pr_err("Supervisor PASID with user provided mm.\n"); 493 return -EINVAL; 494 } 495 } 496 497 if (!(flags & SVM_FLAG_PRIVATE_PASID)) { 498 struct intel_svm *t; 499 500 list_for_each_entry(t, &global_svm_list, list) { 501 if (t->mm != mm || (t->flags & SVM_FLAG_PRIVATE_PASID)) 502 continue; 503 504 svm = t; 505 if (svm->pasid >= pasid_max) { 506 dev_warn(dev, 507 "Limited PASID width. Cannot use existing PASID %d\n", 508 svm->pasid); 509 ret = -ENOSPC; 510 goto out; 511 } 512 513 /* Find the matching device in svm list */ 514 for_each_svm_dev(sdev, svm, dev) { 515 if (sdev->ops != ops) { 516 ret = -EBUSY; 517 goto out; 518 } 519 sdev->users++; 520 goto success; 521 } 522 523 break; 524 } 525 } 526 527 sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); 528 if (!sdev) { 529 ret = -ENOMEM; 530 goto out; 531 } 532 sdev->dev = dev; 533 sdev->iommu = iommu; 534 535 ret = intel_iommu_enable_pasid(iommu, dev); 536 if (ret) { 537 kfree(sdev); 538 goto out; 539 } 540 541 info = get_domain_info(dev); 542 sdev->did = FLPT_DEFAULT_DID; 543 sdev->sid = PCI_DEVID(info->bus, info->devfn); 544 if (info->ats_enabled) { 545 sdev->dev_iotlb = 1; 546 sdev->qdep = info->ats_qdep; 547 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS) 548 sdev->qdep = 0; 549 } 550 551 /* Finish the setup now we know we're keeping it */ 552 sdev->users = 1; 553 sdev->ops = ops; 554 init_rcu_head(&sdev->rcu); 555 556 if (!svm) { 557 svm = kzalloc(sizeof(*svm), GFP_KERNEL); 558 if (!svm) { 559 ret = -ENOMEM; 560 kfree(sdev); 561 goto out; 562 } 563 564 if (pasid_max > intel_pasid_max_id) 565 pasid_max = intel_pasid_max_id; 566 567 /* Do not use PASID 0, reserved for RID to PASID */ 568 svm->pasid = ioasid_alloc(NULL, PASID_MIN, 569 pasid_max - 1, svm); 570 if (svm->pasid == INVALID_IOASID) { 571 kfree(svm); 572 kfree(sdev); 573 ret = -ENOSPC; 574 goto out; 575 } 576 svm->notifier.ops = &intel_mmuops; 577 svm->mm = mm; 578 svm->flags = flags; 579 INIT_LIST_HEAD_RCU(&svm->devs); 580 INIT_LIST_HEAD(&svm->list); 581 ret = -ENOMEM; 582 if (mm) { 583 ret = mmu_notifier_register(&svm->notifier, mm); 584 if (ret) { 585 ioasid_free(svm->pasid); 586 kfree(svm); 587 kfree(sdev); 588 goto out; 589 } 590 } 591 592 spin_lock_irqsave(&iommu->lock, iflags); 593 ret = intel_pasid_setup_first_level(iommu, dev, 594 mm ? mm->pgd : init_mm.pgd, 595 svm->pasid, FLPT_DEFAULT_DID, 596 (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) | 597 (cpu_feature_enabled(X86_FEATURE_LA57) ? 598 PASID_FLAG_FL5LP : 0)); 599 spin_unlock_irqrestore(&iommu->lock, iflags); 600 if (ret) { 601 if (mm) 602 mmu_notifier_unregister(&svm->notifier, mm); 603 ioasid_free(svm->pasid); 604 kfree(svm); 605 kfree(sdev); 606 goto out; 607 } 608 609 list_add_tail(&svm->list, &global_svm_list); 610 if (mm) { 611 /* The newly allocated pasid is loaded to the mm. */ 612 load_pasid(mm, svm->pasid); 613 } 614 } else { 615 /* 616 * Binding a new device with existing PASID, need to setup 617 * the PASID entry. 618 */ 619 spin_lock_irqsave(&iommu->lock, iflags); 620 ret = intel_pasid_setup_first_level(iommu, dev, 621 mm ? mm->pgd : init_mm.pgd, 622 svm->pasid, FLPT_DEFAULT_DID, 623 (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) | 624 (cpu_feature_enabled(X86_FEATURE_LA57) ? 625 PASID_FLAG_FL5LP : 0)); 626 spin_unlock_irqrestore(&iommu->lock, iflags); 627 if (ret) { 628 kfree(sdev); 629 goto out; 630 } 631 } 632 list_add_rcu(&sdev->list, &svm->devs); 633success: 634 sdev->pasid = svm->pasid; 635 sdev->sva.dev = dev; 636 if (sd) 637 *sd = sdev; 638 ret = 0; 639out: 640 return ret; 641} 642 643/* Caller must hold pasid_mutex */ 644static int intel_svm_unbind_mm(struct device *dev, u32 pasid) 645{ 646 struct intel_svm_dev *sdev; 647 struct intel_iommu *iommu; 648 struct intel_svm *svm; 649 int ret = -EINVAL; 650 651 iommu = device_to_iommu(dev, NULL, NULL); 652 if (!iommu) 653 goto out; 654 655 ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev); 656 if (ret) 657 goto out; 658 659 if (sdev) { 660 sdev->users--; 661 if (!sdev->users) { 662 list_del_rcu(&sdev->list); 663 /* Flush the PASID cache and IOTLB for this device. 664 * Note that we do depend on the hardware *not* using 665 * the PASID any more. Just as we depend on other 666 * devices never using PASIDs that they have no right 667 * to use. We have a *shared* PASID table, because it's 668 * large and has to be physically contiguous. So it's 669 * hard to be as defensive as we might like. */ 670 intel_pasid_tear_down_entry(iommu, dev, 671 svm->pasid, false); 672 intel_svm_drain_prq(dev, svm->pasid); 673 kfree_rcu(sdev, rcu); 674 675 if (list_empty(&svm->devs)) { 676 ioasid_free(svm->pasid); 677 if (svm->mm) { 678 mmu_notifier_unregister(&svm->notifier, svm->mm); 679 /* Clear mm's pasid. */ 680 load_pasid(svm->mm, PASID_DISABLED); 681 } 682 list_del(&svm->list); 683 /* We mandate that no page faults may be outstanding 684 * for the PASID when intel_svm_unbind_mm() is called. 685 * If that is not obeyed, subtle errors will happen. 686 * Let's make them less subtle... */ 687 memset(svm, 0x6b, sizeof(*svm)); 688 kfree(svm); 689 } 690 } 691 } 692out: 693 return ret; 694} 695 696/* Page request queue descriptor */ 697struct page_req_dsc { 698 union { 699 struct { 700 u64 type:8; 701 u64 pasid_present:1; 702 u64 priv_data_present:1; 703 u64 rsvd:6; 704 u64 rid:16; 705 u64 pasid:20; 706 u64 exe_req:1; 707 u64 pm_req:1; 708 u64 rsvd2:10; 709 }; 710 u64 qw_0; 711 }; 712 union { 713 struct { 714 u64 rd_req:1; 715 u64 wr_req:1; 716 u64 lpig:1; 717 u64 prg_index:9; 718 u64 addr:52; 719 }; 720 u64 qw_1; 721 }; 722 u64 priv_data[2]; 723}; 724 725#define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20) 726 727static bool access_error(struct vm_area_struct *vma, struct page_req_dsc *req) 728{ 729 unsigned long requested = 0; 730 731 if (req->exe_req) 732 requested |= VM_EXEC; 733 734 if (req->rd_req) 735 requested |= VM_READ; 736 737 if (req->wr_req) 738 requested |= VM_WRITE; 739 740 return (requested & ~vma->vm_flags) != 0; 741} 742 743static bool is_canonical_address(u64 addr) 744{ 745 int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); 746 long saddr = (long) addr; 747 748 return (((saddr << shift) >> shift) == saddr); 749} 750 751/** 752 * intel_svm_drain_prq - Drain page requests and responses for a pasid 753 * @dev: target device 754 * @pasid: pasid for draining 755 * 756 * Drain all pending page requests and responses related to @pasid in both 757 * software and hardware. This is supposed to be called after the device 758 * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB 759 * and DevTLB have been invalidated. 760 * 761 * It waits until all pending page requests for @pasid in the page fault 762 * queue are completed by the prq handling thread. Then follow the steps 763 * described in VT-d spec CH7.10 to drain all page requests and page 764 * responses pending in the hardware. 765 */ 766static void intel_svm_drain_prq(struct device *dev, u32 pasid) 767{ 768 struct device_domain_info *info; 769 struct dmar_domain *domain; 770 struct intel_iommu *iommu; 771 struct qi_desc desc[3]; 772 struct pci_dev *pdev; 773 int head, tail; 774 u16 sid, did; 775 int qdep; 776 777 info = get_domain_info(dev); 778 if (WARN_ON(!info || !dev_is_pci(dev))) 779 return; 780 781 if (!info->pri_enabled) 782 return; 783 784 iommu = info->iommu; 785 domain = info->domain; 786 pdev = to_pci_dev(dev); 787 sid = PCI_DEVID(info->bus, info->devfn); 788 did = domain->iommu_did[iommu->seq_id]; 789 qdep = pci_ats_queue_depth(pdev); 790 791 /* 792 * Check and wait until all pending page requests in the queue are 793 * handled by the prq handling thread. 794 */ 795prq_retry: 796 reinit_completion(&iommu->prq_complete); 797 tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; 798 head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; 799 while (head != tail) { 800 struct page_req_dsc *req; 801 802 req = &iommu->prq[head / sizeof(*req)]; 803 if (!req->pasid_present || req->pasid != pasid) { 804 head = (head + sizeof(*req)) & PRQ_RING_MASK; 805 continue; 806 } 807 808 wait_for_completion(&iommu->prq_complete); 809 goto prq_retry; 810 } 811 812 /* 813 * Perform steps described in VT-d spec CH7.10 to drain page 814 * requests and responses in hardware. 815 */ 816 memset(desc, 0, sizeof(desc)); 817 desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) | 818 QI_IWD_FENCE | 819 QI_IWD_TYPE; 820 desc[1].qw0 = QI_EIOTLB_PASID(pasid) | 821 QI_EIOTLB_DID(did) | 822 QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | 823 QI_EIOTLB_TYPE; 824 desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) | 825 QI_DEV_EIOTLB_SID(sid) | 826 QI_DEV_EIOTLB_QDEP(qdep) | 827 QI_DEIOTLB_TYPE | 828 QI_DEV_IOTLB_PFSID(info->pfsid); 829qi_retry: 830 reinit_completion(&iommu->prq_complete); 831 qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN); 832 if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { 833 wait_for_completion(&iommu->prq_complete); 834 goto qi_retry; 835 } 836} 837 838static int prq_to_iommu_prot(struct page_req_dsc *req) 839{ 840 int prot = 0; 841 842 if (req->rd_req) 843 prot |= IOMMU_FAULT_PERM_READ; 844 if (req->wr_req) 845 prot |= IOMMU_FAULT_PERM_WRITE; 846 if (req->exe_req) 847 prot |= IOMMU_FAULT_PERM_EXEC; 848 if (req->pm_req) 849 prot |= IOMMU_FAULT_PERM_PRIV; 850 851 return prot; 852} 853 854static int 855intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc) 856{ 857 struct iommu_fault_event event; 858 859 if (!dev || !dev_is_pci(dev)) 860 return -ENODEV; 861 862 /* Fill in event data for device specific processing */ 863 memset(&event, 0, sizeof(struct iommu_fault_event)); 864 event.fault.type = IOMMU_FAULT_PAGE_REQ; 865 event.fault.prm.addr = (u64)desc->addr << VTD_PAGE_SHIFT; 866 event.fault.prm.pasid = desc->pasid; 867 event.fault.prm.grpid = desc->prg_index; 868 event.fault.prm.perm = prq_to_iommu_prot(desc); 869 870 if (desc->lpig) 871 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; 872 if (desc->pasid_present) { 873 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; 874 event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID; 875 } 876 if (desc->priv_data_present) { 877 /* 878 * Set last page in group bit if private data is present, 879 * page response is required as it does for LPIG. 880 * iommu_report_device_fault() doesn't understand this vendor 881 * specific requirement thus we set last_page as a workaround. 882 */ 883 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; 884 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA; 885 memcpy(event.fault.prm.private_data, desc->priv_data, 886 sizeof(desc->priv_data)); 887 } 888 889 return iommu_report_device_fault(dev, &event); 890} 891 892static irqreturn_t prq_event_thread(int irq, void *d) 893{ 894 struct intel_svm_dev *sdev = NULL; 895 struct intel_iommu *iommu = d; 896 struct intel_svm *svm = NULL; 897 int head, tail, handled = 0; 898 899 /* Clear PPR bit before reading head/tail registers, to 900 * ensure that we get a new interrupt if needed. */ 901 writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG); 902 903 tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; 904 head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; 905 while (head != tail) { 906 struct vm_area_struct *vma; 907 struct page_req_dsc *req; 908 struct qi_desc resp; 909 int result; 910 vm_fault_t ret; 911 u64 address; 912 913 handled = 1; 914 915 req = &iommu->prq[head / sizeof(*req)]; 916 917 result = QI_RESP_FAILURE; 918 address = (u64)req->addr << VTD_PAGE_SHIFT; 919 if (!req->pasid_present) { 920 pr_err("%s: Page request without PASID: %08llx %08llx\n", 921 iommu->name, ((unsigned long long *)req)[0], 922 ((unsigned long long *)req)[1]); 923 goto no_pasid; 924 } 925 /* We shall not receive page request for supervisor SVM */ 926 if (req->pm_req && (req->rd_req | req->wr_req)) { 927 pr_err("Unexpected page request in Privilege Mode"); 928 /* No need to find the matching sdev as for bad_req */ 929 goto no_pasid; 930 } 931 /* DMA read with exec requeset is not supported. */ 932 if (req->exe_req && req->rd_req) { 933 pr_err("Execution request not supported\n"); 934 goto no_pasid; 935 } 936 if (!svm || svm->pasid != req->pasid) { 937 rcu_read_lock(); 938 svm = ioasid_find(NULL, req->pasid, NULL); 939 /* It *can't* go away, because the driver is not permitted 940 * to unbind the mm while any page faults are outstanding. 941 * So we only need RCU to protect the internal idr code. */ 942 rcu_read_unlock(); 943 if (IS_ERR_OR_NULL(svm)) { 944 pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n", 945 iommu->name, req->pasid, ((unsigned long long *)req)[0], 946 ((unsigned long long *)req)[1]); 947 goto no_pasid; 948 } 949 } 950 951 if (!sdev || sdev->sid != req->rid) { 952 struct intel_svm_dev *t; 953 954 sdev = NULL; 955 rcu_read_lock(); 956 list_for_each_entry_rcu(t, &svm->devs, list) { 957 if (t->sid == req->rid) { 958 sdev = t; 959 break; 960 } 961 } 962 rcu_read_unlock(); 963 } 964 965 result = QI_RESP_INVALID; 966 /* Since we're using init_mm.pgd directly, we should never take 967 * any faults on kernel addresses. */ 968 if (!svm->mm) 969 goto bad_req; 970 971 /* If address is not canonical, return invalid response */ 972 if (!is_canonical_address(address)) 973 goto bad_req; 974 975 /* 976 * If prq is to be handled outside iommu driver via receiver of 977 * the fault notifiers, we skip the page response here. 978 */ 979 if (svm->flags & SVM_FLAG_GUEST_MODE) { 980 if (sdev && !intel_svm_prq_report(sdev->dev, req)) 981 goto prq_advance; 982 else 983 goto bad_req; 984 } 985 986 /* If the mm is already defunct, don't handle faults. */ 987 if (!mmget_not_zero(svm->mm)) 988 goto bad_req; 989 990 mmap_read_lock(svm->mm); 991 vma = find_extend_vma(svm->mm, address); 992 if (!vma || address < vma->vm_start) 993 goto invalid; 994 995 if (access_error(vma, req)) 996 goto invalid; 997 998 ret = handle_mm_fault(vma, address, 999 req->wr_req ? FAULT_FLAG_WRITE : 0, 1000 NULL); 1001 if (ret & VM_FAULT_ERROR) 1002 goto invalid; 1003 1004 result = QI_RESP_SUCCESS; 1005invalid: 1006 mmap_read_unlock(svm->mm); 1007 mmput(svm->mm); 1008bad_req: 1009 WARN_ON(!sdev); 1010 if (sdev && sdev->ops && sdev->ops->fault_cb) { 1011 int rwxp = (req->rd_req << 3) | (req->wr_req << 2) | 1012 (req->exe_req << 1) | (req->pm_req); 1013 sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr, 1014 req->priv_data, rwxp, result); 1015 } 1016 /* We get here in the error case where the PASID lookup failed, 1017 and these can be NULL. Do not use them below this point! */ 1018 sdev = NULL; 1019 svm = NULL; 1020no_pasid: 1021 if (req->lpig || req->priv_data_present) { 1022 /* 1023 * Per VT-d spec. v3.0 ch7.7, system software must 1024 * respond with page group response if private data 1025 * is present (PDP) or last page in group (LPIG) bit 1026 * is set. This is an additional VT-d feature beyond 1027 * PCI ATS spec. 1028 */ 1029 resp.qw0 = QI_PGRP_PASID(req->pasid) | 1030 QI_PGRP_DID(req->rid) | 1031 QI_PGRP_PASID_P(req->pasid_present) | 1032 QI_PGRP_PDP(req->priv_data_present) | 1033 QI_PGRP_RESP_CODE(result) | 1034 QI_PGRP_RESP_TYPE; 1035 resp.qw1 = QI_PGRP_IDX(req->prg_index) | 1036 QI_PGRP_LPIG(req->lpig); 1037 resp.qw2 = 0; 1038 resp.qw3 = 0; 1039 1040 if (req->priv_data_present) 1041 memcpy(&resp.qw2, req->priv_data, 1042 sizeof(req->priv_data)); 1043 qi_submit_sync(iommu, &resp, 1, 0); 1044 } 1045prq_advance: 1046 head = (head + sizeof(*req)) & PRQ_RING_MASK; 1047 } 1048 1049 dmar_writeq(iommu->reg + DMAR_PQH_REG, tail); 1050 1051 /* 1052 * Clear the page request overflow bit and wake up all threads that 1053 * are waiting for the completion of this handling. 1054 */ 1055 if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { 1056 pr_info_ratelimited("IOMMU: %s: PRQ overflow detected\n", 1057 iommu->name); 1058 head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; 1059 tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; 1060 if (head == tail) { 1061 writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG); 1062 pr_info_ratelimited("IOMMU: %s: PRQ overflow cleared", 1063 iommu->name); 1064 } 1065 } 1066 1067 if (!completion_done(&iommu->prq_complete)) 1068 complete(&iommu->prq_complete); 1069 1070 return IRQ_RETVAL(handled); 1071} 1072 1073#define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva) 1074struct iommu_sva * 1075intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata) 1076{ 1077 struct iommu_sva *sva = ERR_PTR(-EINVAL); 1078 struct intel_svm_dev *sdev = NULL; 1079 unsigned int flags = 0; 1080 int ret; 1081 1082 /* 1083 * TODO: Consolidate with generic iommu-sva bind after it is merged. 1084 * It will require shared SVM data structures, i.e. combine io_mm 1085 * and intel_svm etc. 1086 */ 1087 if (drvdata) 1088 flags = *(unsigned int *)drvdata; 1089 mutex_lock(&pasid_mutex); 1090 ret = intel_svm_bind_mm(dev, flags, NULL, mm, &sdev); 1091 if (ret) 1092 sva = ERR_PTR(ret); 1093 else if (sdev) 1094 sva = &sdev->sva; 1095 else 1096 WARN(!sdev, "SVM bind succeeded with no sdev!\n"); 1097 1098 mutex_unlock(&pasid_mutex); 1099 1100 return sva; 1101} 1102 1103void intel_svm_unbind(struct iommu_sva *sva) 1104{ 1105 struct intel_svm_dev *sdev; 1106 1107 mutex_lock(&pasid_mutex); 1108 sdev = to_intel_svm_dev(sva); 1109 intel_svm_unbind_mm(sdev->dev, sdev->pasid); 1110 mutex_unlock(&pasid_mutex); 1111} 1112 1113u32 intel_svm_get_pasid(struct iommu_sva *sva) 1114{ 1115 struct intel_svm_dev *sdev; 1116 u32 pasid; 1117 1118 mutex_lock(&pasid_mutex); 1119 sdev = to_intel_svm_dev(sva); 1120 pasid = sdev->pasid; 1121 mutex_unlock(&pasid_mutex); 1122 1123 return pasid; 1124} 1125 1126int intel_svm_page_response(struct device *dev, 1127 struct iommu_fault_event *evt, 1128 struct iommu_page_response *msg) 1129{ 1130 struct iommu_fault_page_request *prm; 1131 struct intel_svm_dev *sdev = NULL; 1132 struct intel_svm *svm = NULL; 1133 struct intel_iommu *iommu; 1134 bool private_present; 1135 bool pasid_present; 1136 bool last_page; 1137 u8 bus, devfn; 1138 int ret = 0; 1139 u16 sid; 1140 1141 if (!dev || !dev_is_pci(dev)) 1142 return -ENODEV; 1143 1144 iommu = device_to_iommu(dev, &bus, &devfn); 1145 if (!iommu) 1146 return -ENODEV; 1147 1148 if (!msg || !evt) 1149 return -EINVAL; 1150 1151 mutex_lock(&pasid_mutex); 1152 1153 prm = &evt->fault.prm; 1154 sid = PCI_DEVID(bus, devfn); 1155 pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; 1156 private_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA; 1157 last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; 1158 1159 if (!pasid_present) { 1160 ret = -EINVAL; 1161 goto out; 1162 } 1163 1164 if (prm->pasid == 0 || prm->pasid >= PASID_MAX) { 1165 ret = -EINVAL; 1166 goto out; 1167 } 1168 1169 ret = pasid_to_svm_sdev(dev, prm->pasid, &svm, &sdev); 1170 if (ret || !sdev) { 1171 ret = -ENODEV; 1172 goto out; 1173 } 1174 1175 /* 1176 * For responses from userspace, need to make sure that the 1177 * pasid has been bound to its mm. 1178 */ 1179 if (svm->flags & SVM_FLAG_GUEST_MODE) { 1180 struct mm_struct *mm; 1181 1182 mm = get_task_mm(current); 1183 if (!mm) { 1184 ret = -EINVAL; 1185 goto out; 1186 } 1187 1188 if (mm != svm->mm) { 1189 ret = -ENODEV; 1190 mmput(mm); 1191 goto out; 1192 } 1193 1194 mmput(mm); 1195 } 1196 1197 /* 1198 * Per VT-d spec. v3.0 ch7.7, system software must respond 1199 * with page group response if private data is present (PDP) 1200 * or last page in group (LPIG) bit is set. This is an 1201 * additional VT-d requirement beyond PCI ATS spec. 1202 */ 1203 if (last_page || private_present) { 1204 struct qi_desc desc; 1205 1206 desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) | 1207 QI_PGRP_PASID_P(pasid_present) | 1208 QI_PGRP_PDP(private_present) | 1209 QI_PGRP_RESP_CODE(msg->code) | 1210 QI_PGRP_RESP_TYPE; 1211 desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page); 1212 desc.qw2 = 0; 1213 desc.qw3 = 0; 1214 if (private_present) 1215 memcpy(&desc.qw2, prm->private_data, 1216 sizeof(prm->private_data)); 1217 1218 qi_submit_sync(iommu, &desc, 1, 0); 1219 } 1220out: 1221 mutex_unlock(&pasid_mutex); 1222 return ret; 1223} 1224