1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * Copyright © 2006-2014 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org>, 6 * Ashok Raj <ashok.raj@intel.com>, 7 * Shaohua Li <shaohua.li@intel.com>, 8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, 9 * Fenghua Yu <fenghua.yu@intel.com> 10 * Joerg Roedel <jroedel@suse.de> 11 */ 12 13#define pr_fmt(fmt) "DMAR: " fmt 14#define dev_fmt(fmt) pr_fmt(fmt) 15 16#include <linux/init.h> 17#include <linux/bitmap.h> 18#include <linux/debugfs.h> 19#include <linux/export.h> 20#include <linux/slab.h> 21#include <linux/irq.h> 22#include <linux/interrupt.h> 23#include <linux/spinlock.h> 24#include <linux/pci.h> 25#include <linux/dmar.h> 26#include <linux/dma-map-ops.h> 27#include <linux/mempool.h> 28#include <linux/memory.h> 29#include <linux/cpu.h> 30#include <linux/timer.h> 31#include <linux/io.h> 32#include <linux/iova.h> 33#include <linux/iommu.h> 34#include <linux/intel-iommu.h> 35#include <linux/syscore_ops.h> 36#include <linux/tboot.h> 37#include <linux/dmi.h> 38#include <linux/pci-ats.h> 39#include <linux/memblock.h> 40#include <linux/dma-map-ops.h> 41#include <linux/dma-direct.h> 42#include <linux/crash_dump.h> 43#include <linux/numa.h> 44#include <linux/swiotlb.h> 45#include <asm/irq_remapping.h> 46#include <asm/cacheflush.h> 47#include <asm/iommu.h> 48#include <trace/events/intel_iommu.h> 49 50#include "../irq_remapping.h" 51#include "pasid.h" 52 53#define ROOT_SIZE VTD_PAGE_SIZE 54#define CONTEXT_SIZE VTD_PAGE_SIZE 55 56#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) 57#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) 58#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) 59#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) 60 61#define IOAPIC_RANGE_START (0xfee00000) 62#define IOAPIC_RANGE_END (0xfeefffff) 63#define IOVA_START_ADDR (0x1000) 64 65#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 66 67#define MAX_AGAW_WIDTH 64 68#define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT) 69 70#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) 71#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) 72 73/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR 74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */ 75#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ 76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) 77#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) 78 79/* IO virtual address start page frame number */ 80#define IOVA_START_PFN (1) 81 82#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) 83 84/* page table handling */ 85#define LEVEL_STRIDE (9) 86#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) 87 88/* 89 * This bitmap is used to advertise the page sizes our hardware support 90 * to the IOMMU core, which will then use this information to split 91 * physically contiguous memory regions it is mapping into page sizes 92 * that we support. 93 * 94 * Traditionally the IOMMU core just handed us the mappings directly, 95 * after making sure the size is an order of a 4KiB page and that the 96 * mapping has natural alignment. 97 * 98 * To retain this behavior, we currently advertise that we support 99 * all page sizes that are an order of 4KiB. 100 * 101 * If at some point we'd like to utilize the IOMMU core's new behavior, 102 * we could change this to advertise the real page sizes we support. 103 */ 104#define INTEL_IOMMU_PGSIZES (~0xFFFUL) 105 106static inline int agaw_to_level(int agaw) 107{ 108 return agaw + 2; 109} 110 111static inline int agaw_to_width(int agaw) 112{ 113 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); 114} 115 116static inline int width_to_agaw(int width) 117{ 118 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); 119} 120 121static inline unsigned int level_to_offset_bits(int level) 122{ 123 return (level - 1) * LEVEL_STRIDE; 124} 125 126static inline int pfn_level_offset(u64 pfn, int level) 127{ 128 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; 129} 130 131static inline u64 level_mask(int level) 132{ 133 return -1ULL << level_to_offset_bits(level); 134} 135 136static inline u64 level_size(int level) 137{ 138 return 1ULL << level_to_offset_bits(level); 139} 140 141static inline u64 align_to_level(u64 pfn, int level) 142{ 143 return (pfn + level_size(level) - 1) & level_mask(level); 144} 145 146static inline unsigned long lvl_to_nr_pages(unsigned int lvl) 147{ 148 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); 149} 150 151/* VT-d pages must always be _smaller_ than MM pages. Otherwise things 152 are never going to work. */ 153static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn) 154{ 155 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT); 156} 157 158static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn) 159{ 160 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); 161} 162static inline unsigned long page_to_dma_pfn(struct page *pg) 163{ 164 return mm_to_dma_pfn(page_to_pfn(pg)); 165} 166static inline unsigned long virt_to_dma_pfn(void *p) 167{ 168 return page_to_dma_pfn(virt_to_page(p)); 169} 170 171/* global iommu list, set NULL for ignored DMAR units */ 172static struct intel_iommu **g_iommus; 173 174static void __init check_tylersburg_isoch(void); 175static int rwbf_quirk; 176 177/* 178 * set to 1 to panic kernel if can't successfully enable VT-d 179 * (used when kernel is launched w/ TXT) 180 */ 181static int force_on = 0; 182static int intel_iommu_tboot_noforce; 183static int no_platform_optin; 184 185#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) 186 187/* 188 * Take a root_entry and return the Lower Context Table Pointer (LCTP) 189 * if marked present. 190 */ 191static phys_addr_t root_entry_lctp(struct root_entry *re) 192{ 193 if (!(re->lo & 1)) 194 return 0; 195 196 return re->lo & VTD_PAGE_MASK; 197} 198 199/* 200 * Take a root_entry and return the Upper Context Table Pointer (UCTP) 201 * if marked present. 202 */ 203static phys_addr_t root_entry_uctp(struct root_entry *re) 204{ 205 if (!(re->hi & 1)) 206 return 0; 207 208 return re->hi & VTD_PAGE_MASK; 209} 210 211static inline void context_clear_pasid_enable(struct context_entry *context) 212{ 213 context->lo &= ~(1ULL << 11); 214} 215 216static inline bool context_pasid_enabled(struct context_entry *context) 217{ 218 return !!(context->lo & (1ULL << 11)); 219} 220 221static inline void context_set_copied(struct context_entry *context) 222{ 223 context->hi |= (1ull << 3); 224} 225 226static inline bool context_copied(struct context_entry *context) 227{ 228 return !!(context->hi & (1ULL << 3)); 229} 230 231static inline bool __context_present(struct context_entry *context) 232{ 233 return (context->lo & 1); 234} 235 236bool context_present(struct context_entry *context) 237{ 238 return context_pasid_enabled(context) ? 239 __context_present(context) : 240 __context_present(context) && !context_copied(context); 241} 242 243static inline void context_set_present(struct context_entry *context) 244{ 245 context->lo |= 1; 246} 247 248static inline void context_set_fault_enable(struct context_entry *context) 249{ 250 context->lo &= (((u64)-1) << 2) | 1; 251} 252 253static inline void context_set_translation_type(struct context_entry *context, 254 unsigned long value) 255{ 256 context->lo &= (((u64)-1) << 4) | 3; 257 context->lo |= (value & 3) << 2; 258} 259 260static inline void context_set_address_root(struct context_entry *context, 261 unsigned long value) 262{ 263 context->lo &= ~VTD_PAGE_MASK; 264 context->lo |= value & VTD_PAGE_MASK; 265} 266 267static inline void context_set_address_width(struct context_entry *context, 268 unsigned long value) 269{ 270 context->hi |= value & 7; 271} 272 273static inline void context_set_domain_id(struct context_entry *context, 274 unsigned long value) 275{ 276 context->hi |= (value & ((1 << 16) - 1)) << 8; 277} 278 279static inline int context_domain_id(struct context_entry *c) 280{ 281 return((c->hi >> 8) & 0xffff); 282} 283 284static inline void context_clear_entry(struct context_entry *context) 285{ 286 context->lo = 0; 287 context->hi = 0; 288} 289 290/* 291 * This domain is a statically identity mapping domain. 292 * 1. This domain creats a static 1:1 mapping to all usable memory. 293 * 2. It maps to each iommu if successful. 294 * 3. Each iommu mapps to this domain if successful. 295 */ 296static struct dmar_domain *si_domain; 297static int hw_pass_through = 1; 298 299#define for_each_domain_iommu(idx, domain) \ 300 for (idx = 0; idx < g_num_of_iommus; idx++) \ 301 if (domain->iommu_refcnt[idx]) 302 303struct dmar_rmrr_unit { 304 struct list_head list; /* list of rmrr units */ 305 struct acpi_dmar_header *hdr; /* ACPI header */ 306 u64 base_address; /* reserved base address*/ 307 u64 end_address; /* reserved end address */ 308 struct dmar_dev_scope *devices; /* target devices */ 309 int devices_cnt; /* target device count */ 310}; 311 312struct dmar_atsr_unit { 313 struct list_head list; /* list of ATSR units */ 314 struct acpi_dmar_header *hdr; /* ACPI header */ 315 struct dmar_dev_scope *devices; /* target devices */ 316 int devices_cnt; /* target device count */ 317 u8 include_all:1; /* include all ports */ 318}; 319 320static LIST_HEAD(dmar_atsr_units); 321static LIST_HEAD(dmar_rmrr_units); 322 323#define for_each_rmrr_units(rmrr) \ 324 list_for_each_entry(rmrr, &dmar_rmrr_units, list) 325 326/* bitmap for indexing intel_iommus */ 327static int g_num_of_iommus; 328 329static void domain_exit(struct dmar_domain *domain); 330static void domain_remove_dev_info(struct dmar_domain *domain); 331static void dmar_remove_one_dev_info(struct device *dev); 332static void __dmar_remove_one_dev_info(struct device_domain_info *info); 333static int intel_iommu_attach_device(struct iommu_domain *domain, 334 struct device *dev); 335static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 336 dma_addr_t iova); 337 338#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON 339int dmar_disabled = 0; 340#else 341int dmar_disabled = 1; 342#endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */ 343 344#ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON 345int intel_iommu_sm = 1; 346#else 347int intel_iommu_sm; 348#endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */ 349 350int intel_iommu_enabled = 0; 351EXPORT_SYMBOL_GPL(intel_iommu_enabled); 352 353static int dmar_map_gfx = 1; 354static int dmar_forcedac; 355static int intel_iommu_strict; 356static int intel_iommu_superpage = 1; 357static int iommu_identity_mapping; 358static int intel_no_bounce; 359static int iommu_skip_te_disable; 360 361#define IDENTMAP_GFX 2 362#define IDENTMAP_AZALIA 4 363 364int intel_iommu_gfx_mapped; 365EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped); 366 367#define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2)) 368struct device_domain_info *get_domain_info(struct device *dev) 369{ 370 struct device_domain_info *info; 371 372 if (!dev) 373 return NULL; 374 375 info = dev_iommu_priv_get(dev); 376 if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO)) 377 return NULL; 378 379 return info; 380} 381 382DEFINE_SPINLOCK(device_domain_lock); 383static LIST_HEAD(device_domain_list); 384 385#define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) && \ 386 to_pci_dev(d)->untrusted) 387 388/* 389 * Iterate over elements in device_domain_list and call the specified 390 * callback @fn against each element. 391 */ 392int for_each_device_domain(int (*fn)(struct device_domain_info *info, 393 void *data), void *data) 394{ 395 int ret = 0; 396 unsigned long flags; 397 struct device_domain_info *info; 398 399 spin_lock_irqsave(&device_domain_lock, flags); 400 list_for_each_entry(info, &device_domain_list, global) { 401 ret = fn(info, data); 402 if (ret) { 403 spin_unlock_irqrestore(&device_domain_lock, flags); 404 return ret; 405 } 406 } 407 spin_unlock_irqrestore(&device_domain_lock, flags); 408 409 return 0; 410} 411 412const struct iommu_ops intel_iommu_ops; 413 414static bool translation_pre_enabled(struct intel_iommu *iommu) 415{ 416 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); 417} 418 419static void clear_translation_pre_enabled(struct intel_iommu *iommu) 420{ 421 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; 422} 423 424static void init_translation_status(struct intel_iommu *iommu) 425{ 426 u32 gsts; 427 428 gsts = readl(iommu->reg + DMAR_GSTS_REG); 429 if (gsts & DMA_GSTS_TES) 430 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; 431} 432 433static int __init intel_iommu_setup(char *str) 434{ 435 if (!str) 436 return -EINVAL; 437 while (*str) { 438 if (!strncmp(str, "on", 2)) { 439 dmar_disabled = 0; 440 pr_info("IOMMU enabled\n"); 441 } else if (!strncmp(str, "off", 3)) { 442 dmar_disabled = 1; 443 no_platform_optin = 1; 444 pr_info("IOMMU disabled\n"); 445 } else if (!strncmp(str, "igfx_off", 8)) { 446 dmar_map_gfx = 0; 447 pr_info("Disable GFX device mapping\n"); 448 } else if (!strncmp(str, "forcedac", 8)) { 449 pr_info("Forcing DAC for PCI devices\n"); 450 dmar_forcedac = 1; 451 } else if (!strncmp(str, "strict", 6)) { 452 pr_info("Disable batched IOTLB flush\n"); 453 intel_iommu_strict = 1; 454 } else if (!strncmp(str, "sp_off", 6)) { 455 pr_info("Disable supported super page\n"); 456 intel_iommu_superpage = 0; 457 } else if (!strncmp(str, "sm_on", 5)) { 458 pr_info("Intel-IOMMU: scalable mode supported\n"); 459 intel_iommu_sm = 1; 460 } else if (!strncmp(str, "tboot_noforce", 13)) { 461 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 462 intel_iommu_tboot_noforce = 1; 463 } else if (!strncmp(str, "nobounce", 8)) { 464 pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n"); 465 intel_no_bounce = 1; 466 } 467 468 str += strcspn(str, ","); 469 while (*str == ',') 470 str++; 471 } 472 return 0; 473} 474__setup("intel_iommu=", intel_iommu_setup); 475 476static struct kmem_cache *iommu_domain_cache; 477static struct kmem_cache *iommu_devinfo_cache; 478 479static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did) 480{ 481 struct dmar_domain **domains; 482 int idx = did >> 8; 483 484 domains = iommu->domains[idx]; 485 if (!domains) 486 return NULL; 487 488 return domains[did & 0xff]; 489} 490 491static void set_iommu_domain(struct intel_iommu *iommu, u16 did, 492 struct dmar_domain *domain) 493{ 494 struct dmar_domain **domains; 495 int idx = did >> 8; 496 497 if (!iommu->domains[idx]) { 498 size_t size = 256 * sizeof(struct dmar_domain *); 499 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC); 500 } 501 502 domains = iommu->domains[idx]; 503 if (WARN_ON(!domains)) 504 return; 505 else 506 domains[did & 0xff] = domain; 507} 508 509void *alloc_pgtable_page(int node) 510{ 511 struct page *page; 512 void *vaddr = NULL; 513 514 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0); 515 if (page) 516 vaddr = page_address(page); 517 return vaddr; 518} 519 520void free_pgtable_page(void *vaddr) 521{ 522 free_page((unsigned long)vaddr); 523} 524 525static inline void *alloc_domain_mem(void) 526{ 527 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC); 528} 529 530static void free_domain_mem(void *vaddr) 531{ 532 kmem_cache_free(iommu_domain_cache, vaddr); 533} 534 535static inline void * alloc_devinfo_mem(void) 536{ 537 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC); 538} 539 540static inline void free_devinfo_mem(void *vaddr) 541{ 542 kmem_cache_free(iommu_devinfo_cache, vaddr); 543} 544 545static inline int domain_type_is_si(struct dmar_domain *domain) 546{ 547 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY; 548} 549 550static inline bool domain_use_first_level(struct dmar_domain *domain) 551{ 552 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL; 553} 554 555static inline int domain_pfn_supported(struct dmar_domain *domain, 556 unsigned long pfn) 557{ 558 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 559 560 return !(addr_width < BITS_PER_LONG && pfn >> addr_width); 561} 562 563/* 564 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU. 565 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of 566 * the returned SAGAW. 567 */ 568static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu) 569{ 570 unsigned long fl_sagaw, sl_sagaw; 571 572 fl_sagaw = BIT(2) | (cap_5lp_support(iommu->cap) ? BIT(3) : 0); 573 sl_sagaw = cap_sagaw(iommu->cap); 574 575 /* Second level only. */ 576 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) 577 return sl_sagaw; 578 579 /* First level only. */ 580 if (!ecap_slts(iommu->ecap)) 581 return fl_sagaw; 582 583 return fl_sagaw & sl_sagaw; 584} 585 586static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) 587{ 588 unsigned long sagaw; 589 int agaw = -1; 590 591 sagaw = __iommu_calculate_sagaw(iommu); 592 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) { 593 if (test_bit(agaw, &sagaw)) 594 break; 595 } 596 597 return agaw; 598} 599 600/* 601 * Calculate max SAGAW for each iommu. 602 */ 603int iommu_calculate_max_sagaw(struct intel_iommu *iommu) 604{ 605 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); 606} 607 608/* 609 * calculate agaw for each iommu. 610 * "SAGAW" may be different across iommus, use a default agaw, and 611 * get a supported less agaw for iommus that don't support the default agaw. 612 */ 613int iommu_calculate_agaw(struct intel_iommu *iommu) 614{ 615 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); 616} 617 618/* This functionin only returns single iommu in a domain */ 619struct intel_iommu *domain_get_iommu(struct dmar_domain *domain) 620{ 621 int iommu_id; 622 623 /* si_domain and vm domain should not get here. */ 624 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA)) 625 return NULL; 626 627 for_each_domain_iommu(iommu_id, domain) 628 break; 629 630 if (iommu_id < 0 || iommu_id >= g_num_of_iommus) 631 return NULL; 632 633 return g_iommus[iommu_id]; 634} 635 636static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu) 637{ 638 return sm_supported(iommu) ? 639 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); 640} 641 642static void domain_update_iommu_coherency(struct dmar_domain *domain) 643{ 644 struct dmar_drhd_unit *drhd; 645 struct intel_iommu *iommu; 646 bool found = false; 647 int i; 648 649 domain->iommu_coherency = 1; 650 651 for_each_domain_iommu(i, domain) { 652 found = true; 653 if (!iommu_paging_structure_coherency(g_iommus[i])) { 654 domain->iommu_coherency = 0; 655 break; 656 } 657 } 658 if (found) 659 return; 660 661 /* No hardware attached; use lowest common denominator */ 662 rcu_read_lock(); 663 for_each_active_iommu(iommu, drhd) { 664 if (!iommu_paging_structure_coherency(iommu)) { 665 domain->iommu_coherency = 0; 666 break; 667 } 668 } 669 rcu_read_unlock(); 670} 671 672static int domain_update_iommu_snooping(struct intel_iommu *skip) 673{ 674 struct dmar_drhd_unit *drhd; 675 struct intel_iommu *iommu; 676 int ret = 1; 677 678 rcu_read_lock(); 679 for_each_active_iommu(iommu, drhd) { 680 if (iommu != skip) { 681 /* 682 * If the hardware is operating in the scalable mode, 683 * the snooping control is always supported since we 684 * always set PASID-table-entry.PGSNP bit if the domain 685 * is managed outside (UNMANAGED). 686 */ 687 if (!sm_supported(iommu) && 688 !ecap_sc_support(iommu->ecap)) { 689 ret = 0; 690 break; 691 } 692 } 693 } 694 rcu_read_unlock(); 695 696 return ret; 697} 698 699static int domain_update_iommu_superpage(struct dmar_domain *domain, 700 struct intel_iommu *skip) 701{ 702 struct dmar_drhd_unit *drhd; 703 struct intel_iommu *iommu; 704 int mask = 0x3; 705 706 if (!intel_iommu_superpage) { 707 return 0; 708 } 709 710 /* set iommu_superpage to the smallest common denominator */ 711 rcu_read_lock(); 712 for_each_active_iommu(iommu, drhd) { 713 if (iommu != skip) { 714 if (domain && domain_use_first_level(domain)) { 715 if (!cap_fl1gp_support(iommu->cap)) 716 mask = 0x1; 717 } else { 718 mask &= cap_super_page_val(iommu->cap); 719 } 720 721 if (!mask) 722 break; 723 } 724 } 725 rcu_read_unlock(); 726 727 return fls(mask); 728} 729 730static int domain_update_device_node(struct dmar_domain *domain) 731{ 732 struct device_domain_info *info; 733 int nid = NUMA_NO_NODE; 734 735 assert_spin_locked(&device_domain_lock); 736 737 if (list_empty(&domain->devices)) 738 return NUMA_NO_NODE; 739 740 list_for_each_entry(info, &domain->devices, link) { 741 if (!info->dev) 742 continue; 743 744 /* 745 * There could possibly be multiple device numa nodes as devices 746 * within the same domain may sit behind different IOMMUs. There 747 * isn't perfect answer in such situation, so we select first 748 * come first served policy. 749 */ 750 nid = dev_to_node(info->dev); 751 if (nid != NUMA_NO_NODE) 752 break; 753 } 754 755 return nid; 756} 757 758/* Some capabilities may be different across iommus */ 759static void domain_update_iommu_cap(struct dmar_domain *domain) 760{ 761 domain_update_iommu_coherency(domain); 762 domain->iommu_snooping = domain_update_iommu_snooping(NULL); 763 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); 764 765 /* 766 * If RHSA is missing, we should default to the device numa domain 767 * as fall back. 768 */ 769 if (domain->nid == NUMA_NO_NODE) 770 domain->nid = domain_update_device_node(domain); 771 772 /* 773 * First-level translation restricts the input-address to a 774 * canonical address (i.e., address bits 63:N have the same 775 * value as address bit [N-1], where N is 48-bits with 4-level 776 * paging and 57-bits with 5-level paging). Hence, skip bit 777 * [N-1]. 778 */ 779 if (domain_use_first_level(domain)) 780 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); 781 else 782 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); 783} 784 785struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, 786 u8 devfn, int alloc) 787{ 788 struct root_entry *root = &iommu->root_entry[bus]; 789 struct context_entry *context; 790 u64 *entry; 791 792 entry = &root->lo; 793 if (sm_supported(iommu)) { 794 if (devfn >= 0x80) { 795 devfn -= 0x80; 796 entry = &root->hi; 797 } 798 devfn *= 2; 799 } 800 if (*entry & 1) 801 context = phys_to_virt(*entry & VTD_PAGE_MASK); 802 else { 803 unsigned long phy_addr; 804 if (!alloc) 805 return NULL; 806 807 context = alloc_pgtable_page(iommu->node); 808 if (!context) 809 return NULL; 810 811 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); 812 phy_addr = virt_to_phys((void *)context); 813 *entry = phy_addr | 1; 814 __iommu_flush_cache(iommu, entry, sizeof(*entry)); 815 } 816 return &context[devfn]; 817} 818 819static bool attach_deferred(struct device *dev) 820{ 821 return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO; 822} 823 824/** 825 * is_downstream_to_pci_bridge - test if a device belongs to the PCI 826 * sub-hierarchy of a candidate PCI-PCI bridge 827 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy 828 * @bridge: the candidate PCI-PCI bridge 829 * 830 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. 831 */ 832static bool 833is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) 834{ 835 struct pci_dev *pdev, *pbridge; 836 837 if (!dev_is_pci(dev) || !dev_is_pci(bridge)) 838 return false; 839 840 pdev = to_pci_dev(dev); 841 pbridge = to_pci_dev(bridge); 842 843 if (pbridge->subordinate && 844 pbridge->subordinate->number <= pdev->bus->number && 845 pbridge->subordinate->busn_res.end >= pdev->bus->number) 846 return true; 847 848 return false; 849} 850 851static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) 852{ 853 struct dmar_drhd_unit *drhd; 854 u32 vtbar; 855 int rc; 856 857 /* We know that this device on this chipset has its own IOMMU. 858 * If we find it under a different IOMMU, then the BIOS is lying 859 * to us. Hope that the IOMMU for this device is actually 860 * disabled, and it needs no translation... 861 */ 862 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); 863 if (rc) { 864 /* "can't" happen */ 865 dev_info(&pdev->dev, "failed to run vt-d quirk\n"); 866 return false; 867 } 868 vtbar &= 0xffff0000; 869 870 /* we know that the this iommu should be at offset 0xa000 from vtbar */ 871 drhd = dmar_find_matched_drhd_unit(pdev); 872 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { 873 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); 874 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 875 return true; 876 } 877 878 return false; 879} 880 881static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) 882{ 883 if (!iommu || iommu->drhd->ignored) 884 return true; 885 886 if (dev_is_pci(dev)) { 887 struct pci_dev *pdev = to_pci_dev(dev); 888 889 if (pdev->vendor == PCI_VENDOR_ID_INTEL && 890 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && 891 quirk_ioat_snb_local_iommu(pdev)) 892 return true; 893 } 894 895 return false; 896} 897 898struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn) 899{ 900 struct dmar_drhd_unit *drhd = NULL; 901 struct pci_dev *pdev = NULL; 902 struct intel_iommu *iommu; 903 struct device *tmp; 904 u16 segment = 0; 905 int i; 906 907 if (!dev) 908 return NULL; 909 910 if (dev_is_pci(dev)) { 911 struct pci_dev *pf_pdev; 912 913 pdev = pci_real_dma_dev(to_pci_dev(dev)); 914 915 /* VFs aren't listed in scope tables; we need to look up 916 * the PF instead to find the IOMMU. */ 917 pf_pdev = pci_physfn(pdev); 918 dev = &pf_pdev->dev; 919 segment = pci_domain_nr(pdev->bus); 920 } else if (has_acpi_companion(dev)) 921 dev = &ACPI_COMPANION(dev)->dev; 922 923 rcu_read_lock(); 924 for_each_iommu(iommu, drhd) { 925 if (pdev && segment != drhd->segment) 926 continue; 927 928 for_each_active_dev_scope(drhd->devices, 929 drhd->devices_cnt, i, tmp) { 930 if (tmp == dev) { 931 /* For a VF use its original BDF# not that of the PF 932 * which we used for the IOMMU lookup. Strictly speaking 933 * we could do this for all PCI devices; we only need to 934 * get the BDF# from the scope table for ACPI matches. */ 935 if (pdev && pdev->is_virtfn) 936 goto got_pdev; 937 938 if (bus && devfn) { 939 *bus = drhd->devices[i].bus; 940 *devfn = drhd->devices[i].devfn; 941 } 942 goto out; 943 } 944 945 if (is_downstream_to_pci_bridge(dev, tmp)) 946 goto got_pdev; 947 } 948 949 if (pdev && drhd->include_all) { 950 got_pdev: 951 if (bus && devfn) { 952 *bus = pdev->bus->number; 953 *devfn = pdev->devfn; 954 } 955 goto out; 956 } 957 } 958 iommu = NULL; 959 out: 960 if (iommu_is_dummy(iommu, dev)) 961 iommu = NULL; 962 963 rcu_read_unlock(); 964 965 return iommu; 966} 967 968static void domain_flush_cache(struct dmar_domain *domain, 969 void *addr, int size) 970{ 971 if (!domain->iommu_coherency) 972 clflush_cache_range(addr, size); 973} 974 975static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn) 976{ 977 struct context_entry *context; 978 int ret = 0; 979 unsigned long flags; 980 981 spin_lock_irqsave(&iommu->lock, flags); 982 context = iommu_context_addr(iommu, bus, devfn, 0); 983 if (context) 984 ret = context_present(context); 985 spin_unlock_irqrestore(&iommu->lock, flags); 986 return ret; 987} 988 989static void free_context_table(struct intel_iommu *iommu) 990{ 991 int i; 992 unsigned long flags; 993 struct context_entry *context; 994 995 spin_lock_irqsave(&iommu->lock, flags); 996 if (!iommu->root_entry) { 997 goto out; 998 } 999 for (i = 0; i < ROOT_ENTRY_NR; i++) { 1000 context = iommu_context_addr(iommu, i, 0, 0); 1001 if (context) 1002 free_pgtable_page(context); 1003 1004 if (!sm_supported(iommu)) 1005 continue; 1006 1007 context = iommu_context_addr(iommu, i, 0x80, 0); 1008 if (context) 1009 free_pgtable_page(context); 1010 1011 } 1012 free_pgtable_page(iommu->root_entry); 1013 iommu->root_entry = NULL; 1014out: 1015 spin_unlock_irqrestore(&iommu->lock, flags); 1016} 1017 1018static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, 1019 unsigned long pfn, int *target_level) 1020{ 1021 struct dma_pte *parent, *pte; 1022 int level = agaw_to_level(domain->agaw); 1023 int offset; 1024 1025 BUG_ON(!domain->pgd); 1026 1027 if (!domain_pfn_supported(domain, pfn)) 1028 /* Address beyond IOMMU's addressing capabilities. */ 1029 return NULL; 1030 1031 parent = domain->pgd; 1032 1033 while (1) { 1034 void *tmp_page; 1035 1036 offset = pfn_level_offset(pfn, level); 1037 pte = &parent[offset]; 1038 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) 1039 break; 1040 if (level == *target_level) 1041 break; 1042 1043 if (!dma_pte_present(pte)) { 1044 uint64_t pteval; 1045 1046 tmp_page = alloc_pgtable_page(domain->nid); 1047 1048 if (!tmp_page) 1049 return NULL; 1050 1051 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); 1052 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; 1053 if (domain_use_first_level(domain)) { 1054 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US; 1055 if (domain->domain.type == IOMMU_DOMAIN_DMA) 1056 pteval |= DMA_FL_PTE_ACCESS; 1057 } 1058 if (cmpxchg64(&pte->val, 0ULL, pteval)) 1059 /* Someone else set it while we were thinking; use theirs. */ 1060 free_pgtable_page(tmp_page); 1061 else 1062 domain_flush_cache(domain, pte, sizeof(*pte)); 1063 } 1064 if (level == 1) 1065 break; 1066 1067 parent = phys_to_virt(dma_pte_addr(pte)); 1068 level--; 1069 } 1070 1071 if (!*target_level) 1072 *target_level = level; 1073 1074 return pte; 1075} 1076 1077/* return address's pte at specific level */ 1078static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, 1079 unsigned long pfn, 1080 int level, int *large_page) 1081{ 1082 struct dma_pte *parent, *pte; 1083 int total = agaw_to_level(domain->agaw); 1084 int offset; 1085 1086 parent = domain->pgd; 1087 while (level <= total) { 1088 offset = pfn_level_offset(pfn, total); 1089 pte = &parent[offset]; 1090 if (level == total) 1091 return pte; 1092 1093 if (!dma_pte_present(pte)) { 1094 *large_page = total; 1095 break; 1096 } 1097 1098 if (dma_pte_superpage(pte)) { 1099 *large_page = total; 1100 return pte; 1101 } 1102 1103 parent = phys_to_virt(dma_pte_addr(pte)); 1104 total--; 1105 } 1106 return NULL; 1107} 1108 1109/* clear last level pte, a tlb flush should be followed */ 1110static void dma_pte_clear_range(struct dmar_domain *domain, 1111 unsigned long start_pfn, 1112 unsigned long last_pfn) 1113{ 1114 unsigned int large_page; 1115 struct dma_pte *first_pte, *pte; 1116 1117 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1118 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1119 BUG_ON(start_pfn > last_pfn); 1120 1121 /* we don't need lock here; nobody else touches the iova range */ 1122 do { 1123 large_page = 1; 1124 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); 1125 if (!pte) { 1126 start_pfn = align_to_level(start_pfn + 1, large_page + 1); 1127 continue; 1128 } 1129 do { 1130 dma_clear_pte(pte); 1131 start_pfn += lvl_to_nr_pages(large_page); 1132 pte++; 1133 } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); 1134 1135 domain_flush_cache(domain, first_pte, 1136 (void *)pte - (void *)first_pte); 1137 1138 } while (start_pfn && start_pfn <= last_pfn); 1139} 1140 1141static void dma_pte_free_level(struct dmar_domain *domain, int level, 1142 int retain_level, struct dma_pte *pte, 1143 unsigned long pfn, unsigned long start_pfn, 1144 unsigned long last_pfn) 1145{ 1146 pfn = max(start_pfn, pfn); 1147 pte = &pte[pfn_level_offset(pfn, level)]; 1148 1149 do { 1150 unsigned long level_pfn; 1151 struct dma_pte *level_pte; 1152 1153 if (!dma_pte_present(pte) || dma_pte_superpage(pte)) 1154 goto next; 1155 1156 level_pfn = pfn & level_mask(level); 1157 level_pte = phys_to_virt(dma_pte_addr(pte)); 1158 1159 if (level > 2) { 1160 dma_pte_free_level(domain, level - 1, retain_level, 1161 level_pte, level_pfn, start_pfn, 1162 last_pfn); 1163 } 1164 1165 /* 1166 * Free the page table if we're below the level we want to 1167 * retain and the range covers the entire table. 1168 */ 1169 if (level < retain_level && !(start_pfn > level_pfn || 1170 last_pfn < level_pfn + level_size(level) - 1)) { 1171 dma_clear_pte(pte); 1172 domain_flush_cache(domain, pte, sizeof(*pte)); 1173 free_pgtable_page(level_pte); 1174 } 1175next: 1176 pfn += level_size(level); 1177 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1178} 1179 1180/* 1181 * clear last level (leaf) ptes and free page table pages below the 1182 * level we wish to keep intact. 1183 */ 1184static void dma_pte_free_pagetable(struct dmar_domain *domain, 1185 unsigned long start_pfn, 1186 unsigned long last_pfn, 1187 int retain_level) 1188{ 1189 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1190 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1191 BUG_ON(start_pfn > last_pfn); 1192 1193 dma_pte_clear_range(domain, start_pfn, last_pfn); 1194 1195 /* We don't need lock here; nobody else touches the iova range */ 1196 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, 1197 domain->pgd, 0, start_pfn, last_pfn); 1198 1199 /* free pgd */ 1200 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1201 free_pgtable_page(domain->pgd); 1202 domain->pgd = NULL; 1203 } 1204} 1205 1206/* When a page at a given level is being unlinked from its parent, we don't 1207 need to *modify* it at all. All we need to do is make a list of all the 1208 pages which can be freed just as soon as we've flushed the IOTLB and we 1209 know the hardware page-walk will no longer touch them. 1210 The 'pte' argument is the *parent* PTE, pointing to the page that is to 1211 be freed. */ 1212static struct page *dma_pte_list_pagetables(struct dmar_domain *domain, 1213 int level, struct dma_pte *pte, 1214 struct page *freelist) 1215{ 1216 struct page *pg; 1217 1218 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); 1219 pg->freelist = freelist; 1220 freelist = pg; 1221 1222 if (level == 1) 1223 return freelist; 1224 1225 pte = page_address(pg); 1226 do { 1227 if (dma_pte_present(pte) && !dma_pte_superpage(pte)) 1228 freelist = dma_pte_list_pagetables(domain, level - 1, 1229 pte, freelist); 1230 pte++; 1231 } while (!first_pte_in_page(pte)); 1232 1233 return freelist; 1234} 1235 1236static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level, 1237 struct dma_pte *pte, unsigned long pfn, 1238 unsigned long start_pfn, 1239 unsigned long last_pfn, 1240 struct page *freelist) 1241{ 1242 struct dma_pte *first_pte = NULL, *last_pte = NULL; 1243 1244 pfn = max(start_pfn, pfn); 1245 pte = &pte[pfn_level_offset(pfn, level)]; 1246 1247 do { 1248 unsigned long level_pfn; 1249 1250 if (!dma_pte_present(pte)) 1251 goto next; 1252 1253 level_pfn = pfn & level_mask(level); 1254 1255 /* If range covers entire pagetable, free it */ 1256 if (start_pfn <= level_pfn && 1257 last_pfn >= level_pfn + level_size(level) - 1) { 1258 /* These suborbinate page tables are going away entirely. Don't 1259 bother to clear them; we're just going to *free* them. */ 1260 if (level > 1 && !dma_pte_superpage(pte)) 1261 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1262 1263 dma_clear_pte(pte); 1264 if (!first_pte) 1265 first_pte = pte; 1266 last_pte = pte; 1267 } else if (level > 1) { 1268 /* Recurse down into a level that isn't *entirely* obsolete */ 1269 freelist = dma_pte_clear_level(domain, level - 1, 1270 phys_to_virt(dma_pte_addr(pte)), 1271 level_pfn, start_pfn, last_pfn, 1272 freelist); 1273 } 1274next: 1275 pfn += level_size(level); 1276 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1277 1278 if (first_pte) 1279 domain_flush_cache(domain, first_pte, 1280 (void *)++last_pte - (void *)first_pte); 1281 1282 return freelist; 1283} 1284 1285/* We can't just free the pages because the IOMMU may still be walking 1286 the page tables, and may have cached the intermediate levels. The 1287 pages can only be freed after the IOTLB flush has been done. */ 1288static struct page *domain_unmap(struct dmar_domain *domain, 1289 unsigned long start_pfn, 1290 unsigned long last_pfn) 1291{ 1292 struct page *freelist; 1293 1294 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1295 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1296 BUG_ON(start_pfn > last_pfn); 1297 1298 /* we don't need lock here; nobody else touches the iova range */ 1299 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw), 1300 domain->pgd, 0, start_pfn, last_pfn, NULL); 1301 1302 /* free pgd */ 1303 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1304 struct page *pgd_page = virt_to_page(domain->pgd); 1305 pgd_page->freelist = freelist; 1306 freelist = pgd_page; 1307 1308 domain->pgd = NULL; 1309 } 1310 1311 return freelist; 1312} 1313 1314static void dma_free_pagelist(struct page *freelist) 1315{ 1316 struct page *pg; 1317 1318 while ((pg = freelist)) { 1319 freelist = pg->freelist; 1320 free_pgtable_page(page_address(pg)); 1321 } 1322} 1323 1324static void iova_entry_free(unsigned long data) 1325{ 1326 struct page *freelist = (struct page *)data; 1327 1328 dma_free_pagelist(freelist); 1329} 1330 1331/* iommu handling */ 1332static int iommu_alloc_root_entry(struct intel_iommu *iommu) 1333{ 1334 struct root_entry *root; 1335 unsigned long flags; 1336 1337 root = (struct root_entry *)alloc_pgtable_page(iommu->node); 1338 if (!root) { 1339 pr_err("Allocating root entry for %s failed\n", 1340 iommu->name); 1341 return -ENOMEM; 1342 } 1343 1344 __iommu_flush_cache(iommu, root, ROOT_SIZE); 1345 1346 spin_lock_irqsave(&iommu->lock, flags); 1347 iommu->root_entry = root; 1348 spin_unlock_irqrestore(&iommu->lock, flags); 1349 1350 return 0; 1351} 1352 1353static void iommu_set_root_entry(struct intel_iommu *iommu) 1354{ 1355 u64 addr; 1356 u32 sts; 1357 unsigned long flag; 1358 1359 addr = virt_to_phys(iommu->root_entry); 1360 if (sm_supported(iommu)) 1361 addr |= DMA_RTADDR_SMT; 1362 1363 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1364 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); 1365 1366 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); 1367 1368 /* Make sure hardware complete it */ 1369 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1370 readl, (sts & DMA_GSTS_RTPS), sts); 1371 1372 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1373 1374 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 1375 if (sm_supported(iommu)) 1376 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); 1377 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 1378} 1379 1380void iommu_flush_write_buffer(struct intel_iommu *iommu) 1381{ 1382 u32 val; 1383 unsigned long flag; 1384 1385 if (!rwbf_quirk && !cap_rwbf(iommu->cap)) 1386 return; 1387 1388 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1389 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); 1390 1391 /* Make sure hardware complete it */ 1392 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1393 readl, (!(val & DMA_GSTS_WBFS)), val); 1394 1395 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1396} 1397 1398/* return value determine if we need a write buffer flush */ 1399static void __iommu_flush_context(struct intel_iommu *iommu, 1400 u16 did, u16 source_id, u8 function_mask, 1401 u64 type) 1402{ 1403 u64 val = 0; 1404 unsigned long flag; 1405 1406 switch (type) { 1407 case DMA_CCMD_GLOBAL_INVL: 1408 val = DMA_CCMD_GLOBAL_INVL; 1409 break; 1410 case DMA_CCMD_DOMAIN_INVL: 1411 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); 1412 break; 1413 case DMA_CCMD_DEVICE_INVL: 1414 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) 1415 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); 1416 break; 1417 default: 1418 BUG(); 1419 } 1420 val |= DMA_CCMD_ICC; 1421 1422 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1423 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); 1424 1425 /* Make sure hardware complete it */ 1426 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, 1427 dmar_readq, (!(val & DMA_CCMD_ICC)), val); 1428 1429 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1430} 1431 1432/* return value determine if we need a write buffer flush */ 1433static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, 1434 u64 addr, unsigned int size_order, u64 type) 1435{ 1436 int tlb_offset = ecap_iotlb_offset(iommu->ecap); 1437 u64 val = 0, val_iva = 0; 1438 unsigned long flag; 1439 1440 switch (type) { 1441 case DMA_TLB_GLOBAL_FLUSH: 1442 /* global flush doesn't need set IVA_REG */ 1443 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; 1444 break; 1445 case DMA_TLB_DSI_FLUSH: 1446 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1447 break; 1448 case DMA_TLB_PSI_FLUSH: 1449 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1450 /* IH bit is passed in as part of address */ 1451 val_iva = size_order | addr; 1452 break; 1453 default: 1454 BUG(); 1455 } 1456 /* Note: set drain read/write */ 1457#if 0 1458 /* 1459 * This is probably to be super secure.. Looks like we can 1460 * ignore it without any impact. 1461 */ 1462 if (cap_read_drain(iommu->cap)) 1463 val |= DMA_TLB_READ_DRAIN; 1464#endif 1465 if (cap_write_drain(iommu->cap)) 1466 val |= DMA_TLB_WRITE_DRAIN; 1467 1468 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1469 /* Note: Only uses first TLB reg currently */ 1470 if (val_iva) 1471 dmar_writeq(iommu->reg + tlb_offset, val_iva); 1472 dmar_writeq(iommu->reg + tlb_offset + 8, val); 1473 1474 /* Make sure hardware complete it */ 1475 IOMMU_WAIT_OP(iommu, tlb_offset + 8, 1476 dmar_readq, (!(val & DMA_TLB_IVT)), val); 1477 1478 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1479 1480 /* check IOTLB invalidation granularity */ 1481 if (DMA_TLB_IAIG(val) == 0) 1482 pr_err("Flush IOTLB failed\n"); 1483 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) 1484 pr_debug("TLB flush request %Lx, actual %Lx\n", 1485 (unsigned long long)DMA_TLB_IIRG(type), 1486 (unsigned long long)DMA_TLB_IAIG(val)); 1487} 1488 1489static struct device_domain_info * 1490iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu, 1491 u8 bus, u8 devfn) 1492{ 1493 struct device_domain_info *info; 1494 1495 assert_spin_locked(&device_domain_lock); 1496 1497 if (!iommu->qi) 1498 return NULL; 1499 1500 list_for_each_entry(info, &domain->devices, link) 1501 if (info->iommu == iommu && info->bus == bus && 1502 info->devfn == devfn) { 1503 if (info->ats_supported && info->dev) 1504 return info; 1505 break; 1506 } 1507 1508 return NULL; 1509} 1510 1511static void domain_update_iotlb(struct dmar_domain *domain) 1512{ 1513 struct device_domain_info *info; 1514 bool has_iotlb_device = false; 1515 1516 assert_spin_locked(&device_domain_lock); 1517 1518 list_for_each_entry(info, &domain->devices, link) { 1519 struct pci_dev *pdev; 1520 1521 if (!info->dev || !dev_is_pci(info->dev)) 1522 continue; 1523 1524 pdev = to_pci_dev(info->dev); 1525 if (pdev->ats_enabled) { 1526 has_iotlb_device = true; 1527 break; 1528 } 1529 } 1530 1531 domain->has_iotlb_device = has_iotlb_device; 1532} 1533 1534static void iommu_enable_dev_iotlb(struct device_domain_info *info) 1535{ 1536 struct pci_dev *pdev; 1537 1538 assert_spin_locked(&device_domain_lock); 1539 1540 if (!info || !dev_is_pci(info->dev)) 1541 return; 1542 1543 pdev = to_pci_dev(info->dev); 1544 /* For IOMMU that supports device IOTLB throttling (DIT), we assign 1545 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge 1546 * queue depth at PF level. If DIT is not set, PFSID will be treated as 1547 * reserved, which should be set to 0. 1548 */ 1549 if (!ecap_dit(info->iommu->ecap)) 1550 info->pfsid = 0; 1551 else { 1552 struct pci_dev *pf_pdev; 1553 1554 /* pdev will be returned if device is not a vf */ 1555 pf_pdev = pci_physfn(pdev); 1556 info->pfsid = pci_dev_id(pf_pdev); 1557 } 1558 1559#ifdef CONFIG_INTEL_IOMMU_SVM 1560 /* The PCIe spec, in its wisdom, declares that the behaviour of 1561 the device if you enable PASID support after ATS support is 1562 undefined. So always enable PASID support on devices which 1563 have it, even if we can't yet know if we're ever going to 1564 use it. */ 1565 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) 1566 info->pasid_enabled = 1; 1567 1568 if (info->pri_supported && 1569 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) && 1570 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32)) 1571 info->pri_enabled = 1; 1572#endif 1573 if (info->ats_supported && pci_ats_page_aligned(pdev) && 1574 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { 1575 info->ats_enabled = 1; 1576 domain_update_iotlb(info->domain); 1577 info->ats_qdep = pci_ats_queue_depth(pdev); 1578 } 1579} 1580 1581static void iommu_disable_dev_iotlb(struct device_domain_info *info) 1582{ 1583 struct pci_dev *pdev; 1584 1585 assert_spin_locked(&device_domain_lock); 1586 1587 if (!dev_is_pci(info->dev)) 1588 return; 1589 1590 pdev = to_pci_dev(info->dev); 1591 1592 if (info->ats_enabled) { 1593 pci_disable_ats(pdev); 1594 info->ats_enabled = 0; 1595 domain_update_iotlb(info->domain); 1596 } 1597#ifdef CONFIG_INTEL_IOMMU_SVM 1598 if (info->pri_enabled) { 1599 pci_disable_pri(pdev); 1600 info->pri_enabled = 0; 1601 } 1602 if (info->pasid_enabled) { 1603 pci_disable_pasid(pdev); 1604 info->pasid_enabled = 0; 1605 } 1606#endif 1607} 1608 1609static void iommu_flush_dev_iotlb(struct dmar_domain *domain, 1610 u64 addr, unsigned mask) 1611{ 1612 u16 sid, qdep; 1613 unsigned long flags; 1614 struct device_domain_info *info; 1615 1616 if (!domain->has_iotlb_device) 1617 return; 1618 1619 spin_lock_irqsave(&device_domain_lock, flags); 1620 list_for_each_entry(info, &domain->devices, link) { 1621 if (!info->ats_enabled) 1622 continue; 1623 1624 sid = info->bus << 8 | info->devfn; 1625 qdep = info->ats_qdep; 1626 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 1627 qdep, addr, mask); 1628 } 1629 spin_unlock_irqrestore(&device_domain_lock, flags); 1630} 1631 1632static void domain_flush_piotlb(struct intel_iommu *iommu, 1633 struct dmar_domain *domain, 1634 u64 addr, unsigned long npages, bool ih) 1635{ 1636 u16 did = domain->iommu_did[iommu->seq_id]; 1637 1638 if (domain->default_pasid) 1639 qi_flush_piotlb(iommu, did, domain->default_pasid, 1640 addr, npages, ih); 1641 1642 if (!list_empty(&domain->devices)) 1643 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih); 1644} 1645 1646static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, 1647 struct dmar_domain *domain, 1648 unsigned long pfn, unsigned int pages, 1649 int ih, int map) 1650{ 1651 unsigned int aligned_pages = __roundup_pow_of_two(pages); 1652 unsigned int mask = ilog2(aligned_pages); 1653 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT; 1654 u16 did = domain->iommu_did[iommu->seq_id]; 1655 1656 BUG_ON(pages == 0); 1657 1658 if (ih) 1659 ih = 1 << 6; 1660 1661 if (domain_use_first_level(domain)) { 1662 domain_flush_piotlb(iommu, domain, addr, pages, ih); 1663 } else { 1664 unsigned long bitmask = aligned_pages - 1; 1665 1666 /* 1667 * PSI masks the low order bits of the base address. If the 1668 * address isn't aligned to the mask, then compute a mask value 1669 * needed to ensure the target range is flushed. 1670 */ 1671 if (unlikely(bitmask & pfn)) { 1672 unsigned long end_pfn = pfn + pages - 1, shared_bits; 1673 1674 /* 1675 * Since end_pfn <= pfn + bitmask, the only way bits 1676 * higher than bitmask can differ in pfn and end_pfn is 1677 * by carrying. This means after masking out bitmask, 1678 * high bits starting with the first set bit in 1679 * shared_bits are all equal in both pfn and end_pfn. 1680 */ 1681 shared_bits = ~(pfn ^ end_pfn) & ~bitmask; 1682 mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG; 1683 } 1684 1685 /* 1686 * Fallback to domain selective flush if no PSI support or 1687 * the size is too big. 1688 */ 1689 if (!cap_pgsel_inv(iommu->cap) || 1690 mask > cap_max_amask_val(iommu->cap)) 1691 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1692 DMA_TLB_DSI_FLUSH); 1693 else 1694 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, 1695 DMA_TLB_PSI_FLUSH); 1696 } 1697 1698 /* 1699 * In caching mode, changes of pages from non-present to present require 1700 * flush. However, device IOTLB doesn't need to be flushed in this case. 1701 */ 1702 if (!cap_caching_mode(iommu->cap) || !map) 1703 iommu_flush_dev_iotlb(domain, addr, mask); 1704} 1705 1706/* Notification for newly created mappings */ 1707static inline void __mapping_notify_one(struct intel_iommu *iommu, 1708 struct dmar_domain *domain, 1709 unsigned long pfn, unsigned int pages) 1710{ 1711 /* 1712 * It's a non-present to present mapping. Only flush if caching mode 1713 * and second level. 1714 */ 1715 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain)) 1716 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1); 1717 else 1718 iommu_flush_write_buffer(iommu); 1719} 1720 1721static void iommu_flush_iova(struct iova_domain *iovad) 1722{ 1723 struct dmar_domain *domain; 1724 int idx; 1725 1726 domain = container_of(iovad, struct dmar_domain, iovad); 1727 1728 for_each_domain_iommu(idx, domain) { 1729 struct intel_iommu *iommu = g_iommus[idx]; 1730 u16 did = domain->iommu_did[iommu->seq_id]; 1731 1732 if (domain_use_first_level(domain)) 1733 domain_flush_piotlb(iommu, domain, 0, -1, 0); 1734 else 1735 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1736 DMA_TLB_DSI_FLUSH); 1737 1738 if (!cap_caching_mode(iommu->cap)) 1739 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did), 1740 0, MAX_AGAW_PFN_WIDTH); 1741 } 1742} 1743 1744static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) 1745{ 1746 u32 pmen; 1747 unsigned long flags; 1748 1749 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) 1750 return; 1751 1752 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1753 pmen = readl(iommu->reg + DMAR_PMEN_REG); 1754 pmen &= ~DMA_PMEN_EPM; 1755 writel(pmen, iommu->reg + DMAR_PMEN_REG); 1756 1757 /* wait for the protected region status bit to clear */ 1758 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, 1759 readl, !(pmen & DMA_PMEN_PRS), pmen); 1760 1761 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1762} 1763 1764static void iommu_enable_translation(struct intel_iommu *iommu) 1765{ 1766 u32 sts; 1767 unsigned long flags; 1768 1769 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1770 iommu->gcmd |= DMA_GCMD_TE; 1771 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1772 1773 /* Make sure hardware complete it */ 1774 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1775 readl, (sts & DMA_GSTS_TES), sts); 1776 1777 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1778} 1779 1780static void iommu_disable_translation(struct intel_iommu *iommu) 1781{ 1782 u32 sts; 1783 unsigned long flag; 1784 1785 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && 1786 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) 1787 return; 1788 1789 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1790 iommu->gcmd &= ~DMA_GCMD_TE; 1791 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1792 1793 /* Make sure hardware complete it */ 1794 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1795 readl, (!(sts & DMA_GSTS_TES)), sts); 1796 1797 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1798} 1799 1800static int iommu_init_domains(struct intel_iommu *iommu) 1801{ 1802 u32 ndomains, nlongs; 1803 size_t size; 1804 1805 ndomains = cap_ndoms(iommu->cap); 1806 pr_debug("%s: Number of Domains supported <%d>\n", 1807 iommu->name, ndomains); 1808 nlongs = BITS_TO_LONGS(ndomains); 1809 1810 spin_lock_init(&iommu->lock); 1811 1812 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL); 1813 if (!iommu->domain_ids) { 1814 pr_err("%s: Allocating domain id array failed\n", 1815 iommu->name); 1816 return -ENOMEM; 1817 } 1818 1819 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **); 1820 iommu->domains = kzalloc(size, GFP_KERNEL); 1821 1822 if (iommu->domains) { 1823 size = 256 * sizeof(struct dmar_domain *); 1824 iommu->domains[0] = kzalloc(size, GFP_KERNEL); 1825 } 1826 1827 if (!iommu->domains || !iommu->domains[0]) { 1828 pr_err("%s: Allocating domain array failed\n", 1829 iommu->name); 1830 kfree(iommu->domain_ids); 1831 kfree(iommu->domains); 1832 iommu->domain_ids = NULL; 1833 iommu->domains = NULL; 1834 return -ENOMEM; 1835 } 1836 1837 /* 1838 * If Caching mode is set, then invalid translations are tagged 1839 * with domain-id 0, hence we need to pre-allocate it. We also 1840 * use domain-id 0 as a marker for non-allocated domain-id, so 1841 * make sure it is not used for a real domain. 1842 */ 1843 set_bit(0, iommu->domain_ids); 1844 1845 /* 1846 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid 1847 * entry for first-level or pass-through translation modes should 1848 * be programmed with a domain id different from those used for 1849 * second-level or nested translation. We reserve a domain id for 1850 * this purpose. 1851 */ 1852 if (sm_supported(iommu)) 1853 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); 1854 1855 return 0; 1856} 1857 1858static void disable_dmar_iommu(struct intel_iommu *iommu) 1859{ 1860 struct device_domain_info *info, *tmp; 1861 unsigned long flags; 1862 1863 if (!iommu->domains || !iommu->domain_ids) 1864 return; 1865 1866 spin_lock_irqsave(&device_domain_lock, flags); 1867 list_for_each_entry_safe(info, tmp, &device_domain_list, global) { 1868 if (info->iommu != iommu) 1869 continue; 1870 1871 if (!info->dev || !info->domain) 1872 continue; 1873 1874 __dmar_remove_one_dev_info(info); 1875 } 1876 spin_unlock_irqrestore(&device_domain_lock, flags); 1877 1878 if (iommu->gcmd & DMA_GCMD_TE) 1879 iommu_disable_translation(iommu); 1880} 1881 1882static void free_dmar_iommu(struct intel_iommu *iommu) 1883{ 1884 if ((iommu->domains) && (iommu->domain_ids)) { 1885 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8; 1886 int i; 1887 1888 for (i = 0; i < elems; i++) 1889 kfree(iommu->domains[i]); 1890 kfree(iommu->domains); 1891 kfree(iommu->domain_ids); 1892 iommu->domains = NULL; 1893 iommu->domain_ids = NULL; 1894 } 1895 1896 g_iommus[iommu->seq_id] = NULL; 1897 1898 /* free context mapping */ 1899 free_context_table(iommu); 1900 1901#ifdef CONFIG_INTEL_IOMMU_SVM 1902 if (pasid_supported(iommu)) { 1903 if (ecap_prs(iommu->ecap)) 1904 intel_svm_finish_prq(iommu); 1905 } 1906 if (vccap_pasid(iommu->vccap)) 1907 ioasid_unregister_allocator(&iommu->pasid_allocator); 1908 1909#endif 1910} 1911 1912/* 1913 * Check and return whether first level is used by default for 1914 * DMA translation. 1915 */ 1916static bool first_level_by_default(void) 1917{ 1918 struct dmar_drhd_unit *drhd; 1919 struct intel_iommu *iommu; 1920 static int first_level_support = -1; 1921 1922 if (likely(first_level_support != -1)) 1923 return first_level_support; 1924 1925 first_level_support = 1; 1926 1927 rcu_read_lock(); 1928 for_each_active_iommu(iommu, drhd) { 1929 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) { 1930 first_level_support = 0; 1931 break; 1932 } 1933 } 1934 rcu_read_unlock(); 1935 1936 return first_level_support; 1937} 1938 1939static struct dmar_domain *alloc_domain(int flags) 1940{ 1941 struct dmar_domain *domain; 1942 1943 domain = alloc_domain_mem(); 1944 if (!domain) 1945 return NULL; 1946 1947 memset(domain, 0, sizeof(*domain)); 1948 domain->nid = NUMA_NO_NODE; 1949 domain->flags = flags; 1950 if (first_level_by_default()) 1951 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL; 1952 domain->has_iotlb_device = false; 1953 INIT_LIST_HEAD(&domain->devices); 1954 1955 return domain; 1956} 1957 1958/* Must be called with iommu->lock */ 1959static int domain_attach_iommu(struct dmar_domain *domain, 1960 struct intel_iommu *iommu) 1961{ 1962 unsigned long ndomains; 1963 int num; 1964 1965 assert_spin_locked(&device_domain_lock); 1966 assert_spin_locked(&iommu->lock); 1967 1968 domain->iommu_refcnt[iommu->seq_id] += 1; 1969 domain->iommu_count += 1; 1970 if (domain->iommu_refcnt[iommu->seq_id] == 1) { 1971 ndomains = cap_ndoms(iommu->cap); 1972 num = find_first_zero_bit(iommu->domain_ids, ndomains); 1973 1974 if (num >= ndomains) { 1975 pr_err("%s: No free domain ids\n", iommu->name); 1976 domain->iommu_refcnt[iommu->seq_id] -= 1; 1977 domain->iommu_count -= 1; 1978 return -ENOSPC; 1979 } 1980 1981 set_bit(num, iommu->domain_ids); 1982 set_iommu_domain(iommu, num, domain); 1983 1984 domain->iommu_did[iommu->seq_id] = num; 1985 domain->nid = iommu->node; 1986 1987 domain_update_iommu_cap(domain); 1988 } 1989 1990 return 0; 1991} 1992 1993static int domain_detach_iommu(struct dmar_domain *domain, 1994 struct intel_iommu *iommu) 1995{ 1996 int num, count; 1997 1998 assert_spin_locked(&device_domain_lock); 1999 assert_spin_locked(&iommu->lock); 2000 2001 domain->iommu_refcnt[iommu->seq_id] -= 1; 2002 count = --domain->iommu_count; 2003 if (domain->iommu_refcnt[iommu->seq_id] == 0) { 2004 num = domain->iommu_did[iommu->seq_id]; 2005 clear_bit(num, iommu->domain_ids); 2006 set_iommu_domain(iommu, num, NULL); 2007 2008 domain_update_iommu_cap(domain); 2009 domain->iommu_did[iommu->seq_id] = 0; 2010 } 2011 2012 return count; 2013} 2014 2015static struct iova_domain reserved_iova_list; 2016static struct lock_class_key reserved_rbtree_key; 2017 2018static int dmar_init_reserved_ranges(void) 2019{ 2020 struct pci_dev *pdev = NULL; 2021 struct iova *iova; 2022 int i; 2023 2024 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN); 2025 2026 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock, 2027 &reserved_rbtree_key); 2028 2029 /* IOAPIC ranges shouldn't be accessed by DMA */ 2030 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START), 2031 IOVA_PFN(IOAPIC_RANGE_END)); 2032 if (!iova) { 2033 pr_err("Reserve IOAPIC range failed\n"); 2034 return -ENODEV; 2035 } 2036 2037 /* Reserve all PCI MMIO to avoid peer-to-peer access */ 2038 for_each_pci_dev(pdev) { 2039 struct resource *r; 2040 2041 for (i = 0; i < PCI_NUM_RESOURCES; i++) { 2042 r = &pdev->resource[i]; 2043 if (!r->flags || !(r->flags & IORESOURCE_MEM)) 2044 continue; 2045 iova = reserve_iova(&reserved_iova_list, 2046 IOVA_PFN(r->start), 2047 IOVA_PFN(r->end)); 2048 if (!iova) { 2049 pci_err(pdev, "Reserve iova for %pR failed\n", r); 2050 return -ENODEV; 2051 } 2052 } 2053 } 2054 return 0; 2055} 2056 2057static inline int guestwidth_to_adjustwidth(int gaw) 2058{ 2059 int agaw; 2060 int r = (gaw - 12) % 9; 2061 2062 if (r == 0) 2063 agaw = gaw; 2064 else 2065 agaw = gaw + 9 - r; 2066 if (agaw > 64) 2067 agaw = 64; 2068 return agaw; 2069} 2070 2071static void domain_exit(struct dmar_domain *domain) 2072{ 2073 2074 /* Remove associated devices and clear attached or cached domains */ 2075 domain_remove_dev_info(domain); 2076 2077 /* destroy iovas */ 2078 if (domain->domain.type == IOMMU_DOMAIN_DMA) 2079 put_iova_domain(&domain->iovad); 2080 2081 if (domain->pgd) { 2082 struct page *freelist; 2083 2084 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw)); 2085 dma_free_pagelist(freelist); 2086 } 2087 2088 free_domain_mem(domain); 2089} 2090 2091/* 2092 * Get the PASID directory size for scalable mode context entry. 2093 * Value of X in the PDTS field of a scalable mode context entry 2094 * indicates PASID directory with 2^(X + 7) entries. 2095 */ 2096static inline unsigned long context_get_sm_pds(struct pasid_table *table) 2097{ 2098 int pds, max_pde; 2099 2100 max_pde = table->max_pasid >> PASID_PDE_SHIFT; 2101 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS); 2102 if (pds < 7) 2103 return 0; 2104 2105 return pds - 7; 2106} 2107 2108/* 2109 * Set the RID_PASID field of a scalable mode context entry. The 2110 * IOMMU hardware will use the PASID value set in this field for 2111 * DMA translations of DMA requests without PASID. 2112 */ 2113static inline void 2114context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid) 2115{ 2116 context->hi |= pasid & ((1 << 20) - 1); 2117} 2118 2119/* 2120 * Set the DTE(Device-TLB Enable) field of a scalable mode context 2121 * entry. 2122 */ 2123static inline void context_set_sm_dte(struct context_entry *context) 2124{ 2125 context->lo |= (1 << 2); 2126} 2127 2128/* 2129 * Set the PRE(Page Request Enable) field of a scalable mode context 2130 * entry. 2131 */ 2132static inline void context_set_sm_pre(struct context_entry *context) 2133{ 2134 context->lo |= (1 << 4); 2135} 2136 2137/* Convert value to context PASID directory size field coding. */ 2138#define context_pdts(pds) (((pds) & 0x7) << 9) 2139 2140static int domain_context_mapping_one(struct dmar_domain *domain, 2141 struct intel_iommu *iommu, 2142 struct pasid_table *table, 2143 u8 bus, u8 devfn) 2144{ 2145 u16 did = domain->iommu_did[iommu->seq_id]; 2146 int translation = CONTEXT_TT_MULTI_LEVEL; 2147 struct device_domain_info *info = NULL; 2148 struct context_entry *context; 2149 unsigned long flags; 2150 int ret; 2151 2152 WARN_ON(did == 0); 2153 2154 if (hw_pass_through && domain_type_is_si(domain)) 2155 translation = CONTEXT_TT_PASS_THROUGH; 2156 2157 pr_debug("Set context mapping for %02x:%02x.%d\n", 2158 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 2159 2160 BUG_ON(!domain->pgd); 2161 2162 spin_lock_irqsave(&device_domain_lock, flags); 2163 spin_lock(&iommu->lock); 2164 2165 ret = -ENOMEM; 2166 context = iommu_context_addr(iommu, bus, devfn, 1); 2167 if (!context) 2168 goto out_unlock; 2169 2170 ret = 0; 2171 if (context_present(context)) 2172 goto out_unlock; 2173 2174 /* 2175 * For kdump cases, old valid entries may be cached due to the 2176 * in-flight DMA and copied pgtable, but there is no unmapping 2177 * behaviour for them, thus we need an explicit cache flush for 2178 * the newly-mapped device. For kdump, at this point, the device 2179 * is supposed to finish reset at its driver probe stage, so no 2180 * in-flight DMA will exist, and we don't need to worry anymore 2181 * hereafter. 2182 */ 2183 if (context_copied(context)) { 2184 u16 did_old = context_domain_id(context); 2185 2186 if (did_old < cap_ndoms(iommu->cap)) { 2187 iommu->flush.flush_context(iommu, did_old, 2188 (((u16)bus) << 8) | devfn, 2189 DMA_CCMD_MASK_NOBIT, 2190 DMA_CCMD_DEVICE_INVL); 2191 iommu->flush.flush_iotlb(iommu, did_old, 0, 0, 2192 DMA_TLB_DSI_FLUSH); 2193 } 2194 } 2195 2196 context_clear_entry(context); 2197 2198 if (sm_supported(iommu)) { 2199 unsigned long pds; 2200 2201 WARN_ON(!table); 2202 2203 /* Setup the PASID DIR pointer: */ 2204 pds = context_get_sm_pds(table); 2205 context->lo = (u64)virt_to_phys(table->table) | 2206 context_pdts(pds); 2207 2208 /* Setup the RID_PASID field: */ 2209 context_set_sm_rid2pasid(context, PASID_RID2PASID); 2210 2211 /* 2212 * Setup the Device-TLB enable bit and Page request 2213 * Enable bit: 2214 */ 2215 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn); 2216 if (info && info->ats_supported) 2217 context_set_sm_dte(context); 2218 if (info && info->pri_supported) 2219 context_set_sm_pre(context); 2220 } else { 2221 struct dma_pte *pgd = domain->pgd; 2222 int agaw; 2223 2224 context_set_domain_id(context, did); 2225 2226 if (translation != CONTEXT_TT_PASS_THROUGH) { 2227 /* 2228 * Skip top levels of page tables for iommu which has 2229 * less agaw than default. Unnecessary for PT mode. 2230 */ 2231 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2232 ret = -ENOMEM; 2233 pgd = phys_to_virt(dma_pte_addr(pgd)); 2234 if (!dma_pte_present(pgd)) 2235 goto out_unlock; 2236 } 2237 2238 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn); 2239 if (info && info->ats_supported) 2240 translation = CONTEXT_TT_DEV_IOTLB; 2241 else 2242 translation = CONTEXT_TT_MULTI_LEVEL; 2243 2244 context_set_address_root(context, virt_to_phys(pgd)); 2245 context_set_address_width(context, agaw); 2246 } else { 2247 /* 2248 * In pass through mode, AW must be programmed to 2249 * indicate the largest AGAW value supported by 2250 * hardware. And ASR is ignored by hardware. 2251 */ 2252 context_set_address_width(context, iommu->msagaw); 2253 } 2254 2255 context_set_translation_type(context, translation); 2256 } 2257 2258 context_set_fault_enable(context); 2259 context_set_present(context); 2260 if (!ecap_coherent(iommu->ecap)) 2261 clflush_cache_range(context, sizeof(*context)); 2262 2263 /* 2264 * It's a non-present to present mapping. If hardware doesn't cache 2265 * non-present entry we only need to flush the write-buffer. If the 2266 * _does_ cache non-present entries, then it does so in the special 2267 * domain #0, which we have to flush: 2268 */ 2269 if (cap_caching_mode(iommu->cap)) { 2270 iommu->flush.flush_context(iommu, 0, 2271 (((u16)bus) << 8) | devfn, 2272 DMA_CCMD_MASK_NOBIT, 2273 DMA_CCMD_DEVICE_INVL); 2274 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 2275 } else { 2276 iommu_flush_write_buffer(iommu); 2277 } 2278 iommu_enable_dev_iotlb(info); 2279 2280 ret = 0; 2281 2282out_unlock: 2283 spin_unlock(&iommu->lock); 2284 spin_unlock_irqrestore(&device_domain_lock, flags); 2285 2286 return ret; 2287} 2288 2289struct domain_context_mapping_data { 2290 struct dmar_domain *domain; 2291 struct intel_iommu *iommu; 2292 struct pasid_table *table; 2293}; 2294 2295static int domain_context_mapping_cb(struct pci_dev *pdev, 2296 u16 alias, void *opaque) 2297{ 2298 struct domain_context_mapping_data *data = opaque; 2299 2300 return domain_context_mapping_one(data->domain, data->iommu, 2301 data->table, PCI_BUS_NUM(alias), 2302 alias & 0xff); 2303} 2304 2305static int 2306domain_context_mapping(struct dmar_domain *domain, struct device *dev) 2307{ 2308 struct domain_context_mapping_data data; 2309 struct pasid_table *table; 2310 struct intel_iommu *iommu; 2311 u8 bus, devfn; 2312 2313 iommu = device_to_iommu(dev, &bus, &devfn); 2314 if (!iommu) 2315 return -ENODEV; 2316 2317 table = intel_pasid_get_table(dev); 2318 2319 if (!dev_is_pci(dev)) 2320 return domain_context_mapping_one(domain, iommu, table, 2321 bus, devfn); 2322 2323 data.domain = domain; 2324 data.iommu = iommu; 2325 data.table = table; 2326 2327 return pci_for_each_dma_alias(to_pci_dev(dev), 2328 &domain_context_mapping_cb, &data); 2329} 2330 2331static int domain_context_mapped_cb(struct pci_dev *pdev, 2332 u16 alias, void *opaque) 2333{ 2334 struct intel_iommu *iommu = opaque; 2335 2336 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff); 2337} 2338 2339static int domain_context_mapped(struct device *dev) 2340{ 2341 struct intel_iommu *iommu; 2342 u8 bus, devfn; 2343 2344 iommu = device_to_iommu(dev, &bus, &devfn); 2345 if (!iommu) 2346 return -ENODEV; 2347 2348 if (!dev_is_pci(dev)) 2349 return device_context_mapped(iommu, bus, devfn); 2350 2351 return !pci_for_each_dma_alias(to_pci_dev(dev), 2352 domain_context_mapped_cb, iommu); 2353} 2354 2355/* Returns a number of VTD pages, but aligned to MM page size */ 2356static inline unsigned long aligned_nrpages(unsigned long host_addr, 2357 size_t size) 2358{ 2359 host_addr &= ~PAGE_MASK; 2360 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; 2361} 2362 2363/* Return largest possible superpage level for a given mapping */ 2364static inline int hardware_largepage_caps(struct dmar_domain *domain, 2365 unsigned long iov_pfn, 2366 unsigned long phy_pfn, 2367 unsigned long pages) 2368{ 2369 int support, level = 1; 2370 unsigned long pfnmerge; 2371 2372 support = domain->iommu_superpage; 2373 2374 /* To use a large page, the virtual *and* physical addresses 2375 must be aligned to 2MiB/1GiB/etc. Lower bits set in either 2376 of them will mean we have to use smaller pages. So just 2377 merge them and check both at once. */ 2378 pfnmerge = iov_pfn | phy_pfn; 2379 2380 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { 2381 pages >>= VTD_STRIDE_SHIFT; 2382 if (!pages) 2383 break; 2384 pfnmerge >>= VTD_STRIDE_SHIFT; 2385 level++; 2386 support--; 2387 } 2388 return level; 2389} 2390 2391static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2392 struct scatterlist *sg, unsigned long phys_pfn, 2393 unsigned long nr_pages, int prot) 2394{ 2395 struct dma_pte *first_pte = NULL, *pte = NULL; 2396 phys_addr_t pteval; 2397 unsigned long sg_res = 0; 2398 unsigned int largepage_lvl = 0; 2399 unsigned long lvl_pages = 0; 2400 u64 attr; 2401 2402 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)); 2403 2404 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) 2405 return -EINVAL; 2406 2407 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); 2408 attr |= DMA_FL_PTE_PRESENT; 2409 if (domain_use_first_level(domain)) { 2410 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US; 2411 2412 if (domain->domain.type == IOMMU_DOMAIN_DMA) { 2413 attr |= DMA_FL_PTE_ACCESS; 2414 if (prot & DMA_PTE_WRITE) 2415 attr |= DMA_FL_PTE_DIRTY; 2416 } 2417 } 2418 2419 if (!sg) { 2420 sg_res = nr_pages; 2421 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; 2422 } 2423 2424 while (nr_pages > 0) { 2425 uint64_t tmp; 2426 2427 if (!sg_res) { 2428 unsigned int pgoff = sg->offset & ~PAGE_MASK; 2429 2430 sg_res = aligned_nrpages(sg->offset, sg->length); 2431 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff; 2432 sg->dma_length = sg->length; 2433 pteval = (sg_phys(sg) - pgoff) | attr; 2434 phys_pfn = pteval >> VTD_PAGE_SHIFT; 2435 } 2436 2437 if (!pte) { 2438 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res); 2439 2440 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl); 2441 if (!pte) 2442 return -ENOMEM; 2443 /* It is large page*/ 2444 if (largepage_lvl > 1) { 2445 unsigned long nr_superpages, end_pfn; 2446 2447 pteval |= DMA_PTE_LARGE_PAGE; 2448 lvl_pages = lvl_to_nr_pages(largepage_lvl); 2449 2450 nr_superpages = sg_res / lvl_pages; 2451 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1; 2452 2453 /* 2454 * Ensure that old small page tables are 2455 * removed to make room for superpage(s). 2456 * We're adding new large pages, so make sure 2457 * we don't remove their parent tables. 2458 */ 2459 dma_pte_free_pagetable(domain, iov_pfn, end_pfn, 2460 largepage_lvl + 1); 2461 } else { 2462 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; 2463 } 2464 2465 } 2466 /* We don't need lock here, nobody else 2467 * touches the iova range 2468 */ 2469 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval); 2470 if (tmp) { 2471 static int dumps = 5; 2472 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", 2473 iov_pfn, tmp, (unsigned long long)pteval); 2474 if (dumps) { 2475 dumps--; 2476 debug_dma_dump_mappings(NULL); 2477 } 2478 WARN_ON(1); 2479 } 2480 2481 lvl_pages = lvl_to_nr_pages(largepage_lvl); 2482 2483 BUG_ON(nr_pages < lvl_pages); 2484 BUG_ON(sg_res < lvl_pages); 2485 2486 nr_pages -= lvl_pages; 2487 iov_pfn += lvl_pages; 2488 phys_pfn += lvl_pages; 2489 pteval += lvl_pages * VTD_PAGE_SIZE; 2490 sg_res -= lvl_pages; 2491 2492 /* If the next PTE would be the first in a new page, then we 2493 need to flush the cache on the entries we've just written. 2494 And then we'll need to recalculate 'pte', so clear it and 2495 let it get set again in the if (!pte) block above. 2496 2497 If we're done (!nr_pages) we need to flush the cache too. 2498 2499 Also if we've been setting superpages, we may need to 2500 recalculate 'pte' and switch back to smaller pages for the 2501 end of the mapping, if the trailing size is not enough to 2502 use another superpage (i.e. sg_res < lvl_pages). */ 2503 pte++; 2504 if (!nr_pages || first_pte_in_page(pte) || 2505 (largepage_lvl > 1 && sg_res < lvl_pages)) { 2506 domain_flush_cache(domain, first_pte, 2507 (void *)pte - (void *)first_pte); 2508 pte = NULL; 2509 } 2510 2511 if (!sg_res && nr_pages) 2512 sg = sg_next(sg); 2513 } 2514 return 0; 2515} 2516 2517static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2518 struct scatterlist *sg, unsigned long phys_pfn, 2519 unsigned long nr_pages, int prot) 2520{ 2521 int iommu_id, ret; 2522 struct intel_iommu *iommu; 2523 2524 /* Do the real mapping first */ 2525 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot); 2526 if (ret) 2527 return ret; 2528 2529 for_each_domain_iommu(iommu_id, domain) { 2530 iommu = g_iommus[iommu_id]; 2531 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages); 2532 } 2533 2534 return 0; 2535} 2536 2537static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2538 struct scatterlist *sg, unsigned long nr_pages, 2539 int prot) 2540{ 2541 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot); 2542} 2543 2544static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2545 unsigned long phys_pfn, unsigned long nr_pages, 2546 int prot) 2547{ 2548 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot); 2549} 2550 2551static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn) 2552{ 2553 unsigned long flags; 2554 struct context_entry *context; 2555 u16 did_old; 2556 2557 if (!iommu) 2558 return; 2559 2560 spin_lock_irqsave(&iommu->lock, flags); 2561 context = iommu_context_addr(iommu, bus, devfn, 0); 2562 if (!context) { 2563 spin_unlock_irqrestore(&iommu->lock, flags); 2564 return; 2565 } 2566 did_old = context_domain_id(context); 2567 context_clear_entry(context); 2568 __iommu_flush_cache(iommu, context, sizeof(*context)); 2569 spin_unlock_irqrestore(&iommu->lock, flags); 2570 iommu->flush.flush_context(iommu, 2571 did_old, 2572 (((u16)bus) << 8) | devfn, 2573 DMA_CCMD_MASK_NOBIT, 2574 DMA_CCMD_DEVICE_INVL); 2575 2576 if (sm_supported(iommu)) 2577 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0); 2578 2579 iommu->flush.flush_iotlb(iommu, 2580 did_old, 2581 0, 2582 0, 2583 DMA_TLB_DSI_FLUSH); 2584} 2585 2586static inline void unlink_domain_info(struct device_domain_info *info) 2587{ 2588 assert_spin_locked(&device_domain_lock); 2589 list_del(&info->link); 2590 list_del(&info->global); 2591 if (info->dev) 2592 dev_iommu_priv_set(info->dev, NULL); 2593} 2594 2595static void domain_remove_dev_info(struct dmar_domain *domain) 2596{ 2597 struct device_domain_info *info, *tmp; 2598 unsigned long flags; 2599 2600 spin_lock_irqsave(&device_domain_lock, flags); 2601 list_for_each_entry_safe(info, tmp, &domain->devices, link) 2602 __dmar_remove_one_dev_info(info); 2603 spin_unlock_irqrestore(&device_domain_lock, flags); 2604} 2605 2606struct dmar_domain *find_domain(struct device *dev) 2607{ 2608 struct device_domain_info *info; 2609 2610 if (unlikely(!dev || !dev->iommu)) 2611 return NULL; 2612 2613 if (unlikely(attach_deferred(dev))) 2614 return NULL; 2615 2616 /* No lock here, assumes no domain exit in normal case */ 2617 info = get_domain_info(dev); 2618 if (likely(info)) 2619 return info->domain; 2620 2621 return NULL; 2622} 2623 2624static void do_deferred_attach(struct device *dev) 2625{ 2626 struct iommu_domain *domain; 2627 2628 dev_iommu_priv_set(dev, NULL); 2629 domain = iommu_get_domain_for_dev(dev); 2630 if (domain) 2631 intel_iommu_attach_device(domain, dev); 2632} 2633 2634static inline struct device_domain_info * 2635dmar_search_domain_by_dev_info(int segment, int bus, int devfn) 2636{ 2637 struct device_domain_info *info; 2638 2639 list_for_each_entry(info, &device_domain_list, global) 2640 if (info->segment == segment && info->bus == bus && 2641 info->devfn == devfn) 2642 return info; 2643 2644 return NULL; 2645} 2646 2647static int domain_setup_first_level(struct intel_iommu *iommu, 2648 struct dmar_domain *domain, 2649 struct device *dev, 2650 u32 pasid) 2651{ 2652 struct dma_pte *pgd = domain->pgd; 2653 int agaw, level; 2654 int flags = 0; 2655 2656 /* 2657 * Skip top levels of page tables for iommu which has 2658 * less agaw than default. Unnecessary for PT mode. 2659 */ 2660 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2661 pgd = phys_to_virt(dma_pte_addr(pgd)); 2662 if (!dma_pte_present(pgd)) 2663 return -ENOMEM; 2664 } 2665 2666 level = agaw_to_level(agaw); 2667 if (level != 4 && level != 5) 2668 return -EINVAL; 2669 2670 if (pasid != PASID_RID2PASID) 2671 flags |= PASID_FLAG_SUPERVISOR_MODE; 2672 if (level == 5) 2673 flags |= PASID_FLAG_FL5LP; 2674 2675 if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED) 2676 flags |= PASID_FLAG_PAGE_SNOOP; 2677 2678 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, 2679 domain->iommu_did[iommu->seq_id], 2680 flags); 2681} 2682 2683static bool dev_is_real_dma_subdevice(struct device *dev) 2684{ 2685 return dev && dev_is_pci(dev) && 2686 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); 2687} 2688 2689static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, 2690 int bus, int devfn, 2691 struct device *dev, 2692 struct dmar_domain *domain) 2693{ 2694 struct dmar_domain *found = NULL; 2695 struct device_domain_info *info; 2696 unsigned long flags; 2697 int ret; 2698 2699 info = alloc_devinfo_mem(); 2700 if (!info) 2701 return NULL; 2702 2703 if (!dev_is_real_dma_subdevice(dev)) { 2704 info->bus = bus; 2705 info->devfn = devfn; 2706 info->segment = iommu->segment; 2707 } else { 2708 struct pci_dev *pdev = to_pci_dev(dev); 2709 2710 info->bus = pdev->bus->number; 2711 info->devfn = pdev->devfn; 2712 info->segment = pci_domain_nr(pdev->bus); 2713 } 2714 2715 info->ats_supported = info->pasid_supported = info->pri_supported = 0; 2716 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0; 2717 info->ats_qdep = 0; 2718 info->dev = dev; 2719 info->domain = domain; 2720 info->iommu = iommu; 2721 info->pasid_table = NULL; 2722 info->auxd_enabled = 0; 2723 INIT_LIST_HEAD(&info->auxiliary_domains); 2724 2725 if (dev && dev_is_pci(dev)) { 2726 struct pci_dev *pdev = to_pci_dev(info->dev); 2727 2728 if (ecap_dev_iotlb_support(iommu->ecap) && 2729 pci_ats_supported(pdev) && 2730 dmar_find_matched_atsr_unit(pdev)) 2731 info->ats_supported = 1; 2732 2733 if (sm_supported(iommu)) { 2734 if (pasid_supported(iommu)) { 2735 int features = pci_pasid_features(pdev); 2736 if (features >= 0) 2737 info->pasid_supported = features | 1; 2738 } 2739 2740 if (info->ats_supported && ecap_prs(iommu->ecap) && 2741 pci_pri_supported(pdev)) 2742 info->pri_supported = 1; 2743 } 2744 } 2745 2746 spin_lock_irqsave(&device_domain_lock, flags); 2747 if (dev) 2748 found = find_domain(dev); 2749 2750 if (!found) { 2751 struct device_domain_info *info2; 2752 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus, 2753 info->devfn); 2754 if (info2) { 2755 found = info2->domain; 2756 info2->dev = dev; 2757 } 2758 } 2759 2760 if (found) { 2761 spin_unlock_irqrestore(&device_domain_lock, flags); 2762 free_devinfo_mem(info); 2763 /* Caller must free the original domain */ 2764 return found; 2765 } 2766 2767 spin_lock(&iommu->lock); 2768 ret = domain_attach_iommu(domain, iommu); 2769 spin_unlock(&iommu->lock); 2770 2771 if (ret) { 2772 spin_unlock_irqrestore(&device_domain_lock, flags); 2773 free_devinfo_mem(info); 2774 return NULL; 2775 } 2776 2777 list_add(&info->link, &domain->devices); 2778 list_add(&info->global, &device_domain_list); 2779 if (dev) 2780 dev_iommu_priv_set(dev, info); 2781 spin_unlock_irqrestore(&device_domain_lock, flags); 2782 2783 /* PASID table is mandatory for a PCI device in scalable mode. */ 2784 if (dev && dev_is_pci(dev) && sm_supported(iommu)) { 2785 ret = intel_pasid_alloc_table(dev); 2786 if (ret) { 2787 dev_err(dev, "PASID table allocation failed\n"); 2788 dmar_remove_one_dev_info(dev); 2789 return NULL; 2790 } 2791 2792 /* Setup the PASID entry for requests without PASID: */ 2793 spin_lock_irqsave(&iommu->lock, flags); 2794 if (hw_pass_through && domain_type_is_si(domain)) 2795 ret = intel_pasid_setup_pass_through(iommu, domain, 2796 dev, PASID_RID2PASID); 2797 else if (domain_use_first_level(domain)) 2798 ret = domain_setup_first_level(iommu, domain, dev, 2799 PASID_RID2PASID); 2800 else 2801 ret = intel_pasid_setup_second_level(iommu, domain, 2802 dev, PASID_RID2PASID); 2803 spin_unlock_irqrestore(&iommu->lock, flags); 2804 if (ret) { 2805 dev_err(dev, "Setup RID2PASID failed\n"); 2806 dmar_remove_one_dev_info(dev); 2807 return NULL; 2808 } 2809 } 2810 2811 if (dev && domain_context_mapping(domain, dev)) { 2812 dev_err(dev, "Domain context map failed\n"); 2813 dmar_remove_one_dev_info(dev); 2814 return NULL; 2815 } 2816 2817 return domain; 2818} 2819 2820static int iommu_domain_identity_map(struct dmar_domain *domain, 2821 unsigned long first_vpfn, 2822 unsigned long last_vpfn) 2823{ 2824 /* 2825 * RMRR range might have overlap with physical memory range, 2826 * clear it first 2827 */ 2828 dma_pte_clear_range(domain, first_vpfn, last_vpfn); 2829 2830 return __domain_mapping(domain, first_vpfn, NULL, 2831 first_vpfn, last_vpfn - first_vpfn + 1, 2832 DMA_PTE_READ|DMA_PTE_WRITE); 2833} 2834 2835static int md_domain_init(struct dmar_domain *domain, int guest_width); 2836 2837static int __init si_domain_init(int hw) 2838{ 2839 struct dmar_rmrr_unit *rmrr; 2840 struct device *dev; 2841 int i, nid, ret; 2842 2843 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY); 2844 if (!si_domain) 2845 return -EFAULT; 2846 2847 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 2848 domain_exit(si_domain); 2849 si_domain = NULL; 2850 return -EFAULT; 2851 } 2852 2853 if (hw) 2854 return 0; 2855 2856 for_each_online_node(nid) { 2857 unsigned long start_pfn, end_pfn; 2858 int i; 2859 2860 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 2861 ret = iommu_domain_identity_map(si_domain, 2862 mm_to_dma_pfn(start_pfn), 2863 mm_to_dma_pfn(end_pfn)); 2864 if (ret) 2865 return ret; 2866 } 2867 } 2868 2869 /* 2870 * Identity map the RMRRs so that devices with RMRRs could also use 2871 * the si_domain. 2872 */ 2873 for_each_rmrr_units(rmrr) { 2874 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 2875 i, dev) { 2876 unsigned long long start = rmrr->base_address; 2877 unsigned long long end = rmrr->end_address; 2878 2879 if (WARN_ON(end < start || 2880 end >> agaw_to_width(si_domain->agaw))) 2881 continue; 2882 2883 ret = iommu_domain_identity_map(si_domain, 2884 mm_to_dma_pfn(start >> PAGE_SHIFT), 2885 mm_to_dma_pfn(end >> PAGE_SHIFT)); 2886 if (ret) 2887 return ret; 2888 } 2889 } 2890 2891 return 0; 2892} 2893 2894static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev) 2895{ 2896 struct dmar_domain *ndomain; 2897 struct intel_iommu *iommu; 2898 u8 bus, devfn; 2899 2900 iommu = device_to_iommu(dev, &bus, &devfn); 2901 if (!iommu) 2902 return -ENODEV; 2903 2904 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain); 2905 if (ndomain != domain) 2906 return -EBUSY; 2907 2908 return 0; 2909} 2910 2911static bool device_has_rmrr(struct device *dev) 2912{ 2913 struct dmar_rmrr_unit *rmrr; 2914 struct device *tmp; 2915 int i; 2916 2917 rcu_read_lock(); 2918 for_each_rmrr_units(rmrr) { 2919 /* 2920 * Return TRUE if this RMRR contains the device that 2921 * is passed in. 2922 */ 2923 for_each_active_dev_scope(rmrr->devices, 2924 rmrr->devices_cnt, i, tmp) 2925 if (tmp == dev || 2926 is_downstream_to_pci_bridge(dev, tmp)) { 2927 rcu_read_unlock(); 2928 return true; 2929 } 2930 } 2931 rcu_read_unlock(); 2932 return false; 2933} 2934 2935/** 2936 * device_rmrr_is_relaxable - Test whether the RMRR of this device 2937 * is relaxable (ie. is allowed to be not enforced under some conditions) 2938 * @dev: device handle 2939 * 2940 * We assume that PCI USB devices with RMRRs have them largely 2941 * for historical reasons and that the RMRR space is not actively used post 2942 * boot. This exclusion may change if vendors begin to abuse it. 2943 * 2944 * The same exception is made for graphics devices, with the requirement that 2945 * any use of the RMRR regions will be torn down before assigning the device 2946 * to a guest. 2947 * 2948 * Return: true if the RMRR is relaxable, false otherwise 2949 */ 2950static bool device_rmrr_is_relaxable(struct device *dev) 2951{ 2952 struct pci_dev *pdev; 2953 2954 if (!dev_is_pci(dev)) 2955 return false; 2956 2957 pdev = to_pci_dev(dev); 2958 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 2959 return true; 2960 else 2961 return false; 2962} 2963 2964/* 2965 * There are a couple cases where we need to restrict the functionality of 2966 * devices associated with RMRRs. The first is when evaluating a device for 2967 * identity mapping because problems exist when devices are moved in and out 2968 * of domains and their respective RMRR information is lost. This means that 2969 * a device with associated RMRRs will never be in a "passthrough" domain. 2970 * The second is use of the device through the IOMMU API. This interface 2971 * expects to have full control of the IOVA space for the device. We cannot 2972 * satisfy both the requirement that RMRR access is maintained and have an 2973 * unencumbered IOVA space. We also have no ability to quiesce the device's 2974 * use of the RMRR space or even inform the IOMMU API user of the restriction. 2975 * We therefore prevent devices associated with an RMRR from participating in 2976 * the IOMMU API, which eliminates them from device assignment. 2977 * 2978 * In both cases, devices which have relaxable RMRRs are not concerned by this 2979 * restriction. See device_rmrr_is_relaxable comment. 2980 */ 2981static bool device_is_rmrr_locked(struct device *dev) 2982{ 2983 if (!device_has_rmrr(dev)) 2984 return false; 2985 2986 if (device_rmrr_is_relaxable(dev)) 2987 return false; 2988 2989 return true; 2990} 2991 2992/* 2993 * Return the required default domain type for a specific device. 2994 * 2995 * @dev: the device in query 2996 * @startup: true if this is during early boot 2997 * 2998 * Returns: 2999 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain 3000 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain 3001 * - 0: both identity and dynamic domains work for this device 3002 */ 3003static int device_def_domain_type(struct device *dev) 3004{ 3005 if (dev_is_pci(dev)) { 3006 struct pci_dev *pdev = to_pci_dev(dev); 3007 3008 /* 3009 * Prevent any device marked as untrusted from getting 3010 * placed into the statically identity mapping domain. 3011 */ 3012 if (pdev->untrusted) 3013 return IOMMU_DOMAIN_DMA; 3014 3015 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) 3016 return IOMMU_DOMAIN_IDENTITY; 3017 3018 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev)) 3019 return IOMMU_DOMAIN_IDENTITY; 3020 } 3021 3022 return 0; 3023} 3024 3025static void intel_iommu_init_qi(struct intel_iommu *iommu) 3026{ 3027 /* 3028 * Start from the sane iommu hardware state. 3029 * If the queued invalidation is already initialized by us 3030 * (for example, while enabling interrupt-remapping) then 3031 * we got the things already rolling from a sane state. 3032 */ 3033 if (!iommu->qi) { 3034 /* 3035 * Clear any previous faults. 3036 */ 3037 dmar_fault(-1, iommu); 3038 /* 3039 * Disable queued invalidation if supported and already enabled 3040 * before OS handover. 3041 */ 3042 dmar_disable_qi(iommu); 3043 } 3044 3045 if (dmar_enable_qi(iommu)) { 3046 /* 3047 * Queued Invalidate not enabled, use Register Based Invalidate 3048 */ 3049 iommu->flush.flush_context = __iommu_flush_context; 3050 iommu->flush.flush_iotlb = __iommu_flush_iotlb; 3051 pr_info("%s: Using Register based invalidation\n", 3052 iommu->name); 3053 } else { 3054 iommu->flush.flush_context = qi_flush_context; 3055 iommu->flush.flush_iotlb = qi_flush_iotlb; 3056 pr_info("%s: Using Queued invalidation\n", iommu->name); 3057 } 3058} 3059 3060static int copy_context_table(struct intel_iommu *iommu, 3061 struct root_entry *old_re, 3062 struct context_entry **tbl, 3063 int bus, bool ext) 3064{ 3065 int tbl_idx, pos = 0, idx, devfn, ret = 0, did; 3066 struct context_entry *new_ce = NULL, ce; 3067 struct context_entry *old_ce = NULL; 3068 struct root_entry re; 3069 phys_addr_t old_ce_phys; 3070 3071 tbl_idx = ext ? bus * 2 : bus; 3072 memcpy(&re, old_re, sizeof(re)); 3073 3074 for (devfn = 0; devfn < 256; devfn++) { 3075 /* First calculate the correct index */ 3076 idx = (ext ? devfn * 2 : devfn) % 256; 3077 3078 if (idx == 0) { 3079 /* First save what we may have and clean up */ 3080 if (new_ce) { 3081 tbl[tbl_idx] = new_ce; 3082 __iommu_flush_cache(iommu, new_ce, 3083 VTD_PAGE_SIZE); 3084 pos = 1; 3085 } 3086 3087 if (old_ce) 3088 memunmap(old_ce); 3089 3090 ret = 0; 3091 if (devfn < 0x80) 3092 old_ce_phys = root_entry_lctp(&re); 3093 else 3094 old_ce_phys = root_entry_uctp(&re); 3095 3096 if (!old_ce_phys) { 3097 if (ext && devfn == 0) { 3098 /* No LCTP, try UCTP */ 3099 devfn = 0x7f; 3100 continue; 3101 } else { 3102 goto out; 3103 } 3104 } 3105 3106 ret = -ENOMEM; 3107 old_ce = memremap(old_ce_phys, PAGE_SIZE, 3108 MEMREMAP_WB); 3109 if (!old_ce) 3110 goto out; 3111 3112 new_ce = alloc_pgtable_page(iommu->node); 3113 if (!new_ce) 3114 goto out_unmap; 3115 3116 ret = 0; 3117 } 3118 3119 /* Now copy the context entry */ 3120 memcpy(&ce, old_ce + idx, sizeof(ce)); 3121 3122 if (!__context_present(&ce)) 3123 continue; 3124 3125 did = context_domain_id(&ce); 3126 if (did >= 0 && did < cap_ndoms(iommu->cap)) 3127 set_bit(did, iommu->domain_ids); 3128 3129 /* 3130 * We need a marker for copied context entries. This 3131 * marker needs to work for the old format as well as 3132 * for extended context entries. 3133 * 3134 * Bit 67 of the context entry is used. In the old 3135 * format this bit is available to software, in the 3136 * extended format it is the PGE bit, but PGE is ignored 3137 * by HW if PASIDs are disabled (and thus still 3138 * available). 3139 * 3140 * So disable PASIDs first and then mark the entry 3141 * copied. This means that we don't copy PASID 3142 * translations from the old kernel, but this is fine as 3143 * faults there are not fatal. 3144 */ 3145 context_clear_pasid_enable(&ce); 3146 context_set_copied(&ce); 3147 3148 new_ce[idx] = ce; 3149 } 3150 3151 tbl[tbl_idx + pos] = new_ce; 3152 3153 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); 3154 3155out_unmap: 3156 memunmap(old_ce); 3157 3158out: 3159 return ret; 3160} 3161 3162static int copy_translation_tables(struct intel_iommu *iommu) 3163{ 3164 struct context_entry **ctxt_tbls; 3165 struct root_entry *old_rt; 3166 phys_addr_t old_rt_phys; 3167 int ctxt_table_entries; 3168 unsigned long flags; 3169 u64 rtaddr_reg; 3170 int bus, ret; 3171 bool new_ext, ext; 3172 3173 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); 3174 ext = !!(rtaddr_reg & DMA_RTADDR_RTT); 3175 new_ext = !!ecap_ecs(iommu->ecap); 3176 3177 /* 3178 * The RTT bit can only be changed when translation is disabled, 3179 * but disabling translation means to open a window for data 3180 * corruption. So bail out and don't copy anything if we would 3181 * have to change the bit. 3182 */ 3183 if (new_ext != ext) 3184 return -EINVAL; 3185 3186 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; 3187 if (!old_rt_phys) 3188 return -EINVAL; 3189 3190 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); 3191 if (!old_rt) 3192 return -ENOMEM; 3193 3194 /* This is too big for the stack - allocate it from slab */ 3195 ctxt_table_entries = ext ? 512 : 256; 3196 ret = -ENOMEM; 3197 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); 3198 if (!ctxt_tbls) 3199 goto out_unmap; 3200 3201 for (bus = 0; bus < 256; bus++) { 3202 ret = copy_context_table(iommu, &old_rt[bus], 3203 ctxt_tbls, bus, ext); 3204 if (ret) { 3205 pr_err("%s: Failed to copy context table for bus %d\n", 3206 iommu->name, bus); 3207 continue; 3208 } 3209 } 3210 3211 spin_lock_irqsave(&iommu->lock, flags); 3212 3213 /* Context tables are copied, now write them to the root_entry table */ 3214 for (bus = 0; bus < 256; bus++) { 3215 int idx = ext ? bus * 2 : bus; 3216 u64 val; 3217 3218 if (ctxt_tbls[idx]) { 3219 val = virt_to_phys(ctxt_tbls[idx]) | 1; 3220 iommu->root_entry[bus].lo = val; 3221 } 3222 3223 if (!ext || !ctxt_tbls[idx + 1]) 3224 continue; 3225 3226 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; 3227 iommu->root_entry[bus].hi = val; 3228 } 3229 3230 spin_unlock_irqrestore(&iommu->lock, flags); 3231 3232 kfree(ctxt_tbls); 3233 3234 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); 3235 3236 ret = 0; 3237 3238out_unmap: 3239 memunmap(old_rt); 3240 3241 return ret; 3242} 3243 3244#ifdef CONFIG_INTEL_IOMMU_SVM 3245static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data) 3246{ 3247 struct intel_iommu *iommu = data; 3248 ioasid_t ioasid; 3249 3250 if (!iommu) 3251 return INVALID_IOASID; 3252 /* 3253 * VT-d virtual command interface always uses the full 20 bit 3254 * PASID range. Host can partition guest PASID range based on 3255 * policies but it is out of guest's control. 3256 */ 3257 if (min < PASID_MIN || max > intel_pasid_max_id) 3258 return INVALID_IOASID; 3259 3260 if (vcmd_alloc_pasid(iommu, &ioasid)) 3261 return INVALID_IOASID; 3262 3263 return ioasid; 3264} 3265 3266static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data) 3267{ 3268 struct intel_iommu *iommu = data; 3269 3270 if (!iommu) 3271 return; 3272 /* 3273 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO 3274 * We can only free the PASID when all the devices are unbound. 3275 */ 3276 if (ioasid_find(NULL, ioasid, NULL)) { 3277 pr_alert("Cannot free active IOASID %d\n", ioasid); 3278 return; 3279 } 3280 vcmd_free_pasid(iommu, ioasid); 3281} 3282 3283static void register_pasid_allocator(struct intel_iommu *iommu) 3284{ 3285 /* 3286 * If we are running in the host, no need for custom allocator 3287 * in that PASIDs are allocated from the host system-wide. 3288 */ 3289 if (!cap_caching_mode(iommu->cap)) 3290 return; 3291 3292 if (!sm_supported(iommu)) { 3293 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n"); 3294 return; 3295 } 3296 3297 /* 3298 * Register a custom PASID allocator if we are running in a guest, 3299 * guest PASID must be obtained via virtual command interface. 3300 * There can be multiple vIOMMUs in each guest but only one allocator 3301 * is active. All vIOMMU allocators will eventually be calling the same 3302 * host allocator. 3303 */ 3304 if (!vccap_pasid(iommu->vccap)) 3305 return; 3306 3307 pr_info("Register custom PASID allocator\n"); 3308 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc; 3309 iommu->pasid_allocator.free = intel_vcmd_ioasid_free; 3310 iommu->pasid_allocator.pdata = (void *)iommu; 3311 if (ioasid_register_allocator(&iommu->pasid_allocator)) { 3312 pr_warn("Custom PASID allocator failed, scalable mode disabled\n"); 3313 /* 3314 * Disable scalable mode on this IOMMU if there 3315 * is no custom allocator. Mixing SM capable vIOMMU 3316 * and non-SM vIOMMU are not supported. 3317 */ 3318 intel_iommu_sm = 0; 3319 } 3320} 3321#endif 3322 3323static int __init init_dmars(void) 3324{ 3325 struct dmar_drhd_unit *drhd; 3326 struct intel_iommu *iommu; 3327 int ret; 3328 3329 /* 3330 * for each drhd 3331 * allocate root 3332 * initialize and program root entry to not present 3333 * endfor 3334 */ 3335 for_each_drhd_unit(drhd) { 3336 /* 3337 * lock not needed as this is only incremented in the single 3338 * threaded kernel __init code path all other access are read 3339 * only 3340 */ 3341 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) { 3342 g_num_of_iommus++; 3343 continue; 3344 } 3345 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED); 3346 } 3347 3348 /* Preallocate enough resources for IOMMU hot-addition */ 3349 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) 3350 g_num_of_iommus = DMAR_UNITS_SUPPORTED; 3351 3352 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *), 3353 GFP_KERNEL); 3354 if (!g_iommus) { 3355 pr_err("Allocating global iommu array failed\n"); 3356 ret = -ENOMEM; 3357 goto error; 3358 } 3359 3360 for_each_iommu(iommu, drhd) { 3361 if (drhd->ignored) { 3362 iommu_disable_translation(iommu); 3363 continue; 3364 } 3365 3366 /* 3367 * Find the max pasid size of all IOMMU's in the system. 3368 * We need to ensure the system pasid table is no bigger 3369 * than the smallest supported. 3370 */ 3371 if (pasid_supported(iommu)) { 3372 u32 temp = 2 << ecap_pss(iommu->ecap); 3373 3374 intel_pasid_max_id = min_t(u32, temp, 3375 intel_pasid_max_id); 3376 } 3377 3378 g_iommus[iommu->seq_id] = iommu; 3379 3380 intel_iommu_init_qi(iommu); 3381 3382 ret = iommu_init_domains(iommu); 3383 if (ret) 3384 goto free_iommu; 3385 3386 init_translation_status(iommu); 3387 3388 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { 3389 iommu_disable_translation(iommu); 3390 clear_translation_pre_enabled(iommu); 3391 pr_warn("Translation was enabled for %s but we are not in kdump mode\n", 3392 iommu->name); 3393 } 3394 3395 /* 3396 * TBD: 3397 * we could share the same root & context tables 3398 * among all IOMMU's. Need to Split it later. 3399 */ 3400 ret = iommu_alloc_root_entry(iommu); 3401 if (ret) 3402 goto free_iommu; 3403 3404 if (translation_pre_enabled(iommu)) { 3405 pr_info("Translation already enabled - trying to copy translation structures\n"); 3406 3407 ret = copy_translation_tables(iommu); 3408 if (ret) { 3409 /* 3410 * We found the IOMMU with translation 3411 * enabled - but failed to copy over the 3412 * old root-entry table. Try to proceed 3413 * by disabling translation now and 3414 * allocating a clean root-entry table. 3415 * This might cause DMAR faults, but 3416 * probably the dump will still succeed. 3417 */ 3418 pr_err("Failed to copy translation tables from previous kernel for %s\n", 3419 iommu->name); 3420 iommu_disable_translation(iommu); 3421 clear_translation_pre_enabled(iommu); 3422 } else { 3423 pr_info("Copied translation tables from previous kernel for %s\n", 3424 iommu->name); 3425 } 3426 } 3427 3428 if (!ecap_pass_through(iommu->ecap)) 3429 hw_pass_through = 0; 3430 3431 if (!intel_iommu_strict && cap_caching_mode(iommu->cap)) { 3432 pr_warn("Disable batched IOTLB flush due to virtualization"); 3433 intel_iommu_strict = 1; 3434 } 3435 intel_svm_check(iommu); 3436 } 3437 3438 /* 3439 * Now that qi is enabled on all iommus, set the root entry and flush 3440 * caches. This is required on some Intel X58 chipsets, otherwise the 3441 * flush_context function will loop forever and the boot hangs. 3442 */ 3443 for_each_active_iommu(iommu, drhd) { 3444 iommu_flush_write_buffer(iommu); 3445#ifdef CONFIG_INTEL_IOMMU_SVM 3446 register_pasid_allocator(iommu); 3447#endif 3448 iommu_set_root_entry(iommu); 3449 } 3450 3451#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA 3452 dmar_map_gfx = 0; 3453#endif 3454 3455 if (!dmar_map_gfx) 3456 iommu_identity_mapping |= IDENTMAP_GFX; 3457 3458 check_tylersburg_isoch(); 3459 3460 ret = si_domain_init(hw_pass_through); 3461 if (ret) 3462 goto free_iommu; 3463 3464 /* 3465 * for each drhd 3466 * enable fault log 3467 * global invalidate context cache 3468 * global invalidate iotlb 3469 * enable translation 3470 */ 3471 for_each_iommu(iommu, drhd) { 3472 if (drhd->ignored) { 3473 /* 3474 * we always have to disable PMRs or DMA may fail on 3475 * this device 3476 */ 3477 if (force_on) 3478 iommu_disable_protect_mem_regions(iommu); 3479 continue; 3480 } 3481 3482 iommu_flush_write_buffer(iommu); 3483 3484#ifdef CONFIG_INTEL_IOMMU_SVM 3485 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3486 /* 3487 * Call dmar_alloc_hwirq() with dmar_global_lock held, 3488 * could cause possible lock race condition. 3489 */ 3490 up_write(&dmar_global_lock); 3491 ret = intel_svm_enable_prq(iommu); 3492 down_write(&dmar_global_lock); 3493 if (ret) 3494 goto free_iommu; 3495 } 3496#endif 3497 ret = dmar_set_interrupt(iommu); 3498 if (ret) 3499 goto free_iommu; 3500 } 3501 3502 return 0; 3503 3504free_iommu: 3505 for_each_active_iommu(iommu, drhd) { 3506 disable_dmar_iommu(iommu); 3507 free_dmar_iommu(iommu); 3508 } 3509 if (si_domain) { 3510 domain_exit(si_domain); 3511 si_domain = NULL; 3512 } 3513 3514 kfree(g_iommus); 3515 3516error: 3517 return ret; 3518} 3519 3520/* This takes a number of _MM_ pages, not VTD pages */ 3521static unsigned long intel_alloc_iova(struct device *dev, 3522 struct dmar_domain *domain, 3523 unsigned long nrpages, uint64_t dma_mask) 3524{ 3525 unsigned long iova_pfn; 3526 3527 /* 3528 * Restrict dma_mask to the width that the iommu can handle. 3529 * First-level translation restricts the input-address to a 3530 * canonical address (i.e., address bits 63:N have the same 3531 * value as address bit [N-1], where N is 48-bits with 4-level 3532 * paging and 57-bits with 5-level paging). Hence, skip bit 3533 * [N-1]. 3534 */ 3535 if (domain_use_first_level(domain)) 3536 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1), 3537 dma_mask); 3538 else 3539 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), 3540 dma_mask); 3541 3542 /* Ensure we reserve the whole size-aligned region */ 3543 nrpages = __roundup_pow_of_two(nrpages); 3544 3545 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) { 3546 /* 3547 * First try to allocate an io virtual address in 3548 * DMA_BIT_MASK(32) and if that fails then try allocating 3549 * from higher range 3550 */ 3551 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages, 3552 IOVA_PFN(DMA_BIT_MASK(32)), false); 3553 if (iova_pfn) 3554 return iova_pfn; 3555 } 3556 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages, 3557 IOVA_PFN(dma_mask), true); 3558 if (unlikely(!iova_pfn)) { 3559 dev_err_once(dev, "Allocating %ld-page iova failed\n", 3560 nrpages); 3561 return 0; 3562 } 3563 3564 return iova_pfn; 3565} 3566 3567static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr, 3568 size_t size, int dir, u64 dma_mask) 3569{ 3570 struct dmar_domain *domain; 3571 phys_addr_t start_paddr; 3572 unsigned long iova_pfn; 3573 int prot = 0; 3574 int ret; 3575 struct intel_iommu *iommu; 3576 unsigned long paddr_pfn = paddr >> PAGE_SHIFT; 3577 3578 BUG_ON(dir == DMA_NONE); 3579 3580 if (unlikely(attach_deferred(dev))) 3581 do_deferred_attach(dev); 3582 3583 domain = find_domain(dev); 3584 if (!domain) 3585 return DMA_MAPPING_ERROR; 3586 3587 iommu = domain_get_iommu(domain); 3588 size = aligned_nrpages(paddr, size); 3589 3590 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask); 3591 if (!iova_pfn) 3592 goto error; 3593 3594 /* 3595 * Check if DMAR supports zero-length reads on write only 3596 * mappings.. 3597 */ 3598 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \ 3599 !cap_zlr(iommu->cap)) 3600 prot |= DMA_PTE_READ; 3601 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) 3602 prot |= DMA_PTE_WRITE; 3603 /* 3604 * paddr - (paddr + size) might be partial page, we should map the whole 3605 * page. Note: if two part of one page are separately mapped, we 3606 * might have two guest_addr mapping to the same host paddr, but this 3607 * is not a big problem 3608 */ 3609 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn), 3610 mm_to_dma_pfn(paddr_pfn), size, prot); 3611 if (ret) 3612 goto error; 3613 3614 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT; 3615 start_paddr += paddr & ~PAGE_MASK; 3616 3617 trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT); 3618 3619 return start_paddr; 3620 3621error: 3622 if (iova_pfn) 3623 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size)); 3624 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n", 3625 size, (unsigned long long)paddr, dir); 3626 return DMA_MAPPING_ERROR; 3627} 3628 3629static dma_addr_t intel_map_page(struct device *dev, struct page *page, 3630 unsigned long offset, size_t size, 3631 enum dma_data_direction dir, 3632 unsigned long attrs) 3633{ 3634 return __intel_map_single(dev, page_to_phys(page) + offset, 3635 size, dir, *dev->dma_mask); 3636} 3637 3638static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr, 3639 size_t size, enum dma_data_direction dir, 3640 unsigned long attrs) 3641{ 3642 return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask); 3643} 3644 3645static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size) 3646{ 3647 struct dmar_domain *domain; 3648 unsigned long start_pfn, last_pfn; 3649 unsigned long nrpages; 3650 unsigned long iova_pfn; 3651 struct intel_iommu *iommu; 3652 struct page *freelist; 3653 struct pci_dev *pdev = NULL; 3654 3655 domain = find_domain(dev); 3656 BUG_ON(!domain); 3657 3658 iommu = domain_get_iommu(domain); 3659 3660 iova_pfn = IOVA_PFN(dev_addr); 3661 3662 nrpages = aligned_nrpages(dev_addr, size); 3663 start_pfn = mm_to_dma_pfn(iova_pfn); 3664 last_pfn = start_pfn + nrpages - 1; 3665 3666 if (dev_is_pci(dev)) 3667 pdev = to_pci_dev(dev); 3668 3669 freelist = domain_unmap(domain, start_pfn, last_pfn); 3670 if (intel_iommu_strict || (pdev && pdev->untrusted) || 3671 !has_iova_flush_queue(&domain->iovad)) { 3672 iommu_flush_iotlb_psi(iommu, domain, start_pfn, 3673 nrpages, !freelist, 0); 3674 /* free iova */ 3675 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages)); 3676 dma_free_pagelist(freelist); 3677 } else { 3678 queue_iova(&domain->iovad, iova_pfn, nrpages, 3679 (unsigned long)freelist); 3680 /* 3681 * queue up the release of the unmap to save the 1/6th of the 3682 * cpu used up by the iotlb flush operation... 3683 */ 3684 } 3685 3686 trace_unmap_single(dev, dev_addr, size); 3687} 3688 3689static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr, 3690 size_t size, enum dma_data_direction dir, 3691 unsigned long attrs) 3692{ 3693 intel_unmap(dev, dev_addr, size); 3694} 3695 3696static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr, 3697 size_t size, enum dma_data_direction dir, unsigned long attrs) 3698{ 3699 intel_unmap(dev, dev_addr, size); 3700} 3701 3702static void *intel_alloc_coherent(struct device *dev, size_t size, 3703 dma_addr_t *dma_handle, gfp_t flags, 3704 unsigned long attrs) 3705{ 3706 struct page *page = NULL; 3707 int order; 3708 3709 if (unlikely(attach_deferred(dev))) 3710 do_deferred_attach(dev); 3711 3712 size = PAGE_ALIGN(size); 3713 order = get_order(size); 3714 3715 if (gfpflags_allow_blocking(flags)) { 3716 unsigned int count = size >> PAGE_SHIFT; 3717 3718 page = dma_alloc_from_contiguous(dev, count, order, 3719 flags & __GFP_NOWARN); 3720 } 3721 3722 if (!page) 3723 page = alloc_pages(flags, order); 3724 if (!page) 3725 return NULL; 3726 memset(page_address(page), 0, size); 3727 3728 *dma_handle = __intel_map_single(dev, page_to_phys(page), size, 3729 DMA_BIDIRECTIONAL, 3730 dev->coherent_dma_mask); 3731 if (*dma_handle != DMA_MAPPING_ERROR) 3732 return page_address(page); 3733 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT)) 3734 __free_pages(page, order); 3735 3736 return NULL; 3737} 3738 3739static void intel_free_coherent(struct device *dev, size_t size, void *vaddr, 3740 dma_addr_t dma_handle, unsigned long attrs) 3741{ 3742 int order; 3743 struct page *page = virt_to_page(vaddr); 3744 3745 size = PAGE_ALIGN(size); 3746 order = get_order(size); 3747 3748 intel_unmap(dev, dma_handle, size); 3749 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT)) 3750 __free_pages(page, order); 3751} 3752 3753static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist, 3754 int nelems, enum dma_data_direction dir, 3755 unsigned long attrs) 3756{ 3757 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK; 3758 unsigned long nrpages = 0; 3759 struct scatterlist *sg; 3760 int i; 3761 3762 for_each_sg(sglist, sg, nelems, i) { 3763 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg)); 3764 } 3765 3766 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT); 3767 3768 trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT); 3769} 3770 3771static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems, 3772 enum dma_data_direction dir, unsigned long attrs) 3773{ 3774 int i; 3775 struct dmar_domain *domain; 3776 size_t size = 0; 3777 int prot = 0; 3778 unsigned long iova_pfn; 3779 int ret; 3780 struct scatterlist *sg; 3781 unsigned long start_vpfn; 3782 struct intel_iommu *iommu; 3783 3784 BUG_ON(dir == DMA_NONE); 3785 3786 if (unlikely(attach_deferred(dev))) 3787 do_deferred_attach(dev); 3788 3789 domain = find_domain(dev); 3790 if (!domain) 3791 return 0; 3792 3793 iommu = domain_get_iommu(domain); 3794 3795 for_each_sg(sglist, sg, nelems, i) 3796 size += aligned_nrpages(sg->offset, sg->length); 3797 3798 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), 3799 *dev->dma_mask); 3800 if (!iova_pfn) { 3801 sglist->dma_length = 0; 3802 return 0; 3803 } 3804 3805 /* 3806 * Check if DMAR supports zero-length reads on write only 3807 * mappings.. 3808 */ 3809 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \ 3810 !cap_zlr(iommu->cap)) 3811 prot |= DMA_PTE_READ; 3812 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) 3813 prot |= DMA_PTE_WRITE; 3814 3815 start_vpfn = mm_to_dma_pfn(iova_pfn); 3816 3817 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot); 3818 if (unlikely(ret)) { 3819 dma_pte_free_pagetable(domain, start_vpfn, 3820 start_vpfn + size - 1, 3821 agaw_to_level(domain->agaw) + 1); 3822 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size)); 3823 return 0; 3824 } 3825 3826 for_each_sg(sglist, sg, nelems, i) 3827 trace_map_sg(dev, i + 1, nelems, sg); 3828 3829 return nelems; 3830} 3831 3832static u64 intel_get_required_mask(struct device *dev) 3833{ 3834 return DMA_BIT_MASK(32); 3835} 3836 3837static const struct dma_map_ops intel_dma_ops = { 3838 .alloc = intel_alloc_coherent, 3839 .free = intel_free_coherent, 3840 .map_sg = intel_map_sg, 3841 .unmap_sg = intel_unmap_sg, 3842 .map_page = intel_map_page, 3843 .unmap_page = intel_unmap_page, 3844 .map_resource = intel_map_resource, 3845 .unmap_resource = intel_unmap_resource, 3846 .dma_supported = dma_direct_supported, 3847 .mmap = dma_common_mmap, 3848 .get_sgtable = dma_common_get_sgtable, 3849 .alloc_pages = dma_common_alloc_pages, 3850 .free_pages = dma_common_free_pages, 3851 .get_required_mask = intel_get_required_mask, 3852}; 3853 3854static void 3855bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size, 3856 enum dma_data_direction dir, enum dma_sync_target target) 3857{ 3858 struct dmar_domain *domain; 3859 phys_addr_t tlb_addr; 3860 3861 domain = find_domain(dev); 3862 if (WARN_ON(!domain)) 3863 return; 3864 3865 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr); 3866 if (is_swiotlb_buffer(tlb_addr)) 3867 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target); 3868} 3869 3870static dma_addr_t 3871bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size, 3872 enum dma_data_direction dir, unsigned long attrs, 3873 u64 dma_mask) 3874{ 3875 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE); 3876 struct dmar_domain *domain; 3877 struct intel_iommu *iommu; 3878 unsigned long iova_pfn; 3879 unsigned long nrpages; 3880 phys_addr_t tlb_addr; 3881 int prot = 0; 3882 int ret; 3883 3884 if (unlikely(attach_deferred(dev))) 3885 do_deferred_attach(dev); 3886 3887 domain = find_domain(dev); 3888 3889 if (WARN_ON(dir == DMA_NONE || !domain)) 3890 return DMA_MAPPING_ERROR; 3891 3892 iommu = domain_get_iommu(domain); 3893 if (WARN_ON(!iommu)) 3894 return DMA_MAPPING_ERROR; 3895 3896 nrpages = aligned_nrpages(0, size); 3897 iova_pfn = intel_alloc_iova(dev, domain, 3898 dma_to_mm_pfn(nrpages), dma_mask); 3899 if (!iova_pfn) 3900 return DMA_MAPPING_ERROR; 3901 3902 /* 3903 * Check if DMAR supports zero-length reads on write only 3904 * mappings.. 3905 */ 3906 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || 3907 !cap_zlr(iommu->cap)) 3908 prot |= DMA_PTE_READ; 3909 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) 3910 prot |= DMA_PTE_WRITE; 3911 3912 /* 3913 * If both the physical buffer start address and size are 3914 * page aligned, we don't need to use a bounce page. 3915 */ 3916 if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) { 3917 tlb_addr = swiotlb_tbl_map_single(dev, paddr, size, 3918 aligned_size, dir, attrs); 3919 if (tlb_addr == DMA_MAPPING_ERROR) { 3920 goto swiotlb_error; 3921 } else { 3922 /* Cleanup the padding area. */ 3923 void *padding_start = phys_to_virt(tlb_addr); 3924 size_t padding_size = aligned_size; 3925 3926 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && 3927 (dir == DMA_TO_DEVICE || 3928 dir == DMA_BIDIRECTIONAL)) { 3929 padding_start += size; 3930 padding_size -= size; 3931 } 3932 3933 memset(padding_start, 0, padding_size); 3934 } 3935 } else { 3936 tlb_addr = paddr; 3937 } 3938 3939 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn), 3940 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot); 3941 if (ret) 3942 goto mapping_error; 3943 3944 trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size); 3945 3946 return (phys_addr_t)iova_pfn << PAGE_SHIFT; 3947 3948mapping_error: 3949 if (is_swiotlb_buffer(tlb_addr)) 3950 swiotlb_tbl_unmap_single(dev, tlb_addr, size, 3951 aligned_size, dir, attrs); 3952swiotlb_error: 3953 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages)); 3954 dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n", 3955 size, (unsigned long long)paddr, dir); 3956 3957 return DMA_MAPPING_ERROR; 3958} 3959 3960static void 3961bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size, 3962 enum dma_data_direction dir, unsigned long attrs) 3963{ 3964 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE); 3965 struct dmar_domain *domain; 3966 phys_addr_t tlb_addr; 3967 3968 domain = find_domain(dev); 3969 if (WARN_ON(!domain)) 3970 return; 3971 3972 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr); 3973 if (WARN_ON(!tlb_addr)) 3974 return; 3975 3976 intel_unmap(dev, dev_addr, size); 3977 if (is_swiotlb_buffer(tlb_addr)) 3978 swiotlb_tbl_unmap_single(dev, tlb_addr, size, 3979 aligned_size, dir, attrs); 3980 3981 trace_bounce_unmap_single(dev, dev_addr, size); 3982} 3983 3984static dma_addr_t 3985bounce_map_page(struct device *dev, struct page *page, unsigned long offset, 3986 size_t size, enum dma_data_direction dir, unsigned long attrs) 3987{ 3988 return bounce_map_single(dev, page_to_phys(page) + offset, 3989 size, dir, attrs, *dev->dma_mask); 3990} 3991 3992static dma_addr_t 3993bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size, 3994 enum dma_data_direction dir, unsigned long attrs) 3995{ 3996 return bounce_map_single(dev, phys_addr, size, 3997 dir, attrs, *dev->dma_mask); 3998} 3999 4000static void 4001bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size, 4002 enum dma_data_direction dir, unsigned long attrs) 4003{ 4004 bounce_unmap_single(dev, dev_addr, size, dir, attrs); 4005} 4006 4007static void 4008bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size, 4009 enum dma_data_direction dir, unsigned long attrs) 4010{ 4011 bounce_unmap_single(dev, dev_addr, size, dir, attrs); 4012} 4013 4014static void 4015bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems, 4016 enum dma_data_direction dir, unsigned long attrs) 4017{ 4018 struct scatterlist *sg; 4019 int i; 4020 4021 for_each_sg(sglist, sg, nelems, i) 4022 bounce_unmap_page(dev, sg->dma_address, 4023 sg_dma_len(sg), dir, attrs); 4024} 4025 4026static int 4027bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems, 4028 enum dma_data_direction dir, unsigned long attrs) 4029{ 4030 int i; 4031 struct scatterlist *sg; 4032 4033 for_each_sg(sglist, sg, nelems, i) { 4034 sg->dma_address = bounce_map_page(dev, sg_page(sg), 4035 sg->offset, sg->length, 4036 dir, attrs); 4037 if (sg->dma_address == DMA_MAPPING_ERROR) 4038 goto out_unmap; 4039 sg_dma_len(sg) = sg->length; 4040 } 4041 4042 for_each_sg(sglist, sg, nelems, i) 4043 trace_bounce_map_sg(dev, i + 1, nelems, sg); 4044 4045 return nelems; 4046 4047out_unmap: 4048 bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); 4049 return 0; 4050} 4051 4052static void 4053bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr, 4054 size_t size, enum dma_data_direction dir) 4055{ 4056 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU); 4057} 4058 4059static void 4060bounce_sync_single_for_device(struct device *dev, dma_addr_t addr, 4061 size_t size, enum dma_data_direction dir) 4062{ 4063 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE); 4064} 4065 4066static void 4067bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, 4068 int nelems, enum dma_data_direction dir) 4069{ 4070 struct scatterlist *sg; 4071 int i; 4072 4073 for_each_sg(sglist, sg, nelems, i) 4074 bounce_sync_single(dev, sg_dma_address(sg), 4075 sg_dma_len(sg), dir, SYNC_FOR_CPU); 4076} 4077 4078static void 4079bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist, 4080 int nelems, enum dma_data_direction dir) 4081{ 4082 struct scatterlist *sg; 4083 int i; 4084 4085 for_each_sg(sglist, sg, nelems, i) 4086 bounce_sync_single(dev, sg_dma_address(sg), 4087 sg_dma_len(sg), dir, SYNC_FOR_DEVICE); 4088} 4089 4090static const struct dma_map_ops bounce_dma_ops = { 4091 .alloc = intel_alloc_coherent, 4092 .free = intel_free_coherent, 4093 .map_sg = bounce_map_sg, 4094 .unmap_sg = bounce_unmap_sg, 4095 .map_page = bounce_map_page, 4096 .unmap_page = bounce_unmap_page, 4097 .sync_single_for_cpu = bounce_sync_single_for_cpu, 4098 .sync_single_for_device = bounce_sync_single_for_device, 4099 .sync_sg_for_cpu = bounce_sync_sg_for_cpu, 4100 .sync_sg_for_device = bounce_sync_sg_for_device, 4101 .map_resource = bounce_map_resource, 4102 .unmap_resource = bounce_unmap_resource, 4103 .alloc_pages = dma_common_alloc_pages, 4104 .free_pages = dma_common_free_pages, 4105 .dma_supported = dma_direct_supported, 4106}; 4107 4108static inline int iommu_domain_cache_init(void) 4109{ 4110 int ret = 0; 4111 4112 iommu_domain_cache = kmem_cache_create("iommu_domain", 4113 sizeof(struct dmar_domain), 4114 0, 4115 SLAB_HWCACHE_ALIGN, 4116 4117 NULL); 4118 if (!iommu_domain_cache) { 4119 pr_err("Couldn't create iommu_domain cache\n"); 4120 ret = -ENOMEM; 4121 } 4122 4123 return ret; 4124} 4125 4126static inline int iommu_devinfo_cache_init(void) 4127{ 4128 int ret = 0; 4129 4130 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo", 4131 sizeof(struct device_domain_info), 4132 0, 4133 SLAB_HWCACHE_ALIGN, 4134 NULL); 4135 if (!iommu_devinfo_cache) { 4136 pr_err("Couldn't create devinfo cache\n"); 4137 ret = -ENOMEM; 4138 } 4139 4140 return ret; 4141} 4142 4143static int __init iommu_init_mempool(void) 4144{ 4145 int ret; 4146 ret = iova_cache_get(); 4147 if (ret) 4148 return ret; 4149 4150 ret = iommu_domain_cache_init(); 4151 if (ret) 4152 goto domain_error; 4153 4154 ret = iommu_devinfo_cache_init(); 4155 if (!ret) 4156 return ret; 4157 4158 kmem_cache_destroy(iommu_domain_cache); 4159domain_error: 4160 iova_cache_put(); 4161 4162 return -ENOMEM; 4163} 4164 4165static void __init iommu_exit_mempool(void) 4166{ 4167 kmem_cache_destroy(iommu_devinfo_cache); 4168 kmem_cache_destroy(iommu_domain_cache); 4169 iova_cache_put(); 4170} 4171 4172static void __init init_no_remapping_devices(void) 4173{ 4174 struct dmar_drhd_unit *drhd; 4175 struct device *dev; 4176 int i; 4177 4178 for_each_drhd_unit(drhd) { 4179 if (!drhd->include_all) { 4180 for_each_active_dev_scope(drhd->devices, 4181 drhd->devices_cnt, i, dev) 4182 break; 4183 /* ignore DMAR unit if no devices exist */ 4184 if (i == drhd->devices_cnt) 4185 drhd->ignored = 1; 4186 } 4187 } 4188 4189 for_each_active_drhd_unit(drhd) { 4190 if (drhd->include_all) 4191 continue; 4192 4193 for_each_active_dev_scope(drhd->devices, 4194 drhd->devices_cnt, i, dev) 4195 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) 4196 break; 4197 if (i < drhd->devices_cnt) 4198 continue; 4199 4200 /* This IOMMU has *only* gfx devices. Either bypass it or 4201 set the gfx_mapped flag, as appropriate */ 4202 drhd->gfx_dedicated = 1; 4203 if (!dmar_map_gfx) 4204 drhd->ignored = 1; 4205 } 4206} 4207 4208#ifdef CONFIG_SUSPEND 4209static int init_iommu_hw(void) 4210{ 4211 struct dmar_drhd_unit *drhd; 4212 struct intel_iommu *iommu = NULL; 4213 4214 for_each_active_iommu(iommu, drhd) 4215 if (iommu->qi) 4216 dmar_reenable_qi(iommu); 4217 4218 for_each_iommu(iommu, drhd) { 4219 if (drhd->ignored) { 4220 /* 4221 * we always have to disable PMRs or DMA may fail on 4222 * this device 4223 */ 4224 if (force_on) 4225 iommu_disable_protect_mem_regions(iommu); 4226 continue; 4227 } 4228 4229 iommu_flush_write_buffer(iommu); 4230 iommu_set_root_entry(iommu); 4231 iommu_enable_translation(iommu); 4232 iommu_disable_protect_mem_regions(iommu); 4233 } 4234 4235 return 0; 4236} 4237 4238static void iommu_flush_all(void) 4239{ 4240 struct dmar_drhd_unit *drhd; 4241 struct intel_iommu *iommu; 4242 4243 for_each_active_iommu(iommu, drhd) { 4244 iommu->flush.flush_context(iommu, 0, 0, 0, 4245 DMA_CCMD_GLOBAL_INVL); 4246 iommu->flush.flush_iotlb(iommu, 0, 0, 0, 4247 DMA_TLB_GLOBAL_FLUSH); 4248 } 4249} 4250 4251static int iommu_suspend(void) 4252{ 4253 struct dmar_drhd_unit *drhd; 4254 struct intel_iommu *iommu = NULL; 4255 unsigned long flag; 4256 4257 for_each_active_iommu(iommu, drhd) { 4258 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32), 4259 GFP_ATOMIC); 4260 if (!iommu->iommu_state) 4261 goto nomem; 4262 } 4263 4264 iommu_flush_all(); 4265 4266 for_each_active_iommu(iommu, drhd) { 4267 iommu_disable_translation(iommu); 4268 4269 raw_spin_lock_irqsave(&iommu->register_lock, flag); 4270 4271 iommu->iommu_state[SR_DMAR_FECTL_REG] = 4272 readl(iommu->reg + DMAR_FECTL_REG); 4273 iommu->iommu_state[SR_DMAR_FEDATA_REG] = 4274 readl(iommu->reg + DMAR_FEDATA_REG); 4275 iommu->iommu_state[SR_DMAR_FEADDR_REG] = 4276 readl(iommu->reg + DMAR_FEADDR_REG); 4277 iommu->iommu_state[SR_DMAR_FEUADDR_REG] = 4278 readl(iommu->reg + DMAR_FEUADDR_REG); 4279 4280 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 4281 } 4282 return 0; 4283 4284nomem: 4285 for_each_active_iommu(iommu, drhd) 4286 kfree(iommu->iommu_state); 4287 4288 return -ENOMEM; 4289} 4290 4291static void iommu_resume(void) 4292{ 4293 struct dmar_drhd_unit *drhd; 4294 struct intel_iommu *iommu = NULL; 4295 unsigned long flag; 4296 4297 if (init_iommu_hw()) { 4298 if (force_on) 4299 panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); 4300 else 4301 WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); 4302 return; 4303 } 4304 4305 for_each_active_iommu(iommu, drhd) { 4306 4307 raw_spin_lock_irqsave(&iommu->register_lock, flag); 4308 4309 writel(iommu->iommu_state[SR_DMAR_FECTL_REG], 4310 iommu->reg + DMAR_FECTL_REG); 4311 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], 4312 iommu->reg + DMAR_FEDATA_REG); 4313 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], 4314 iommu->reg + DMAR_FEADDR_REG); 4315 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], 4316 iommu->reg + DMAR_FEUADDR_REG); 4317 4318 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 4319 } 4320 4321 for_each_active_iommu(iommu, drhd) 4322 kfree(iommu->iommu_state); 4323} 4324 4325static struct syscore_ops iommu_syscore_ops = { 4326 .resume = iommu_resume, 4327 .suspend = iommu_suspend, 4328}; 4329 4330static void __init init_iommu_pm_ops(void) 4331{ 4332 register_syscore_ops(&iommu_syscore_ops); 4333} 4334 4335#else 4336static inline void init_iommu_pm_ops(void) {} 4337#endif /* CONFIG_PM */ 4338 4339static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) 4340{ 4341 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || 4342 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || 4343 rmrr->end_address <= rmrr->base_address || 4344 arch_rmrr_sanity_check(rmrr)) 4345 return -EINVAL; 4346 4347 return 0; 4348} 4349 4350int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) 4351{ 4352 struct acpi_dmar_reserved_memory *rmrr; 4353 struct dmar_rmrr_unit *rmrru; 4354 4355 rmrr = (struct acpi_dmar_reserved_memory *)header; 4356 if (rmrr_sanity_check(rmrr)) { 4357 pr_warn(FW_BUG 4358 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" 4359 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 4360 rmrr->base_address, rmrr->end_address, 4361 dmi_get_system_info(DMI_BIOS_VENDOR), 4362 dmi_get_system_info(DMI_BIOS_VERSION), 4363 dmi_get_system_info(DMI_PRODUCT_VERSION)); 4364 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 4365 } 4366 4367 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); 4368 if (!rmrru) 4369 goto out; 4370 4371 rmrru->hdr = header; 4372 4373 rmrru->base_address = rmrr->base_address; 4374 rmrru->end_address = rmrr->end_address; 4375 4376 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), 4377 ((void *)rmrr) + rmrr->header.length, 4378 &rmrru->devices_cnt); 4379 if (rmrru->devices_cnt && rmrru->devices == NULL) 4380 goto free_rmrru; 4381 4382 list_add(&rmrru->list, &dmar_rmrr_units); 4383 4384 return 0; 4385free_rmrru: 4386 kfree(rmrru); 4387out: 4388 return -ENOMEM; 4389} 4390 4391static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) 4392{ 4393 struct dmar_atsr_unit *atsru; 4394 struct acpi_dmar_atsr *tmp; 4395 4396 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, 4397 dmar_rcu_check()) { 4398 tmp = (struct acpi_dmar_atsr *)atsru->hdr; 4399 if (atsr->segment != tmp->segment) 4400 continue; 4401 if (atsr->header.length != tmp->header.length) 4402 continue; 4403 if (memcmp(atsr, tmp, atsr->header.length) == 0) 4404 return atsru; 4405 } 4406 4407 return NULL; 4408} 4409 4410int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) 4411{ 4412 struct acpi_dmar_atsr *atsr; 4413 struct dmar_atsr_unit *atsru; 4414 4415 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 4416 return 0; 4417 4418 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 4419 atsru = dmar_find_atsr(atsr); 4420 if (atsru) 4421 return 0; 4422 4423 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); 4424 if (!atsru) 4425 return -ENOMEM; 4426 4427 /* 4428 * If memory is allocated from slab by ACPI _DSM method, we need to 4429 * copy the memory content because the memory buffer will be freed 4430 * on return. 4431 */ 4432 atsru->hdr = (void *)(atsru + 1); 4433 memcpy(atsru->hdr, hdr, hdr->length); 4434 atsru->include_all = atsr->flags & 0x1; 4435 if (!atsru->include_all) { 4436 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), 4437 (void *)atsr + atsr->header.length, 4438 &atsru->devices_cnt); 4439 if (atsru->devices_cnt && atsru->devices == NULL) { 4440 kfree(atsru); 4441 return -ENOMEM; 4442 } 4443 } 4444 4445 list_add_rcu(&atsru->list, &dmar_atsr_units); 4446 4447 return 0; 4448} 4449 4450static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) 4451{ 4452 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); 4453 kfree(atsru); 4454} 4455 4456int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) 4457{ 4458 struct acpi_dmar_atsr *atsr; 4459 struct dmar_atsr_unit *atsru; 4460 4461 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 4462 atsru = dmar_find_atsr(atsr); 4463 if (atsru) { 4464 list_del_rcu(&atsru->list); 4465 synchronize_rcu(); 4466 intel_iommu_free_atsr(atsru); 4467 } 4468 4469 return 0; 4470} 4471 4472int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) 4473{ 4474 int i; 4475 struct device *dev; 4476 struct acpi_dmar_atsr *atsr; 4477 struct dmar_atsr_unit *atsru; 4478 4479 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 4480 atsru = dmar_find_atsr(atsr); 4481 if (!atsru) 4482 return 0; 4483 4484 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { 4485 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, 4486 i, dev) 4487 return -EBUSY; 4488 } 4489 4490 return 0; 4491} 4492 4493static int intel_iommu_add(struct dmar_drhd_unit *dmaru) 4494{ 4495 int sp, ret; 4496 struct intel_iommu *iommu = dmaru->iommu; 4497 4498 if (g_iommus[iommu->seq_id]) 4499 return 0; 4500 4501 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) { 4502 pr_warn("%s: Doesn't support hardware pass through.\n", 4503 iommu->name); 4504 return -ENXIO; 4505 } 4506 if (!ecap_sc_support(iommu->ecap) && 4507 domain_update_iommu_snooping(iommu)) { 4508 pr_warn("%s: Doesn't support snooping.\n", 4509 iommu->name); 4510 return -ENXIO; 4511 } 4512 sp = domain_update_iommu_superpage(NULL, iommu) - 1; 4513 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { 4514 pr_warn("%s: Doesn't support large page.\n", 4515 iommu->name); 4516 return -ENXIO; 4517 } 4518 4519 /* 4520 * Disable translation if already enabled prior to OS handover. 4521 */ 4522 if (iommu->gcmd & DMA_GCMD_TE) 4523 iommu_disable_translation(iommu); 4524 4525 g_iommus[iommu->seq_id] = iommu; 4526 ret = iommu_init_domains(iommu); 4527 if (ret == 0) 4528 ret = iommu_alloc_root_entry(iommu); 4529 if (ret) 4530 goto out; 4531 4532 intel_svm_check(iommu); 4533 4534 if (dmaru->ignored) { 4535 /* 4536 * we always have to disable PMRs or DMA may fail on this device 4537 */ 4538 if (force_on) 4539 iommu_disable_protect_mem_regions(iommu); 4540 return 0; 4541 } 4542 4543 intel_iommu_init_qi(iommu); 4544 iommu_flush_write_buffer(iommu); 4545 4546#ifdef CONFIG_INTEL_IOMMU_SVM 4547 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 4548 ret = intel_svm_enable_prq(iommu); 4549 if (ret) 4550 goto disable_iommu; 4551 } 4552#endif 4553 ret = dmar_set_interrupt(iommu); 4554 if (ret) 4555 goto disable_iommu; 4556 4557 iommu_set_root_entry(iommu); 4558 iommu_enable_translation(iommu); 4559 4560 iommu_disable_protect_mem_regions(iommu); 4561 return 0; 4562 4563disable_iommu: 4564 disable_dmar_iommu(iommu); 4565out: 4566 free_dmar_iommu(iommu); 4567 return ret; 4568} 4569 4570int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) 4571{ 4572 int ret = 0; 4573 struct intel_iommu *iommu = dmaru->iommu; 4574 4575 if (!intel_iommu_enabled) 4576 return 0; 4577 if (iommu == NULL) 4578 return -EINVAL; 4579 4580 if (insert) { 4581 ret = intel_iommu_add(dmaru); 4582 } else { 4583 disable_dmar_iommu(iommu); 4584 free_dmar_iommu(iommu); 4585 } 4586 4587 return ret; 4588} 4589 4590static void intel_iommu_free_dmars(void) 4591{ 4592 struct dmar_rmrr_unit *rmrru, *rmrr_n; 4593 struct dmar_atsr_unit *atsru, *atsr_n; 4594 4595 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { 4596 list_del(&rmrru->list); 4597 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); 4598 kfree(rmrru); 4599 } 4600 4601 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { 4602 list_del(&atsru->list); 4603 intel_iommu_free_atsr(atsru); 4604 } 4605} 4606 4607int dmar_find_matched_atsr_unit(struct pci_dev *dev) 4608{ 4609 int i, ret = 1; 4610 struct pci_bus *bus; 4611 struct pci_dev *bridge = NULL; 4612 struct device *tmp; 4613 struct acpi_dmar_atsr *atsr; 4614 struct dmar_atsr_unit *atsru; 4615 4616 dev = pci_physfn(dev); 4617 for (bus = dev->bus; bus; bus = bus->parent) { 4618 bridge = bus->self; 4619 /* If it's an integrated device, allow ATS */ 4620 if (!bridge) 4621 return 1; 4622 /* Connected via non-PCIe: no ATS */ 4623 if (!pci_is_pcie(bridge) || 4624 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) 4625 return 0; 4626 /* If we found the root port, look it up in the ATSR */ 4627 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) 4628 break; 4629 } 4630 4631 rcu_read_lock(); 4632 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { 4633 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 4634 if (atsr->segment != pci_domain_nr(dev->bus)) 4635 continue; 4636 4637 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) 4638 if (tmp == &bridge->dev) 4639 goto out; 4640 4641 if (atsru->include_all) 4642 goto out; 4643 } 4644 ret = 0; 4645out: 4646 rcu_read_unlock(); 4647 4648 return ret; 4649} 4650 4651int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) 4652{ 4653 int ret; 4654 struct dmar_rmrr_unit *rmrru; 4655 struct dmar_atsr_unit *atsru; 4656 struct acpi_dmar_atsr *atsr; 4657 struct acpi_dmar_reserved_memory *rmrr; 4658 4659 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) 4660 return 0; 4661 4662 list_for_each_entry(rmrru, &dmar_rmrr_units, list) { 4663 rmrr = container_of(rmrru->hdr, 4664 struct acpi_dmar_reserved_memory, header); 4665 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 4666 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), 4667 ((void *)rmrr) + rmrr->header.length, 4668 rmrr->segment, rmrru->devices, 4669 rmrru->devices_cnt); 4670 if (ret < 0) 4671 return ret; 4672 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 4673 dmar_remove_dev_scope(info, rmrr->segment, 4674 rmrru->devices, rmrru->devices_cnt); 4675 } 4676 } 4677 4678 list_for_each_entry(atsru, &dmar_atsr_units, list) { 4679 if (atsru->include_all) 4680 continue; 4681 4682 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 4683 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 4684 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), 4685 (void *)atsr + atsr->header.length, 4686 atsr->segment, atsru->devices, 4687 atsru->devices_cnt); 4688 if (ret > 0) 4689 break; 4690 else if (ret < 0) 4691 return ret; 4692 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 4693 if (dmar_remove_dev_scope(info, atsr->segment, 4694 atsru->devices, atsru->devices_cnt)) 4695 break; 4696 } 4697 } 4698 4699 return 0; 4700} 4701 4702static int intel_iommu_memory_notifier(struct notifier_block *nb, 4703 unsigned long val, void *v) 4704{ 4705 struct memory_notify *mhp = v; 4706 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn); 4707 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn + 4708 mhp->nr_pages - 1); 4709 4710 switch (val) { 4711 case MEM_GOING_ONLINE: 4712 if (iommu_domain_identity_map(si_domain, 4713 start_vpfn, last_vpfn)) { 4714 pr_warn("Failed to build identity map for [%lx-%lx]\n", 4715 start_vpfn, last_vpfn); 4716 return NOTIFY_BAD; 4717 } 4718 break; 4719 4720 case MEM_OFFLINE: 4721 case MEM_CANCEL_ONLINE: 4722 { 4723 struct dmar_drhd_unit *drhd; 4724 struct intel_iommu *iommu; 4725 struct page *freelist; 4726 4727 freelist = domain_unmap(si_domain, 4728 start_vpfn, last_vpfn); 4729 4730 rcu_read_lock(); 4731 for_each_active_iommu(iommu, drhd) 4732 iommu_flush_iotlb_psi(iommu, si_domain, 4733 start_vpfn, mhp->nr_pages, 4734 !freelist, 0); 4735 rcu_read_unlock(); 4736 dma_free_pagelist(freelist); 4737 } 4738 break; 4739 } 4740 4741 return NOTIFY_OK; 4742} 4743 4744static struct notifier_block intel_iommu_memory_nb = { 4745 .notifier_call = intel_iommu_memory_notifier, 4746 .priority = 0 4747}; 4748 4749static void free_all_cpu_cached_iovas(unsigned int cpu) 4750{ 4751 int i; 4752 4753 for (i = 0; i < g_num_of_iommus; i++) { 4754 struct intel_iommu *iommu = g_iommus[i]; 4755 struct dmar_domain *domain; 4756 int did; 4757 4758 if (!iommu) 4759 continue; 4760 4761 for (did = 0; did < cap_ndoms(iommu->cap); did++) { 4762 domain = get_iommu_domain(iommu, (u16)did); 4763 4764 if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA) 4765 continue; 4766 4767 free_cpu_cached_iovas(cpu, &domain->iovad); 4768 } 4769 } 4770} 4771 4772static int intel_iommu_cpu_dead(unsigned int cpu) 4773{ 4774 free_all_cpu_cached_iovas(cpu); 4775 return 0; 4776} 4777 4778static void intel_disable_iommus(void) 4779{ 4780 struct intel_iommu *iommu = NULL; 4781 struct dmar_drhd_unit *drhd; 4782 4783 for_each_iommu(iommu, drhd) 4784 iommu_disable_translation(iommu); 4785} 4786 4787void intel_iommu_shutdown(void) 4788{ 4789 struct dmar_drhd_unit *drhd; 4790 struct intel_iommu *iommu = NULL; 4791 4792 if (no_iommu || dmar_disabled) 4793 return; 4794 4795 down_write(&dmar_global_lock); 4796 4797 /* Disable PMRs explicitly here. */ 4798 for_each_iommu(iommu, drhd) 4799 iommu_disable_protect_mem_regions(iommu); 4800 4801 /* Make sure the IOMMUs are switched off */ 4802 intel_disable_iommus(); 4803 4804 up_write(&dmar_global_lock); 4805} 4806 4807static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev) 4808{ 4809 struct iommu_device *iommu_dev = dev_to_iommu_device(dev); 4810 4811 return container_of(iommu_dev, struct intel_iommu, iommu); 4812} 4813 4814static ssize_t intel_iommu_show_version(struct device *dev, 4815 struct device_attribute *attr, 4816 char *buf) 4817{ 4818 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4819 u32 ver = readl(iommu->reg + DMAR_VER_REG); 4820 return sprintf(buf, "%d:%d\n", 4821 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); 4822} 4823static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL); 4824 4825static ssize_t intel_iommu_show_address(struct device *dev, 4826 struct device_attribute *attr, 4827 char *buf) 4828{ 4829 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4830 return sprintf(buf, "%llx\n", iommu->reg_phys); 4831} 4832static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL); 4833 4834static ssize_t intel_iommu_show_cap(struct device *dev, 4835 struct device_attribute *attr, 4836 char *buf) 4837{ 4838 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4839 return sprintf(buf, "%llx\n", iommu->cap); 4840} 4841static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL); 4842 4843static ssize_t intel_iommu_show_ecap(struct device *dev, 4844 struct device_attribute *attr, 4845 char *buf) 4846{ 4847 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4848 return sprintf(buf, "%llx\n", iommu->ecap); 4849} 4850static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL); 4851 4852static ssize_t intel_iommu_show_ndoms(struct device *dev, 4853 struct device_attribute *attr, 4854 char *buf) 4855{ 4856 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4857 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap)); 4858} 4859static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL); 4860 4861static ssize_t intel_iommu_show_ndoms_used(struct device *dev, 4862 struct device_attribute *attr, 4863 char *buf) 4864{ 4865 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4866 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids, 4867 cap_ndoms(iommu->cap))); 4868} 4869static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL); 4870 4871static struct attribute *intel_iommu_attrs[] = { 4872 &dev_attr_version.attr, 4873 &dev_attr_address.attr, 4874 &dev_attr_cap.attr, 4875 &dev_attr_ecap.attr, 4876 &dev_attr_domains_supported.attr, 4877 &dev_attr_domains_used.attr, 4878 NULL, 4879}; 4880 4881static struct attribute_group intel_iommu_group = { 4882 .name = "intel-iommu", 4883 .attrs = intel_iommu_attrs, 4884}; 4885 4886const struct attribute_group *intel_iommu_groups[] = { 4887 &intel_iommu_group, 4888 NULL, 4889}; 4890 4891static inline bool has_external_pci(void) 4892{ 4893 struct pci_dev *pdev = NULL; 4894 4895 for_each_pci_dev(pdev) 4896 if (pdev->external_facing) { 4897 pci_dev_put(pdev); 4898 return true; 4899 } 4900 4901 return false; 4902} 4903 4904static int __init platform_optin_force_iommu(void) 4905{ 4906 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) 4907 return 0; 4908 4909 if (no_iommu || dmar_disabled) 4910 pr_info("Intel-IOMMU force enabled due to platform opt in\n"); 4911 4912 /* 4913 * If Intel-IOMMU is disabled by default, we will apply identity 4914 * map for all devices except those marked as being untrusted. 4915 */ 4916 if (dmar_disabled) 4917 iommu_set_default_passthrough(false); 4918 4919 dmar_disabled = 0; 4920 no_iommu = 0; 4921 4922 return 1; 4923} 4924 4925static int __init probe_acpi_namespace_devices(void) 4926{ 4927 struct dmar_drhd_unit *drhd; 4928 /* To avoid a -Wunused-but-set-variable warning. */ 4929 struct intel_iommu *iommu __maybe_unused; 4930 struct device *dev; 4931 int i, ret = 0; 4932 4933 for_each_active_iommu(iommu, drhd) { 4934 for_each_active_dev_scope(drhd->devices, 4935 drhd->devices_cnt, i, dev) { 4936 struct acpi_device_physical_node *pn; 4937 struct iommu_group *group; 4938 struct acpi_device *adev; 4939 4940 if (dev->bus != &acpi_bus_type) 4941 continue; 4942 4943 adev = to_acpi_device(dev); 4944 mutex_lock(&adev->physical_node_lock); 4945 list_for_each_entry(pn, 4946 &adev->physical_node_list, node) { 4947 group = iommu_group_get(pn->dev); 4948 if (group) { 4949 iommu_group_put(group); 4950 continue; 4951 } 4952 4953 pn->dev->bus->iommu_ops = &intel_iommu_ops; 4954 ret = iommu_probe_device(pn->dev); 4955 if (ret) 4956 break; 4957 } 4958 mutex_unlock(&adev->physical_node_lock); 4959 4960 if (ret) 4961 return ret; 4962 } 4963 } 4964 4965 return 0; 4966} 4967 4968int __init intel_iommu_init(void) 4969{ 4970 int ret = -ENODEV; 4971 struct dmar_drhd_unit *drhd; 4972 struct intel_iommu *iommu; 4973 4974 /* 4975 * Intel IOMMU is required for a TXT/tboot launch or platform 4976 * opt in, so enforce that. 4977 */ 4978 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || 4979 platform_optin_force_iommu(); 4980 4981 if (iommu_init_mempool()) { 4982 if (force_on) 4983 panic("tboot: Failed to initialize iommu memory\n"); 4984 return -ENOMEM; 4985 } 4986 4987 down_write(&dmar_global_lock); 4988 if (dmar_table_init()) { 4989 if (force_on) 4990 panic("tboot: Failed to initialize DMAR table\n"); 4991 goto out_free_dmar; 4992 } 4993 4994 if (dmar_dev_scope_init() < 0) { 4995 if (force_on) 4996 panic("tboot: Failed to initialize DMAR device scope\n"); 4997 goto out_free_dmar; 4998 } 4999 5000 up_write(&dmar_global_lock); 5001 5002 /* 5003 * The bus notifier takes the dmar_global_lock, so lockdep will 5004 * complain later when we register it under the lock. 5005 */ 5006 dmar_register_bus_notifier(); 5007 5008 down_write(&dmar_global_lock); 5009 5010 if (!no_iommu) 5011 intel_iommu_debugfs_init(); 5012 5013 if (no_iommu || dmar_disabled) { 5014 /* 5015 * We exit the function here to ensure IOMMU's remapping and 5016 * mempool aren't setup, which means that the IOMMU's PMRs 5017 * won't be disabled via the call to init_dmars(). So disable 5018 * it explicitly here. The PMRs were setup by tboot prior to 5019 * calling SENTER, but the kernel is expected to reset/tear 5020 * down the PMRs. 5021 */ 5022 if (intel_iommu_tboot_noforce) { 5023 for_each_iommu(iommu, drhd) 5024 iommu_disable_protect_mem_regions(iommu); 5025 } 5026 5027 /* 5028 * Make sure the IOMMUs are switched off, even when we 5029 * boot into a kexec kernel and the previous kernel left 5030 * them enabled 5031 */ 5032 intel_disable_iommus(); 5033 goto out_free_dmar; 5034 } 5035 5036 if (list_empty(&dmar_rmrr_units)) 5037 pr_info("No RMRR found\n"); 5038 5039 if (list_empty(&dmar_atsr_units)) 5040 pr_info("No ATSR found\n"); 5041 5042 if (dmar_init_reserved_ranges()) { 5043 if (force_on) 5044 panic("tboot: Failed to reserve iommu ranges\n"); 5045 goto out_free_reserved_range; 5046 } 5047 5048 if (dmar_map_gfx) 5049 intel_iommu_gfx_mapped = 1; 5050 5051 init_no_remapping_devices(); 5052 5053 ret = init_dmars(); 5054 if (ret) { 5055 if (force_on) 5056 panic("tboot: Failed to initialize DMARs\n"); 5057 pr_err("Initialization failed\n"); 5058 goto out_free_reserved_range; 5059 } 5060 up_write(&dmar_global_lock); 5061 5062 init_iommu_pm_ops(); 5063 5064 down_read(&dmar_global_lock); 5065 for_each_active_iommu(iommu, drhd) { 5066 iommu_device_sysfs_add(&iommu->iommu, NULL, 5067 intel_iommu_groups, 5068 "%s", iommu->name); 5069 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops); 5070 iommu_device_register(&iommu->iommu); 5071 } 5072 up_read(&dmar_global_lock); 5073 5074 bus_set_iommu(&pci_bus_type, &intel_iommu_ops); 5075 if (si_domain && !hw_pass_through) 5076 register_memory_notifier(&intel_iommu_memory_nb); 5077 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL, 5078 intel_iommu_cpu_dead); 5079 5080 down_read(&dmar_global_lock); 5081 if (probe_acpi_namespace_devices()) 5082 pr_warn("ACPI name space devices didn't probe correctly\n"); 5083 5084 /* Finally, we enable the DMA remapping hardware. */ 5085 for_each_iommu(iommu, drhd) { 5086 if (!drhd->ignored && !translation_pre_enabled(iommu)) 5087 iommu_enable_translation(iommu); 5088 5089 iommu_disable_protect_mem_regions(iommu); 5090 } 5091 up_read(&dmar_global_lock); 5092 5093 pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); 5094 5095 intel_iommu_enabled = 1; 5096 5097 return 0; 5098 5099out_free_reserved_range: 5100 put_iova_domain(&reserved_iova_list); 5101out_free_dmar: 5102 intel_iommu_free_dmars(); 5103 up_write(&dmar_global_lock); 5104 iommu_exit_mempool(); 5105 return ret; 5106} 5107 5108static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) 5109{ 5110 struct intel_iommu *iommu = opaque; 5111 5112 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff); 5113 return 0; 5114} 5115 5116/* 5117 * NB - intel-iommu lacks any sort of reference counting for the users of 5118 * dependent devices. If multiple endpoints have intersecting dependent 5119 * devices, unbinding the driver from any one of them will possibly leave 5120 * the others unable to operate. 5121 */ 5122static void domain_context_clear(struct intel_iommu *iommu, struct device *dev) 5123{ 5124 if (!iommu || !dev || !dev_is_pci(dev)) 5125 return; 5126 5127 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu); 5128} 5129 5130static void __dmar_remove_one_dev_info(struct device_domain_info *info) 5131{ 5132 struct dmar_domain *domain; 5133 struct intel_iommu *iommu; 5134 unsigned long flags; 5135 5136 assert_spin_locked(&device_domain_lock); 5137 5138 if (WARN_ON(!info)) 5139 return; 5140 5141 iommu = info->iommu; 5142 domain = info->domain; 5143 5144 if (info->dev) { 5145 if (dev_is_pci(info->dev) && sm_supported(iommu)) 5146 intel_pasid_tear_down_entry(iommu, info->dev, 5147 PASID_RID2PASID, false); 5148 5149 iommu_disable_dev_iotlb(info); 5150 if (!dev_is_real_dma_subdevice(info->dev)) 5151 domain_context_clear(iommu, info->dev); 5152 intel_pasid_free_table(info->dev); 5153 } 5154 5155 unlink_domain_info(info); 5156 5157 spin_lock_irqsave(&iommu->lock, flags); 5158 domain_detach_iommu(domain, iommu); 5159 spin_unlock_irqrestore(&iommu->lock, flags); 5160 5161 free_devinfo_mem(info); 5162} 5163 5164static void dmar_remove_one_dev_info(struct device *dev) 5165{ 5166 struct device_domain_info *info; 5167 unsigned long flags; 5168 5169 spin_lock_irqsave(&device_domain_lock, flags); 5170 info = get_domain_info(dev); 5171 if (info) 5172 __dmar_remove_one_dev_info(info); 5173 spin_unlock_irqrestore(&device_domain_lock, flags); 5174} 5175 5176static int md_domain_init(struct dmar_domain *domain, int guest_width) 5177{ 5178 int adjust_width; 5179 5180 /* calculate AGAW */ 5181 domain->gaw = guest_width; 5182 adjust_width = guestwidth_to_adjustwidth(guest_width); 5183 domain->agaw = width_to_agaw(adjust_width); 5184 5185 domain->iommu_coherency = 0; 5186 domain->iommu_snooping = 0; 5187 domain->iommu_superpage = 0; 5188 domain->max_addr = 0; 5189 5190 /* always allocate the top pgd */ 5191 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid); 5192 if (!domain->pgd) 5193 return -ENOMEM; 5194 domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 5195 return 0; 5196} 5197 5198static void intel_init_iova_domain(struct dmar_domain *dmar_domain) 5199{ 5200 init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN); 5201 copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad); 5202 5203 if (!intel_iommu_strict && 5204 init_iova_flush_queue(&dmar_domain->iovad, 5205 iommu_flush_iova, iova_entry_free)) 5206 pr_info("iova flush queue initialization failed\n"); 5207} 5208 5209static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) 5210{ 5211 struct dmar_domain *dmar_domain; 5212 struct iommu_domain *domain; 5213 5214 switch (type) { 5215 case IOMMU_DOMAIN_DMA: 5216 case IOMMU_DOMAIN_UNMANAGED: 5217 dmar_domain = alloc_domain(0); 5218 if (!dmar_domain) { 5219 pr_err("Can't allocate dmar_domain\n"); 5220 return NULL; 5221 } 5222 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 5223 pr_err("Domain initialization failed\n"); 5224 domain_exit(dmar_domain); 5225 return NULL; 5226 } 5227 5228 if (type == IOMMU_DOMAIN_DMA) 5229 intel_init_iova_domain(dmar_domain); 5230 5231 domain = &dmar_domain->domain; 5232 domain->geometry.aperture_start = 0; 5233 domain->geometry.aperture_end = 5234 __DOMAIN_MAX_ADDR(dmar_domain->gaw); 5235 domain->geometry.force_aperture = true; 5236 5237 return domain; 5238 case IOMMU_DOMAIN_IDENTITY: 5239 return &si_domain->domain; 5240 default: 5241 return NULL; 5242 } 5243 5244 return NULL; 5245} 5246 5247static void intel_iommu_domain_free(struct iommu_domain *domain) 5248{ 5249 if (domain != &si_domain->domain) 5250 domain_exit(to_dmar_domain(domain)); 5251} 5252 5253/* 5254 * Check whether a @domain could be attached to the @dev through the 5255 * aux-domain attach/detach APIs. 5256 */ 5257static inline bool 5258is_aux_domain(struct device *dev, struct iommu_domain *domain) 5259{ 5260 struct device_domain_info *info = get_domain_info(dev); 5261 5262 return info && info->auxd_enabled && 5263 domain->type == IOMMU_DOMAIN_UNMANAGED; 5264} 5265 5266static void auxiliary_link_device(struct dmar_domain *domain, 5267 struct device *dev) 5268{ 5269 struct device_domain_info *info = get_domain_info(dev); 5270 5271 assert_spin_locked(&device_domain_lock); 5272 if (WARN_ON(!info)) 5273 return; 5274 5275 domain->auxd_refcnt++; 5276 list_add(&domain->auxd, &info->auxiliary_domains); 5277} 5278 5279static void auxiliary_unlink_device(struct dmar_domain *domain, 5280 struct device *dev) 5281{ 5282 struct device_domain_info *info = get_domain_info(dev); 5283 5284 assert_spin_locked(&device_domain_lock); 5285 if (WARN_ON(!info)) 5286 return; 5287 5288 list_del(&domain->auxd); 5289 domain->auxd_refcnt--; 5290 5291 if (!domain->auxd_refcnt && domain->default_pasid > 0) 5292 ioasid_free(domain->default_pasid); 5293} 5294 5295static int aux_domain_add_dev(struct dmar_domain *domain, 5296 struct device *dev) 5297{ 5298 int ret; 5299 unsigned long flags; 5300 struct intel_iommu *iommu; 5301 5302 iommu = device_to_iommu(dev, NULL, NULL); 5303 if (!iommu) 5304 return -ENODEV; 5305 5306 if (domain->default_pasid <= 0) { 5307 u32 pasid; 5308 5309 /* No private data needed for the default pasid */ 5310 pasid = ioasid_alloc(NULL, PASID_MIN, 5311 pci_max_pasids(to_pci_dev(dev)) - 1, 5312 NULL); 5313 if (pasid == INVALID_IOASID) { 5314 pr_err("Can't allocate default pasid\n"); 5315 return -ENODEV; 5316 } 5317 domain->default_pasid = pasid; 5318 } 5319 5320 spin_lock_irqsave(&device_domain_lock, flags); 5321 /* 5322 * iommu->lock must be held to attach domain to iommu and setup the 5323 * pasid entry for second level translation. 5324 */ 5325 spin_lock(&iommu->lock); 5326 ret = domain_attach_iommu(domain, iommu); 5327 if (ret) 5328 goto attach_failed; 5329 5330 /* Setup the PASID entry for mediated devices: */ 5331 if (domain_use_first_level(domain)) 5332 ret = domain_setup_first_level(iommu, domain, dev, 5333 domain->default_pasid); 5334 else 5335 ret = intel_pasid_setup_second_level(iommu, domain, dev, 5336 domain->default_pasid); 5337 if (ret) 5338 goto table_failed; 5339 spin_unlock(&iommu->lock); 5340 5341 auxiliary_link_device(domain, dev); 5342 5343 spin_unlock_irqrestore(&device_domain_lock, flags); 5344 5345 return 0; 5346 5347table_failed: 5348 domain_detach_iommu(domain, iommu); 5349attach_failed: 5350 spin_unlock(&iommu->lock); 5351 spin_unlock_irqrestore(&device_domain_lock, flags); 5352 if (!domain->auxd_refcnt && domain->default_pasid > 0) 5353 ioasid_free(domain->default_pasid); 5354 5355 return ret; 5356} 5357 5358static void aux_domain_remove_dev(struct dmar_domain *domain, 5359 struct device *dev) 5360{ 5361 struct device_domain_info *info; 5362 struct intel_iommu *iommu; 5363 unsigned long flags; 5364 5365 if (!is_aux_domain(dev, &domain->domain)) 5366 return; 5367 5368 spin_lock_irqsave(&device_domain_lock, flags); 5369 info = get_domain_info(dev); 5370 iommu = info->iommu; 5371 5372 auxiliary_unlink_device(domain, dev); 5373 5374 spin_lock(&iommu->lock); 5375 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false); 5376 domain_detach_iommu(domain, iommu); 5377 spin_unlock(&iommu->lock); 5378 5379 spin_unlock_irqrestore(&device_domain_lock, flags); 5380} 5381 5382static int prepare_domain_attach_device(struct iommu_domain *domain, 5383 struct device *dev) 5384{ 5385 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5386 struct intel_iommu *iommu; 5387 int addr_width; 5388 5389 iommu = device_to_iommu(dev, NULL, NULL); 5390 if (!iommu) 5391 return -ENODEV; 5392 5393 /* check if this iommu agaw is sufficient for max mapped address */ 5394 addr_width = agaw_to_width(iommu->agaw); 5395 if (addr_width > cap_mgaw(iommu->cap)) 5396 addr_width = cap_mgaw(iommu->cap); 5397 5398 if (dmar_domain->max_addr > (1LL << addr_width)) { 5399 dev_err(dev, "%s: iommu width (%d) is not " 5400 "sufficient for the mapped address (%llx)\n", 5401 __func__, addr_width, dmar_domain->max_addr); 5402 return -EFAULT; 5403 } 5404 dmar_domain->gaw = addr_width; 5405 5406 /* 5407 * Knock out extra levels of page tables if necessary 5408 */ 5409 while (iommu->agaw < dmar_domain->agaw) { 5410 struct dma_pte *pte; 5411 5412 pte = dmar_domain->pgd; 5413 if (dma_pte_present(pte)) { 5414 dmar_domain->pgd = (struct dma_pte *) 5415 phys_to_virt(dma_pte_addr(pte)); 5416 free_pgtable_page(pte); 5417 } 5418 dmar_domain->agaw--; 5419 } 5420 5421 return 0; 5422} 5423 5424static int intel_iommu_attach_device(struct iommu_domain *domain, 5425 struct device *dev) 5426{ 5427 int ret; 5428 5429 if (domain->type == IOMMU_DOMAIN_UNMANAGED && 5430 device_is_rmrr_locked(dev)) { 5431 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n"); 5432 return -EPERM; 5433 } 5434 5435 if (is_aux_domain(dev, domain)) 5436 return -EPERM; 5437 5438 /* normally dev is not mapped */ 5439 if (unlikely(domain_context_mapped(dev))) { 5440 struct dmar_domain *old_domain; 5441 5442 old_domain = find_domain(dev); 5443 if (old_domain) 5444 dmar_remove_one_dev_info(dev); 5445 } 5446 5447 ret = prepare_domain_attach_device(domain, dev); 5448 if (ret) 5449 return ret; 5450 5451 return domain_add_dev_info(to_dmar_domain(domain), dev); 5452} 5453 5454static int intel_iommu_aux_attach_device(struct iommu_domain *domain, 5455 struct device *dev) 5456{ 5457 int ret; 5458 5459 if (!is_aux_domain(dev, domain)) 5460 return -EPERM; 5461 5462 ret = prepare_domain_attach_device(domain, dev); 5463 if (ret) 5464 return ret; 5465 5466 return aux_domain_add_dev(to_dmar_domain(domain), dev); 5467} 5468 5469static void intel_iommu_detach_device(struct iommu_domain *domain, 5470 struct device *dev) 5471{ 5472 dmar_remove_one_dev_info(dev); 5473} 5474 5475static void intel_iommu_aux_detach_device(struct iommu_domain *domain, 5476 struct device *dev) 5477{ 5478 aux_domain_remove_dev(to_dmar_domain(domain), dev); 5479} 5480 5481#ifdef CONFIG_INTEL_IOMMU_SVM 5482/* 5483 * 2D array for converting and sanitizing IOMMU generic TLB granularity to 5484 * VT-d granularity. Invalidation is typically included in the unmap operation 5485 * as a result of DMA or VFIO unmap. However, for assigned devices guest 5486 * owns the first level page tables. Invalidations of translation caches in the 5487 * guest are trapped and passed down to the host. 5488 * 5489 * vIOMMU in the guest will only expose first level page tables, therefore 5490 * we do not support IOTLB granularity for request without PASID (second level). 5491 * 5492 * For example, to find the VT-d granularity encoding for IOTLB 5493 * type and page selective granularity within PASID: 5494 * X: indexed by iommu cache type 5495 * Y: indexed by enum iommu_inv_granularity 5496 * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR] 5497 */ 5498 5499static const int 5500inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = { 5501 /* 5502 * PASID based IOTLB invalidation: PASID selective (per PASID), 5503 * page selective (address granularity) 5504 */ 5505 {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID}, 5506 /* PASID based dev TLBs */ 5507 {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL}, 5508 /* PASID cache */ 5509 {-EINVAL, -EINVAL, -EINVAL} 5510}; 5511 5512static inline int to_vtd_granularity(int type, int granu) 5513{ 5514 return inv_type_granu_table[type][granu]; 5515} 5516 5517static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules) 5518{ 5519 u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT; 5520 5521 /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc. 5522 * IOMMU cache invalidate API passes granu_size in bytes, and number of 5523 * granu size in contiguous memory. 5524 */ 5525 return order_base_2(nr_pages); 5526} 5527 5528static int 5529intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev, 5530 struct iommu_cache_invalidate_info *inv_info) 5531{ 5532 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5533 struct device_domain_info *info; 5534 struct intel_iommu *iommu; 5535 unsigned long flags; 5536 int cache_type; 5537 u8 bus, devfn; 5538 u16 did, sid; 5539 int ret = 0; 5540 u64 size = 0; 5541 5542 if (!inv_info || !dmar_domain) 5543 return -EINVAL; 5544 5545 if (!dev || !dev_is_pci(dev)) 5546 return -ENODEV; 5547 5548 iommu = device_to_iommu(dev, &bus, &devfn); 5549 if (!iommu) 5550 return -ENODEV; 5551 5552 if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE)) 5553 return -EINVAL; 5554 5555 spin_lock_irqsave(&device_domain_lock, flags); 5556 spin_lock(&iommu->lock); 5557 info = get_domain_info(dev); 5558 if (!info) { 5559 ret = -EINVAL; 5560 goto out_unlock; 5561 } 5562 did = dmar_domain->iommu_did[iommu->seq_id]; 5563 sid = PCI_DEVID(bus, devfn); 5564 5565 /* Size is only valid in address selective invalidation */ 5566 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) 5567 size = to_vtd_size(inv_info->granu.addr_info.granule_size, 5568 inv_info->granu.addr_info.nb_granules); 5569 5570 for_each_set_bit(cache_type, 5571 (unsigned long *)&inv_info->cache, 5572 IOMMU_CACHE_INV_TYPE_NR) { 5573 int granu = 0; 5574 u64 pasid = 0; 5575 u64 addr = 0; 5576 5577 granu = to_vtd_granularity(cache_type, inv_info->granularity); 5578 if (granu == -EINVAL) { 5579 pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n", 5580 cache_type, inv_info->granularity); 5581 break; 5582 } 5583 5584 /* 5585 * PASID is stored in different locations based on the 5586 * granularity. 5587 */ 5588 if (inv_info->granularity == IOMMU_INV_GRANU_PASID && 5589 (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID)) 5590 pasid = inv_info->granu.pasid_info.pasid; 5591 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR && 5592 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID)) 5593 pasid = inv_info->granu.addr_info.pasid; 5594 5595 switch (BIT(cache_type)) { 5596 case IOMMU_CACHE_INV_TYPE_IOTLB: 5597 /* HW will ignore LSB bits based on address mask */ 5598 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR && 5599 size && 5600 (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) { 5601 pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n", 5602 inv_info->granu.addr_info.addr, size); 5603 } 5604 5605 /* 5606 * If granu is PASID-selective, address is ignored. 5607 * We use npages = -1 to indicate that. 5608 */ 5609 qi_flush_piotlb(iommu, did, pasid, 5610 mm_to_dma_pfn(inv_info->granu.addr_info.addr), 5611 (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size, 5612 inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF); 5613 5614 if (!info->ats_enabled) 5615 break; 5616 /* 5617 * Always flush device IOTLB if ATS is enabled. vIOMMU 5618 * in the guest may assume IOTLB flush is inclusive, 5619 * which is more efficient. 5620 */ 5621 fallthrough; 5622 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB: 5623 /* 5624 * PASID based device TLB invalidation does not support 5625 * IOMMU_INV_GRANU_PASID granularity but only supports 5626 * IOMMU_INV_GRANU_ADDR. 5627 * The equivalent of that is we set the size to be the 5628 * entire range of 64 bit. User only provides PASID info 5629 * without address info. So we set addr to 0. 5630 */ 5631 if (inv_info->granularity == IOMMU_INV_GRANU_PASID) { 5632 size = 64 - VTD_PAGE_SHIFT; 5633 addr = 0; 5634 } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) { 5635 addr = inv_info->granu.addr_info.addr; 5636 } 5637 5638 if (info->ats_enabled) 5639 qi_flush_dev_iotlb_pasid(iommu, sid, 5640 info->pfsid, pasid, 5641 info->ats_qdep, addr, 5642 size); 5643 else 5644 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n"); 5645 break; 5646 default: 5647 dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n", 5648 cache_type); 5649 ret = -EINVAL; 5650 } 5651 } 5652out_unlock: 5653 spin_unlock(&iommu->lock); 5654 spin_unlock_irqrestore(&device_domain_lock, flags); 5655 5656 return ret; 5657} 5658#endif 5659 5660static int intel_iommu_map(struct iommu_domain *domain, 5661 unsigned long iova, phys_addr_t hpa, 5662 size_t size, int iommu_prot, gfp_t gfp) 5663{ 5664 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5665 u64 max_addr; 5666 int prot = 0; 5667 int ret; 5668 5669 if (iommu_prot & IOMMU_READ) 5670 prot |= DMA_PTE_READ; 5671 if (iommu_prot & IOMMU_WRITE) 5672 prot |= DMA_PTE_WRITE; 5673 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping) 5674 prot |= DMA_PTE_SNP; 5675 5676 max_addr = iova + size; 5677 if (dmar_domain->max_addr < max_addr) { 5678 u64 end; 5679 5680 /* check if minimum agaw is sufficient for mapped address */ 5681 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; 5682 if (end < max_addr) { 5683 pr_err("%s: iommu width (%d) is not " 5684 "sufficient for the mapped address (%llx)\n", 5685 __func__, dmar_domain->gaw, max_addr); 5686 return -EFAULT; 5687 } 5688 dmar_domain->max_addr = max_addr; 5689 } 5690 /* Round up size to next multiple of PAGE_SIZE, if it and 5691 the low bits of hpa would take us onto the next page */ 5692 size = aligned_nrpages(hpa, size); 5693 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, 5694 hpa >> VTD_PAGE_SHIFT, size, prot); 5695 return ret; 5696} 5697 5698static size_t intel_iommu_unmap(struct iommu_domain *domain, 5699 unsigned long iova, size_t size, 5700 struct iommu_iotlb_gather *gather) 5701{ 5702 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5703 struct page *freelist = NULL; 5704 unsigned long start_pfn, last_pfn; 5705 unsigned int npages; 5706 int iommu_id, level = 0; 5707 5708 /* Cope with horrid API which requires us to unmap more than the 5709 size argument if it happens to be a large-page mapping. */ 5710 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level)); 5711 5712 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) 5713 size = VTD_PAGE_SIZE << level_to_offset_bits(level); 5714 5715 start_pfn = iova >> VTD_PAGE_SHIFT; 5716 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; 5717 5718 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn); 5719 5720 npages = last_pfn - start_pfn + 1; 5721 5722 for_each_domain_iommu(iommu_id, dmar_domain) 5723 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain, 5724 start_pfn, npages, !freelist, 0); 5725 5726 dma_free_pagelist(freelist); 5727 5728 if (dmar_domain->max_addr == iova + size) 5729 dmar_domain->max_addr = iova; 5730 5731 return size; 5732} 5733 5734static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 5735 dma_addr_t iova) 5736{ 5737 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5738 struct dma_pte *pte; 5739 int level = 0; 5740 u64 phys = 0; 5741 5742 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level); 5743 if (pte && dma_pte_present(pte)) 5744 phys = dma_pte_addr(pte) + 5745 (iova & (BIT_MASK(level_to_offset_bits(level) + 5746 VTD_PAGE_SHIFT) - 1)); 5747 5748 return phys; 5749} 5750 5751static inline bool scalable_mode_support(void) 5752{ 5753 struct dmar_drhd_unit *drhd; 5754 struct intel_iommu *iommu; 5755 bool ret = true; 5756 5757 rcu_read_lock(); 5758 for_each_active_iommu(iommu, drhd) { 5759 if (!sm_supported(iommu)) { 5760 ret = false; 5761 break; 5762 } 5763 } 5764 rcu_read_unlock(); 5765 5766 return ret; 5767} 5768 5769static inline bool iommu_pasid_support(void) 5770{ 5771 struct dmar_drhd_unit *drhd; 5772 struct intel_iommu *iommu; 5773 bool ret = true; 5774 5775 rcu_read_lock(); 5776 for_each_active_iommu(iommu, drhd) { 5777 if (!pasid_supported(iommu)) { 5778 ret = false; 5779 break; 5780 } 5781 } 5782 rcu_read_unlock(); 5783 5784 return ret; 5785} 5786 5787static inline bool nested_mode_support(void) 5788{ 5789 struct dmar_drhd_unit *drhd; 5790 struct intel_iommu *iommu; 5791 bool ret = true; 5792 5793 rcu_read_lock(); 5794 for_each_active_iommu(iommu, drhd) { 5795 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) { 5796 ret = false; 5797 break; 5798 } 5799 } 5800 rcu_read_unlock(); 5801 5802 return ret; 5803} 5804 5805static bool intel_iommu_capable(enum iommu_cap cap) 5806{ 5807 if (cap == IOMMU_CAP_CACHE_COHERENCY) 5808 return domain_update_iommu_snooping(NULL) == 1; 5809 if (cap == IOMMU_CAP_INTR_REMAP) 5810 return irq_remapping_enabled == 1; 5811 5812 return false; 5813} 5814 5815static struct iommu_device *intel_iommu_probe_device(struct device *dev) 5816{ 5817 struct intel_iommu *iommu; 5818 5819 iommu = device_to_iommu(dev, NULL, NULL); 5820 if (!iommu) 5821 return ERR_PTR(-ENODEV); 5822 5823 if (translation_pre_enabled(iommu)) 5824 dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO); 5825 5826 return &iommu->iommu; 5827} 5828 5829static void intel_iommu_release_device(struct device *dev) 5830{ 5831 struct intel_iommu *iommu; 5832 5833 iommu = device_to_iommu(dev, NULL, NULL); 5834 if (!iommu) 5835 return; 5836 5837 dmar_remove_one_dev_info(dev); 5838 5839 set_dma_ops(dev, NULL); 5840} 5841 5842static void intel_iommu_probe_finalize(struct device *dev) 5843{ 5844 struct iommu_domain *domain; 5845 5846 domain = iommu_get_domain_for_dev(dev); 5847 if (device_needs_bounce(dev)) 5848 set_dma_ops(dev, &bounce_dma_ops); 5849 else if (domain && domain->type == IOMMU_DOMAIN_DMA) 5850 set_dma_ops(dev, &intel_dma_ops); 5851 else 5852 set_dma_ops(dev, NULL); 5853} 5854 5855static void intel_iommu_get_resv_regions(struct device *device, 5856 struct list_head *head) 5857{ 5858 int prot = DMA_PTE_READ | DMA_PTE_WRITE; 5859 struct iommu_resv_region *reg; 5860 struct dmar_rmrr_unit *rmrr; 5861 struct device *i_dev; 5862 int i; 5863 5864 down_read(&dmar_global_lock); 5865 for_each_rmrr_units(rmrr) { 5866 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 5867 i, i_dev) { 5868 struct iommu_resv_region *resv; 5869 enum iommu_resv_type type; 5870 size_t length; 5871 5872 if (i_dev != device && 5873 !is_downstream_to_pci_bridge(device, i_dev)) 5874 continue; 5875 5876 length = rmrr->end_address - rmrr->base_address + 1; 5877 5878 type = device_rmrr_is_relaxable(device) ? 5879 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; 5880 5881 resv = iommu_alloc_resv_region(rmrr->base_address, 5882 length, prot, type); 5883 if (!resv) 5884 break; 5885 5886 list_add_tail(&resv->list, head); 5887 } 5888 } 5889 up_read(&dmar_global_lock); 5890 5891#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA 5892 if (dev_is_pci(device)) { 5893 struct pci_dev *pdev = to_pci_dev(device); 5894 5895 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { 5896 reg = iommu_alloc_resv_region(0, 1UL << 24, prot, 5897 IOMMU_RESV_DIRECT_RELAXABLE); 5898 if (reg) 5899 list_add_tail(®->list, head); 5900 } 5901 } 5902#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ 5903 5904 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, 5905 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 5906 0, IOMMU_RESV_MSI); 5907 if (!reg) 5908 return; 5909 list_add_tail(®->list, head); 5910} 5911 5912int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev) 5913{ 5914 struct device_domain_info *info; 5915 struct context_entry *context; 5916 struct dmar_domain *domain; 5917 unsigned long flags; 5918 u64 ctx_lo; 5919 int ret; 5920 5921 domain = find_domain(dev); 5922 if (!domain) 5923 return -EINVAL; 5924 5925 spin_lock_irqsave(&device_domain_lock, flags); 5926 spin_lock(&iommu->lock); 5927 5928 ret = -EINVAL; 5929 info = get_domain_info(dev); 5930 if (!info || !info->pasid_supported) 5931 goto out; 5932 5933 context = iommu_context_addr(iommu, info->bus, info->devfn, 0); 5934 if (WARN_ON(!context)) 5935 goto out; 5936 5937 ctx_lo = context[0].lo; 5938 5939 if (!(ctx_lo & CONTEXT_PASIDE)) { 5940 ctx_lo |= CONTEXT_PASIDE; 5941 context[0].lo = ctx_lo; 5942 wmb(); 5943 iommu->flush.flush_context(iommu, 5944 domain->iommu_did[iommu->seq_id], 5945 PCI_DEVID(info->bus, info->devfn), 5946 DMA_CCMD_MASK_NOBIT, 5947 DMA_CCMD_DEVICE_INVL); 5948 } 5949 5950 /* Enable PASID support in the device, if it wasn't already */ 5951 if (!info->pasid_enabled) 5952 iommu_enable_dev_iotlb(info); 5953 5954 ret = 0; 5955 5956 out: 5957 spin_unlock(&iommu->lock); 5958 spin_unlock_irqrestore(&device_domain_lock, flags); 5959 5960 return ret; 5961} 5962 5963static void intel_iommu_apply_resv_region(struct device *dev, 5964 struct iommu_domain *domain, 5965 struct iommu_resv_region *region) 5966{ 5967 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5968 unsigned long start, end; 5969 5970 start = IOVA_PFN(region->start); 5971 end = IOVA_PFN(region->start + region->length - 1); 5972 5973 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end)); 5974} 5975 5976static struct iommu_group *intel_iommu_device_group(struct device *dev) 5977{ 5978 if (dev_is_pci(dev)) 5979 return pci_device_group(dev); 5980 return generic_device_group(dev); 5981} 5982 5983static int intel_iommu_enable_auxd(struct device *dev) 5984{ 5985 struct device_domain_info *info; 5986 struct intel_iommu *iommu; 5987 unsigned long flags; 5988 int ret; 5989 5990 iommu = device_to_iommu(dev, NULL, NULL); 5991 if (!iommu || dmar_disabled) 5992 return -EINVAL; 5993 5994 if (!sm_supported(iommu) || !pasid_supported(iommu)) 5995 return -EINVAL; 5996 5997 ret = intel_iommu_enable_pasid(iommu, dev); 5998 if (ret) 5999 return -ENODEV; 6000 6001 spin_lock_irqsave(&device_domain_lock, flags); 6002 info = get_domain_info(dev); 6003 info->auxd_enabled = 1; 6004 spin_unlock_irqrestore(&device_domain_lock, flags); 6005 6006 return 0; 6007} 6008 6009static int intel_iommu_disable_auxd(struct device *dev) 6010{ 6011 struct device_domain_info *info; 6012 unsigned long flags; 6013 6014 spin_lock_irqsave(&device_domain_lock, flags); 6015 info = get_domain_info(dev); 6016 if (!WARN_ON(!info)) 6017 info->auxd_enabled = 0; 6018 spin_unlock_irqrestore(&device_domain_lock, flags); 6019 6020 return 0; 6021} 6022 6023/* 6024 * A PCI express designated vendor specific extended capability is defined 6025 * in the section 3.7 of Intel scalable I/O virtualization technical spec 6026 * for system software and tools to detect endpoint devices supporting the 6027 * Intel scalable IO virtualization without host driver dependency. 6028 * 6029 * Returns the address of the matching extended capability structure within 6030 * the device's PCI configuration space or 0 if the device does not support 6031 * it. 6032 */ 6033static int siov_find_pci_dvsec(struct pci_dev *pdev) 6034{ 6035 int pos; 6036 u16 vendor, id; 6037 6038 pos = pci_find_next_ext_capability(pdev, 0, 0x23); 6039 while (pos) { 6040 pci_read_config_word(pdev, pos + 4, &vendor); 6041 pci_read_config_word(pdev, pos + 8, &id); 6042 if (vendor == PCI_VENDOR_ID_INTEL && id == 5) 6043 return pos; 6044 6045 pos = pci_find_next_ext_capability(pdev, pos, 0x23); 6046 } 6047 6048 return 0; 6049} 6050 6051static bool 6052intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat) 6053{ 6054 if (feat == IOMMU_DEV_FEAT_AUX) { 6055 int ret; 6056 6057 if (!dev_is_pci(dev) || dmar_disabled || 6058 !scalable_mode_support() || !iommu_pasid_support()) 6059 return false; 6060 6061 ret = pci_pasid_features(to_pci_dev(dev)); 6062 if (ret < 0) 6063 return false; 6064 6065 return !!siov_find_pci_dvsec(to_pci_dev(dev)); 6066 } 6067 6068 if (feat == IOMMU_DEV_FEAT_SVA) { 6069 struct device_domain_info *info = get_domain_info(dev); 6070 6071 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) && 6072 info->pasid_supported && info->pri_supported && 6073 info->ats_supported; 6074 } 6075 6076 return false; 6077} 6078 6079static int 6080intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) 6081{ 6082 if (feat == IOMMU_DEV_FEAT_AUX) 6083 return intel_iommu_enable_auxd(dev); 6084 6085 if (feat == IOMMU_DEV_FEAT_SVA) { 6086 struct device_domain_info *info = get_domain_info(dev); 6087 6088 if (!info) 6089 return -EINVAL; 6090 6091 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) 6092 return 0; 6093 } 6094 6095 return -ENODEV; 6096} 6097 6098static int 6099intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) 6100{ 6101 if (feat == IOMMU_DEV_FEAT_AUX) 6102 return intel_iommu_disable_auxd(dev); 6103 6104 return -ENODEV; 6105} 6106 6107static bool 6108intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat) 6109{ 6110 struct device_domain_info *info = get_domain_info(dev); 6111 6112 if (feat == IOMMU_DEV_FEAT_AUX) 6113 return scalable_mode_support() && info && info->auxd_enabled; 6114 6115 return false; 6116} 6117 6118static int 6119intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev) 6120{ 6121 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 6122 6123 return dmar_domain->default_pasid > 0 ? 6124 dmar_domain->default_pasid : -EINVAL; 6125} 6126 6127static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain, 6128 struct device *dev) 6129{ 6130 return attach_deferred(dev); 6131} 6132 6133static int 6134intel_iommu_domain_set_attr(struct iommu_domain *domain, 6135 enum iommu_attr attr, void *data) 6136{ 6137 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 6138 unsigned long flags; 6139 int ret = 0; 6140 6141 if (domain->type != IOMMU_DOMAIN_UNMANAGED) 6142 return -EINVAL; 6143 6144 switch (attr) { 6145 case DOMAIN_ATTR_NESTING: 6146 spin_lock_irqsave(&device_domain_lock, flags); 6147 if (nested_mode_support() && 6148 list_empty(&dmar_domain->devices)) { 6149 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE; 6150 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL; 6151 } else { 6152 ret = -ENODEV; 6153 } 6154 spin_unlock_irqrestore(&device_domain_lock, flags); 6155 break; 6156 default: 6157 ret = -EINVAL; 6158 break; 6159 } 6160 6161 return ret; 6162} 6163 6164/* 6165 * Check that the device does not live on an external facing PCI port that is 6166 * marked as untrusted. Such devices should not be able to apply quirks and 6167 * thus not be able to bypass the IOMMU restrictions. 6168 */ 6169static bool risky_device(struct pci_dev *pdev) 6170{ 6171 if (pdev->untrusted) { 6172 pci_info(pdev, 6173 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", 6174 pdev->vendor, pdev->device); 6175 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); 6176 return true; 6177 } 6178 return false; 6179} 6180 6181const struct iommu_ops intel_iommu_ops = { 6182 .capable = intel_iommu_capable, 6183 .domain_alloc = intel_iommu_domain_alloc, 6184 .domain_free = intel_iommu_domain_free, 6185 .domain_set_attr = intel_iommu_domain_set_attr, 6186 .attach_dev = intel_iommu_attach_device, 6187 .detach_dev = intel_iommu_detach_device, 6188 .aux_attach_dev = intel_iommu_aux_attach_device, 6189 .aux_detach_dev = intel_iommu_aux_detach_device, 6190 .aux_get_pasid = intel_iommu_aux_get_pasid, 6191 .map = intel_iommu_map, 6192 .unmap = intel_iommu_unmap, 6193 .iova_to_phys = intel_iommu_iova_to_phys, 6194 .probe_device = intel_iommu_probe_device, 6195 .probe_finalize = intel_iommu_probe_finalize, 6196 .release_device = intel_iommu_release_device, 6197 .get_resv_regions = intel_iommu_get_resv_regions, 6198 .put_resv_regions = generic_iommu_put_resv_regions, 6199 .apply_resv_region = intel_iommu_apply_resv_region, 6200 .device_group = intel_iommu_device_group, 6201 .dev_has_feat = intel_iommu_dev_has_feat, 6202 .dev_feat_enabled = intel_iommu_dev_feat_enabled, 6203 .dev_enable_feat = intel_iommu_dev_enable_feat, 6204 .dev_disable_feat = intel_iommu_dev_disable_feat, 6205 .is_attach_deferred = intel_iommu_is_attach_deferred, 6206 .def_domain_type = device_def_domain_type, 6207 .pgsize_bitmap = INTEL_IOMMU_PGSIZES, 6208#ifdef CONFIG_INTEL_IOMMU_SVM 6209 .cache_invalidate = intel_iommu_sva_invalidate, 6210 .sva_bind_gpasid = intel_svm_bind_gpasid, 6211 .sva_unbind_gpasid = intel_svm_unbind_gpasid, 6212 .sva_bind = intel_svm_bind, 6213 .sva_unbind = intel_svm_unbind, 6214 .sva_get_pasid = intel_svm_get_pasid, 6215 .page_response = intel_svm_page_response, 6216#endif 6217}; 6218 6219static void quirk_iommu_igfx(struct pci_dev *dev) 6220{ 6221 if (risky_device(dev)) 6222 return; 6223 6224 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); 6225 dmar_map_gfx = 0; 6226} 6227 6228/* G4x/GM45 integrated gfx dmar support is totally busted. */ 6229DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); 6230DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); 6231DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); 6232DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); 6233DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); 6234DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); 6235DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); 6236 6237/* Broadwell igfx malfunctions with dmar */ 6238DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); 6239DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); 6240DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); 6241DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); 6242DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); 6243DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); 6244DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); 6245DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); 6246DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); 6247DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); 6248DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); 6249DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); 6250DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); 6251DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); 6252DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); 6253DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); 6254DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); 6255DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); 6256DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); 6257DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); 6258DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); 6259DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); 6260DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); 6261DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); 6262 6263static void quirk_iommu_rwbf(struct pci_dev *dev) 6264{ 6265 if (risky_device(dev)) 6266 return; 6267 6268 /* 6269 * Mobile 4 Series Chipset neglects to set RWBF capability, 6270 * but needs it. Same seems to hold for the desktop versions. 6271 */ 6272 pci_info(dev, "Forcing write-buffer flush capability\n"); 6273 rwbf_quirk = 1; 6274} 6275 6276DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); 6277DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); 6278DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); 6279DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); 6280DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); 6281DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); 6282DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); 6283 6284#define GGC 0x52 6285#define GGC_MEMORY_SIZE_MASK (0xf << 8) 6286#define GGC_MEMORY_SIZE_NONE (0x0 << 8) 6287#define GGC_MEMORY_SIZE_1M (0x1 << 8) 6288#define GGC_MEMORY_SIZE_2M (0x3 << 8) 6289#define GGC_MEMORY_VT_ENABLED (0x8 << 8) 6290#define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) 6291#define GGC_MEMORY_SIZE_3M_VT (0xa << 8) 6292#define GGC_MEMORY_SIZE_4M_VT (0xb << 8) 6293 6294static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) 6295{ 6296 unsigned short ggc; 6297 6298 if (risky_device(dev)) 6299 return; 6300 6301 if (pci_read_config_word(dev, GGC, &ggc)) 6302 return; 6303 6304 if (!(ggc & GGC_MEMORY_VT_ENABLED)) { 6305 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); 6306 dmar_map_gfx = 0; 6307 } else if (dmar_map_gfx) { 6308 /* we have to ensure the gfx device is idle before we flush */ 6309 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); 6310 intel_iommu_strict = 1; 6311 } 6312} 6313DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); 6314DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); 6315DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); 6316DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); 6317 6318static void quirk_igfx_skip_te_disable(struct pci_dev *dev) 6319{ 6320 unsigned short ver; 6321 6322 if (!IS_GFX_DEVICE(dev)) 6323 return; 6324 6325 ver = (dev->device >> 8) & 0xff; 6326 if (ver != 0x45 && ver != 0x46 && ver != 0x4c && 6327 ver != 0x4e && ver != 0x8a && ver != 0x98 && 6328 ver != 0x9a && ver != 0xa7 && ver != 0x7d) 6329 return; 6330 6331 if (risky_device(dev)) 6332 return; 6333 6334 pci_info(dev, "Skip IOMMU disabling for graphics\n"); 6335 iommu_skip_te_disable = 1; 6336} 6337DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); 6338 6339/* On Tylersburg chipsets, some BIOSes have been known to enable the 6340 ISOCH DMAR unit for the Azalia sound device, but not give it any 6341 TLB entries, which causes it to deadlock. Check for that. We do 6342 this in a function called from init_dmars(), instead of in a PCI 6343 quirk, because we don't want to print the obnoxious "BIOS broken" 6344 message if VT-d is actually disabled. 6345*/ 6346static void __init check_tylersburg_isoch(void) 6347{ 6348 struct pci_dev *pdev; 6349 uint32_t vtisochctrl; 6350 6351 /* If there's no Azalia in the system anyway, forget it. */ 6352 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); 6353 if (!pdev) 6354 return; 6355 6356 if (risky_device(pdev)) { 6357 pci_dev_put(pdev); 6358 return; 6359 } 6360 6361 pci_dev_put(pdev); 6362 6363 /* System Management Registers. Might be hidden, in which case 6364 we can't do the sanity check. But that's OK, because the 6365 known-broken BIOSes _don't_ actually hide it, so far. */ 6366 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); 6367 if (!pdev) 6368 return; 6369 6370 if (risky_device(pdev)) { 6371 pci_dev_put(pdev); 6372 return; 6373 } 6374 6375 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { 6376 pci_dev_put(pdev); 6377 return; 6378 } 6379 6380 pci_dev_put(pdev); 6381 6382 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ 6383 if (vtisochctrl & 1) 6384 return; 6385 6386 /* Drop all bits other than the number of TLB entries */ 6387 vtisochctrl &= 0x1c; 6388 6389 /* If we have the recommended number of TLB entries (16), fine. */ 6390 if (vtisochctrl == 0x10) 6391 return; 6392 6393 /* Zero TLB entries? You get to ride the short bus to school. */ 6394 if (!vtisochctrl) { 6395 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" 6396 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 6397 dmi_get_system_info(DMI_BIOS_VENDOR), 6398 dmi_get_system_info(DMI_BIOS_VERSION), 6399 dmi_get_system_info(DMI_PRODUCT_VERSION)); 6400 iommu_identity_mapping |= IDENTMAP_AZALIA; 6401 return; 6402 } 6403 6404 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", 6405 vtisochctrl); 6406} 6407