1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright © 2006-2014 Intel Corporation.
4 *
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 *          Ashok Raj <ashok.raj@intel.com>,
7 *          Shaohua Li <shaohua.li@intel.com>,
8 *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 *          Fenghua Yu <fenghua.yu@intel.com>
10 *          Joerg Roedel <jroedel@suse.de>
11 */
12
13#define pr_fmt(fmt)     "DMAR: " fmt
14#define dev_fmt(fmt)    pr_fmt(fmt)
15
16#include <linux/crash_dump.h>
17#include <linux/dma-direct.h>
18#include <linux/dmi.h>
19#include <linux/memory.h>
20#include <linux/pci.h>
21#include <linux/pci-ats.h>
22#include <linux/spinlock.h>
23#include <linux/syscore_ops.h>
24#include <linux/tboot.h>
25#include <uapi/linux/iommufd.h>
26
27#include "iommu.h"
28#include "../dma-iommu.h"
29#include "../irq_remapping.h"
30#include "../iommu-sva.h"
31#include "pasid.h"
32#include "cap_audit.h"
33#include "perfmon.h"
34
35#define ROOT_SIZE		VTD_PAGE_SIZE
36#define CONTEXT_SIZE		VTD_PAGE_SIZE
37
38#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42
43#define IOAPIC_RANGE_START	(0xfee00000)
44#define IOAPIC_RANGE_END	(0xfeefffff)
45#define IOVA_START_ADDR		(0x1000)
46
47#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48
49#define MAX_AGAW_WIDTH 64
50#define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
51
52#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
53#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
54
55/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
56   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
57#define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
58				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
59#define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
60
61/* IO virtual address start page frame number */
62#define IOVA_START_PFN		(1)
63
64#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
65
66/* page table handling */
67#define LEVEL_STRIDE		(9)
68#define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
69
70static inline int agaw_to_level(int agaw)
71{
72	return agaw + 2;
73}
74
75static inline int agaw_to_width(int agaw)
76{
77	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
78}
79
80static inline int width_to_agaw(int width)
81{
82	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
83}
84
85static inline unsigned int level_to_offset_bits(int level)
86{
87	return (level - 1) * LEVEL_STRIDE;
88}
89
90static inline int pfn_level_offset(u64 pfn, int level)
91{
92	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
93}
94
95static inline u64 level_mask(int level)
96{
97	return -1ULL << level_to_offset_bits(level);
98}
99
100static inline u64 level_size(int level)
101{
102	return 1ULL << level_to_offset_bits(level);
103}
104
105static inline u64 align_to_level(u64 pfn, int level)
106{
107	return (pfn + level_size(level) - 1) & level_mask(level);
108}
109
110static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
111{
112	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
113}
114
115/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
116   are never going to work. */
117static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn)
118{
119	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
120}
121static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn)
122{
123	return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1;
124}
125static inline unsigned long page_to_dma_pfn(struct page *pg)
126{
127	return mm_to_dma_pfn_start(page_to_pfn(pg));
128}
129static inline unsigned long virt_to_dma_pfn(void *p)
130{
131	return page_to_dma_pfn(virt_to_page(p));
132}
133
134static void __init check_tylersburg_isoch(void);
135static int rwbf_quirk;
136
137/*
138 * set to 1 to panic kernel if can't successfully enable VT-d
139 * (used when kernel is launched w/ TXT)
140 */
141static int force_on = 0;
142static int intel_iommu_tboot_noforce;
143static int no_platform_optin;
144
145#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
146
147/*
148 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
149 * if marked present.
150 */
151static phys_addr_t root_entry_lctp(struct root_entry *re)
152{
153	if (!(re->lo & 1))
154		return 0;
155
156	return re->lo & VTD_PAGE_MASK;
157}
158
159/*
160 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
161 * if marked present.
162 */
163static phys_addr_t root_entry_uctp(struct root_entry *re)
164{
165	if (!(re->hi & 1))
166		return 0;
167
168	return re->hi & VTD_PAGE_MASK;
169}
170
171static inline void context_set_present(struct context_entry *context)
172{
173	context->lo |= 1;
174}
175
176static inline void context_set_fault_enable(struct context_entry *context)
177{
178	context->lo &= (((u64)-1) << 2) | 1;
179}
180
181static inline void context_set_translation_type(struct context_entry *context,
182						unsigned long value)
183{
184	context->lo &= (((u64)-1) << 4) | 3;
185	context->lo |= (value & 3) << 2;
186}
187
188static inline void context_set_address_root(struct context_entry *context,
189					    unsigned long value)
190{
191	context->lo &= ~VTD_PAGE_MASK;
192	context->lo |= value & VTD_PAGE_MASK;
193}
194
195static inline void context_set_address_width(struct context_entry *context,
196					     unsigned long value)
197{
198	context->hi |= value & 7;
199}
200
201static inline void context_set_domain_id(struct context_entry *context,
202					 unsigned long value)
203{
204	context->hi |= (value & ((1 << 16) - 1)) << 8;
205}
206
207static inline void context_set_pasid(struct context_entry *context)
208{
209	context->lo |= CONTEXT_PASIDE;
210}
211
212static inline int context_domain_id(struct context_entry *c)
213{
214	return((c->hi >> 8) & 0xffff);
215}
216
217static inline void context_clear_entry(struct context_entry *context)
218{
219	context->lo = 0;
220	context->hi = 0;
221}
222
223static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
224{
225	if (!iommu->copied_tables)
226		return false;
227
228	return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
229}
230
231static inline void
232set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
233{
234	set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
235}
236
237static inline void
238clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
239{
240	clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
241}
242
243/*
244 * This domain is a statically identity mapping domain.
245 *	1. This domain creats a static 1:1 mapping to all usable memory.
246 * 	2. It maps to each iommu if successful.
247 *	3. Each iommu mapps to this domain if successful.
248 */
249static struct dmar_domain *si_domain;
250static int hw_pass_through = 1;
251
252struct dmar_rmrr_unit {
253	struct list_head list;		/* list of rmrr units	*/
254	struct acpi_dmar_header *hdr;	/* ACPI header		*/
255	u64	base_address;		/* reserved base address*/
256	u64	end_address;		/* reserved end address */
257	struct dmar_dev_scope *devices;	/* target devices */
258	int	devices_cnt;		/* target device count */
259};
260
261struct dmar_atsr_unit {
262	struct list_head list;		/* list of ATSR units */
263	struct acpi_dmar_header *hdr;	/* ACPI header */
264	struct dmar_dev_scope *devices;	/* target devices */
265	int devices_cnt;		/* target device count */
266	u8 include_all:1;		/* include all ports */
267};
268
269struct dmar_satc_unit {
270	struct list_head list;		/* list of SATC units */
271	struct acpi_dmar_header *hdr;	/* ACPI header */
272	struct dmar_dev_scope *devices;	/* target devices */
273	struct intel_iommu *iommu;	/* the corresponding iommu */
274	int devices_cnt;		/* target device count */
275	u8 atc_required:1;		/* ATS is required */
276};
277
278static LIST_HEAD(dmar_atsr_units);
279static LIST_HEAD(dmar_rmrr_units);
280static LIST_HEAD(dmar_satc_units);
281
282#define for_each_rmrr_units(rmrr) \
283	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
284
285static void device_block_translation(struct device *dev);
286static void intel_iommu_domain_free(struct iommu_domain *domain);
287
288int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
289int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
290
291int intel_iommu_enabled = 0;
292EXPORT_SYMBOL_GPL(intel_iommu_enabled);
293
294static int dmar_map_gfx = 1;
295static int intel_iommu_superpage = 1;
296static int iommu_identity_mapping;
297static int iommu_skip_te_disable;
298
299#define IDENTMAP_GFX		2
300#define IDENTMAP_AZALIA		4
301
302const struct iommu_ops intel_iommu_ops;
303
304static bool translation_pre_enabled(struct intel_iommu *iommu)
305{
306	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
307}
308
309static void clear_translation_pre_enabled(struct intel_iommu *iommu)
310{
311	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
312}
313
314static void init_translation_status(struct intel_iommu *iommu)
315{
316	u32 gsts;
317
318	gsts = readl(iommu->reg + DMAR_GSTS_REG);
319	if (gsts & DMA_GSTS_TES)
320		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
321}
322
323static int __init intel_iommu_setup(char *str)
324{
325	if (!str)
326		return -EINVAL;
327
328	while (*str) {
329		if (!strncmp(str, "on", 2)) {
330			dmar_disabled = 0;
331			pr_info("IOMMU enabled\n");
332		} else if (!strncmp(str, "off", 3)) {
333			dmar_disabled = 1;
334			no_platform_optin = 1;
335			pr_info("IOMMU disabled\n");
336		} else if (!strncmp(str, "igfx_off", 8)) {
337			dmar_map_gfx = 0;
338			pr_info("Disable GFX device mapping\n");
339		} else if (!strncmp(str, "forcedac", 8)) {
340			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
341			iommu_dma_forcedac = true;
342		} else if (!strncmp(str, "strict", 6)) {
343			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
344			iommu_set_dma_strict();
345		} else if (!strncmp(str, "sp_off", 6)) {
346			pr_info("Disable supported super page\n");
347			intel_iommu_superpage = 0;
348		} else if (!strncmp(str, "sm_on", 5)) {
349			pr_info("Enable scalable mode if hardware supports\n");
350			intel_iommu_sm = 1;
351		} else if (!strncmp(str, "sm_off", 6)) {
352			pr_info("Scalable mode is disallowed\n");
353			intel_iommu_sm = 0;
354		} else if (!strncmp(str, "tboot_noforce", 13)) {
355			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
356			intel_iommu_tboot_noforce = 1;
357		} else {
358			pr_notice("Unknown option - '%s'\n", str);
359		}
360
361		str += strcspn(str, ",");
362		while (*str == ',')
363			str++;
364	}
365
366	return 1;
367}
368__setup("intel_iommu=", intel_iommu_setup);
369
370void *alloc_pgtable_page(int node, gfp_t gfp)
371{
372	struct page *page;
373	void *vaddr = NULL;
374
375	page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
376	if (page)
377		vaddr = page_address(page);
378	return vaddr;
379}
380
381void free_pgtable_page(void *vaddr)
382{
383	free_page((unsigned long)vaddr);
384}
385
386static inline int domain_type_is_si(struct dmar_domain *domain)
387{
388	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
389}
390
391static inline int domain_pfn_supported(struct dmar_domain *domain,
392				       unsigned long pfn)
393{
394	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
395
396	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
397}
398
399/*
400 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
401 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
402 * the returned SAGAW.
403 */
404static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
405{
406	unsigned long fl_sagaw, sl_sagaw;
407
408	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
409	sl_sagaw = cap_sagaw(iommu->cap);
410
411	/* Second level only. */
412	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
413		return sl_sagaw;
414
415	/* First level only. */
416	if (!ecap_slts(iommu->ecap))
417		return fl_sagaw;
418
419	return fl_sagaw & sl_sagaw;
420}
421
422static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
423{
424	unsigned long sagaw;
425	int agaw;
426
427	sagaw = __iommu_calculate_sagaw(iommu);
428	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
429		if (test_bit(agaw, &sagaw))
430			break;
431	}
432
433	return agaw;
434}
435
436/*
437 * Calculate max SAGAW for each iommu.
438 */
439int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
440{
441	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
442}
443
444/*
445 * calculate agaw for each iommu.
446 * "SAGAW" may be different across iommus, use a default agaw, and
447 * get a supported less agaw for iommus that don't support the default agaw.
448 */
449int iommu_calculate_agaw(struct intel_iommu *iommu)
450{
451	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
452}
453
454static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
455{
456	return sm_supported(iommu) ?
457			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
458}
459
460static void domain_update_iommu_coherency(struct dmar_domain *domain)
461{
462	struct iommu_domain_info *info;
463	struct dmar_drhd_unit *drhd;
464	struct intel_iommu *iommu;
465	bool found = false;
466	unsigned long i;
467
468	domain->iommu_coherency = true;
469	xa_for_each(&domain->iommu_array, i, info) {
470		found = true;
471		if (!iommu_paging_structure_coherency(info->iommu)) {
472			domain->iommu_coherency = false;
473			break;
474		}
475	}
476	if (found)
477		return;
478
479	/* No hardware attached; use lowest common denominator */
480	rcu_read_lock();
481	for_each_active_iommu(iommu, drhd) {
482		if (!iommu_paging_structure_coherency(iommu)) {
483			domain->iommu_coherency = false;
484			break;
485		}
486	}
487	rcu_read_unlock();
488}
489
490static int domain_update_iommu_superpage(struct dmar_domain *domain,
491					 struct intel_iommu *skip)
492{
493	struct dmar_drhd_unit *drhd;
494	struct intel_iommu *iommu;
495	int mask = 0x3;
496
497	if (!intel_iommu_superpage)
498		return 0;
499
500	/* set iommu_superpage to the smallest common denominator */
501	rcu_read_lock();
502	for_each_active_iommu(iommu, drhd) {
503		if (iommu != skip) {
504			if (domain && domain->use_first_level) {
505				if (!cap_fl1gp_support(iommu->cap))
506					mask = 0x1;
507			} else {
508				mask &= cap_super_page_val(iommu->cap);
509			}
510
511			if (!mask)
512				break;
513		}
514	}
515	rcu_read_unlock();
516
517	return fls(mask);
518}
519
520static int domain_update_device_node(struct dmar_domain *domain)
521{
522	struct device_domain_info *info;
523	int nid = NUMA_NO_NODE;
524	unsigned long flags;
525
526	spin_lock_irqsave(&domain->lock, flags);
527	list_for_each_entry(info, &domain->devices, link) {
528		/*
529		 * There could possibly be multiple device numa nodes as devices
530		 * within the same domain may sit behind different IOMMUs. There
531		 * isn't perfect answer in such situation, so we select first
532		 * come first served policy.
533		 */
534		nid = dev_to_node(info->dev);
535		if (nid != NUMA_NO_NODE)
536			break;
537	}
538	spin_unlock_irqrestore(&domain->lock, flags);
539
540	return nid;
541}
542
543static void domain_update_iotlb(struct dmar_domain *domain);
544
545/* Return the super pagesize bitmap if supported. */
546static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
547{
548	unsigned long bitmap = 0;
549
550	/*
551	 * 1-level super page supports page size of 2MiB, 2-level super page
552	 * supports page size of both 2MiB and 1GiB.
553	 */
554	if (domain->iommu_superpage == 1)
555		bitmap |= SZ_2M;
556	else if (domain->iommu_superpage == 2)
557		bitmap |= SZ_2M | SZ_1G;
558
559	return bitmap;
560}
561
562/* Some capabilities may be different across iommus */
563static void domain_update_iommu_cap(struct dmar_domain *domain)
564{
565	domain_update_iommu_coherency(domain);
566	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
567
568	/*
569	 * If RHSA is missing, we should default to the device numa domain
570	 * as fall back.
571	 */
572	if (domain->nid == NUMA_NO_NODE)
573		domain->nid = domain_update_device_node(domain);
574
575	/*
576	 * First-level translation restricts the input-address to a
577	 * canonical address (i.e., address bits 63:N have the same
578	 * value as address bit [N-1], where N is 48-bits with 4-level
579	 * paging and 57-bits with 5-level paging). Hence, skip bit
580	 * [N-1].
581	 */
582	if (domain->use_first_level)
583		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
584	else
585		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
586
587	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
588	domain_update_iotlb(domain);
589}
590
591struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
592					 u8 devfn, int alloc)
593{
594	struct root_entry *root = &iommu->root_entry[bus];
595	struct context_entry *context;
596	u64 *entry;
597
598	/*
599	 * Except that the caller requested to allocate a new entry,
600	 * returning a copied context entry makes no sense.
601	 */
602	if (!alloc && context_copied(iommu, bus, devfn))
603		return NULL;
604
605	entry = &root->lo;
606	if (sm_supported(iommu)) {
607		if (devfn >= 0x80) {
608			devfn -= 0x80;
609			entry = &root->hi;
610		}
611		devfn *= 2;
612	}
613	if (*entry & 1)
614		context = phys_to_virt(*entry & VTD_PAGE_MASK);
615	else {
616		unsigned long phy_addr;
617		if (!alloc)
618			return NULL;
619
620		context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
621		if (!context)
622			return NULL;
623
624		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
625		phy_addr = virt_to_phys((void *)context);
626		*entry = phy_addr | 1;
627		__iommu_flush_cache(iommu, entry, sizeof(*entry));
628	}
629	return &context[devfn];
630}
631
632/**
633 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
634 *				 sub-hierarchy of a candidate PCI-PCI bridge
635 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
636 * @bridge: the candidate PCI-PCI bridge
637 *
638 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
639 */
640static bool
641is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
642{
643	struct pci_dev *pdev, *pbridge;
644
645	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
646		return false;
647
648	pdev = to_pci_dev(dev);
649	pbridge = to_pci_dev(bridge);
650
651	if (pbridge->subordinate &&
652	    pbridge->subordinate->number <= pdev->bus->number &&
653	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
654		return true;
655
656	return false;
657}
658
659static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
660{
661	struct dmar_drhd_unit *drhd;
662	u32 vtbar;
663	int rc;
664
665	/* We know that this device on this chipset has its own IOMMU.
666	 * If we find it under a different IOMMU, then the BIOS is lying
667	 * to us. Hope that the IOMMU for this device is actually
668	 * disabled, and it needs no translation...
669	 */
670	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
671	if (rc) {
672		/* "can't" happen */
673		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
674		return false;
675	}
676	vtbar &= 0xffff0000;
677
678	/* we know that the this iommu should be at offset 0xa000 from vtbar */
679	drhd = dmar_find_matched_drhd_unit(pdev);
680	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
681		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
682		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
683		return true;
684	}
685
686	return false;
687}
688
689static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
690{
691	if (!iommu || iommu->drhd->ignored)
692		return true;
693
694	if (dev_is_pci(dev)) {
695		struct pci_dev *pdev = to_pci_dev(dev);
696
697		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
698		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
699		    quirk_ioat_snb_local_iommu(pdev))
700			return true;
701	}
702
703	return false;
704}
705
706struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
707{
708	struct dmar_drhd_unit *drhd = NULL;
709	struct pci_dev *pdev = NULL;
710	struct intel_iommu *iommu;
711	struct device *tmp;
712	u16 segment = 0;
713	int i;
714
715	if (!dev)
716		return NULL;
717
718	if (dev_is_pci(dev)) {
719		struct pci_dev *pf_pdev;
720
721		pdev = pci_real_dma_dev(to_pci_dev(dev));
722
723		/* VFs aren't listed in scope tables; we need to look up
724		 * the PF instead to find the IOMMU. */
725		pf_pdev = pci_physfn(pdev);
726		dev = &pf_pdev->dev;
727		segment = pci_domain_nr(pdev->bus);
728	} else if (has_acpi_companion(dev))
729		dev = &ACPI_COMPANION(dev)->dev;
730
731	rcu_read_lock();
732	for_each_iommu(iommu, drhd) {
733		if (pdev && segment != drhd->segment)
734			continue;
735
736		for_each_active_dev_scope(drhd->devices,
737					  drhd->devices_cnt, i, tmp) {
738			if (tmp == dev) {
739				/* For a VF use its original BDF# not that of the PF
740				 * which we used for the IOMMU lookup. Strictly speaking
741				 * we could do this for all PCI devices; we only need to
742				 * get the BDF# from the scope table for ACPI matches. */
743				if (pdev && pdev->is_virtfn)
744					goto got_pdev;
745
746				if (bus && devfn) {
747					*bus = drhd->devices[i].bus;
748					*devfn = drhd->devices[i].devfn;
749				}
750				goto out;
751			}
752
753			if (is_downstream_to_pci_bridge(dev, tmp))
754				goto got_pdev;
755		}
756
757		if (pdev && drhd->include_all) {
758got_pdev:
759			if (bus && devfn) {
760				*bus = pdev->bus->number;
761				*devfn = pdev->devfn;
762			}
763			goto out;
764		}
765	}
766	iommu = NULL;
767out:
768	if (iommu_is_dummy(iommu, dev))
769		iommu = NULL;
770
771	rcu_read_unlock();
772
773	return iommu;
774}
775
776static void domain_flush_cache(struct dmar_domain *domain,
777			       void *addr, int size)
778{
779	if (!domain->iommu_coherency)
780		clflush_cache_range(addr, size);
781}
782
783static void free_context_table(struct intel_iommu *iommu)
784{
785	struct context_entry *context;
786	int i;
787
788	if (!iommu->root_entry)
789		return;
790
791	for (i = 0; i < ROOT_ENTRY_NR; i++) {
792		context = iommu_context_addr(iommu, i, 0, 0);
793		if (context)
794			free_pgtable_page(context);
795
796		if (!sm_supported(iommu))
797			continue;
798
799		context = iommu_context_addr(iommu, i, 0x80, 0);
800		if (context)
801			free_pgtable_page(context);
802	}
803
804	free_pgtable_page(iommu->root_entry);
805	iommu->root_entry = NULL;
806}
807
808#ifdef CONFIG_DMAR_DEBUG
809static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
810			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
811{
812	struct dma_pte *pte;
813	int offset;
814
815	while (1) {
816		offset = pfn_level_offset(pfn, level);
817		pte = &parent[offset];
818		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
819			pr_info("PTE not present at level %d\n", level);
820			break;
821		}
822
823		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
824
825		if (level == 1)
826			break;
827
828		parent = phys_to_virt(dma_pte_addr(pte));
829		level--;
830	}
831}
832
833void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
834			  unsigned long long addr, u32 pasid)
835{
836	struct pasid_dir_entry *dir, *pde;
837	struct pasid_entry *entries, *pte;
838	struct context_entry *ctx_entry;
839	struct root_entry *rt_entry;
840	int i, dir_index, index, level;
841	u8 devfn = source_id & 0xff;
842	u8 bus = source_id >> 8;
843	struct dma_pte *pgtable;
844
845	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
846
847	/* root entry dump */
848	rt_entry = &iommu->root_entry[bus];
849	if (!rt_entry) {
850		pr_info("root table entry is not present\n");
851		return;
852	}
853
854	if (sm_supported(iommu))
855		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
856			rt_entry->hi, rt_entry->lo);
857	else
858		pr_info("root entry: 0x%016llx", rt_entry->lo);
859
860	/* context entry dump */
861	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
862	if (!ctx_entry) {
863		pr_info("context table entry is not present\n");
864		return;
865	}
866
867	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
868		ctx_entry->hi, ctx_entry->lo);
869
870	/* legacy mode does not require PASID entries */
871	if (!sm_supported(iommu)) {
872		level = agaw_to_level(ctx_entry->hi & 7);
873		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
874		goto pgtable_walk;
875	}
876
877	/* get the pointer to pasid directory entry */
878	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
879	if (!dir) {
880		pr_info("pasid directory entry is not present\n");
881		return;
882	}
883	/* For request-without-pasid, get the pasid from context entry */
884	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
885		pasid = IOMMU_NO_PASID;
886
887	dir_index = pasid >> PASID_PDE_SHIFT;
888	pde = &dir[dir_index];
889	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
890
891	/* get the pointer to the pasid table entry */
892	entries = get_pasid_table_from_pde(pde);
893	if (!entries) {
894		pr_info("pasid table entry is not present\n");
895		return;
896	}
897	index = pasid & PASID_PTE_MASK;
898	pte = &entries[index];
899	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
900		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
901
902	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
903		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
904		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
905	} else {
906		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
907		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
908	}
909
910pgtable_walk:
911	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
912}
913#endif
914
915static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
916				      unsigned long pfn, int *target_level,
917				      gfp_t gfp)
918{
919	struct dma_pte *parent, *pte;
920	int level = agaw_to_level(domain->agaw);
921	int offset;
922
923	if (!domain_pfn_supported(domain, pfn))
924		/* Address beyond IOMMU's addressing capabilities. */
925		return NULL;
926
927	parent = domain->pgd;
928
929	while (1) {
930		void *tmp_page;
931
932		offset = pfn_level_offset(pfn, level);
933		pte = &parent[offset];
934		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
935			break;
936		if (level == *target_level)
937			break;
938
939		if (!dma_pte_present(pte)) {
940			uint64_t pteval;
941
942			tmp_page = alloc_pgtable_page(domain->nid, gfp);
943
944			if (!tmp_page)
945				return NULL;
946
947			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
948			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
949			if (domain->use_first_level)
950				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
951
952			if (cmpxchg64(&pte->val, 0ULL, pteval))
953				/* Someone else set it while we were thinking; use theirs. */
954				free_pgtable_page(tmp_page);
955			else
956				domain_flush_cache(domain, pte, sizeof(*pte));
957		}
958		if (level == 1)
959			break;
960
961		parent = phys_to_virt(dma_pte_addr(pte));
962		level--;
963	}
964
965	if (!*target_level)
966		*target_level = level;
967
968	return pte;
969}
970
971/* return address's pte at specific level */
972static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
973					 unsigned long pfn,
974					 int level, int *large_page)
975{
976	struct dma_pte *parent, *pte;
977	int total = agaw_to_level(domain->agaw);
978	int offset;
979
980	parent = domain->pgd;
981	while (level <= total) {
982		offset = pfn_level_offset(pfn, total);
983		pte = &parent[offset];
984		if (level == total)
985			return pte;
986
987		if (!dma_pte_present(pte)) {
988			*large_page = total;
989			break;
990		}
991
992		if (dma_pte_superpage(pte)) {
993			*large_page = total;
994			return pte;
995		}
996
997		parent = phys_to_virt(dma_pte_addr(pte));
998		total--;
999	}
1000	return NULL;
1001}
1002
1003/* clear last level pte, a tlb flush should be followed */
1004static void dma_pte_clear_range(struct dmar_domain *domain,
1005				unsigned long start_pfn,
1006				unsigned long last_pfn)
1007{
1008	unsigned int large_page;
1009	struct dma_pte *first_pte, *pte;
1010
1011	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1012	    WARN_ON(start_pfn > last_pfn))
1013		return;
1014
1015	/* we don't need lock here; nobody else touches the iova range */
1016	do {
1017		large_page = 1;
1018		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1019		if (!pte) {
1020			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1021			continue;
1022		}
1023		do {
1024			dma_clear_pte(pte);
1025			start_pfn += lvl_to_nr_pages(large_page);
1026			pte++;
1027		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1028
1029		domain_flush_cache(domain, first_pte,
1030				   (void *)pte - (void *)first_pte);
1031
1032	} while (start_pfn && start_pfn <= last_pfn);
1033}
1034
1035static void dma_pte_free_level(struct dmar_domain *domain, int level,
1036			       int retain_level, struct dma_pte *pte,
1037			       unsigned long pfn, unsigned long start_pfn,
1038			       unsigned long last_pfn)
1039{
1040	pfn = max(start_pfn, pfn);
1041	pte = &pte[pfn_level_offset(pfn, level)];
1042
1043	do {
1044		unsigned long level_pfn;
1045		struct dma_pte *level_pte;
1046
1047		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1048			goto next;
1049
1050		level_pfn = pfn & level_mask(level);
1051		level_pte = phys_to_virt(dma_pte_addr(pte));
1052
1053		if (level > 2) {
1054			dma_pte_free_level(domain, level - 1, retain_level,
1055					   level_pte, level_pfn, start_pfn,
1056					   last_pfn);
1057		}
1058
1059		/*
1060		 * Free the page table if we're below the level we want to
1061		 * retain and the range covers the entire table.
1062		 */
1063		if (level < retain_level && !(start_pfn > level_pfn ||
1064		      last_pfn < level_pfn + level_size(level) - 1)) {
1065			dma_clear_pte(pte);
1066			domain_flush_cache(domain, pte, sizeof(*pte));
1067			free_pgtable_page(level_pte);
1068		}
1069next:
1070		pfn += level_size(level);
1071	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1072}
1073
1074/*
1075 * clear last level (leaf) ptes and free page table pages below the
1076 * level we wish to keep intact.
1077 */
1078static void dma_pte_free_pagetable(struct dmar_domain *domain,
1079				   unsigned long start_pfn,
1080				   unsigned long last_pfn,
1081				   int retain_level)
1082{
1083	dma_pte_clear_range(domain, start_pfn, last_pfn);
1084
1085	/* We don't need lock here; nobody else touches the iova range */
1086	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1087			   domain->pgd, 0, start_pfn, last_pfn);
1088
1089	/* free pgd */
1090	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1091		free_pgtable_page(domain->pgd);
1092		domain->pgd = NULL;
1093	}
1094}
1095
1096/* When a page at a given level is being unlinked from its parent, we don't
1097   need to *modify* it at all. All we need to do is make a list of all the
1098   pages which can be freed just as soon as we've flushed the IOTLB and we
1099   know the hardware page-walk will no longer touch them.
1100   The 'pte' argument is the *parent* PTE, pointing to the page that is to
1101   be freed. */
1102static void dma_pte_list_pagetables(struct dmar_domain *domain,
1103				    int level, struct dma_pte *pte,
1104				    struct list_head *freelist)
1105{
1106	struct page *pg;
1107
1108	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1109	list_add_tail(&pg->lru, freelist);
1110
1111	if (level == 1)
1112		return;
1113
1114	pte = page_address(pg);
1115	do {
1116		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1117			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1118		pte++;
1119	} while (!first_pte_in_page(pte));
1120}
1121
1122static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1123				struct dma_pte *pte, unsigned long pfn,
1124				unsigned long start_pfn, unsigned long last_pfn,
1125				struct list_head *freelist)
1126{
1127	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1128
1129	pfn = max(start_pfn, pfn);
1130	pte = &pte[pfn_level_offset(pfn, level)];
1131
1132	do {
1133		unsigned long level_pfn = pfn & level_mask(level);
1134
1135		if (!dma_pte_present(pte))
1136			goto next;
1137
1138		/* If range covers entire pagetable, free it */
1139		if (start_pfn <= level_pfn &&
1140		    last_pfn >= level_pfn + level_size(level) - 1) {
1141			/* These suborbinate page tables are going away entirely. Don't
1142			   bother to clear them; we're just going to *free* them. */
1143			if (level > 1 && !dma_pte_superpage(pte))
1144				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1145
1146			dma_clear_pte(pte);
1147			if (!first_pte)
1148				first_pte = pte;
1149			last_pte = pte;
1150		} else if (level > 1) {
1151			/* Recurse down into a level that isn't *entirely* obsolete */
1152			dma_pte_clear_level(domain, level - 1,
1153					    phys_to_virt(dma_pte_addr(pte)),
1154					    level_pfn, start_pfn, last_pfn,
1155					    freelist);
1156		}
1157next:
1158		pfn = level_pfn + level_size(level);
1159	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1160
1161	if (first_pte)
1162		domain_flush_cache(domain, first_pte,
1163				   (void *)++last_pte - (void *)first_pte);
1164}
1165
1166/* We can't just free the pages because the IOMMU may still be walking
1167   the page tables, and may have cached the intermediate levels. The
1168   pages can only be freed after the IOTLB flush has been done. */
1169static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1170			 unsigned long last_pfn, struct list_head *freelist)
1171{
1172	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1173	    WARN_ON(start_pfn > last_pfn))
1174		return;
1175
1176	/* we don't need lock here; nobody else touches the iova range */
1177	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1178			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1179
1180	/* free pgd */
1181	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1182		struct page *pgd_page = virt_to_page(domain->pgd);
1183		list_add_tail(&pgd_page->lru, freelist);
1184		domain->pgd = NULL;
1185	}
1186}
1187
1188/* iommu handling */
1189static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1190{
1191	struct root_entry *root;
1192
1193	root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1194	if (!root) {
1195		pr_err("Allocating root entry for %s failed\n",
1196			iommu->name);
1197		return -ENOMEM;
1198	}
1199
1200	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1201	iommu->root_entry = root;
1202
1203	return 0;
1204}
1205
1206static void iommu_set_root_entry(struct intel_iommu *iommu)
1207{
1208	u64 addr;
1209	u32 sts;
1210	unsigned long flag;
1211
1212	addr = virt_to_phys(iommu->root_entry);
1213	if (sm_supported(iommu))
1214		addr |= DMA_RTADDR_SMT;
1215
1216	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1217	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1218
1219	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1220
1221	/* Make sure hardware complete it */
1222	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1223		      readl, (sts & DMA_GSTS_RTPS), sts);
1224
1225	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1226
1227	/*
1228	 * Hardware invalidates all DMA remapping hardware translation
1229	 * caches as part of SRTP flow.
1230	 */
1231	if (cap_esrtps(iommu->cap))
1232		return;
1233
1234	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1235	if (sm_supported(iommu))
1236		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1237	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1238}
1239
1240void iommu_flush_write_buffer(struct intel_iommu *iommu)
1241{
1242	u32 val;
1243	unsigned long flag;
1244
1245	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1246		return;
1247
1248	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1249	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1250
1251	/* Make sure hardware complete it */
1252	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1253		      readl, (!(val & DMA_GSTS_WBFS)), val);
1254
1255	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1256}
1257
1258/* return value determine if we need a write buffer flush */
1259static void __iommu_flush_context(struct intel_iommu *iommu,
1260				  u16 did, u16 source_id, u8 function_mask,
1261				  u64 type)
1262{
1263	u64 val = 0;
1264	unsigned long flag;
1265
1266	switch (type) {
1267	case DMA_CCMD_GLOBAL_INVL:
1268		val = DMA_CCMD_GLOBAL_INVL;
1269		break;
1270	case DMA_CCMD_DOMAIN_INVL:
1271		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1272		break;
1273	case DMA_CCMD_DEVICE_INVL:
1274		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1275			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1276		break;
1277	default:
1278		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1279			iommu->name, type);
1280		return;
1281	}
1282	val |= DMA_CCMD_ICC;
1283
1284	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1285	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1286
1287	/* Make sure hardware complete it */
1288	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1289		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1290
1291	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1292}
1293
1294/* return value determine if we need a write buffer flush */
1295static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1296				u64 addr, unsigned int size_order, u64 type)
1297{
1298	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1299	u64 val = 0, val_iva = 0;
1300	unsigned long flag;
1301
1302	switch (type) {
1303	case DMA_TLB_GLOBAL_FLUSH:
1304		/* global flush doesn't need set IVA_REG */
1305		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1306		break;
1307	case DMA_TLB_DSI_FLUSH:
1308		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1309		break;
1310	case DMA_TLB_PSI_FLUSH:
1311		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1312		/* IH bit is passed in as part of address */
1313		val_iva = size_order | addr;
1314		break;
1315	default:
1316		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1317			iommu->name, type);
1318		return;
1319	}
1320
1321	if (cap_write_drain(iommu->cap))
1322		val |= DMA_TLB_WRITE_DRAIN;
1323
1324	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1325	/* Note: Only uses first TLB reg currently */
1326	if (val_iva)
1327		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1328	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1329
1330	/* Make sure hardware complete it */
1331	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1332		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1333
1334	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1335
1336	/* check IOTLB invalidation granularity */
1337	if (DMA_TLB_IAIG(val) == 0)
1338		pr_err("Flush IOTLB failed\n");
1339	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1340		pr_debug("TLB flush request %Lx, actual %Lx\n",
1341			(unsigned long long)DMA_TLB_IIRG(type),
1342			(unsigned long long)DMA_TLB_IAIG(val));
1343}
1344
1345static struct device_domain_info *
1346domain_lookup_dev_info(struct dmar_domain *domain,
1347		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1348{
1349	struct device_domain_info *info;
1350	unsigned long flags;
1351
1352	spin_lock_irqsave(&domain->lock, flags);
1353	list_for_each_entry(info, &domain->devices, link) {
1354		if (info->iommu == iommu && info->bus == bus &&
1355		    info->devfn == devfn) {
1356			spin_unlock_irqrestore(&domain->lock, flags);
1357			return info;
1358		}
1359	}
1360	spin_unlock_irqrestore(&domain->lock, flags);
1361
1362	return NULL;
1363}
1364
1365static void domain_update_iotlb(struct dmar_domain *domain)
1366{
1367	struct dev_pasid_info *dev_pasid;
1368	struct device_domain_info *info;
1369	bool has_iotlb_device = false;
1370	unsigned long flags;
1371
1372	spin_lock_irqsave(&domain->lock, flags);
1373	list_for_each_entry(info, &domain->devices, link) {
1374		if (info->ats_enabled) {
1375			has_iotlb_device = true;
1376			break;
1377		}
1378	}
1379
1380	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1381		info = dev_iommu_priv_get(dev_pasid->dev);
1382		if (info->ats_enabled) {
1383			has_iotlb_device = true;
1384			break;
1385		}
1386	}
1387	domain->has_iotlb_device = has_iotlb_device;
1388	spin_unlock_irqrestore(&domain->lock, flags);
1389}
1390
1391/*
1392 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1393 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1394 * check because it applies only to the built-in QAT devices and it doesn't
1395 * grant additional privileges.
1396 */
1397#define BUGGY_QAT_DEVID_MASK 0x4940
1398static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1399{
1400	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1401		return false;
1402
1403	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1404		return false;
1405
1406	return true;
1407}
1408
1409static void iommu_enable_pci_caps(struct device_domain_info *info)
1410{
1411	struct pci_dev *pdev;
1412
1413	if (!dev_is_pci(info->dev))
1414		return;
1415
1416	pdev = to_pci_dev(info->dev);
1417
1418	/* The PCIe spec, in its wisdom, declares that the behaviour of
1419	   the device if you enable PASID support after ATS support is
1420	   undefined. So always enable PASID support on devices which
1421	   have it, even if we can't yet know if we're ever going to
1422	   use it. */
1423	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1424		info->pasid_enabled = 1;
1425
1426	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1427	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1428		info->ats_enabled = 1;
1429		domain_update_iotlb(info->domain);
1430	}
1431}
1432
1433static void iommu_disable_pci_caps(struct device_domain_info *info)
1434{
1435	struct pci_dev *pdev;
1436
1437	if (!dev_is_pci(info->dev))
1438		return;
1439
1440	pdev = to_pci_dev(info->dev);
1441
1442	if (info->ats_enabled) {
1443		pci_disable_ats(pdev);
1444		info->ats_enabled = 0;
1445		domain_update_iotlb(info->domain);
1446	}
1447
1448	if (info->pasid_enabled) {
1449		pci_disable_pasid(pdev);
1450		info->pasid_enabled = 0;
1451	}
1452}
1453
1454static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1455				    u64 addr, unsigned int mask)
1456{
1457	u16 sid, qdep;
1458
1459	if (!info || !info->ats_enabled)
1460		return;
1461
1462	sid = info->bus << 8 | info->devfn;
1463	qdep = info->ats_qdep;
1464	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1465			   qdep, addr, mask);
1466	quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1467}
1468
1469static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1470				  u64 addr, unsigned mask)
1471{
1472	struct dev_pasid_info *dev_pasid;
1473	struct device_domain_info *info;
1474	unsigned long flags;
1475
1476	if (!domain->has_iotlb_device)
1477		return;
1478
1479	spin_lock_irqsave(&domain->lock, flags);
1480	list_for_each_entry(info, &domain->devices, link)
1481		__iommu_flush_dev_iotlb(info, addr, mask);
1482
1483	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1484		info = dev_iommu_priv_get(dev_pasid->dev);
1485
1486		if (!info->ats_enabled)
1487			continue;
1488
1489		qi_flush_dev_iotlb_pasid(info->iommu,
1490					 PCI_DEVID(info->bus, info->devfn),
1491					 info->pfsid, dev_pasid->pasid,
1492					 info->ats_qdep, addr,
1493					 mask);
1494	}
1495	spin_unlock_irqrestore(&domain->lock, flags);
1496}
1497
1498static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1499				     struct dmar_domain *domain, u64 addr,
1500				     unsigned long npages, bool ih)
1501{
1502	u16 did = domain_id_iommu(domain, iommu);
1503	struct dev_pasid_info *dev_pasid;
1504	unsigned long flags;
1505
1506	spin_lock_irqsave(&domain->lock, flags);
1507	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1508		qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih);
1509
1510	if (!list_empty(&domain->devices))
1511		qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
1512	spin_unlock_irqrestore(&domain->lock, flags);
1513}
1514
1515static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1516				  struct dmar_domain *domain,
1517				  unsigned long pfn, unsigned int pages,
1518				  int ih, int map)
1519{
1520	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1521	unsigned int mask = ilog2(aligned_pages);
1522	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1523	u16 did = domain_id_iommu(domain, iommu);
1524
1525	if (WARN_ON(!pages))
1526		return;
1527
1528	if (ih)
1529		ih = 1 << 6;
1530
1531	if (domain->use_first_level) {
1532		domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
1533	} else {
1534		unsigned long bitmask = aligned_pages - 1;
1535
1536		/*
1537		 * PSI masks the low order bits of the base address. If the
1538		 * address isn't aligned to the mask, then compute a mask value
1539		 * needed to ensure the target range is flushed.
1540		 */
1541		if (unlikely(bitmask & pfn)) {
1542			unsigned long end_pfn = pfn + pages - 1, shared_bits;
1543
1544			/*
1545			 * Since end_pfn <= pfn + bitmask, the only way bits
1546			 * higher than bitmask can differ in pfn and end_pfn is
1547			 * by carrying. This means after masking out bitmask,
1548			 * high bits starting with the first set bit in
1549			 * shared_bits are all equal in both pfn and end_pfn.
1550			 */
1551			shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1552			mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1553		}
1554
1555		/*
1556		 * Fallback to domain selective flush if no PSI support or
1557		 * the size is too big.
1558		 */
1559		if (!cap_pgsel_inv(iommu->cap) ||
1560		    mask > cap_max_amask_val(iommu->cap))
1561			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1562							DMA_TLB_DSI_FLUSH);
1563		else
1564			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1565							DMA_TLB_PSI_FLUSH);
1566	}
1567
1568	/*
1569	 * In caching mode, changes of pages from non-present to present require
1570	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1571	 */
1572	if (!cap_caching_mode(iommu->cap) || !map)
1573		iommu_flush_dev_iotlb(domain, addr, mask);
1574}
1575
1576/* Notification for newly created mappings */
1577static inline void __mapping_notify_one(struct intel_iommu *iommu,
1578					struct dmar_domain *domain,
1579					unsigned long pfn, unsigned int pages)
1580{
1581	/*
1582	 * It's a non-present to present mapping. Only flush if caching mode
1583	 * and second level.
1584	 */
1585	if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1586		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1587	else
1588		iommu_flush_write_buffer(iommu);
1589}
1590
1591static void intel_flush_iotlb_all(struct iommu_domain *domain)
1592{
1593	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1594	struct iommu_domain_info *info;
1595	unsigned long idx;
1596
1597	xa_for_each(&dmar_domain->iommu_array, idx, info) {
1598		struct intel_iommu *iommu = info->iommu;
1599		u16 did = domain_id_iommu(dmar_domain, iommu);
1600
1601		if (dmar_domain->use_first_level)
1602			domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0);
1603		else
1604			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1605						 DMA_TLB_DSI_FLUSH);
1606
1607		if (!cap_caching_mode(iommu->cap))
1608			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1609	}
1610}
1611
1612static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1613{
1614	u32 pmen;
1615	unsigned long flags;
1616
1617	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1618		return;
1619
1620	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1621	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1622	pmen &= ~DMA_PMEN_EPM;
1623	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1624
1625	/* wait for the protected region status bit to clear */
1626	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1627		readl, !(pmen & DMA_PMEN_PRS), pmen);
1628
1629	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1630}
1631
1632static void iommu_enable_translation(struct intel_iommu *iommu)
1633{
1634	u32 sts;
1635	unsigned long flags;
1636
1637	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1638	iommu->gcmd |= DMA_GCMD_TE;
1639	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1640
1641	/* Make sure hardware complete it */
1642	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1643		      readl, (sts & DMA_GSTS_TES), sts);
1644
1645	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1646}
1647
1648static void iommu_disable_translation(struct intel_iommu *iommu)
1649{
1650	u32 sts;
1651	unsigned long flag;
1652
1653	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1654	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1655		return;
1656
1657	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1658	iommu->gcmd &= ~DMA_GCMD_TE;
1659	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1660
1661	/* Make sure hardware complete it */
1662	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1663		      readl, (!(sts & DMA_GSTS_TES)), sts);
1664
1665	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1666}
1667
1668static int iommu_init_domains(struct intel_iommu *iommu)
1669{
1670	u32 ndomains;
1671
1672	ndomains = cap_ndoms(iommu->cap);
1673	pr_debug("%s: Number of Domains supported <%d>\n",
1674		 iommu->name, ndomains);
1675
1676	spin_lock_init(&iommu->lock);
1677
1678	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1679	if (!iommu->domain_ids)
1680		return -ENOMEM;
1681
1682	/*
1683	 * If Caching mode is set, then invalid translations are tagged
1684	 * with domain-id 0, hence we need to pre-allocate it. We also
1685	 * use domain-id 0 as a marker for non-allocated domain-id, so
1686	 * make sure it is not used for a real domain.
1687	 */
1688	set_bit(0, iommu->domain_ids);
1689
1690	/*
1691	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1692	 * entry for first-level or pass-through translation modes should
1693	 * be programmed with a domain id different from those used for
1694	 * second-level or nested translation. We reserve a domain id for
1695	 * this purpose.
1696	 */
1697	if (sm_supported(iommu))
1698		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1699
1700	return 0;
1701}
1702
1703static void disable_dmar_iommu(struct intel_iommu *iommu)
1704{
1705	if (!iommu->domain_ids)
1706		return;
1707
1708	/*
1709	 * All iommu domains must have been detached from the devices,
1710	 * hence there should be no domain IDs in use.
1711	 */
1712	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1713		    > NUM_RESERVED_DID))
1714		return;
1715
1716	if (iommu->gcmd & DMA_GCMD_TE)
1717		iommu_disable_translation(iommu);
1718}
1719
1720static void free_dmar_iommu(struct intel_iommu *iommu)
1721{
1722	if (iommu->domain_ids) {
1723		bitmap_free(iommu->domain_ids);
1724		iommu->domain_ids = NULL;
1725	}
1726
1727	if (iommu->copied_tables) {
1728		bitmap_free(iommu->copied_tables);
1729		iommu->copied_tables = NULL;
1730	}
1731
1732	/* free context mapping */
1733	free_context_table(iommu);
1734
1735#ifdef CONFIG_INTEL_IOMMU_SVM
1736	if (pasid_supported(iommu)) {
1737		if (ecap_prs(iommu->ecap))
1738			intel_svm_finish_prq(iommu);
1739	}
1740#endif
1741}
1742
1743/*
1744 * Check and return whether first level is used by default for
1745 * DMA translation.
1746 */
1747static bool first_level_by_default(unsigned int type)
1748{
1749	/* Only SL is available in legacy mode */
1750	if (!scalable_mode_support())
1751		return false;
1752
1753	/* Only level (either FL or SL) is available, just use it */
1754	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1755		return intel_cap_flts_sanity();
1756
1757	/* Both levels are available, decide it based on domain type */
1758	return type != IOMMU_DOMAIN_UNMANAGED;
1759}
1760
1761static struct dmar_domain *alloc_domain(unsigned int type)
1762{
1763	struct dmar_domain *domain;
1764
1765	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1766	if (!domain)
1767		return NULL;
1768
1769	domain->nid = NUMA_NO_NODE;
1770	if (first_level_by_default(type))
1771		domain->use_first_level = true;
1772	domain->has_iotlb_device = false;
1773	INIT_LIST_HEAD(&domain->devices);
1774	INIT_LIST_HEAD(&domain->dev_pasids);
1775	spin_lock_init(&domain->lock);
1776	xa_init(&domain->iommu_array);
1777
1778	return domain;
1779}
1780
1781static int domain_attach_iommu(struct dmar_domain *domain,
1782			       struct intel_iommu *iommu)
1783{
1784	struct iommu_domain_info *info, *curr;
1785	unsigned long ndomains;
1786	int num, ret = -ENOSPC;
1787
1788	info = kzalloc(sizeof(*info), GFP_KERNEL);
1789	if (!info)
1790		return -ENOMEM;
1791
1792	spin_lock(&iommu->lock);
1793	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1794	if (curr) {
1795		curr->refcnt++;
1796		spin_unlock(&iommu->lock);
1797		kfree(info);
1798		return 0;
1799	}
1800
1801	ndomains = cap_ndoms(iommu->cap);
1802	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1803	if (num >= ndomains) {
1804		pr_err("%s: No free domain ids\n", iommu->name);
1805		goto err_unlock;
1806	}
1807
1808	set_bit(num, iommu->domain_ids);
1809	info->refcnt	= 1;
1810	info->did	= num;
1811	info->iommu	= iommu;
1812	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1813			  NULL, info, GFP_ATOMIC);
1814	if (curr) {
1815		ret = xa_err(curr) ? : -EBUSY;
1816		goto err_clear;
1817	}
1818	domain_update_iommu_cap(domain);
1819
1820	spin_unlock(&iommu->lock);
1821	return 0;
1822
1823err_clear:
1824	clear_bit(info->did, iommu->domain_ids);
1825err_unlock:
1826	spin_unlock(&iommu->lock);
1827	kfree(info);
1828	return ret;
1829}
1830
1831static void domain_detach_iommu(struct dmar_domain *domain,
1832				struct intel_iommu *iommu)
1833{
1834	struct iommu_domain_info *info;
1835
1836	spin_lock(&iommu->lock);
1837	info = xa_load(&domain->iommu_array, iommu->seq_id);
1838	if (--info->refcnt == 0) {
1839		clear_bit(info->did, iommu->domain_ids);
1840		xa_erase(&domain->iommu_array, iommu->seq_id);
1841		domain->nid = NUMA_NO_NODE;
1842		domain_update_iommu_cap(domain);
1843		kfree(info);
1844	}
1845	spin_unlock(&iommu->lock);
1846}
1847
1848static inline int guestwidth_to_adjustwidth(int gaw)
1849{
1850	int agaw;
1851	int r = (gaw - 12) % 9;
1852
1853	if (r == 0)
1854		agaw = gaw;
1855	else
1856		agaw = gaw + 9 - r;
1857	if (agaw > 64)
1858		agaw = 64;
1859	return agaw;
1860}
1861
1862static void domain_exit(struct dmar_domain *domain)
1863{
1864	if (domain->pgd) {
1865		LIST_HEAD(freelist);
1866
1867		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1868		put_pages_list(&freelist);
1869	}
1870
1871	if (WARN_ON(!list_empty(&domain->devices)))
1872		return;
1873
1874	kfree(domain);
1875}
1876
1877/*
1878 * Get the PASID directory size for scalable mode context entry.
1879 * Value of X in the PDTS field of a scalable mode context entry
1880 * indicates PASID directory with 2^(X + 7) entries.
1881 */
1882static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1883{
1884	unsigned long pds, max_pde;
1885
1886	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1887	pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1888	if (pds < 7)
1889		return 0;
1890
1891	return pds - 7;
1892}
1893
1894/*
1895 * Set the RID_PASID field of a scalable mode context entry. The
1896 * IOMMU hardware will use the PASID value set in this field for
1897 * DMA translations of DMA requests without PASID.
1898 */
1899static inline void
1900context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1901{
1902	context->hi |= pasid & ((1 << 20) - 1);
1903}
1904
1905/*
1906 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1907 * entry.
1908 */
1909static inline void context_set_sm_dte(struct context_entry *context)
1910{
1911	context->lo |= BIT_ULL(2);
1912}
1913
1914/*
1915 * Set the PRE(Page Request Enable) field of a scalable mode context
1916 * entry.
1917 */
1918static inline void context_set_sm_pre(struct context_entry *context)
1919{
1920	context->lo |= BIT_ULL(4);
1921}
1922
1923/* Convert value to context PASID directory size field coding. */
1924#define context_pdts(pds)	(((pds) & 0x7) << 9)
1925
1926static int domain_context_mapping_one(struct dmar_domain *domain,
1927				      struct intel_iommu *iommu,
1928				      struct pasid_table *table,
1929				      u8 bus, u8 devfn)
1930{
1931	struct device_domain_info *info =
1932			domain_lookup_dev_info(domain, iommu, bus, devfn);
1933	u16 did = domain_id_iommu(domain, iommu);
1934	int translation = CONTEXT_TT_MULTI_LEVEL;
1935	struct context_entry *context;
1936	int ret;
1937
1938	if (hw_pass_through && domain_type_is_si(domain))
1939		translation = CONTEXT_TT_PASS_THROUGH;
1940
1941	pr_debug("Set context mapping for %02x:%02x.%d\n",
1942		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1943
1944	spin_lock(&iommu->lock);
1945	ret = -ENOMEM;
1946	context = iommu_context_addr(iommu, bus, devfn, 1);
1947	if (!context)
1948		goto out_unlock;
1949
1950	ret = 0;
1951	if (context_present(context) && !context_copied(iommu, bus, devfn))
1952		goto out_unlock;
1953
1954	/*
1955	 * For kdump cases, old valid entries may be cached due to the
1956	 * in-flight DMA and copied pgtable, but there is no unmapping
1957	 * behaviour for them, thus we need an explicit cache flush for
1958	 * the newly-mapped device. For kdump, at this point, the device
1959	 * is supposed to finish reset at its driver probe stage, so no
1960	 * in-flight DMA will exist, and we don't need to worry anymore
1961	 * hereafter.
1962	 */
1963	if (context_copied(iommu, bus, devfn)) {
1964		u16 did_old = context_domain_id(context);
1965
1966		if (did_old < cap_ndoms(iommu->cap)) {
1967			iommu->flush.flush_context(iommu, did_old,
1968						   (((u16)bus) << 8) | devfn,
1969						   DMA_CCMD_MASK_NOBIT,
1970						   DMA_CCMD_DEVICE_INVL);
1971			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1972						 DMA_TLB_DSI_FLUSH);
1973		}
1974
1975		clear_context_copied(iommu, bus, devfn);
1976	}
1977
1978	context_clear_entry(context);
1979
1980	if (sm_supported(iommu)) {
1981		unsigned long pds;
1982
1983		/* Setup the PASID DIR pointer: */
1984		pds = context_get_sm_pds(table);
1985		context->lo = (u64)virt_to_phys(table->table) |
1986				context_pdts(pds);
1987
1988		/* Setup the RID_PASID field: */
1989		context_set_sm_rid2pasid(context, IOMMU_NO_PASID);
1990
1991		/*
1992		 * Setup the Device-TLB enable bit and Page request
1993		 * Enable bit:
1994		 */
1995		if (info && info->ats_supported)
1996			context_set_sm_dte(context);
1997		if (info && info->pri_supported)
1998			context_set_sm_pre(context);
1999		if (info && info->pasid_supported)
2000			context_set_pasid(context);
2001	} else {
2002		struct dma_pte *pgd = domain->pgd;
2003		int agaw;
2004
2005		context_set_domain_id(context, did);
2006
2007		if (translation != CONTEXT_TT_PASS_THROUGH) {
2008			/*
2009			 * Skip top levels of page tables for iommu which has
2010			 * less agaw than default. Unnecessary for PT mode.
2011			 */
2012			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2013				ret = -ENOMEM;
2014				pgd = phys_to_virt(dma_pte_addr(pgd));
2015				if (!dma_pte_present(pgd))
2016					goto out_unlock;
2017			}
2018
2019			if (info && info->ats_supported)
2020				translation = CONTEXT_TT_DEV_IOTLB;
2021			else
2022				translation = CONTEXT_TT_MULTI_LEVEL;
2023
2024			context_set_address_root(context, virt_to_phys(pgd));
2025			context_set_address_width(context, agaw);
2026		} else {
2027			/*
2028			 * In pass through mode, AW must be programmed to
2029			 * indicate the largest AGAW value supported by
2030			 * hardware. And ASR is ignored by hardware.
2031			 */
2032			context_set_address_width(context, iommu->msagaw);
2033		}
2034
2035		context_set_translation_type(context, translation);
2036	}
2037
2038	context_set_fault_enable(context);
2039	context_set_present(context);
2040	if (!ecap_coherent(iommu->ecap))
2041		clflush_cache_range(context, sizeof(*context));
2042
2043	/*
2044	 * It's a non-present to present mapping. If hardware doesn't cache
2045	 * non-present entry we only need to flush the write-buffer. If the
2046	 * _does_ cache non-present entries, then it does so in the special
2047	 * domain #0, which we have to flush:
2048	 */
2049	if (cap_caching_mode(iommu->cap)) {
2050		iommu->flush.flush_context(iommu, 0,
2051					   (((u16)bus) << 8) | devfn,
2052					   DMA_CCMD_MASK_NOBIT,
2053					   DMA_CCMD_DEVICE_INVL);
2054		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2055	} else {
2056		iommu_flush_write_buffer(iommu);
2057	}
2058
2059	ret = 0;
2060
2061out_unlock:
2062	spin_unlock(&iommu->lock);
2063
2064	return ret;
2065}
2066
2067struct domain_context_mapping_data {
2068	struct dmar_domain *domain;
2069	struct intel_iommu *iommu;
2070	struct pasid_table *table;
2071};
2072
2073static int domain_context_mapping_cb(struct pci_dev *pdev,
2074				     u16 alias, void *opaque)
2075{
2076	struct domain_context_mapping_data *data = opaque;
2077
2078	return domain_context_mapping_one(data->domain, data->iommu,
2079					  data->table, PCI_BUS_NUM(alias),
2080					  alias & 0xff);
2081}
2082
2083static int
2084domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2085{
2086	struct domain_context_mapping_data data;
2087	struct pasid_table *table;
2088	struct intel_iommu *iommu;
2089	u8 bus, devfn;
2090
2091	iommu = device_to_iommu(dev, &bus, &devfn);
2092	if (!iommu)
2093		return -ENODEV;
2094
2095	table = intel_pasid_get_table(dev);
2096
2097	if (!dev_is_pci(dev))
2098		return domain_context_mapping_one(domain, iommu, table,
2099						  bus, devfn);
2100
2101	data.domain = domain;
2102	data.iommu = iommu;
2103	data.table = table;
2104
2105	return pci_for_each_dma_alias(to_pci_dev(dev),
2106				      &domain_context_mapping_cb, &data);
2107}
2108
2109/* Returns a number of VTD pages, but aligned to MM page size */
2110static inline unsigned long aligned_nrpages(unsigned long host_addr,
2111					    size_t size)
2112{
2113	host_addr &= ~PAGE_MASK;
2114	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2115}
2116
2117/* Return largest possible superpage level for a given mapping */
2118static inline int hardware_largepage_caps(struct dmar_domain *domain,
2119					  unsigned long iov_pfn,
2120					  unsigned long phy_pfn,
2121					  unsigned long pages)
2122{
2123	int support, level = 1;
2124	unsigned long pfnmerge;
2125
2126	support = domain->iommu_superpage;
2127
2128	/* To use a large page, the virtual *and* physical addresses
2129	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2130	   of them will mean we have to use smaller pages. So just
2131	   merge them and check both at once. */
2132	pfnmerge = iov_pfn | phy_pfn;
2133
2134	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2135		pages >>= VTD_STRIDE_SHIFT;
2136		if (!pages)
2137			break;
2138		pfnmerge >>= VTD_STRIDE_SHIFT;
2139		level++;
2140		support--;
2141	}
2142	return level;
2143}
2144
2145/*
2146 * Ensure that old small page tables are removed to make room for superpage(s).
2147 * We're going to add new large pages, so make sure we don't remove their parent
2148 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2149 */
2150static void switch_to_super_page(struct dmar_domain *domain,
2151				 unsigned long start_pfn,
2152				 unsigned long end_pfn, int level)
2153{
2154	unsigned long lvl_pages = lvl_to_nr_pages(level);
2155	struct iommu_domain_info *info;
2156	struct dma_pte *pte = NULL;
2157	unsigned long i;
2158
2159	while (start_pfn <= end_pfn) {
2160		if (!pte)
2161			pte = pfn_to_dma_pte(domain, start_pfn, &level,
2162					     GFP_ATOMIC);
2163
2164		if (dma_pte_present(pte)) {
2165			dma_pte_free_pagetable(domain, start_pfn,
2166					       start_pfn + lvl_pages - 1,
2167					       level + 1);
2168
2169			xa_for_each(&domain->iommu_array, i, info)
2170				iommu_flush_iotlb_psi(info->iommu, domain,
2171						      start_pfn, lvl_pages,
2172						      0, 0);
2173		}
2174
2175		pte++;
2176		start_pfn += lvl_pages;
2177		if (first_pte_in_page(pte))
2178			pte = NULL;
2179	}
2180}
2181
2182static int
2183__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2184		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
2185		 gfp_t gfp)
2186{
2187	struct dma_pte *first_pte = NULL, *pte = NULL;
2188	unsigned int largepage_lvl = 0;
2189	unsigned long lvl_pages = 0;
2190	phys_addr_t pteval;
2191	u64 attr;
2192
2193	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2194		return -EINVAL;
2195
2196	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2197		return -EINVAL;
2198
2199	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2200	attr |= DMA_FL_PTE_PRESENT;
2201	if (domain->use_first_level) {
2202		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2203		if (prot & DMA_PTE_WRITE)
2204			attr |= DMA_FL_PTE_DIRTY;
2205	}
2206
2207	domain->has_mappings = true;
2208
2209	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2210
2211	while (nr_pages > 0) {
2212		uint64_t tmp;
2213
2214		if (!pte) {
2215			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2216					phys_pfn, nr_pages);
2217
2218			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2219					     gfp);
2220			if (!pte)
2221				return -ENOMEM;
2222			first_pte = pte;
2223
2224			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2225
2226			/* It is large page*/
2227			if (largepage_lvl > 1) {
2228				unsigned long end_pfn;
2229				unsigned long pages_to_remove;
2230
2231				pteval |= DMA_PTE_LARGE_PAGE;
2232				pages_to_remove = min_t(unsigned long, nr_pages,
2233							nr_pte_to_next_page(pte) * lvl_pages);
2234				end_pfn = iov_pfn + pages_to_remove - 1;
2235				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2236			} else {
2237				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2238			}
2239
2240		}
2241		/* We don't need lock here, nobody else
2242		 * touches the iova range
2243		 */
2244		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2245		if (tmp) {
2246			static int dumps = 5;
2247			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2248				iov_pfn, tmp, (unsigned long long)pteval);
2249			if (dumps) {
2250				dumps--;
2251				debug_dma_dump_mappings(NULL);
2252			}
2253			WARN_ON(1);
2254		}
2255
2256		nr_pages -= lvl_pages;
2257		iov_pfn += lvl_pages;
2258		phys_pfn += lvl_pages;
2259		pteval += lvl_pages * VTD_PAGE_SIZE;
2260
2261		/* If the next PTE would be the first in a new page, then we
2262		 * need to flush the cache on the entries we've just written.
2263		 * And then we'll need to recalculate 'pte', so clear it and
2264		 * let it get set again in the if (!pte) block above.
2265		 *
2266		 * If we're done (!nr_pages) we need to flush the cache too.
2267		 *
2268		 * Also if we've been setting superpages, we may need to
2269		 * recalculate 'pte' and switch back to smaller pages for the
2270		 * end of the mapping, if the trailing size is not enough to
2271		 * use another superpage (i.e. nr_pages < lvl_pages).
2272		 */
2273		pte++;
2274		if (!nr_pages || first_pte_in_page(pte) ||
2275		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2276			domain_flush_cache(domain, first_pte,
2277					   (void *)pte - (void *)first_pte);
2278			pte = NULL;
2279		}
2280	}
2281
2282	return 0;
2283}
2284
2285static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2286{
2287	struct intel_iommu *iommu = info->iommu;
2288	struct context_entry *context;
2289	u16 did_old;
2290
2291	if (!iommu)
2292		return;
2293
2294	spin_lock(&iommu->lock);
2295	context = iommu_context_addr(iommu, bus, devfn, 0);
2296	if (!context) {
2297		spin_unlock(&iommu->lock);
2298		return;
2299	}
2300
2301	if (sm_supported(iommu)) {
2302		if (hw_pass_through && domain_type_is_si(info->domain))
2303			did_old = FLPT_DEFAULT_DID;
2304		else
2305			did_old = domain_id_iommu(info->domain, iommu);
2306	} else {
2307		did_old = context_domain_id(context);
2308	}
2309
2310	context_clear_entry(context);
2311	__iommu_flush_cache(iommu, context, sizeof(*context));
2312	spin_unlock(&iommu->lock);
2313	iommu->flush.flush_context(iommu,
2314				   did_old,
2315				   (((u16)bus) << 8) | devfn,
2316				   DMA_CCMD_MASK_NOBIT,
2317				   DMA_CCMD_DEVICE_INVL);
2318
2319	if (sm_supported(iommu))
2320		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2321
2322	iommu->flush.flush_iotlb(iommu,
2323				 did_old,
2324				 0,
2325				 0,
2326				 DMA_TLB_DSI_FLUSH);
2327
2328	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2329}
2330
2331static int domain_setup_first_level(struct intel_iommu *iommu,
2332				    struct dmar_domain *domain,
2333				    struct device *dev,
2334				    u32 pasid)
2335{
2336	struct dma_pte *pgd = domain->pgd;
2337	int agaw, level;
2338	int flags = 0;
2339
2340	/*
2341	 * Skip top levels of page tables for iommu which has
2342	 * less agaw than default. Unnecessary for PT mode.
2343	 */
2344	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2345		pgd = phys_to_virt(dma_pte_addr(pgd));
2346		if (!dma_pte_present(pgd))
2347			return -ENOMEM;
2348	}
2349
2350	level = agaw_to_level(agaw);
2351	if (level != 4 && level != 5)
2352		return -EINVAL;
2353
2354	if (level == 5)
2355		flags |= PASID_FLAG_FL5LP;
2356
2357	if (domain->force_snooping)
2358		flags |= PASID_FLAG_PAGE_SNOOP;
2359
2360	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2361					     domain_id_iommu(domain, iommu),
2362					     flags);
2363}
2364
2365static bool dev_is_real_dma_subdevice(struct device *dev)
2366{
2367	return dev && dev_is_pci(dev) &&
2368	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2369}
2370
2371static int iommu_domain_identity_map(struct dmar_domain *domain,
2372				     unsigned long first_vpfn,
2373				     unsigned long last_vpfn)
2374{
2375	/*
2376	 * RMRR range might have overlap with physical memory range,
2377	 * clear it first
2378	 */
2379	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2380
2381	return __domain_mapping(domain, first_vpfn,
2382				first_vpfn, last_vpfn - first_vpfn + 1,
2383				DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2384}
2385
2386static int md_domain_init(struct dmar_domain *domain, int guest_width);
2387
2388static int __init si_domain_init(int hw)
2389{
2390	struct dmar_rmrr_unit *rmrr;
2391	struct device *dev;
2392	int i, nid, ret;
2393
2394	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2395	if (!si_domain)
2396		return -EFAULT;
2397
2398	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2399		domain_exit(si_domain);
2400		si_domain = NULL;
2401		return -EFAULT;
2402	}
2403
2404	if (hw)
2405		return 0;
2406
2407	for_each_online_node(nid) {
2408		unsigned long start_pfn, end_pfn;
2409		int i;
2410
2411		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2412			ret = iommu_domain_identity_map(si_domain,
2413					mm_to_dma_pfn_start(start_pfn),
2414					mm_to_dma_pfn_end(end_pfn));
2415			if (ret)
2416				return ret;
2417		}
2418	}
2419
2420	/*
2421	 * Identity map the RMRRs so that devices with RMRRs could also use
2422	 * the si_domain.
2423	 */
2424	for_each_rmrr_units(rmrr) {
2425		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2426					  i, dev) {
2427			unsigned long long start = rmrr->base_address;
2428			unsigned long long end = rmrr->end_address;
2429
2430			if (WARN_ON(end < start ||
2431				    end >> agaw_to_width(si_domain->agaw)))
2432				continue;
2433
2434			ret = iommu_domain_identity_map(si_domain,
2435					mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2436					mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2437			if (ret)
2438				return ret;
2439		}
2440	}
2441
2442	return 0;
2443}
2444
2445static int dmar_domain_attach_device(struct dmar_domain *domain,
2446				     struct device *dev)
2447{
2448	struct device_domain_info *info = dev_iommu_priv_get(dev);
2449	struct intel_iommu *iommu;
2450	unsigned long flags;
2451	u8 bus, devfn;
2452	int ret;
2453
2454	iommu = device_to_iommu(dev, &bus, &devfn);
2455	if (!iommu)
2456		return -ENODEV;
2457
2458	ret = domain_attach_iommu(domain, iommu);
2459	if (ret)
2460		return ret;
2461	info->domain = domain;
2462	spin_lock_irqsave(&domain->lock, flags);
2463	list_add(&info->link, &domain->devices);
2464	spin_unlock_irqrestore(&domain->lock, flags);
2465
2466	/* PASID table is mandatory for a PCI device in scalable mode. */
2467	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2468		/* Setup the PASID entry for requests without PASID: */
2469		if (hw_pass_through && domain_type_is_si(domain))
2470			ret = intel_pasid_setup_pass_through(iommu, domain,
2471					dev, IOMMU_NO_PASID);
2472		else if (domain->use_first_level)
2473			ret = domain_setup_first_level(iommu, domain, dev,
2474					IOMMU_NO_PASID);
2475		else
2476			ret = intel_pasid_setup_second_level(iommu, domain,
2477					dev, IOMMU_NO_PASID);
2478		if (ret) {
2479			dev_err(dev, "Setup RID2PASID failed\n");
2480			device_block_translation(dev);
2481			return ret;
2482		}
2483	}
2484
2485	ret = domain_context_mapping(domain, dev);
2486	if (ret) {
2487		dev_err(dev, "Domain context map failed\n");
2488		device_block_translation(dev);
2489		return ret;
2490	}
2491
2492	if (sm_supported(info->iommu) || !domain_type_is_si(info->domain))
2493		iommu_enable_pci_caps(info);
2494
2495	return 0;
2496}
2497
2498/**
2499 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2500 * is relaxable (ie. is allowed to be not enforced under some conditions)
2501 * @dev: device handle
2502 *
2503 * We assume that PCI USB devices with RMRRs have them largely
2504 * for historical reasons and that the RMRR space is not actively used post
2505 * boot.  This exclusion may change if vendors begin to abuse it.
2506 *
2507 * The same exception is made for graphics devices, with the requirement that
2508 * any use of the RMRR regions will be torn down before assigning the device
2509 * to a guest.
2510 *
2511 * Return: true if the RMRR is relaxable, false otherwise
2512 */
2513static bool device_rmrr_is_relaxable(struct device *dev)
2514{
2515	struct pci_dev *pdev;
2516
2517	if (!dev_is_pci(dev))
2518		return false;
2519
2520	pdev = to_pci_dev(dev);
2521	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2522		return true;
2523	else
2524		return false;
2525}
2526
2527/*
2528 * Return the required default domain type for a specific device.
2529 *
2530 * @dev: the device in query
2531 * @startup: true if this is during early boot
2532 *
2533 * Returns:
2534 *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2535 *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2536 *  - 0: both identity and dynamic domains work for this device
2537 */
2538static int device_def_domain_type(struct device *dev)
2539{
2540	if (dev_is_pci(dev)) {
2541		struct pci_dev *pdev = to_pci_dev(dev);
2542
2543		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2544			return IOMMU_DOMAIN_IDENTITY;
2545
2546		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2547			return IOMMU_DOMAIN_IDENTITY;
2548	}
2549
2550	return 0;
2551}
2552
2553static void intel_iommu_init_qi(struct intel_iommu *iommu)
2554{
2555	/*
2556	 * Start from the sane iommu hardware state.
2557	 * If the queued invalidation is already initialized by us
2558	 * (for example, while enabling interrupt-remapping) then
2559	 * we got the things already rolling from a sane state.
2560	 */
2561	if (!iommu->qi) {
2562		/*
2563		 * Clear any previous faults.
2564		 */
2565		dmar_fault(-1, iommu);
2566		/*
2567		 * Disable queued invalidation if supported and already enabled
2568		 * before OS handover.
2569		 */
2570		dmar_disable_qi(iommu);
2571	}
2572
2573	if (dmar_enable_qi(iommu)) {
2574		/*
2575		 * Queued Invalidate not enabled, use Register Based Invalidate
2576		 */
2577		iommu->flush.flush_context = __iommu_flush_context;
2578		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2579		pr_info("%s: Using Register based invalidation\n",
2580			iommu->name);
2581	} else {
2582		iommu->flush.flush_context = qi_flush_context;
2583		iommu->flush.flush_iotlb = qi_flush_iotlb;
2584		pr_info("%s: Using Queued invalidation\n", iommu->name);
2585	}
2586}
2587
2588static int copy_context_table(struct intel_iommu *iommu,
2589			      struct root_entry *old_re,
2590			      struct context_entry **tbl,
2591			      int bus, bool ext)
2592{
2593	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2594	struct context_entry *new_ce = NULL, ce;
2595	struct context_entry *old_ce = NULL;
2596	struct root_entry re;
2597	phys_addr_t old_ce_phys;
2598
2599	tbl_idx = ext ? bus * 2 : bus;
2600	memcpy(&re, old_re, sizeof(re));
2601
2602	for (devfn = 0; devfn < 256; devfn++) {
2603		/* First calculate the correct index */
2604		idx = (ext ? devfn * 2 : devfn) % 256;
2605
2606		if (idx == 0) {
2607			/* First save what we may have and clean up */
2608			if (new_ce) {
2609				tbl[tbl_idx] = new_ce;
2610				__iommu_flush_cache(iommu, new_ce,
2611						    VTD_PAGE_SIZE);
2612				pos = 1;
2613			}
2614
2615			if (old_ce)
2616				memunmap(old_ce);
2617
2618			ret = 0;
2619			if (devfn < 0x80)
2620				old_ce_phys = root_entry_lctp(&re);
2621			else
2622				old_ce_phys = root_entry_uctp(&re);
2623
2624			if (!old_ce_phys) {
2625				if (ext && devfn == 0) {
2626					/* No LCTP, try UCTP */
2627					devfn = 0x7f;
2628					continue;
2629				} else {
2630					goto out;
2631				}
2632			}
2633
2634			ret = -ENOMEM;
2635			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2636					MEMREMAP_WB);
2637			if (!old_ce)
2638				goto out;
2639
2640			new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2641			if (!new_ce)
2642				goto out_unmap;
2643
2644			ret = 0;
2645		}
2646
2647		/* Now copy the context entry */
2648		memcpy(&ce, old_ce + idx, sizeof(ce));
2649
2650		if (!context_present(&ce))
2651			continue;
2652
2653		did = context_domain_id(&ce);
2654		if (did >= 0 && did < cap_ndoms(iommu->cap))
2655			set_bit(did, iommu->domain_ids);
2656
2657		set_context_copied(iommu, bus, devfn);
2658		new_ce[idx] = ce;
2659	}
2660
2661	tbl[tbl_idx + pos] = new_ce;
2662
2663	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2664
2665out_unmap:
2666	memunmap(old_ce);
2667
2668out:
2669	return ret;
2670}
2671
2672static int copy_translation_tables(struct intel_iommu *iommu)
2673{
2674	struct context_entry **ctxt_tbls;
2675	struct root_entry *old_rt;
2676	phys_addr_t old_rt_phys;
2677	int ctxt_table_entries;
2678	u64 rtaddr_reg;
2679	int bus, ret;
2680	bool new_ext, ext;
2681
2682	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2683	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2684	new_ext    = !!sm_supported(iommu);
2685
2686	/*
2687	 * The RTT bit can only be changed when translation is disabled,
2688	 * but disabling translation means to open a window for data
2689	 * corruption. So bail out and don't copy anything if we would
2690	 * have to change the bit.
2691	 */
2692	if (new_ext != ext)
2693		return -EINVAL;
2694
2695	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2696	if (!iommu->copied_tables)
2697		return -ENOMEM;
2698
2699	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2700	if (!old_rt_phys)
2701		return -EINVAL;
2702
2703	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2704	if (!old_rt)
2705		return -ENOMEM;
2706
2707	/* This is too big for the stack - allocate it from slab */
2708	ctxt_table_entries = ext ? 512 : 256;
2709	ret = -ENOMEM;
2710	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2711	if (!ctxt_tbls)
2712		goto out_unmap;
2713
2714	for (bus = 0; bus < 256; bus++) {
2715		ret = copy_context_table(iommu, &old_rt[bus],
2716					 ctxt_tbls, bus, ext);
2717		if (ret) {
2718			pr_err("%s: Failed to copy context table for bus %d\n",
2719				iommu->name, bus);
2720			continue;
2721		}
2722	}
2723
2724	spin_lock(&iommu->lock);
2725
2726	/* Context tables are copied, now write them to the root_entry table */
2727	for (bus = 0; bus < 256; bus++) {
2728		int idx = ext ? bus * 2 : bus;
2729		u64 val;
2730
2731		if (ctxt_tbls[idx]) {
2732			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2733			iommu->root_entry[bus].lo = val;
2734		}
2735
2736		if (!ext || !ctxt_tbls[idx + 1])
2737			continue;
2738
2739		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2740		iommu->root_entry[bus].hi = val;
2741	}
2742
2743	spin_unlock(&iommu->lock);
2744
2745	kfree(ctxt_tbls);
2746
2747	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2748
2749	ret = 0;
2750
2751out_unmap:
2752	memunmap(old_rt);
2753
2754	return ret;
2755}
2756
2757static int __init init_dmars(void)
2758{
2759	struct dmar_drhd_unit *drhd;
2760	struct intel_iommu *iommu;
2761	int ret;
2762
2763	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2764	if (ret)
2765		goto free_iommu;
2766
2767	for_each_iommu(iommu, drhd) {
2768		if (drhd->ignored) {
2769			iommu_disable_translation(iommu);
2770			continue;
2771		}
2772
2773		/*
2774		 * Find the max pasid size of all IOMMU's in the system.
2775		 * We need to ensure the system pasid table is no bigger
2776		 * than the smallest supported.
2777		 */
2778		if (pasid_supported(iommu)) {
2779			u32 temp = 2 << ecap_pss(iommu->ecap);
2780
2781			intel_pasid_max_id = min_t(u32, temp,
2782						   intel_pasid_max_id);
2783		}
2784
2785		intel_iommu_init_qi(iommu);
2786
2787		ret = iommu_init_domains(iommu);
2788		if (ret)
2789			goto free_iommu;
2790
2791		init_translation_status(iommu);
2792
2793		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2794			iommu_disable_translation(iommu);
2795			clear_translation_pre_enabled(iommu);
2796			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2797				iommu->name);
2798		}
2799
2800		/*
2801		 * TBD:
2802		 * we could share the same root & context tables
2803		 * among all IOMMU's. Need to Split it later.
2804		 */
2805		ret = iommu_alloc_root_entry(iommu);
2806		if (ret)
2807			goto free_iommu;
2808
2809		if (translation_pre_enabled(iommu)) {
2810			pr_info("Translation already enabled - trying to copy translation structures\n");
2811
2812			ret = copy_translation_tables(iommu);
2813			if (ret) {
2814				/*
2815				 * We found the IOMMU with translation
2816				 * enabled - but failed to copy over the
2817				 * old root-entry table. Try to proceed
2818				 * by disabling translation now and
2819				 * allocating a clean root-entry table.
2820				 * This might cause DMAR faults, but
2821				 * probably the dump will still succeed.
2822				 */
2823				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2824				       iommu->name);
2825				iommu_disable_translation(iommu);
2826				clear_translation_pre_enabled(iommu);
2827			} else {
2828				pr_info("Copied translation tables from previous kernel for %s\n",
2829					iommu->name);
2830			}
2831		}
2832
2833		if (!ecap_pass_through(iommu->ecap))
2834			hw_pass_through = 0;
2835		intel_svm_check(iommu);
2836	}
2837
2838	/*
2839	 * Now that qi is enabled on all iommus, set the root entry and flush
2840	 * caches. This is required on some Intel X58 chipsets, otherwise the
2841	 * flush_context function will loop forever and the boot hangs.
2842	 */
2843	for_each_active_iommu(iommu, drhd) {
2844		iommu_flush_write_buffer(iommu);
2845		iommu_set_root_entry(iommu);
2846	}
2847
2848#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2849	dmar_map_gfx = 0;
2850#endif
2851
2852	if (!dmar_map_gfx)
2853		iommu_identity_mapping |= IDENTMAP_GFX;
2854
2855	check_tylersburg_isoch();
2856
2857	ret = si_domain_init(hw_pass_through);
2858	if (ret)
2859		goto free_iommu;
2860
2861	/*
2862	 * for each drhd
2863	 *   enable fault log
2864	 *   global invalidate context cache
2865	 *   global invalidate iotlb
2866	 *   enable translation
2867	 */
2868	for_each_iommu(iommu, drhd) {
2869		if (drhd->ignored) {
2870			/*
2871			 * we always have to disable PMRs or DMA may fail on
2872			 * this device
2873			 */
2874			if (force_on)
2875				iommu_disable_protect_mem_regions(iommu);
2876			continue;
2877		}
2878
2879		iommu_flush_write_buffer(iommu);
2880
2881#ifdef CONFIG_INTEL_IOMMU_SVM
2882		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2883			/*
2884			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2885			 * could cause possible lock race condition.
2886			 */
2887			up_write(&dmar_global_lock);
2888			ret = intel_svm_enable_prq(iommu);
2889			down_write(&dmar_global_lock);
2890			if (ret)
2891				goto free_iommu;
2892		}
2893#endif
2894		ret = dmar_set_interrupt(iommu);
2895		if (ret)
2896			goto free_iommu;
2897	}
2898
2899	return 0;
2900
2901free_iommu:
2902	for_each_active_iommu(iommu, drhd) {
2903		disable_dmar_iommu(iommu);
2904		free_dmar_iommu(iommu);
2905	}
2906	if (si_domain) {
2907		domain_exit(si_domain);
2908		si_domain = NULL;
2909	}
2910
2911	return ret;
2912}
2913
2914static void __init init_no_remapping_devices(void)
2915{
2916	struct dmar_drhd_unit *drhd;
2917	struct device *dev;
2918	int i;
2919
2920	for_each_drhd_unit(drhd) {
2921		if (!drhd->include_all) {
2922			for_each_active_dev_scope(drhd->devices,
2923						  drhd->devices_cnt, i, dev)
2924				break;
2925			/* ignore DMAR unit if no devices exist */
2926			if (i == drhd->devices_cnt)
2927				drhd->ignored = 1;
2928		}
2929	}
2930
2931	for_each_active_drhd_unit(drhd) {
2932		if (drhd->include_all)
2933			continue;
2934
2935		for_each_active_dev_scope(drhd->devices,
2936					  drhd->devices_cnt, i, dev)
2937			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2938				break;
2939		if (i < drhd->devices_cnt)
2940			continue;
2941
2942		/* This IOMMU has *only* gfx devices. Either bypass it or
2943		   set the gfx_mapped flag, as appropriate */
2944		drhd->gfx_dedicated = 1;
2945		if (!dmar_map_gfx)
2946			drhd->ignored = 1;
2947	}
2948}
2949
2950#ifdef CONFIG_SUSPEND
2951static int init_iommu_hw(void)
2952{
2953	struct dmar_drhd_unit *drhd;
2954	struct intel_iommu *iommu = NULL;
2955	int ret;
2956
2957	for_each_active_iommu(iommu, drhd) {
2958		if (iommu->qi) {
2959			ret = dmar_reenable_qi(iommu);
2960			if (ret)
2961				return ret;
2962		}
2963	}
2964
2965	for_each_iommu(iommu, drhd) {
2966		if (drhd->ignored) {
2967			/*
2968			 * we always have to disable PMRs or DMA may fail on
2969			 * this device
2970			 */
2971			if (force_on)
2972				iommu_disable_protect_mem_regions(iommu);
2973			continue;
2974		}
2975
2976		iommu_flush_write_buffer(iommu);
2977		iommu_set_root_entry(iommu);
2978		iommu_enable_translation(iommu);
2979		iommu_disable_protect_mem_regions(iommu);
2980	}
2981
2982	return 0;
2983}
2984
2985static void iommu_flush_all(void)
2986{
2987	struct dmar_drhd_unit *drhd;
2988	struct intel_iommu *iommu;
2989
2990	for_each_active_iommu(iommu, drhd) {
2991		iommu->flush.flush_context(iommu, 0, 0, 0,
2992					   DMA_CCMD_GLOBAL_INVL);
2993		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2994					 DMA_TLB_GLOBAL_FLUSH);
2995	}
2996}
2997
2998static int iommu_suspend(void)
2999{
3000	struct dmar_drhd_unit *drhd;
3001	struct intel_iommu *iommu = NULL;
3002	unsigned long flag;
3003
3004	iommu_flush_all();
3005
3006	for_each_active_iommu(iommu, drhd) {
3007		iommu_disable_translation(iommu);
3008
3009		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3010
3011		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3012			readl(iommu->reg + DMAR_FECTL_REG);
3013		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3014			readl(iommu->reg + DMAR_FEDATA_REG);
3015		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3016			readl(iommu->reg + DMAR_FEADDR_REG);
3017		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3018			readl(iommu->reg + DMAR_FEUADDR_REG);
3019
3020		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3021	}
3022	return 0;
3023}
3024
3025static void iommu_resume(void)
3026{
3027	struct dmar_drhd_unit *drhd;
3028	struct intel_iommu *iommu = NULL;
3029	unsigned long flag;
3030
3031	if (init_iommu_hw()) {
3032		if (force_on)
3033			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3034		else
3035			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3036		return;
3037	}
3038
3039	for_each_active_iommu(iommu, drhd) {
3040
3041		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3042
3043		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3044			iommu->reg + DMAR_FECTL_REG);
3045		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3046			iommu->reg + DMAR_FEDATA_REG);
3047		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3048			iommu->reg + DMAR_FEADDR_REG);
3049		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3050			iommu->reg + DMAR_FEUADDR_REG);
3051
3052		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3053	}
3054}
3055
3056static struct syscore_ops iommu_syscore_ops = {
3057	.resume		= iommu_resume,
3058	.suspend	= iommu_suspend,
3059};
3060
3061static void __init init_iommu_pm_ops(void)
3062{
3063	register_syscore_ops(&iommu_syscore_ops);
3064}
3065
3066#else
3067static inline void init_iommu_pm_ops(void) {}
3068#endif	/* CONFIG_PM */
3069
3070static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3071{
3072	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3073	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3074	    rmrr->end_address <= rmrr->base_address ||
3075	    arch_rmrr_sanity_check(rmrr))
3076		return -EINVAL;
3077
3078	return 0;
3079}
3080
3081int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3082{
3083	struct acpi_dmar_reserved_memory *rmrr;
3084	struct dmar_rmrr_unit *rmrru;
3085
3086	rmrr = (struct acpi_dmar_reserved_memory *)header;
3087	if (rmrr_sanity_check(rmrr)) {
3088		pr_warn(FW_BUG
3089			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3090			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3091			   rmrr->base_address, rmrr->end_address,
3092			   dmi_get_system_info(DMI_BIOS_VENDOR),
3093			   dmi_get_system_info(DMI_BIOS_VERSION),
3094			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3095		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3096	}
3097
3098	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3099	if (!rmrru)
3100		goto out;
3101
3102	rmrru->hdr = header;
3103
3104	rmrru->base_address = rmrr->base_address;
3105	rmrru->end_address = rmrr->end_address;
3106
3107	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3108				((void *)rmrr) + rmrr->header.length,
3109				&rmrru->devices_cnt);
3110	if (rmrru->devices_cnt && rmrru->devices == NULL)
3111		goto free_rmrru;
3112
3113	list_add(&rmrru->list, &dmar_rmrr_units);
3114
3115	return 0;
3116free_rmrru:
3117	kfree(rmrru);
3118out:
3119	return -ENOMEM;
3120}
3121
3122static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3123{
3124	struct dmar_atsr_unit *atsru;
3125	struct acpi_dmar_atsr *tmp;
3126
3127	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3128				dmar_rcu_check()) {
3129		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3130		if (atsr->segment != tmp->segment)
3131			continue;
3132		if (atsr->header.length != tmp->header.length)
3133			continue;
3134		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3135			return atsru;
3136	}
3137
3138	return NULL;
3139}
3140
3141int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3142{
3143	struct acpi_dmar_atsr *atsr;
3144	struct dmar_atsr_unit *atsru;
3145
3146	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3147		return 0;
3148
3149	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3150	atsru = dmar_find_atsr(atsr);
3151	if (atsru)
3152		return 0;
3153
3154	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3155	if (!atsru)
3156		return -ENOMEM;
3157
3158	/*
3159	 * If memory is allocated from slab by ACPI _DSM method, we need to
3160	 * copy the memory content because the memory buffer will be freed
3161	 * on return.
3162	 */
3163	atsru->hdr = (void *)(atsru + 1);
3164	memcpy(atsru->hdr, hdr, hdr->length);
3165	atsru->include_all = atsr->flags & 0x1;
3166	if (!atsru->include_all) {
3167		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3168				(void *)atsr + atsr->header.length,
3169				&atsru->devices_cnt);
3170		if (atsru->devices_cnt && atsru->devices == NULL) {
3171			kfree(atsru);
3172			return -ENOMEM;
3173		}
3174	}
3175
3176	list_add_rcu(&atsru->list, &dmar_atsr_units);
3177
3178	return 0;
3179}
3180
3181static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3182{
3183	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3184	kfree(atsru);
3185}
3186
3187int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3188{
3189	struct acpi_dmar_atsr *atsr;
3190	struct dmar_atsr_unit *atsru;
3191
3192	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3193	atsru = dmar_find_atsr(atsr);
3194	if (atsru) {
3195		list_del_rcu(&atsru->list);
3196		synchronize_rcu();
3197		intel_iommu_free_atsr(atsru);
3198	}
3199
3200	return 0;
3201}
3202
3203int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3204{
3205	int i;
3206	struct device *dev;
3207	struct acpi_dmar_atsr *atsr;
3208	struct dmar_atsr_unit *atsru;
3209
3210	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3211	atsru = dmar_find_atsr(atsr);
3212	if (!atsru)
3213		return 0;
3214
3215	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3216		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3217					  i, dev)
3218			return -EBUSY;
3219	}
3220
3221	return 0;
3222}
3223
3224static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3225{
3226	struct dmar_satc_unit *satcu;
3227	struct acpi_dmar_satc *tmp;
3228
3229	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3230				dmar_rcu_check()) {
3231		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3232		if (satc->segment != tmp->segment)
3233			continue;
3234		if (satc->header.length != tmp->header.length)
3235			continue;
3236		if (memcmp(satc, tmp, satc->header.length) == 0)
3237			return satcu;
3238	}
3239
3240	return NULL;
3241}
3242
3243int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3244{
3245	struct acpi_dmar_satc *satc;
3246	struct dmar_satc_unit *satcu;
3247
3248	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3249		return 0;
3250
3251	satc = container_of(hdr, struct acpi_dmar_satc, header);
3252	satcu = dmar_find_satc(satc);
3253	if (satcu)
3254		return 0;
3255
3256	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3257	if (!satcu)
3258		return -ENOMEM;
3259
3260	satcu->hdr = (void *)(satcu + 1);
3261	memcpy(satcu->hdr, hdr, hdr->length);
3262	satcu->atc_required = satc->flags & 0x1;
3263	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3264					      (void *)satc + satc->header.length,
3265					      &satcu->devices_cnt);
3266	if (satcu->devices_cnt && !satcu->devices) {
3267		kfree(satcu);
3268		return -ENOMEM;
3269	}
3270	list_add_rcu(&satcu->list, &dmar_satc_units);
3271
3272	return 0;
3273}
3274
3275static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3276{
3277	int sp, ret;
3278	struct intel_iommu *iommu = dmaru->iommu;
3279
3280	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3281	if (ret)
3282		goto out;
3283
3284	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3285		pr_warn("%s: Doesn't support hardware pass through.\n",
3286			iommu->name);
3287		return -ENXIO;
3288	}
3289
3290	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3291	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3292		pr_warn("%s: Doesn't support large page.\n",
3293			iommu->name);
3294		return -ENXIO;
3295	}
3296
3297	/*
3298	 * Disable translation if already enabled prior to OS handover.
3299	 */
3300	if (iommu->gcmd & DMA_GCMD_TE)
3301		iommu_disable_translation(iommu);
3302
3303	ret = iommu_init_domains(iommu);
3304	if (ret == 0)
3305		ret = iommu_alloc_root_entry(iommu);
3306	if (ret)
3307		goto out;
3308
3309	intel_svm_check(iommu);
3310
3311	if (dmaru->ignored) {
3312		/*
3313		 * we always have to disable PMRs or DMA may fail on this device
3314		 */
3315		if (force_on)
3316			iommu_disable_protect_mem_regions(iommu);
3317		return 0;
3318	}
3319
3320	intel_iommu_init_qi(iommu);
3321	iommu_flush_write_buffer(iommu);
3322
3323#ifdef CONFIG_INTEL_IOMMU_SVM
3324	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3325		ret = intel_svm_enable_prq(iommu);
3326		if (ret)
3327			goto disable_iommu;
3328	}
3329#endif
3330	ret = dmar_set_interrupt(iommu);
3331	if (ret)
3332		goto disable_iommu;
3333
3334	iommu_set_root_entry(iommu);
3335	iommu_enable_translation(iommu);
3336
3337	iommu_disable_protect_mem_regions(iommu);
3338	return 0;
3339
3340disable_iommu:
3341	disable_dmar_iommu(iommu);
3342out:
3343	free_dmar_iommu(iommu);
3344	return ret;
3345}
3346
3347int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3348{
3349	int ret = 0;
3350	struct intel_iommu *iommu = dmaru->iommu;
3351
3352	if (!intel_iommu_enabled)
3353		return 0;
3354	if (iommu == NULL)
3355		return -EINVAL;
3356
3357	if (insert) {
3358		ret = intel_iommu_add(dmaru);
3359	} else {
3360		disable_dmar_iommu(iommu);
3361		free_dmar_iommu(iommu);
3362	}
3363
3364	return ret;
3365}
3366
3367static void intel_iommu_free_dmars(void)
3368{
3369	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3370	struct dmar_atsr_unit *atsru, *atsr_n;
3371	struct dmar_satc_unit *satcu, *satc_n;
3372
3373	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3374		list_del(&rmrru->list);
3375		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3376		kfree(rmrru);
3377	}
3378
3379	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3380		list_del(&atsru->list);
3381		intel_iommu_free_atsr(atsru);
3382	}
3383	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3384		list_del(&satcu->list);
3385		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3386		kfree(satcu);
3387	}
3388}
3389
3390static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3391{
3392	struct dmar_satc_unit *satcu;
3393	struct acpi_dmar_satc *satc;
3394	struct device *tmp;
3395	int i;
3396
3397	dev = pci_physfn(dev);
3398	rcu_read_lock();
3399
3400	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3401		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3402		if (satc->segment != pci_domain_nr(dev->bus))
3403			continue;
3404		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3405			if (to_pci_dev(tmp) == dev)
3406				goto out;
3407	}
3408	satcu = NULL;
3409out:
3410	rcu_read_unlock();
3411	return satcu;
3412}
3413
3414static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3415{
3416	int i, ret = 1;
3417	struct pci_bus *bus;
3418	struct pci_dev *bridge = NULL;
3419	struct device *tmp;
3420	struct acpi_dmar_atsr *atsr;
3421	struct dmar_atsr_unit *atsru;
3422	struct dmar_satc_unit *satcu;
3423
3424	dev = pci_physfn(dev);
3425	satcu = dmar_find_matched_satc_unit(dev);
3426	if (satcu)
3427		/*
3428		 * This device supports ATS as it is in SATC table.
3429		 * When IOMMU is in legacy mode, enabling ATS is done
3430		 * automatically by HW for the device that requires
3431		 * ATS, hence OS should not enable this device ATS
3432		 * to avoid duplicated TLB invalidation.
3433		 */
3434		return !(satcu->atc_required && !sm_supported(iommu));
3435
3436	for (bus = dev->bus; bus; bus = bus->parent) {
3437		bridge = bus->self;
3438		/* If it's an integrated device, allow ATS */
3439		if (!bridge)
3440			return 1;
3441		/* Connected via non-PCIe: no ATS */
3442		if (!pci_is_pcie(bridge) ||
3443		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3444			return 0;
3445		/* If we found the root port, look it up in the ATSR */
3446		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3447			break;
3448	}
3449
3450	rcu_read_lock();
3451	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3452		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3453		if (atsr->segment != pci_domain_nr(dev->bus))
3454			continue;
3455
3456		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3457			if (tmp == &bridge->dev)
3458				goto out;
3459
3460		if (atsru->include_all)
3461			goto out;
3462	}
3463	ret = 0;
3464out:
3465	rcu_read_unlock();
3466
3467	return ret;
3468}
3469
3470int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3471{
3472	int ret;
3473	struct dmar_rmrr_unit *rmrru;
3474	struct dmar_atsr_unit *atsru;
3475	struct dmar_satc_unit *satcu;
3476	struct acpi_dmar_atsr *atsr;
3477	struct acpi_dmar_reserved_memory *rmrr;
3478	struct acpi_dmar_satc *satc;
3479
3480	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3481		return 0;
3482
3483	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3484		rmrr = container_of(rmrru->hdr,
3485				    struct acpi_dmar_reserved_memory, header);
3486		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3487			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3488				((void *)rmrr) + rmrr->header.length,
3489				rmrr->segment, rmrru->devices,
3490				rmrru->devices_cnt);
3491			if (ret < 0)
3492				return ret;
3493		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3494			dmar_remove_dev_scope(info, rmrr->segment,
3495				rmrru->devices, rmrru->devices_cnt);
3496		}
3497	}
3498
3499	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3500		if (atsru->include_all)
3501			continue;
3502
3503		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3504		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3505			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3506					(void *)atsr + atsr->header.length,
3507					atsr->segment, atsru->devices,
3508					atsru->devices_cnt);
3509			if (ret > 0)
3510				break;
3511			else if (ret < 0)
3512				return ret;
3513		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3514			if (dmar_remove_dev_scope(info, atsr->segment,
3515					atsru->devices, atsru->devices_cnt))
3516				break;
3517		}
3518	}
3519	list_for_each_entry(satcu, &dmar_satc_units, list) {
3520		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3521		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3522			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3523					(void *)satc + satc->header.length,
3524					satc->segment, satcu->devices,
3525					satcu->devices_cnt);
3526			if (ret > 0)
3527				break;
3528			else if (ret < 0)
3529				return ret;
3530		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3531			if (dmar_remove_dev_scope(info, satc->segment,
3532					satcu->devices, satcu->devices_cnt))
3533				break;
3534		}
3535	}
3536
3537	return 0;
3538}
3539
3540static int intel_iommu_memory_notifier(struct notifier_block *nb,
3541				       unsigned long val, void *v)
3542{
3543	struct memory_notify *mhp = v;
3544	unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3545	unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3546			mhp->nr_pages - 1);
3547
3548	switch (val) {
3549	case MEM_GOING_ONLINE:
3550		if (iommu_domain_identity_map(si_domain,
3551					      start_vpfn, last_vpfn)) {
3552			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3553				start_vpfn, last_vpfn);
3554			return NOTIFY_BAD;
3555		}
3556		break;
3557
3558	case MEM_OFFLINE:
3559	case MEM_CANCEL_ONLINE:
3560		{
3561			struct dmar_drhd_unit *drhd;
3562			struct intel_iommu *iommu;
3563			LIST_HEAD(freelist);
3564
3565			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3566
3567			rcu_read_lock();
3568			for_each_active_iommu(iommu, drhd)
3569				iommu_flush_iotlb_psi(iommu, si_domain,
3570					start_vpfn, mhp->nr_pages,
3571					list_empty(&freelist), 0);
3572			rcu_read_unlock();
3573			put_pages_list(&freelist);
3574		}
3575		break;
3576	}
3577
3578	return NOTIFY_OK;
3579}
3580
3581static struct notifier_block intel_iommu_memory_nb = {
3582	.notifier_call = intel_iommu_memory_notifier,
3583	.priority = 0
3584};
3585
3586static void intel_disable_iommus(void)
3587{
3588	struct intel_iommu *iommu = NULL;
3589	struct dmar_drhd_unit *drhd;
3590
3591	for_each_iommu(iommu, drhd)
3592		iommu_disable_translation(iommu);
3593}
3594
3595void intel_iommu_shutdown(void)
3596{
3597	struct dmar_drhd_unit *drhd;
3598	struct intel_iommu *iommu = NULL;
3599
3600	if (no_iommu || dmar_disabled)
3601		return;
3602
3603	down_write(&dmar_global_lock);
3604
3605	/* Disable PMRs explicitly here. */
3606	for_each_iommu(iommu, drhd)
3607		iommu_disable_protect_mem_regions(iommu);
3608
3609	/* Make sure the IOMMUs are switched off */
3610	intel_disable_iommus();
3611
3612	up_write(&dmar_global_lock);
3613}
3614
3615static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3616{
3617	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3618
3619	return container_of(iommu_dev, struct intel_iommu, iommu);
3620}
3621
3622static ssize_t version_show(struct device *dev,
3623			    struct device_attribute *attr, char *buf)
3624{
3625	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3626	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3627	return sysfs_emit(buf, "%d:%d\n",
3628			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3629}
3630static DEVICE_ATTR_RO(version);
3631
3632static ssize_t address_show(struct device *dev,
3633			    struct device_attribute *attr, char *buf)
3634{
3635	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3636	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3637}
3638static DEVICE_ATTR_RO(address);
3639
3640static ssize_t cap_show(struct device *dev,
3641			struct device_attribute *attr, char *buf)
3642{
3643	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3644	return sysfs_emit(buf, "%llx\n", iommu->cap);
3645}
3646static DEVICE_ATTR_RO(cap);
3647
3648static ssize_t ecap_show(struct device *dev,
3649			 struct device_attribute *attr, char *buf)
3650{
3651	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3652	return sysfs_emit(buf, "%llx\n", iommu->ecap);
3653}
3654static DEVICE_ATTR_RO(ecap);
3655
3656static ssize_t domains_supported_show(struct device *dev,
3657				      struct device_attribute *attr, char *buf)
3658{
3659	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3660	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3661}
3662static DEVICE_ATTR_RO(domains_supported);
3663
3664static ssize_t domains_used_show(struct device *dev,
3665				 struct device_attribute *attr, char *buf)
3666{
3667	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3668	return sysfs_emit(buf, "%d\n",
3669			  bitmap_weight(iommu->domain_ids,
3670					cap_ndoms(iommu->cap)));
3671}
3672static DEVICE_ATTR_RO(domains_used);
3673
3674static struct attribute *intel_iommu_attrs[] = {
3675	&dev_attr_version.attr,
3676	&dev_attr_address.attr,
3677	&dev_attr_cap.attr,
3678	&dev_attr_ecap.attr,
3679	&dev_attr_domains_supported.attr,
3680	&dev_attr_domains_used.attr,
3681	NULL,
3682};
3683
3684static struct attribute_group intel_iommu_group = {
3685	.name = "intel-iommu",
3686	.attrs = intel_iommu_attrs,
3687};
3688
3689const struct attribute_group *intel_iommu_groups[] = {
3690	&intel_iommu_group,
3691	NULL,
3692};
3693
3694static inline bool has_external_pci(void)
3695{
3696	struct pci_dev *pdev = NULL;
3697
3698	for_each_pci_dev(pdev)
3699		if (pdev->external_facing) {
3700			pci_dev_put(pdev);
3701			return true;
3702		}
3703
3704	return false;
3705}
3706
3707static int __init platform_optin_force_iommu(void)
3708{
3709	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3710		return 0;
3711
3712	if (no_iommu || dmar_disabled)
3713		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3714
3715	/*
3716	 * If Intel-IOMMU is disabled by default, we will apply identity
3717	 * map for all devices except those marked as being untrusted.
3718	 */
3719	if (dmar_disabled)
3720		iommu_set_default_passthrough(false);
3721
3722	dmar_disabled = 0;
3723	no_iommu = 0;
3724
3725	return 1;
3726}
3727
3728static int __init probe_acpi_namespace_devices(void)
3729{
3730	struct dmar_drhd_unit *drhd;
3731	/* To avoid a -Wunused-but-set-variable warning. */
3732	struct intel_iommu *iommu __maybe_unused;
3733	struct device *dev;
3734	int i, ret = 0;
3735
3736	for_each_active_iommu(iommu, drhd) {
3737		for_each_active_dev_scope(drhd->devices,
3738					  drhd->devices_cnt, i, dev) {
3739			struct acpi_device_physical_node *pn;
3740			struct acpi_device *adev;
3741
3742			if (dev->bus != &acpi_bus_type)
3743				continue;
3744
3745			adev = to_acpi_device(dev);
3746			mutex_lock(&adev->physical_node_lock);
3747			list_for_each_entry(pn,
3748					    &adev->physical_node_list, node) {
3749				ret = iommu_probe_device(pn->dev);
3750				if (ret)
3751					break;
3752			}
3753			mutex_unlock(&adev->physical_node_lock);
3754
3755			if (ret)
3756				return ret;
3757		}
3758	}
3759
3760	return 0;
3761}
3762
3763static __init int tboot_force_iommu(void)
3764{
3765	if (!tboot_enabled())
3766		return 0;
3767
3768	if (no_iommu || dmar_disabled)
3769		pr_warn("Forcing Intel-IOMMU to enabled\n");
3770
3771	dmar_disabled = 0;
3772	no_iommu = 0;
3773
3774	return 1;
3775}
3776
3777int __init intel_iommu_init(void)
3778{
3779	int ret = -ENODEV;
3780	struct dmar_drhd_unit *drhd;
3781	struct intel_iommu *iommu;
3782
3783	/*
3784	 * Intel IOMMU is required for a TXT/tboot launch or platform
3785	 * opt in, so enforce that.
3786	 */
3787	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3788		    platform_optin_force_iommu();
3789
3790	down_write(&dmar_global_lock);
3791	if (dmar_table_init()) {
3792		if (force_on)
3793			panic("tboot: Failed to initialize DMAR table\n");
3794		goto out_free_dmar;
3795	}
3796
3797	if (dmar_dev_scope_init() < 0) {
3798		if (force_on)
3799			panic("tboot: Failed to initialize DMAR device scope\n");
3800		goto out_free_dmar;
3801	}
3802
3803	up_write(&dmar_global_lock);
3804
3805	/*
3806	 * The bus notifier takes the dmar_global_lock, so lockdep will
3807	 * complain later when we register it under the lock.
3808	 */
3809	dmar_register_bus_notifier();
3810
3811	down_write(&dmar_global_lock);
3812
3813	if (!no_iommu)
3814		intel_iommu_debugfs_init();
3815
3816	if (no_iommu || dmar_disabled) {
3817		/*
3818		 * We exit the function here to ensure IOMMU's remapping and
3819		 * mempool aren't setup, which means that the IOMMU's PMRs
3820		 * won't be disabled via the call to init_dmars(). So disable
3821		 * it explicitly here. The PMRs were setup by tboot prior to
3822		 * calling SENTER, but the kernel is expected to reset/tear
3823		 * down the PMRs.
3824		 */
3825		if (intel_iommu_tboot_noforce) {
3826			for_each_iommu(iommu, drhd)
3827				iommu_disable_protect_mem_regions(iommu);
3828		}
3829
3830		/*
3831		 * Make sure the IOMMUs are switched off, even when we
3832		 * boot into a kexec kernel and the previous kernel left
3833		 * them enabled
3834		 */
3835		intel_disable_iommus();
3836		goto out_free_dmar;
3837	}
3838
3839	if (list_empty(&dmar_rmrr_units))
3840		pr_info("No RMRR found\n");
3841
3842	if (list_empty(&dmar_atsr_units))
3843		pr_info("No ATSR found\n");
3844
3845	if (list_empty(&dmar_satc_units))
3846		pr_info("No SATC found\n");
3847
3848	init_no_remapping_devices();
3849
3850	ret = init_dmars();
3851	if (ret) {
3852		if (force_on)
3853			panic("tboot: Failed to initialize DMARs\n");
3854		pr_err("Initialization failed\n");
3855		goto out_free_dmar;
3856	}
3857	up_write(&dmar_global_lock);
3858
3859	init_iommu_pm_ops();
3860
3861	down_read(&dmar_global_lock);
3862	for_each_active_iommu(iommu, drhd) {
3863		/*
3864		 * The flush queue implementation does not perform
3865		 * page-selective invalidations that are required for efficient
3866		 * TLB flushes in virtual environments.  The benefit of batching
3867		 * is likely to be much lower than the overhead of synchronizing
3868		 * the virtual and physical IOMMU page-tables.
3869		 */
3870		if (cap_caching_mode(iommu->cap) &&
3871		    !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3872			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3873			iommu_set_dma_strict();
3874		}
3875		iommu_device_sysfs_add(&iommu->iommu, NULL,
3876				       intel_iommu_groups,
3877				       "%s", iommu->name);
3878		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3879
3880		iommu_pmu_register(iommu);
3881	}
3882	up_read(&dmar_global_lock);
3883
3884	if (si_domain && !hw_pass_through)
3885		register_memory_notifier(&intel_iommu_memory_nb);
3886
3887	down_read(&dmar_global_lock);
3888	if (probe_acpi_namespace_devices())
3889		pr_warn("ACPI name space devices didn't probe correctly\n");
3890
3891	/* Finally, we enable the DMA remapping hardware. */
3892	for_each_iommu(iommu, drhd) {
3893		if (!drhd->ignored && !translation_pre_enabled(iommu))
3894			iommu_enable_translation(iommu);
3895
3896		iommu_disable_protect_mem_regions(iommu);
3897	}
3898	up_read(&dmar_global_lock);
3899
3900	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3901
3902	intel_iommu_enabled = 1;
3903
3904	return 0;
3905
3906out_free_dmar:
3907	intel_iommu_free_dmars();
3908	up_write(&dmar_global_lock);
3909	return ret;
3910}
3911
3912static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3913{
3914	struct device_domain_info *info = opaque;
3915
3916	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3917	return 0;
3918}
3919
3920/*
3921 * NB - intel-iommu lacks any sort of reference counting for the users of
3922 * dependent devices.  If multiple endpoints have intersecting dependent
3923 * devices, unbinding the driver from any one of them will possibly leave
3924 * the others unable to operate.
3925 */
3926static void domain_context_clear(struct device_domain_info *info)
3927{
3928	if (!dev_is_pci(info->dev))
3929		domain_context_clear_one(info, info->bus, info->devfn);
3930
3931	pci_for_each_dma_alias(to_pci_dev(info->dev),
3932			       &domain_context_clear_one_cb, info);
3933}
3934
3935static void dmar_remove_one_dev_info(struct device *dev)
3936{
3937	struct device_domain_info *info = dev_iommu_priv_get(dev);
3938	struct dmar_domain *domain = info->domain;
3939	struct intel_iommu *iommu = info->iommu;
3940	unsigned long flags;
3941
3942	if (!dev_is_real_dma_subdevice(info->dev)) {
3943		if (dev_is_pci(info->dev) && sm_supported(iommu))
3944			intel_pasid_tear_down_entry(iommu, info->dev,
3945					IOMMU_NO_PASID, false);
3946
3947		iommu_disable_pci_caps(info);
3948		domain_context_clear(info);
3949	}
3950
3951	spin_lock_irqsave(&domain->lock, flags);
3952	list_del(&info->link);
3953	spin_unlock_irqrestore(&domain->lock, flags);
3954
3955	domain_detach_iommu(domain, iommu);
3956	info->domain = NULL;
3957}
3958
3959/*
3960 * Clear the page table pointer in context or pasid table entries so that
3961 * all DMA requests without PASID from the device are blocked. If the page
3962 * table has been set, clean up the data structures.
3963 */
3964static void device_block_translation(struct device *dev)
3965{
3966	struct device_domain_info *info = dev_iommu_priv_get(dev);
3967	struct intel_iommu *iommu = info->iommu;
3968	unsigned long flags;
3969
3970	iommu_disable_pci_caps(info);
3971	if (!dev_is_real_dma_subdevice(dev)) {
3972		if (sm_supported(iommu))
3973			intel_pasid_tear_down_entry(iommu, dev,
3974						    IOMMU_NO_PASID, false);
3975		else
3976			domain_context_clear(info);
3977	}
3978
3979	if (!info->domain)
3980		return;
3981
3982	spin_lock_irqsave(&info->domain->lock, flags);
3983	list_del(&info->link);
3984	spin_unlock_irqrestore(&info->domain->lock, flags);
3985
3986	domain_detach_iommu(info->domain, iommu);
3987	info->domain = NULL;
3988}
3989
3990static int md_domain_init(struct dmar_domain *domain, int guest_width)
3991{
3992	int adjust_width;
3993
3994	/* calculate AGAW */
3995	domain->gaw = guest_width;
3996	adjust_width = guestwidth_to_adjustwidth(guest_width);
3997	domain->agaw = width_to_agaw(adjust_width);
3998
3999	domain->iommu_coherency = false;
4000	domain->iommu_superpage = 0;
4001	domain->max_addr = 0;
4002
4003	/* always allocate the top pgd */
4004	domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
4005	if (!domain->pgd)
4006		return -ENOMEM;
4007	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4008	return 0;
4009}
4010
4011static int blocking_domain_attach_dev(struct iommu_domain *domain,
4012				      struct device *dev)
4013{
4014	device_block_translation(dev);
4015	return 0;
4016}
4017
4018static struct iommu_domain blocking_domain = {
4019	.ops = &(const struct iommu_domain_ops) {
4020		.attach_dev	= blocking_domain_attach_dev,
4021		.free		= intel_iommu_domain_free
4022	}
4023};
4024
4025static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4026{
4027	struct dmar_domain *dmar_domain;
4028	struct iommu_domain *domain;
4029
4030	switch (type) {
4031	case IOMMU_DOMAIN_BLOCKED:
4032		return &blocking_domain;
4033	case IOMMU_DOMAIN_DMA:
4034	case IOMMU_DOMAIN_UNMANAGED:
4035		dmar_domain = alloc_domain(type);
4036		if (!dmar_domain) {
4037			pr_err("Can't allocate dmar_domain\n");
4038			return NULL;
4039		}
4040		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4041			pr_err("Domain initialization failed\n");
4042			domain_exit(dmar_domain);
4043			return NULL;
4044		}
4045
4046		domain = &dmar_domain->domain;
4047		domain->geometry.aperture_start = 0;
4048		domain->geometry.aperture_end   =
4049				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4050		domain->geometry.force_aperture = true;
4051
4052		return domain;
4053	case IOMMU_DOMAIN_IDENTITY:
4054		return &si_domain->domain;
4055	case IOMMU_DOMAIN_SVA:
4056		return intel_svm_domain_alloc();
4057	default:
4058		return NULL;
4059	}
4060
4061	return NULL;
4062}
4063
4064static void intel_iommu_domain_free(struct iommu_domain *domain)
4065{
4066	if (domain != &si_domain->domain && domain != &blocking_domain)
4067		domain_exit(to_dmar_domain(domain));
4068}
4069
4070static int prepare_domain_attach_device(struct iommu_domain *domain,
4071					struct device *dev)
4072{
4073	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4074	struct intel_iommu *iommu;
4075	int addr_width;
4076
4077	iommu = device_to_iommu(dev, NULL, NULL);
4078	if (!iommu)
4079		return -ENODEV;
4080
4081	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4082		return -EINVAL;
4083
4084	/* check if this iommu agaw is sufficient for max mapped address */
4085	addr_width = agaw_to_width(iommu->agaw);
4086	if (addr_width > cap_mgaw(iommu->cap))
4087		addr_width = cap_mgaw(iommu->cap);
4088
4089	if (dmar_domain->max_addr > (1LL << addr_width))
4090		return -EINVAL;
4091	dmar_domain->gaw = addr_width;
4092
4093	/*
4094	 * Knock out extra levels of page tables if necessary
4095	 */
4096	while (iommu->agaw < dmar_domain->agaw) {
4097		struct dma_pte *pte;
4098
4099		pte = dmar_domain->pgd;
4100		if (dma_pte_present(pte)) {
4101			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4102			free_pgtable_page(pte);
4103		}
4104		dmar_domain->agaw--;
4105	}
4106
4107	return 0;
4108}
4109
4110static int intel_iommu_attach_device(struct iommu_domain *domain,
4111				     struct device *dev)
4112{
4113	struct device_domain_info *info = dev_iommu_priv_get(dev);
4114	int ret;
4115
4116	if (info->domain)
4117		device_block_translation(dev);
4118
4119	ret = prepare_domain_attach_device(domain, dev);
4120	if (ret)
4121		return ret;
4122
4123	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4124}
4125
4126static int intel_iommu_map(struct iommu_domain *domain,
4127			   unsigned long iova, phys_addr_t hpa,
4128			   size_t size, int iommu_prot, gfp_t gfp)
4129{
4130	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4131	u64 max_addr;
4132	int prot = 0;
4133
4134	if (iommu_prot & IOMMU_READ)
4135		prot |= DMA_PTE_READ;
4136	if (iommu_prot & IOMMU_WRITE)
4137		prot |= DMA_PTE_WRITE;
4138	if (dmar_domain->set_pte_snp)
4139		prot |= DMA_PTE_SNP;
4140
4141	max_addr = iova + size;
4142	if (dmar_domain->max_addr < max_addr) {
4143		u64 end;
4144
4145		/* check if minimum agaw is sufficient for mapped address */
4146		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4147		if (end < max_addr) {
4148			pr_err("%s: iommu width (%d) is not "
4149			       "sufficient for the mapped address (%llx)\n",
4150			       __func__, dmar_domain->gaw, max_addr);
4151			return -EFAULT;
4152		}
4153		dmar_domain->max_addr = max_addr;
4154	}
4155	/* Round up size to next multiple of PAGE_SIZE, if it and
4156	   the low bits of hpa would take us onto the next page */
4157	size = aligned_nrpages(hpa, size);
4158	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4159				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4160}
4161
4162static int intel_iommu_map_pages(struct iommu_domain *domain,
4163				 unsigned long iova, phys_addr_t paddr,
4164				 size_t pgsize, size_t pgcount,
4165				 int prot, gfp_t gfp, size_t *mapped)
4166{
4167	unsigned long pgshift = __ffs(pgsize);
4168	size_t size = pgcount << pgshift;
4169	int ret;
4170
4171	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4172		return -EINVAL;
4173
4174	if (!IS_ALIGNED(iova | paddr, pgsize))
4175		return -EINVAL;
4176
4177	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4178	if (!ret && mapped)
4179		*mapped = size;
4180
4181	return ret;
4182}
4183
4184static size_t intel_iommu_unmap(struct iommu_domain *domain,
4185				unsigned long iova, size_t size,
4186				struct iommu_iotlb_gather *gather)
4187{
4188	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4189	unsigned long start_pfn, last_pfn;
4190	int level = 0;
4191
4192	/* Cope with horrid API which requires us to unmap more than the
4193	   size argument if it happens to be a large-page mapping. */
4194	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4195				     &level, GFP_ATOMIC)))
4196		return 0;
4197
4198	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4199		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4200
4201	start_pfn = iova >> VTD_PAGE_SHIFT;
4202	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4203
4204	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4205
4206	if (dmar_domain->max_addr == iova + size)
4207		dmar_domain->max_addr = iova;
4208
4209	/*
4210	 * We do not use page-selective IOTLB invalidation in flush queue,
4211	 * so there is no need to track page and sync iotlb.
4212	 */
4213	if (!iommu_iotlb_gather_queued(gather))
4214		iommu_iotlb_gather_add_page(domain, gather, iova, size);
4215
4216	return size;
4217}
4218
4219static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4220				      unsigned long iova,
4221				      size_t pgsize, size_t pgcount,
4222				      struct iommu_iotlb_gather *gather)
4223{
4224	unsigned long pgshift = __ffs(pgsize);
4225	size_t size = pgcount << pgshift;
4226
4227	return intel_iommu_unmap(domain, iova, size, gather);
4228}
4229
4230static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4231				 struct iommu_iotlb_gather *gather)
4232{
4233	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4234	unsigned long iova_pfn = IOVA_PFN(gather->start);
4235	size_t size = gather->end - gather->start;
4236	struct iommu_domain_info *info;
4237	unsigned long start_pfn;
4238	unsigned long nrpages;
4239	unsigned long i;
4240
4241	nrpages = aligned_nrpages(gather->start, size);
4242	start_pfn = mm_to_dma_pfn_start(iova_pfn);
4243
4244	xa_for_each(&dmar_domain->iommu_array, i, info)
4245		iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4246				      start_pfn, nrpages,
4247				      list_empty(&gather->freelist), 0);
4248
4249	put_pages_list(&gather->freelist);
4250}
4251
4252static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4253					    dma_addr_t iova)
4254{
4255	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4256	struct dma_pte *pte;
4257	int level = 0;
4258	u64 phys = 0;
4259
4260	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4261			     GFP_ATOMIC);
4262	if (pte && dma_pte_present(pte))
4263		phys = dma_pte_addr(pte) +
4264			(iova & (BIT_MASK(level_to_offset_bits(level) +
4265						VTD_PAGE_SHIFT) - 1));
4266
4267	return phys;
4268}
4269
4270static bool domain_support_force_snooping(struct dmar_domain *domain)
4271{
4272	struct device_domain_info *info;
4273	bool support = true;
4274
4275	assert_spin_locked(&domain->lock);
4276	list_for_each_entry(info, &domain->devices, link) {
4277		if (!ecap_sc_support(info->iommu->ecap)) {
4278			support = false;
4279			break;
4280		}
4281	}
4282
4283	return support;
4284}
4285
4286static void domain_set_force_snooping(struct dmar_domain *domain)
4287{
4288	struct device_domain_info *info;
4289
4290	assert_spin_locked(&domain->lock);
4291	/*
4292	 * Second level page table supports per-PTE snoop control. The
4293	 * iommu_map() interface will handle this by setting SNP bit.
4294	 */
4295	if (!domain->use_first_level) {
4296		domain->set_pte_snp = true;
4297		return;
4298	}
4299
4300	list_for_each_entry(info, &domain->devices, link)
4301		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4302						     IOMMU_NO_PASID);
4303}
4304
4305static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4306{
4307	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4308	unsigned long flags;
4309
4310	if (dmar_domain->force_snooping)
4311		return true;
4312
4313	spin_lock_irqsave(&dmar_domain->lock, flags);
4314	if (!domain_support_force_snooping(dmar_domain) ||
4315	    (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
4316		spin_unlock_irqrestore(&dmar_domain->lock, flags);
4317		return false;
4318	}
4319
4320	domain_set_force_snooping(dmar_domain);
4321	dmar_domain->force_snooping = true;
4322	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4323
4324	return true;
4325}
4326
4327static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4328{
4329	struct device_domain_info *info = dev_iommu_priv_get(dev);
4330
4331	switch (cap) {
4332	case IOMMU_CAP_CACHE_COHERENCY:
4333	case IOMMU_CAP_DEFERRED_FLUSH:
4334		return true;
4335	case IOMMU_CAP_PRE_BOOT_PROTECTION:
4336		return dmar_platform_optin();
4337	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4338		return ecap_sc_support(info->iommu->ecap);
4339	default:
4340		return false;
4341	}
4342}
4343
4344static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4345{
4346	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4347	struct device_domain_info *info;
4348	struct intel_iommu *iommu;
4349	u8 bus, devfn;
4350	int ret;
4351
4352	iommu = device_to_iommu(dev, &bus, &devfn);
4353	if (!iommu || !iommu->iommu.ops)
4354		return ERR_PTR(-ENODEV);
4355
4356	info = kzalloc(sizeof(*info), GFP_KERNEL);
4357	if (!info)
4358		return ERR_PTR(-ENOMEM);
4359
4360	if (dev_is_real_dma_subdevice(dev)) {
4361		info->bus = pdev->bus->number;
4362		info->devfn = pdev->devfn;
4363		info->segment = pci_domain_nr(pdev->bus);
4364	} else {
4365		info->bus = bus;
4366		info->devfn = devfn;
4367		info->segment = iommu->segment;
4368	}
4369
4370	info->dev = dev;
4371	info->iommu = iommu;
4372	if (dev_is_pci(dev)) {
4373		if (ecap_dev_iotlb_support(iommu->ecap) &&
4374		    pci_ats_supported(pdev) &&
4375		    dmar_ats_supported(pdev, iommu)) {
4376			info->ats_supported = 1;
4377			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4378
4379			/*
4380			 * For IOMMU that supports device IOTLB throttling
4381			 * (DIT), we assign PFSID to the invalidation desc
4382			 * of a VF such that IOMMU HW can gauge queue depth
4383			 * at PF level. If DIT is not set, PFSID will be
4384			 * treated as reserved, which should be set to 0.
4385			 */
4386			if (ecap_dit(iommu->ecap))
4387				info->pfsid = pci_dev_id(pci_physfn(pdev));
4388			info->ats_qdep = pci_ats_queue_depth(pdev);
4389		}
4390		if (sm_supported(iommu)) {
4391			if (pasid_supported(iommu)) {
4392				int features = pci_pasid_features(pdev);
4393
4394				if (features >= 0)
4395					info->pasid_supported = features | 1;
4396			}
4397
4398			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4399			    pci_pri_supported(pdev))
4400				info->pri_supported = 1;
4401		}
4402	}
4403
4404	dev_iommu_priv_set(dev, info);
4405
4406	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4407		ret = intel_pasid_alloc_table(dev);
4408		if (ret) {
4409			dev_err(dev, "PASID table allocation failed\n");
4410			dev_iommu_priv_set(dev, NULL);
4411			kfree(info);
4412			return ERR_PTR(ret);
4413		}
4414	}
4415
4416	return &iommu->iommu;
4417}
4418
4419static void intel_iommu_release_device(struct device *dev)
4420{
4421	struct device_domain_info *info = dev_iommu_priv_get(dev);
4422
4423	dmar_remove_one_dev_info(dev);
4424	intel_pasid_free_table(dev);
4425	dev_iommu_priv_set(dev, NULL);
4426	kfree(info);
4427	set_dma_ops(dev, NULL);
4428}
4429
4430static void intel_iommu_probe_finalize(struct device *dev)
4431{
4432	set_dma_ops(dev, NULL);
4433	iommu_setup_dma_ops(dev, 0, U64_MAX);
4434}
4435
4436static void intel_iommu_get_resv_regions(struct device *device,
4437					 struct list_head *head)
4438{
4439	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4440	struct iommu_resv_region *reg;
4441	struct dmar_rmrr_unit *rmrr;
4442	struct device *i_dev;
4443	int i;
4444
4445	rcu_read_lock();
4446	for_each_rmrr_units(rmrr) {
4447		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4448					  i, i_dev) {
4449			struct iommu_resv_region *resv;
4450			enum iommu_resv_type type;
4451			size_t length;
4452
4453			if (i_dev != device &&
4454			    !is_downstream_to_pci_bridge(device, i_dev))
4455				continue;
4456
4457			length = rmrr->end_address - rmrr->base_address + 1;
4458
4459			type = device_rmrr_is_relaxable(device) ?
4460				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4461
4462			resv = iommu_alloc_resv_region(rmrr->base_address,
4463						       length, prot, type,
4464						       GFP_ATOMIC);
4465			if (!resv)
4466				break;
4467
4468			list_add_tail(&resv->list, head);
4469		}
4470	}
4471	rcu_read_unlock();
4472
4473#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4474	if (dev_is_pci(device)) {
4475		struct pci_dev *pdev = to_pci_dev(device);
4476
4477		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4478			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4479					IOMMU_RESV_DIRECT_RELAXABLE,
4480					GFP_KERNEL);
4481			if (reg)
4482				list_add_tail(&reg->list, head);
4483		}
4484	}
4485#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4486
4487	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4488				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4489				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4490	if (!reg)
4491		return;
4492	list_add_tail(&reg->list, head);
4493}
4494
4495static struct iommu_group *intel_iommu_device_group(struct device *dev)
4496{
4497	if (dev_is_pci(dev))
4498		return pci_device_group(dev);
4499	return generic_device_group(dev);
4500}
4501
4502static int intel_iommu_enable_sva(struct device *dev)
4503{
4504	struct device_domain_info *info = dev_iommu_priv_get(dev);
4505	struct intel_iommu *iommu;
4506
4507	if (!info || dmar_disabled)
4508		return -EINVAL;
4509
4510	iommu = info->iommu;
4511	if (!iommu)
4512		return -EINVAL;
4513
4514	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4515		return -ENODEV;
4516
4517	if (!info->pasid_enabled || !info->ats_enabled)
4518		return -EINVAL;
4519
4520	/*
4521	 * Devices having device-specific I/O fault handling should not
4522	 * support PCI/PRI. The IOMMU side has no means to check the
4523	 * capability of device-specific IOPF.  Therefore, IOMMU can only
4524	 * default that if the device driver enables SVA on a non-PRI
4525	 * device, it will handle IOPF in its own way.
4526	 */
4527	if (!info->pri_supported)
4528		return 0;
4529
4530	/* Devices supporting PRI should have it enabled. */
4531	if (!info->pri_enabled)
4532		return -EINVAL;
4533
4534	return 0;
4535}
4536
4537static int intel_iommu_enable_iopf(struct device *dev)
4538{
4539	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4540	struct device_domain_info *info = dev_iommu_priv_get(dev);
4541	struct intel_iommu *iommu;
4542	int ret;
4543
4544	if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4545		return -ENODEV;
4546
4547	if (info->pri_enabled)
4548		return -EBUSY;
4549
4550	iommu = info->iommu;
4551	if (!iommu)
4552		return -EINVAL;
4553
4554	/* PASID is required in PRG Response Message. */
4555	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4556		return -EINVAL;
4557
4558	ret = pci_reset_pri(pdev);
4559	if (ret)
4560		return ret;
4561
4562	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4563	if (ret)
4564		return ret;
4565
4566	ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4567	if (ret)
4568		goto iopf_remove_device;
4569
4570	ret = pci_enable_pri(pdev, PRQ_DEPTH);
4571	if (ret)
4572		goto iopf_unregister_handler;
4573	info->pri_enabled = 1;
4574
4575	return 0;
4576
4577iopf_unregister_handler:
4578	iommu_unregister_device_fault_handler(dev);
4579iopf_remove_device:
4580	iopf_queue_remove_device(iommu->iopf_queue, dev);
4581
4582	return ret;
4583}
4584
4585static int intel_iommu_disable_iopf(struct device *dev)
4586{
4587	struct device_domain_info *info = dev_iommu_priv_get(dev);
4588	struct intel_iommu *iommu = info->iommu;
4589
4590	if (!info->pri_enabled)
4591		return -EINVAL;
4592
4593	/*
4594	 * PCIe spec states that by clearing PRI enable bit, the Page
4595	 * Request Interface will not issue new page requests, but has
4596	 * outstanding page requests that have been transmitted or are
4597	 * queued for transmission. This is supposed to be called after
4598	 * the device driver has stopped DMA, all PASIDs have been
4599	 * unbound and the outstanding PRQs have been drained.
4600	 */
4601	pci_disable_pri(to_pci_dev(dev));
4602	info->pri_enabled = 0;
4603
4604	/*
4605	 * With PRI disabled and outstanding PRQs drained, unregistering
4606	 * fault handler and removing device from iopf queue should never
4607	 * fail.
4608	 */
4609	WARN_ON(iommu_unregister_device_fault_handler(dev));
4610	WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
4611
4612	return 0;
4613}
4614
4615static int
4616intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4617{
4618	switch (feat) {
4619	case IOMMU_DEV_FEAT_IOPF:
4620		return intel_iommu_enable_iopf(dev);
4621
4622	case IOMMU_DEV_FEAT_SVA:
4623		return intel_iommu_enable_sva(dev);
4624
4625	default:
4626		return -ENODEV;
4627	}
4628}
4629
4630static int
4631intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4632{
4633	switch (feat) {
4634	case IOMMU_DEV_FEAT_IOPF:
4635		return intel_iommu_disable_iopf(dev);
4636
4637	case IOMMU_DEV_FEAT_SVA:
4638		return 0;
4639
4640	default:
4641		return -ENODEV;
4642	}
4643}
4644
4645static bool intel_iommu_is_attach_deferred(struct device *dev)
4646{
4647	struct device_domain_info *info = dev_iommu_priv_get(dev);
4648
4649	return translation_pre_enabled(info->iommu) && !info->domain;
4650}
4651
4652/*
4653 * Check that the device does not live on an external facing PCI port that is
4654 * marked as untrusted. Such devices should not be able to apply quirks and
4655 * thus not be able to bypass the IOMMU restrictions.
4656 */
4657static bool risky_device(struct pci_dev *pdev)
4658{
4659	if (pdev->untrusted) {
4660		pci_info(pdev,
4661			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4662			 pdev->vendor, pdev->device);
4663		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4664		return true;
4665	}
4666	return false;
4667}
4668
4669static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4670				       unsigned long iova, size_t size)
4671{
4672	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4673	unsigned long pages = aligned_nrpages(iova, size);
4674	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4675	struct iommu_domain_info *info;
4676	unsigned long i;
4677
4678	xa_for_each(&dmar_domain->iommu_array, i, info)
4679		__mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4680}
4681
4682static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4683{
4684	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
4685	struct dev_pasid_info *curr, *dev_pasid = NULL;
4686	struct dmar_domain *dmar_domain;
4687	struct iommu_domain *domain;
4688	unsigned long flags;
4689
4690	domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4691	if (WARN_ON_ONCE(!domain))
4692		goto out_tear_down;
4693
4694	/*
4695	 * The SVA implementation needs to handle its own stuffs like the mm
4696	 * notification. Before consolidating that code into iommu core, let
4697	 * the intel sva code handle it.
4698	 */
4699	if (domain->type == IOMMU_DOMAIN_SVA) {
4700		intel_svm_remove_dev_pasid(dev, pasid);
4701		goto out_tear_down;
4702	}
4703
4704	dmar_domain = to_dmar_domain(domain);
4705	spin_lock_irqsave(&dmar_domain->lock, flags);
4706	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4707		if (curr->dev == dev && curr->pasid == pasid) {
4708			list_del(&curr->link_domain);
4709			dev_pasid = curr;
4710			break;
4711		}
4712	}
4713	WARN_ON_ONCE(!dev_pasid);
4714	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4715
4716	domain_detach_iommu(dmar_domain, iommu);
4717	kfree(dev_pasid);
4718out_tear_down:
4719	intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4720	intel_drain_pasid_prq(dev, pasid);
4721}
4722
4723static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4724				     struct device *dev, ioasid_t pasid)
4725{
4726	struct device_domain_info *info = dev_iommu_priv_get(dev);
4727	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4728	struct intel_iommu *iommu = info->iommu;
4729	struct dev_pasid_info *dev_pasid;
4730	unsigned long flags;
4731	int ret;
4732
4733	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4734		return -EOPNOTSUPP;
4735
4736	if (context_copied(iommu, info->bus, info->devfn))
4737		return -EBUSY;
4738
4739	ret = prepare_domain_attach_device(domain, dev);
4740	if (ret)
4741		return ret;
4742
4743	dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4744	if (!dev_pasid)
4745		return -ENOMEM;
4746
4747	ret = domain_attach_iommu(dmar_domain, iommu);
4748	if (ret)
4749		goto out_free;
4750
4751	if (domain_type_is_si(dmar_domain))
4752		ret = intel_pasid_setup_pass_through(iommu, dmar_domain,
4753						     dev, pasid);
4754	else if (dmar_domain->use_first_level)
4755		ret = domain_setup_first_level(iommu, dmar_domain,
4756					       dev, pasid);
4757	else
4758		ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4759						     dev, pasid);
4760	if (ret)
4761		goto out_detach_iommu;
4762
4763	dev_pasid->dev = dev;
4764	dev_pasid->pasid = pasid;
4765	spin_lock_irqsave(&dmar_domain->lock, flags);
4766	list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4767	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4768
4769	return 0;
4770out_detach_iommu:
4771	domain_detach_iommu(dmar_domain, iommu);
4772out_free:
4773	kfree(dev_pasid);
4774	return ret;
4775}
4776
4777static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4778{
4779	struct device_domain_info *info = dev_iommu_priv_get(dev);
4780	struct intel_iommu *iommu = info->iommu;
4781	struct iommu_hw_info_vtd *vtd;
4782
4783	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4784	if (!vtd)
4785		return ERR_PTR(-ENOMEM);
4786
4787	vtd->cap_reg = iommu->cap;
4788	vtd->ecap_reg = iommu->ecap;
4789	*length = sizeof(*vtd);
4790	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4791	return vtd;
4792}
4793
4794const struct iommu_ops intel_iommu_ops = {
4795	.capable		= intel_iommu_capable,
4796	.hw_info		= intel_iommu_hw_info,
4797	.domain_alloc		= intel_iommu_domain_alloc,
4798	.probe_device		= intel_iommu_probe_device,
4799	.probe_finalize		= intel_iommu_probe_finalize,
4800	.release_device		= intel_iommu_release_device,
4801	.get_resv_regions	= intel_iommu_get_resv_regions,
4802	.device_group		= intel_iommu_device_group,
4803	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4804	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4805	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4806	.def_domain_type	= device_def_domain_type,
4807	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4808	.pgsize_bitmap		= SZ_4K,
4809#ifdef CONFIG_INTEL_IOMMU_SVM
4810	.page_response		= intel_svm_page_response,
4811#endif
4812	.default_domain_ops = &(const struct iommu_domain_ops) {
4813		.attach_dev		= intel_iommu_attach_device,
4814		.set_dev_pasid		= intel_iommu_set_dev_pasid,
4815		.map_pages		= intel_iommu_map_pages,
4816		.unmap_pages		= intel_iommu_unmap_pages,
4817		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4818		.flush_iotlb_all        = intel_flush_iotlb_all,
4819		.iotlb_sync		= intel_iommu_tlb_sync,
4820		.iova_to_phys		= intel_iommu_iova_to_phys,
4821		.free			= intel_iommu_domain_free,
4822		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4823	}
4824};
4825
4826static void quirk_iommu_igfx(struct pci_dev *dev)
4827{
4828	if (risky_device(dev))
4829		return;
4830
4831	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4832	dmar_map_gfx = 0;
4833}
4834
4835/* G4x/GM45 integrated gfx dmar support is totally busted. */
4836DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4837DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4838DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4839DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4840DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4841DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4842DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4843
4844/* Broadwell igfx malfunctions with dmar */
4845DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4846DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4847DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4848DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4849DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4850DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4851DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4852DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4853DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4854DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4855DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4856DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4857DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4858DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4859DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4860DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4861DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4862DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4863DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4864DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4865DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4866DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4867DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4868DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4869
4870static void quirk_iommu_rwbf(struct pci_dev *dev)
4871{
4872	if (risky_device(dev))
4873		return;
4874
4875	/*
4876	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4877	 * but needs it. Same seems to hold for the desktop versions.
4878	 */
4879	pci_info(dev, "Forcing write-buffer flush capability\n");
4880	rwbf_quirk = 1;
4881}
4882
4883DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4884DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4885DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4886DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4887DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4888DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4889DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4890
4891#define GGC 0x52
4892#define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4893#define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4894#define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4895#define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4896#define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4897#define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4898#define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4899#define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4900
4901static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4902{
4903	unsigned short ggc;
4904
4905	if (risky_device(dev))
4906		return;
4907
4908	if (pci_read_config_word(dev, GGC, &ggc))
4909		return;
4910
4911	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4912		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4913		dmar_map_gfx = 0;
4914	} else if (dmar_map_gfx) {
4915		/* we have to ensure the gfx device is idle before we flush */
4916		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4917		iommu_set_dma_strict();
4918	}
4919}
4920DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4921DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4922DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4923DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4924
4925static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4926{
4927	unsigned short ver;
4928
4929	if (!IS_GFX_DEVICE(dev))
4930		return;
4931
4932	ver = (dev->device >> 8) & 0xff;
4933	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4934	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4935	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4936		return;
4937
4938	if (risky_device(dev))
4939		return;
4940
4941	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4942	iommu_skip_te_disable = 1;
4943}
4944DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4945
4946/* On Tylersburg chipsets, some BIOSes have been known to enable the
4947   ISOCH DMAR unit for the Azalia sound device, but not give it any
4948   TLB entries, which causes it to deadlock. Check for that.  We do
4949   this in a function called from init_dmars(), instead of in a PCI
4950   quirk, because we don't want to print the obnoxious "BIOS broken"
4951   message if VT-d is actually disabled.
4952*/
4953static void __init check_tylersburg_isoch(void)
4954{
4955	struct pci_dev *pdev;
4956	uint32_t vtisochctrl;
4957
4958	/* If there's no Azalia in the system anyway, forget it. */
4959	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4960	if (!pdev)
4961		return;
4962
4963	if (risky_device(pdev)) {
4964		pci_dev_put(pdev);
4965		return;
4966	}
4967
4968	pci_dev_put(pdev);
4969
4970	/* System Management Registers. Might be hidden, in which case
4971	   we can't do the sanity check. But that's OK, because the
4972	   known-broken BIOSes _don't_ actually hide it, so far. */
4973	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4974	if (!pdev)
4975		return;
4976
4977	if (risky_device(pdev)) {
4978		pci_dev_put(pdev);
4979		return;
4980	}
4981
4982	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4983		pci_dev_put(pdev);
4984		return;
4985	}
4986
4987	pci_dev_put(pdev);
4988
4989	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4990	if (vtisochctrl & 1)
4991		return;
4992
4993	/* Drop all bits other than the number of TLB entries */
4994	vtisochctrl &= 0x1c;
4995
4996	/* If we have the recommended number of TLB entries (16), fine. */
4997	if (vtisochctrl == 0x10)
4998		return;
4999
5000	/* Zero TLB entries? You get to ride the short bus to school. */
5001	if (!vtisochctrl) {
5002		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5003		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5004		     dmi_get_system_info(DMI_BIOS_VENDOR),
5005		     dmi_get_system_info(DMI_BIOS_VERSION),
5006		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5007		iommu_identity_mapping |= IDENTMAP_AZALIA;
5008		return;
5009	}
5010
5011	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5012	       vtisochctrl);
5013}
5014
5015/*
5016 * Here we deal with a device TLB defect where device may inadvertently issue ATS
5017 * invalidation completion before posted writes initiated with translated address
5018 * that utilized translations matching the invalidation address range, violating
5019 * the invalidation completion ordering.
5020 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5021 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5022 * under the control of the trusted/privileged host device driver must use this
5023 * quirk.
5024 * Device TLBs are invalidated under the following six conditions:
5025 * 1. Device driver does DMA API unmap IOVA
5026 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5027 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5028 *    exit_mmap() due to crash
5029 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5030 *    VM has to free pages that were unmapped
5031 * 5. Userspace driver unmaps a DMA buffer
5032 * 6. Cache invalidation in vSVA usage (upcoming)
5033 *
5034 * For #1 and #2, device drivers are responsible for stopping DMA traffic
5035 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5036 * invalidate TLB the same way as normal user unmap which will use this quirk.
5037 * The dTLB invalidation after PASID cache flush does not need this quirk.
5038 *
5039 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5040 */
5041void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5042			       unsigned long address, unsigned long mask,
5043			       u32 pasid, u16 qdep)
5044{
5045	u16 sid;
5046
5047	if (likely(!info->dtlb_extra_inval))
5048		return;
5049
5050	sid = PCI_DEVID(info->bus, info->devfn);
5051	if (pasid == IOMMU_NO_PASID) {
5052		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5053				   qdep, address, mask);
5054	} else {
5055		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5056					 pasid, qdep, address, mask);
5057	}
5058}
5059
5060#define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
5061
5062/*
5063 * Function to submit a command to the enhanced command interface. The
5064 * valid enhanced command descriptions are defined in Table 47 of the
5065 * VT-d spec. The VT-d hardware implementation may support some but not
5066 * all commands, which can be determined by checking the Enhanced
5067 * Command Capability Register.
5068 *
5069 * Return values:
5070 *  - 0: Command successful without any error;
5071 *  - Negative: software error value;
5072 *  - Nonzero positive: failure status code defined in Table 48.
5073 */
5074int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5075{
5076	unsigned long flags;
5077	u64 res;
5078	int ret;
5079
5080	if (!cap_ecmds(iommu->cap))
5081		return -ENODEV;
5082
5083	raw_spin_lock_irqsave(&iommu->register_lock, flags);
5084
5085	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5086	if (res & DMA_ECMD_ECRSP_IP) {
5087		ret = -EBUSY;
5088		goto err;
5089	}
5090
5091	/*
5092	 * Unconditionally write the operand B, because
5093	 * - There is no side effect if an ecmd doesn't require an
5094	 *   operand B, but we set the register to some value.
5095	 * - It's not invoked in any critical path. The extra MMIO
5096	 *   write doesn't bring any performance concerns.
5097	 */
5098	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5099	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5100
5101	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5102		      !(res & DMA_ECMD_ECRSP_IP), res);
5103
5104	if (res & DMA_ECMD_ECRSP_IP) {
5105		ret = -ETIMEDOUT;
5106		goto err;
5107	}
5108
5109	ret = ecmd_get_status_code(res);
5110err:
5111	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5112
5113	return ret;
5114}
5115