1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright © 2006-2014 Intel Corporation.
4 *
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 *          Ashok Raj <ashok.raj@intel.com>,
7 *          Shaohua Li <shaohua.li@intel.com>,
8 *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 *          Fenghua Yu <fenghua.yu@intel.com>
10 *          Joerg Roedel <jroedel@suse.de>
11 */
12
13#define pr_fmt(fmt)     "DMAR: " fmt
14#define dev_fmt(fmt)    pr_fmt(fmt)
15
16#include <linux/init.h>
17#include <linux/bitmap.h>
18#include <linux/debugfs.h>
19#include <linux/export.h>
20#include <linux/slab.h>
21#include <linux/irq.h>
22#include <linux/interrupt.h>
23#include <linux/spinlock.h>
24#include <linux/pci.h>
25#include <linux/dmar.h>
26#include <linux/dma-map-ops.h>
27#include <linux/mempool.h>
28#include <linux/memory.h>
29#include <linux/cpu.h>
30#include <linux/timer.h>
31#include <linux/io.h>
32#include <linux/iova.h>
33#include <linux/iommu.h>
34#include <linux/intel-iommu.h>
35#include <linux/syscore_ops.h>
36#include <linux/tboot.h>
37#include <linux/dmi.h>
38#include <linux/pci-ats.h>
39#include <linux/memblock.h>
40#include <linux/dma-map-ops.h>
41#include <linux/dma-direct.h>
42#include <linux/crash_dump.h>
43#include <linux/numa.h>
44#include <linux/swiotlb.h>
45#include <asm/irq_remapping.h>
46#include <asm/cacheflush.h>
47#include <asm/iommu.h>
48#include <trace/events/intel_iommu.h>
49
50#include "../irq_remapping.h"
51#include "pasid.h"
52
53#define ROOT_SIZE		VTD_PAGE_SIZE
54#define CONTEXT_SIZE		VTD_PAGE_SIZE
55
56#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61#define IOAPIC_RANGE_START	(0xfee00000)
62#define IOAPIC_RANGE_END	(0xfeefffff)
63#define IOVA_START_ADDR		(0x1000)
64
65#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66
67#define MAX_AGAW_WIDTH 64
68#define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
71#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
72
73/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75#define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
76				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77#define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79/* IO virtual address start page frame number */
80#define IOVA_START_PFN		(1)
81
82#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
83
84/* page table handling */
85#define LEVEL_STRIDE		(9)
86#define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
87
88/*
89 * This bitmap is used to advertise the page sizes our hardware support
90 * to the IOMMU core, which will then use this information to split
91 * physically contiguous memory regions it is mapping into page sizes
92 * that we support.
93 *
94 * Traditionally the IOMMU core just handed us the mappings directly,
95 * after making sure the size is an order of a 4KiB page and that the
96 * mapping has natural alignment.
97 *
98 * To retain this behavior, we currently advertise that we support
99 * all page sizes that are an order of 4KiB.
100 *
101 * If at some point we'd like to utilize the IOMMU core's new behavior,
102 * we could change this to advertise the real page sizes we support.
103 */
104#define INTEL_IOMMU_PGSIZES	(~0xFFFUL)
105
106static inline int agaw_to_level(int agaw)
107{
108	return agaw + 2;
109}
110
111static inline int agaw_to_width(int agaw)
112{
113	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114}
115
116static inline int width_to_agaw(int width)
117{
118	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119}
120
121static inline unsigned int level_to_offset_bits(int level)
122{
123	return (level - 1) * LEVEL_STRIDE;
124}
125
126static inline int pfn_level_offset(u64 pfn, int level)
127{
128	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129}
130
131static inline u64 level_mask(int level)
132{
133	return -1ULL << level_to_offset_bits(level);
134}
135
136static inline u64 level_size(int level)
137{
138	return 1ULL << level_to_offset_bits(level);
139}
140
141static inline u64 align_to_level(u64 pfn, int level)
142{
143	return (pfn + level_size(level) - 1) & level_mask(level);
144}
145
146static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147{
148	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149}
150
151/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152   are never going to work. */
153static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154{
155	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156}
157
158static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159{
160	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161}
162static inline unsigned long page_to_dma_pfn(struct page *pg)
163{
164	return mm_to_dma_pfn(page_to_pfn(pg));
165}
166static inline unsigned long virt_to_dma_pfn(void *p)
167{
168	return page_to_dma_pfn(virt_to_page(p));
169}
170
171/* global iommu list, set NULL for ignored DMAR units */
172static struct intel_iommu **g_iommus;
173
174static void __init check_tylersburg_isoch(void);
175static int rwbf_quirk;
176
177/*
178 * set to 1 to panic kernel if can't successfully enable VT-d
179 * (used when kernel is launched w/ TXT)
180 */
181static int force_on = 0;
182static int intel_iommu_tboot_noforce;
183static int no_platform_optin;
184
185#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186
187/*
188 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189 * if marked present.
190 */
191static phys_addr_t root_entry_lctp(struct root_entry *re)
192{
193	if (!(re->lo & 1))
194		return 0;
195
196	return re->lo & VTD_PAGE_MASK;
197}
198
199/*
200 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201 * if marked present.
202 */
203static phys_addr_t root_entry_uctp(struct root_entry *re)
204{
205	if (!(re->hi & 1))
206		return 0;
207
208	return re->hi & VTD_PAGE_MASK;
209}
210
211static inline void context_clear_pasid_enable(struct context_entry *context)
212{
213	context->lo &= ~(1ULL << 11);
214}
215
216static inline bool context_pasid_enabled(struct context_entry *context)
217{
218	return !!(context->lo & (1ULL << 11));
219}
220
221static inline void context_set_copied(struct context_entry *context)
222{
223	context->hi |= (1ull << 3);
224}
225
226static inline bool context_copied(struct context_entry *context)
227{
228	return !!(context->hi & (1ULL << 3));
229}
230
231static inline bool __context_present(struct context_entry *context)
232{
233	return (context->lo & 1);
234}
235
236bool context_present(struct context_entry *context)
237{
238	return context_pasid_enabled(context) ?
239	     __context_present(context) :
240	     __context_present(context) && !context_copied(context);
241}
242
243static inline void context_set_present(struct context_entry *context)
244{
245	context->lo |= 1;
246}
247
248static inline void context_set_fault_enable(struct context_entry *context)
249{
250	context->lo &= (((u64)-1) << 2) | 1;
251}
252
253static inline void context_set_translation_type(struct context_entry *context,
254						unsigned long value)
255{
256	context->lo &= (((u64)-1) << 4) | 3;
257	context->lo |= (value & 3) << 2;
258}
259
260static inline void context_set_address_root(struct context_entry *context,
261					    unsigned long value)
262{
263	context->lo &= ~VTD_PAGE_MASK;
264	context->lo |= value & VTD_PAGE_MASK;
265}
266
267static inline void context_set_address_width(struct context_entry *context,
268					     unsigned long value)
269{
270	context->hi |= value & 7;
271}
272
273static inline void context_set_domain_id(struct context_entry *context,
274					 unsigned long value)
275{
276	context->hi |= (value & ((1 << 16) - 1)) << 8;
277}
278
279static inline int context_domain_id(struct context_entry *c)
280{
281	return((c->hi >> 8) & 0xffff);
282}
283
284static inline void context_clear_entry(struct context_entry *context)
285{
286	context->lo = 0;
287	context->hi = 0;
288}
289
290/*
291 * This domain is a statically identity mapping domain.
292 *	1. This domain creats a static 1:1 mapping to all usable memory.
293 * 	2. It maps to each iommu if successful.
294 *	3. Each iommu mapps to this domain if successful.
295 */
296static struct dmar_domain *si_domain;
297static int hw_pass_through = 1;
298
299#define for_each_domain_iommu(idx, domain)			\
300	for (idx = 0; idx < g_num_of_iommus; idx++)		\
301		if (domain->iommu_refcnt[idx])
302
303struct dmar_rmrr_unit {
304	struct list_head list;		/* list of rmrr units	*/
305	struct acpi_dmar_header *hdr;	/* ACPI header		*/
306	u64	base_address;		/* reserved base address*/
307	u64	end_address;		/* reserved end address */
308	struct dmar_dev_scope *devices;	/* target devices */
309	int	devices_cnt;		/* target device count */
310};
311
312struct dmar_atsr_unit {
313	struct list_head list;		/* list of ATSR units */
314	struct acpi_dmar_header *hdr;	/* ACPI header */
315	struct dmar_dev_scope *devices;	/* target devices */
316	int devices_cnt;		/* target device count */
317	u8 include_all:1;		/* include all ports */
318};
319
320static LIST_HEAD(dmar_atsr_units);
321static LIST_HEAD(dmar_rmrr_units);
322
323#define for_each_rmrr_units(rmrr) \
324	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
325
326/* bitmap for indexing intel_iommus */
327static int g_num_of_iommus;
328
329static void domain_exit(struct dmar_domain *domain);
330static void domain_remove_dev_info(struct dmar_domain *domain);
331static void dmar_remove_one_dev_info(struct device *dev);
332static void __dmar_remove_one_dev_info(struct device_domain_info *info);
333static int intel_iommu_attach_device(struct iommu_domain *domain,
334				     struct device *dev);
335static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
336					    dma_addr_t iova);
337
338#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
339int dmar_disabled = 0;
340#else
341int dmar_disabled = 1;
342#endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
343
344#ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
345int intel_iommu_sm = 1;
346#else
347int intel_iommu_sm;
348#endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
349
350int intel_iommu_enabled = 0;
351EXPORT_SYMBOL_GPL(intel_iommu_enabled);
352
353static int dmar_map_gfx = 1;
354static int dmar_forcedac;
355static int intel_iommu_strict;
356static int intel_iommu_superpage = 1;
357static int iommu_identity_mapping;
358static int intel_no_bounce;
359static int iommu_skip_te_disable;
360
361#define IDENTMAP_GFX		2
362#define IDENTMAP_AZALIA		4
363
364int intel_iommu_gfx_mapped;
365EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
366
367#define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
368struct device_domain_info *get_domain_info(struct device *dev)
369{
370	struct device_domain_info *info;
371
372	if (!dev)
373		return NULL;
374
375	info = dev_iommu_priv_get(dev);
376	if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
377		return NULL;
378
379	return info;
380}
381
382DEFINE_SPINLOCK(device_domain_lock);
383static LIST_HEAD(device_domain_list);
384
385#define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&	\
386				to_pci_dev(d)->untrusted)
387
388/*
389 * Iterate over elements in device_domain_list and call the specified
390 * callback @fn against each element.
391 */
392int for_each_device_domain(int (*fn)(struct device_domain_info *info,
393				     void *data), void *data)
394{
395	int ret = 0;
396	unsigned long flags;
397	struct device_domain_info *info;
398
399	spin_lock_irqsave(&device_domain_lock, flags);
400	list_for_each_entry(info, &device_domain_list, global) {
401		ret = fn(info, data);
402		if (ret) {
403			spin_unlock_irqrestore(&device_domain_lock, flags);
404			return ret;
405		}
406	}
407	spin_unlock_irqrestore(&device_domain_lock, flags);
408
409	return 0;
410}
411
412const struct iommu_ops intel_iommu_ops;
413
414static bool translation_pre_enabled(struct intel_iommu *iommu)
415{
416	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
417}
418
419static void clear_translation_pre_enabled(struct intel_iommu *iommu)
420{
421	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
422}
423
424static void init_translation_status(struct intel_iommu *iommu)
425{
426	u32 gsts;
427
428	gsts = readl(iommu->reg + DMAR_GSTS_REG);
429	if (gsts & DMA_GSTS_TES)
430		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
431}
432
433static int __init intel_iommu_setup(char *str)
434{
435	if (!str)
436		return -EINVAL;
437	while (*str) {
438		if (!strncmp(str, "on", 2)) {
439			dmar_disabled = 0;
440			pr_info("IOMMU enabled\n");
441		} else if (!strncmp(str, "off", 3)) {
442			dmar_disabled = 1;
443			no_platform_optin = 1;
444			pr_info("IOMMU disabled\n");
445		} else if (!strncmp(str, "igfx_off", 8)) {
446			dmar_map_gfx = 0;
447			pr_info("Disable GFX device mapping\n");
448		} else if (!strncmp(str, "forcedac", 8)) {
449			pr_info("Forcing DAC for PCI devices\n");
450			dmar_forcedac = 1;
451		} else if (!strncmp(str, "strict", 6)) {
452			pr_info("Disable batched IOTLB flush\n");
453			intel_iommu_strict = 1;
454		} else if (!strncmp(str, "sp_off", 6)) {
455			pr_info("Disable supported super page\n");
456			intel_iommu_superpage = 0;
457		} else if (!strncmp(str, "sm_on", 5)) {
458			pr_info("Intel-IOMMU: scalable mode supported\n");
459			intel_iommu_sm = 1;
460		} else if (!strncmp(str, "tboot_noforce", 13)) {
461			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
462			intel_iommu_tboot_noforce = 1;
463		} else if (!strncmp(str, "nobounce", 8)) {
464			pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
465			intel_no_bounce = 1;
466		}
467
468		str += strcspn(str, ",");
469		while (*str == ',')
470			str++;
471	}
472	return 0;
473}
474__setup("intel_iommu=", intel_iommu_setup);
475
476static struct kmem_cache *iommu_domain_cache;
477static struct kmem_cache *iommu_devinfo_cache;
478
479static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
480{
481	struct dmar_domain **domains;
482	int idx = did >> 8;
483
484	domains = iommu->domains[idx];
485	if (!domains)
486		return NULL;
487
488	return domains[did & 0xff];
489}
490
491static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
492			     struct dmar_domain *domain)
493{
494	struct dmar_domain **domains;
495	int idx = did >> 8;
496
497	if (!iommu->domains[idx]) {
498		size_t size = 256 * sizeof(struct dmar_domain *);
499		iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
500	}
501
502	domains = iommu->domains[idx];
503	if (WARN_ON(!domains))
504		return;
505	else
506		domains[did & 0xff] = domain;
507}
508
509void *alloc_pgtable_page(int node)
510{
511	struct page *page;
512	void *vaddr = NULL;
513
514	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
515	if (page)
516		vaddr = page_address(page);
517	return vaddr;
518}
519
520void free_pgtable_page(void *vaddr)
521{
522	free_page((unsigned long)vaddr);
523}
524
525static inline void *alloc_domain_mem(void)
526{
527	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
528}
529
530static void free_domain_mem(void *vaddr)
531{
532	kmem_cache_free(iommu_domain_cache, vaddr);
533}
534
535static inline void * alloc_devinfo_mem(void)
536{
537	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
538}
539
540static inline void free_devinfo_mem(void *vaddr)
541{
542	kmem_cache_free(iommu_devinfo_cache, vaddr);
543}
544
545static inline int domain_type_is_si(struct dmar_domain *domain)
546{
547	return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
548}
549
550static inline bool domain_use_first_level(struct dmar_domain *domain)
551{
552	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
553}
554
555static inline int domain_pfn_supported(struct dmar_domain *domain,
556				       unsigned long pfn)
557{
558	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
559
560	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
561}
562
563/*
564 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
565 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
566 * the returned SAGAW.
567 */
568static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
569{
570	unsigned long fl_sagaw, sl_sagaw;
571
572	fl_sagaw = BIT(2) | (cap_5lp_support(iommu->cap) ? BIT(3) : 0);
573	sl_sagaw = cap_sagaw(iommu->cap);
574
575	/* Second level only. */
576	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
577		return sl_sagaw;
578
579	/* First level only. */
580	if (!ecap_slts(iommu->ecap))
581		return fl_sagaw;
582
583	return fl_sagaw & sl_sagaw;
584}
585
586static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
587{
588	unsigned long sagaw;
589	int agaw = -1;
590
591	sagaw = __iommu_calculate_sagaw(iommu);
592	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
593		if (test_bit(agaw, &sagaw))
594			break;
595	}
596
597	return agaw;
598}
599
600/*
601 * Calculate max SAGAW for each iommu.
602 */
603int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
604{
605	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
606}
607
608/*
609 * calculate agaw for each iommu.
610 * "SAGAW" may be different across iommus, use a default agaw, and
611 * get a supported less agaw for iommus that don't support the default agaw.
612 */
613int iommu_calculate_agaw(struct intel_iommu *iommu)
614{
615	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
616}
617
618/* This functionin only returns single iommu in a domain */
619struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
620{
621	int iommu_id;
622
623	/* si_domain and vm domain should not get here. */
624	if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
625		return NULL;
626
627	for_each_domain_iommu(iommu_id, domain)
628		break;
629
630	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
631		return NULL;
632
633	return g_iommus[iommu_id];
634}
635
636static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
637{
638	return sm_supported(iommu) ?
639			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
640}
641
642static void domain_update_iommu_coherency(struct dmar_domain *domain)
643{
644	struct dmar_drhd_unit *drhd;
645	struct intel_iommu *iommu;
646	bool found = false;
647	int i;
648
649	domain->iommu_coherency = 1;
650
651	for_each_domain_iommu(i, domain) {
652		found = true;
653		if (!iommu_paging_structure_coherency(g_iommus[i])) {
654			domain->iommu_coherency = 0;
655			break;
656		}
657	}
658	if (found)
659		return;
660
661	/* No hardware attached; use lowest common denominator */
662	rcu_read_lock();
663	for_each_active_iommu(iommu, drhd) {
664		if (!iommu_paging_structure_coherency(iommu)) {
665			domain->iommu_coherency = 0;
666			break;
667		}
668	}
669	rcu_read_unlock();
670}
671
672static int domain_update_iommu_snooping(struct intel_iommu *skip)
673{
674	struct dmar_drhd_unit *drhd;
675	struct intel_iommu *iommu;
676	int ret = 1;
677
678	rcu_read_lock();
679	for_each_active_iommu(iommu, drhd) {
680		if (iommu != skip) {
681			/*
682			 * If the hardware is operating in the scalable mode,
683			 * the snooping control is always supported since we
684			 * always set PASID-table-entry.PGSNP bit if the domain
685			 * is managed outside (UNMANAGED).
686			 */
687			if (!sm_supported(iommu) &&
688			    !ecap_sc_support(iommu->ecap)) {
689				ret = 0;
690				break;
691			}
692		}
693	}
694	rcu_read_unlock();
695
696	return ret;
697}
698
699static int domain_update_iommu_superpage(struct dmar_domain *domain,
700					 struct intel_iommu *skip)
701{
702	struct dmar_drhd_unit *drhd;
703	struct intel_iommu *iommu;
704	int mask = 0x3;
705
706	if (!intel_iommu_superpage) {
707		return 0;
708	}
709
710	/* set iommu_superpage to the smallest common denominator */
711	rcu_read_lock();
712	for_each_active_iommu(iommu, drhd) {
713		if (iommu != skip) {
714			if (domain && domain_use_first_level(domain)) {
715				if (!cap_fl1gp_support(iommu->cap))
716					mask = 0x1;
717			} else {
718				mask &= cap_super_page_val(iommu->cap);
719			}
720
721			if (!mask)
722				break;
723		}
724	}
725	rcu_read_unlock();
726
727	return fls(mask);
728}
729
730static int domain_update_device_node(struct dmar_domain *domain)
731{
732	struct device_domain_info *info;
733	int nid = NUMA_NO_NODE;
734
735	assert_spin_locked(&device_domain_lock);
736
737	if (list_empty(&domain->devices))
738		return NUMA_NO_NODE;
739
740	list_for_each_entry(info, &domain->devices, link) {
741		if (!info->dev)
742			continue;
743
744		/*
745		 * There could possibly be multiple device numa nodes as devices
746		 * within the same domain may sit behind different IOMMUs. There
747		 * isn't perfect answer in such situation, so we select first
748		 * come first served policy.
749		 */
750		nid = dev_to_node(info->dev);
751		if (nid != NUMA_NO_NODE)
752			break;
753	}
754
755	return nid;
756}
757
758/* Some capabilities may be different across iommus */
759static void domain_update_iommu_cap(struct dmar_domain *domain)
760{
761	domain_update_iommu_coherency(domain);
762	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
763	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
764
765	/*
766	 * If RHSA is missing, we should default to the device numa domain
767	 * as fall back.
768	 */
769	if (domain->nid == NUMA_NO_NODE)
770		domain->nid = domain_update_device_node(domain);
771
772	/*
773	 * First-level translation restricts the input-address to a
774	 * canonical address (i.e., address bits 63:N have the same
775	 * value as address bit [N-1], where N is 48-bits with 4-level
776	 * paging and 57-bits with 5-level paging). Hence, skip bit
777	 * [N-1].
778	 */
779	if (domain_use_first_level(domain))
780		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
781	else
782		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
783}
784
785struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
786					 u8 devfn, int alloc)
787{
788	struct root_entry *root = &iommu->root_entry[bus];
789	struct context_entry *context;
790	u64 *entry;
791
792	entry = &root->lo;
793	if (sm_supported(iommu)) {
794		if (devfn >= 0x80) {
795			devfn -= 0x80;
796			entry = &root->hi;
797		}
798		devfn *= 2;
799	}
800	if (*entry & 1)
801		context = phys_to_virt(*entry & VTD_PAGE_MASK);
802	else {
803		unsigned long phy_addr;
804		if (!alloc)
805			return NULL;
806
807		context = alloc_pgtable_page(iommu->node);
808		if (!context)
809			return NULL;
810
811		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
812		phy_addr = virt_to_phys((void *)context);
813		*entry = phy_addr | 1;
814		__iommu_flush_cache(iommu, entry, sizeof(*entry));
815	}
816	return &context[devfn];
817}
818
819static bool attach_deferred(struct device *dev)
820{
821	return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
822}
823
824/**
825 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
826 *				 sub-hierarchy of a candidate PCI-PCI bridge
827 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
828 * @bridge: the candidate PCI-PCI bridge
829 *
830 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
831 */
832static bool
833is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
834{
835	struct pci_dev *pdev, *pbridge;
836
837	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
838		return false;
839
840	pdev = to_pci_dev(dev);
841	pbridge = to_pci_dev(bridge);
842
843	if (pbridge->subordinate &&
844	    pbridge->subordinate->number <= pdev->bus->number &&
845	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
846		return true;
847
848	return false;
849}
850
851static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
852{
853	struct dmar_drhd_unit *drhd;
854	u32 vtbar;
855	int rc;
856
857	/* We know that this device on this chipset has its own IOMMU.
858	 * If we find it under a different IOMMU, then the BIOS is lying
859	 * to us. Hope that the IOMMU for this device is actually
860	 * disabled, and it needs no translation...
861	 */
862	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
863	if (rc) {
864		/* "can't" happen */
865		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
866		return false;
867	}
868	vtbar &= 0xffff0000;
869
870	/* we know that the this iommu should be at offset 0xa000 from vtbar */
871	drhd = dmar_find_matched_drhd_unit(pdev);
872	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
873		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
874		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
875		return true;
876	}
877
878	return false;
879}
880
881static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
882{
883	if (!iommu || iommu->drhd->ignored)
884		return true;
885
886	if (dev_is_pci(dev)) {
887		struct pci_dev *pdev = to_pci_dev(dev);
888
889		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
890		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
891		    quirk_ioat_snb_local_iommu(pdev))
892			return true;
893	}
894
895	return false;
896}
897
898struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
899{
900	struct dmar_drhd_unit *drhd = NULL;
901	struct pci_dev *pdev = NULL;
902	struct intel_iommu *iommu;
903	struct device *tmp;
904	u16 segment = 0;
905	int i;
906
907	if (!dev)
908		return NULL;
909
910	if (dev_is_pci(dev)) {
911		struct pci_dev *pf_pdev;
912
913		pdev = pci_real_dma_dev(to_pci_dev(dev));
914
915		/* VFs aren't listed in scope tables; we need to look up
916		 * the PF instead to find the IOMMU. */
917		pf_pdev = pci_physfn(pdev);
918		dev = &pf_pdev->dev;
919		segment = pci_domain_nr(pdev->bus);
920	} else if (has_acpi_companion(dev))
921		dev = &ACPI_COMPANION(dev)->dev;
922
923	rcu_read_lock();
924	for_each_iommu(iommu, drhd) {
925		if (pdev && segment != drhd->segment)
926			continue;
927
928		for_each_active_dev_scope(drhd->devices,
929					  drhd->devices_cnt, i, tmp) {
930			if (tmp == dev) {
931				/* For a VF use its original BDF# not that of the PF
932				 * which we used for the IOMMU lookup. Strictly speaking
933				 * we could do this for all PCI devices; we only need to
934				 * get the BDF# from the scope table for ACPI matches. */
935				if (pdev && pdev->is_virtfn)
936					goto got_pdev;
937
938				if (bus && devfn) {
939					*bus = drhd->devices[i].bus;
940					*devfn = drhd->devices[i].devfn;
941				}
942				goto out;
943			}
944
945			if (is_downstream_to_pci_bridge(dev, tmp))
946				goto got_pdev;
947		}
948
949		if (pdev && drhd->include_all) {
950		got_pdev:
951			if (bus && devfn) {
952				*bus = pdev->bus->number;
953				*devfn = pdev->devfn;
954			}
955			goto out;
956		}
957	}
958	iommu = NULL;
959 out:
960	if (iommu_is_dummy(iommu, dev))
961		iommu = NULL;
962
963	rcu_read_unlock();
964
965	return iommu;
966}
967
968static void domain_flush_cache(struct dmar_domain *domain,
969			       void *addr, int size)
970{
971	if (!domain->iommu_coherency)
972		clflush_cache_range(addr, size);
973}
974
975static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
976{
977	struct context_entry *context;
978	int ret = 0;
979	unsigned long flags;
980
981	spin_lock_irqsave(&iommu->lock, flags);
982	context = iommu_context_addr(iommu, bus, devfn, 0);
983	if (context)
984		ret = context_present(context);
985	spin_unlock_irqrestore(&iommu->lock, flags);
986	return ret;
987}
988
989static void free_context_table(struct intel_iommu *iommu)
990{
991	int i;
992	unsigned long flags;
993	struct context_entry *context;
994
995	spin_lock_irqsave(&iommu->lock, flags);
996	if (!iommu->root_entry) {
997		goto out;
998	}
999	for (i = 0; i < ROOT_ENTRY_NR; i++) {
1000		context = iommu_context_addr(iommu, i, 0, 0);
1001		if (context)
1002			free_pgtable_page(context);
1003
1004		if (!sm_supported(iommu))
1005			continue;
1006
1007		context = iommu_context_addr(iommu, i, 0x80, 0);
1008		if (context)
1009			free_pgtable_page(context);
1010
1011	}
1012	free_pgtable_page(iommu->root_entry);
1013	iommu->root_entry = NULL;
1014out:
1015	spin_unlock_irqrestore(&iommu->lock, flags);
1016}
1017
1018static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
1019				      unsigned long pfn, int *target_level)
1020{
1021	struct dma_pte *parent, *pte;
1022	int level = agaw_to_level(domain->agaw);
1023	int offset;
1024
1025	BUG_ON(!domain->pgd);
1026
1027	if (!domain_pfn_supported(domain, pfn))
1028		/* Address beyond IOMMU's addressing capabilities. */
1029		return NULL;
1030
1031	parent = domain->pgd;
1032
1033	while (1) {
1034		void *tmp_page;
1035
1036		offset = pfn_level_offset(pfn, level);
1037		pte = &parent[offset];
1038		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1039			break;
1040		if (level == *target_level)
1041			break;
1042
1043		if (!dma_pte_present(pte)) {
1044			uint64_t pteval;
1045
1046			tmp_page = alloc_pgtable_page(domain->nid);
1047
1048			if (!tmp_page)
1049				return NULL;
1050
1051			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1052			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1053			if (domain_use_first_level(domain)) {
1054				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1055				if (domain->domain.type == IOMMU_DOMAIN_DMA)
1056					pteval |= DMA_FL_PTE_ACCESS;
1057			}
1058			if (cmpxchg64(&pte->val, 0ULL, pteval))
1059				/* Someone else set it while we were thinking; use theirs. */
1060				free_pgtable_page(tmp_page);
1061			else
1062				domain_flush_cache(domain, pte, sizeof(*pte));
1063		}
1064		if (level == 1)
1065			break;
1066
1067		parent = phys_to_virt(dma_pte_addr(pte));
1068		level--;
1069	}
1070
1071	if (!*target_level)
1072		*target_level = level;
1073
1074	return pte;
1075}
1076
1077/* return address's pte at specific level */
1078static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1079					 unsigned long pfn,
1080					 int level, int *large_page)
1081{
1082	struct dma_pte *parent, *pte;
1083	int total = agaw_to_level(domain->agaw);
1084	int offset;
1085
1086	parent = domain->pgd;
1087	while (level <= total) {
1088		offset = pfn_level_offset(pfn, total);
1089		pte = &parent[offset];
1090		if (level == total)
1091			return pte;
1092
1093		if (!dma_pte_present(pte)) {
1094			*large_page = total;
1095			break;
1096		}
1097
1098		if (dma_pte_superpage(pte)) {
1099			*large_page = total;
1100			return pte;
1101		}
1102
1103		parent = phys_to_virt(dma_pte_addr(pte));
1104		total--;
1105	}
1106	return NULL;
1107}
1108
1109/* clear last level pte, a tlb flush should be followed */
1110static void dma_pte_clear_range(struct dmar_domain *domain,
1111				unsigned long start_pfn,
1112				unsigned long last_pfn)
1113{
1114	unsigned int large_page;
1115	struct dma_pte *first_pte, *pte;
1116
1117	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1118	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1119	BUG_ON(start_pfn > last_pfn);
1120
1121	/* we don't need lock here; nobody else touches the iova range */
1122	do {
1123		large_page = 1;
1124		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1125		if (!pte) {
1126			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1127			continue;
1128		}
1129		do {
1130			dma_clear_pte(pte);
1131			start_pfn += lvl_to_nr_pages(large_page);
1132			pte++;
1133		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1134
1135		domain_flush_cache(domain, first_pte,
1136				   (void *)pte - (void *)first_pte);
1137
1138	} while (start_pfn && start_pfn <= last_pfn);
1139}
1140
1141static void dma_pte_free_level(struct dmar_domain *domain, int level,
1142			       int retain_level, struct dma_pte *pte,
1143			       unsigned long pfn, unsigned long start_pfn,
1144			       unsigned long last_pfn)
1145{
1146	pfn = max(start_pfn, pfn);
1147	pte = &pte[pfn_level_offset(pfn, level)];
1148
1149	do {
1150		unsigned long level_pfn;
1151		struct dma_pte *level_pte;
1152
1153		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1154			goto next;
1155
1156		level_pfn = pfn & level_mask(level);
1157		level_pte = phys_to_virt(dma_pte_addr(pte));
1158
1159		if (level > 2) {
1160			dma_pte_free_level(domain, level - 1, retain_level,
1161					   level_pte, level_pfn, start_pfn,
1162					   last_pfn);
1163		}
1164
1165		/*
1166		 * Free the page table if we're below the level we want to
1167		 * retain and the range covers the entire table.
1168		 */
1169		if (level < retain_level && !(start_pfn > level_pfn ||
1170		      last_pfn < level_pfn + level_size(level) - 1)) {
1171			dma_clear_pte(pte);
1172			domain_flush_cache(domain, pte, sizeof(*pte));
1173			free_pgtable_page(level_pte);
1174		}
1175next:
1176		pfn += level_size(level);
1177	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1178}
1179
1180/*
1181 * clear last level (leaf) ptes and free page table pages below the
1182 * level we wish to keep intact.
1183 */
1184static void dma_pte_free_pagetable(struct dmar_domain *domain,
1185				   unsigned long start_pfn,
1186				   unsigned long last_pfn,
1187				   int retain_level)
1188{
1189	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1190	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1191	BUG_ON(start_pfn > last_pfn);
1192
1193	dma_pte_clear_range(domain, start_pfn, last_pfn);
1194
1195	/* We don't need lock here; nobody else touches the iova range */
1196	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1197			   domain->pgd, 0, start_pfn, last_pfn);
1198
1199	/* free pgd */
1200	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1201		free_pgtable_page(domain->pgd);
1202		domain->pgd = NULL;
1203	}
1204}
1205
1206/* When a page at a given level is being unlinked from its parent, we don't
1207   need to *modify* it at all. All we need to do is make a list of all the
1208   pages which can be freed just as soon as we've flushed the IOTLB and we
1209   know the hardware page-walk will no longer touch them.
1210   The 'pte' argument is the *parent* PTE, pointing to the page that is to
1211   be freed. */
1212static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1213					    int level, struct dma_pte *pte,
1214					    struct page *freelist)
1215{
1216	struct page *pg;
1217
1218	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1219	pg->freelist = freelist;
1220	freelist = pg;
1221
1222	if (level == 1)
1223		return freelist;
1224
1225	pte = page_address(pg);
1226	do {
1227		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1228			freelist = dma_pte_list_pagetables(domain, level - 1,
1229							   pte, freelist);
1230		pte++;
1231	} while (!first_pte_in_page(pte));
1232
1233	return freelist;
1234}
1235
1236static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1237					struct dma_pte *pte, unsigned long pfn,
1238					unsigned long start_pfn,
1239					unsigned long last_pfn,
1240					struct page *freelist)
1241{
1242	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1243
1244	pfn = max(start_pfn, pfn);
1245	pte = &pte[pfn_level_offset(pfn, level)];
1246
1247	do {
1248		unsigned long level_pfn;
1249
1250		if (!dma_pte_present(pte))
1251			goto next;
1252
1253		level_pfn = pfn & level_mask(level);
1254
1255		/* If range covers entire pagetable, free it */
1256		if (start_pfn <= level_pfn &&
1257		    last_pfn >= level_pfn + level_size(level) - 1) {
1258			/* These suborbinate page tables are going away entirely. Don't
1259			   bother to clear them; we're just going to *free* them. */
1260			if (level > 1 && !dma_pte_superpage(pte))
1261				freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1262
1263			dma_clear_pte(pte);
1264			if (!first_pte)
1265				first_pte = pte;
1266			last_pte = pte;
1267		} else if (level > 1) {
1268			/* Recurse down into a level that isn't *entirely* obsolete */
1269			freelist = dma_pte_clear_level(domain, level - 1,
1270						       phys_to_virt(dma_pte_addr(pte)),
1271						       level_pfn, start_pfn, last_pfn,
1272						       freelist);
1273		}
1274next:
1275		pfn += level_size(level);
1276	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1277
1278	if (first_pte)
1279		domain_flush_cache(domain, first_pte,
1280				   (void *)++last_pte - (void *)first_pte);
1281
1282	return freelist;
1283}
1284
1285/* We can't just free the pages because the IOMMU may still be walking
1286   the page tables, and may have cached the intermediate levels. The
1287   pages can only be freed after the IOTLB flush has been done. */
1288static struct page *domain_unmap(struct dmar_domain *domain,
1289				 unsigned long start_pfn,
1290				 unsigned long last_pfn)
1291{
1292	struct page *freelist;
1293
1294	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1295	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1296	BUG_ON(start_pfn > last_pfn);
1297
1298	/* we don't need lock here; nobody else touches the iova range */
1299	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1300				       domain->pgd, 0, start_pfn, last_pfn, NULL);
1301
1302	/* free pgd */
1303	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1304		struct page *pgd_page = virt_to_page(domain->pgd);
1305		pgd_page->freelist = freelist;
1306		freelist = pgd_page;
1307
1308		domain->pgd = NULL;
1309	}
1310
1311	return freelist;
1312}
1313
1314static void dma_free_pagelist(struct page *freelist)
1315{
1316	struct page *pg;
1317
1318	while ((pg = freelist)) {
1319		freelist = pg->freelist;
1320		free_pgtable_page(page_address(pg));
1321	}
1322}
1323
1324static void iova_entry_free(unsigned long data)
1325{
1326	struct page *freelist = (struct page *)data;
1327
1328	dma_free_pagelist(freelist);
1329}
1330
1331/* iommu handling */
1332static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1333{
1334	struct root_entry *root;
1335	unsigned long flags;
1336
1337	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1338	if (!root) {
1339		pr_err("Allocating root entry for %s failed\n",
1340			iommu->name);
1341		return -ENOMEM;
1342	}
1343
1344	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1345
1346	spin_lock_irqsave(&iommu->lock, flags);
1347	iommu->root_entry = root;
1348	spin_unlock_irqrestore(&iommu->lock, flags);
1349
1350	return 0;
1351}
1352
1353static void iommu_set_root_entry(struct intel_iommu *iommu)
1354{
1355	u64 addr;
1356	u32 sts;
1357	unsigned long flag;
1358
1359	addr = virt_to_phys(iommu->root_entry);
1360	if (sm_supported(iommu))
1361		addr |= DMA_RTADDR_SMT;
1362
1363	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1364	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1365
1366	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1367
1368	/* Make sure hardware complete it */
1369	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1370		      readl, (sts & DMA_GSTS_RTPS), sts);
1371
1372	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1373
1374	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1375	if (sm_supported(iommu))
1376		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1377	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1378}
1379
1380void iommu_flush_write_buffer(struct intel_iommu *iommu)
1381{
1382	u32 val;
1383	unsigned long flag;
1384
1385	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1386		return;
1387
1388	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1389	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1390
1391	/* Make sure hardware complete it */
1392	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1393		      readl, (!(val & DMA_GSTS_WBFS)), val);
1394
1395	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1396}
1397
1398/* return value determine if we need a write buffer flush */
1399static void __iommu_flush_context(struct intel_iommu *iommu,
1400				  u16 did, u16 source_id, u8 function_mask,
1401				  u64 type)
1402{
1403	u64 val = 0;
1404	unsigned long flag;
1405
1406	switch (type) {
1407	case DMA_CCMD_GLOBAL_INVL:
1408		val = DMA_CCMD_GLOBAL_INVL;
1409		break;
1410	case DMA_CCMD_DOMAIN_INVL:
1411		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1412		break;
1413	case DMA_CCMD_DEVICE_INVL:
1414		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1415			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1416		break;
1417	default:
1418		BUG();
1419	}
1420	val |= DMA_CCMD_ICC;
1421
1422	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1423	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1424
1425	/* Make sure hardware complete it */
1426	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1427		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1428
1429	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1430}
1431
1432/* return value determine if we need a write buffer flush */
1433static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1434				u64 addr, unsigned int size_order, u64 type)
1435{
1436	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1437	u64 val = 0, val_iva = 0;
1438	unsigned long flag;
1439
1440	switch (type) {
1441	case DMA_TLB_GLOBAL_FLUSH:
1442		/* global flush doesn't need set IVA_REG */
1443		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1444		break;
1445	case DMA_TLB_DSI_FLUSH:
1446		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1447		break;
1448	case DMA_TLB_PSI_FLUSH:
1449		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1450		/* IH bit is passed in as part of address */
1451		val_iva = size_order | addr;
1452		break;
1453	default:
1454		BUG();
1455	}
1456	/* Note: set drain read/write */
1457#if 0
1458	/*
1459	 * This is probably to be super secure.. Looks like we can
1460	 * ignore it without any impact.
1461	 */
1462	if (cap_read_drain(iommu->cap))
1463		val |= DMA_TLB_READ_DRAIN;
1464#endif
1465	if (cap_write_drain(iommu->cap))
1466		val |= DMA_TLB_WRITE_DRAIN;
1467
1468	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1469	/* Note: Only uses first TLB reg currently */
1470	if (val_iva)
1471		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1472	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1473
1474	/* Make sure hardware complete it */
1475	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1476		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1477
1478	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1479
1480	/* check IOTLB invalidation granularity */
1481	if (DMA_TLB_IAIG(val) == 0)
1482		pr_err("Flush IOTLB failed\n");
1483	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1484		pr_debug("TLB flush request %Lx, actual %Lx\n",
1485			(unsigned long long)DMA_TLB_IIRG(type),
1486			(unsigned long long)DMA_TLB_IAIG(val));
1487}
1488
1489static struct device_domain_info *
1490iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1491			 u8 bus, u8 devfn)
1492{
1493	struct device_domain_info *info;
1494
1495	assert_spin_locked(&device_domain_lock);
1496
1497	if (!iommu->qi)
1498		return NULL;
1499
1500	list_for_each_entry(info, &domain->devices, link)
1501		if (info->iommu == iommu && info->bus == bus &&
1502		    info->devfn == devfn) {
1503			if (info->ats_supported && info->dev)
1504				return info;
1505			break;
1506		}
1507
1508	return NULL;
1509}
1510
1511static void domain_update_iotlb(struct dmar_domain *domain)
1512{
1513	struct device_domain_info *info;
1514	bool has_iotlb_device = false;
1515
1516	assert_spin_locked(&device_domain_lock);
1517
1518	list_for_each_entry(info, &domain->devices, link) {
1519		struct pci_dev *pdev;
1520
1521		if (!info->dev || !dev_is_pci(info->dev))
1522			continue;
1523
1524		pdev = to_pci_dev(info->dev);
1525		if (pdev->ats_enabled) {
1526			has_iotlb_device = true;
1527			break;
1528		}
1529	}
1530
1531	domain->has_iotlb_device = has_iotlb_device;
1532}
1533
1534static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1535{
1536	struct pci_dev *pdev;
1537
1538	assert_spin_locked(&device_domain_lock);
1539
1540	if (!info || !dev_is_pci(info->dev))
1541		return;
1542
1543	pdev = to_pci_dev(info->dev);
1544	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1545	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1546	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1547	 * reserved, which should be set to 0.
1548	 */
1549	if (!ecap_dit(info->iommu->ecap))
1550		info->pfsid = 0;
1551	else {
1552		struct pci_dev *pf_pdev;
1553
1554		/* pdev will be returned if device is not a vf */
1555		pf_pdev = pci_physfn(pdev);
1556		info->pfsid = pci_dev_id(pf_pdev);
1557	}
1558
1559#ifdef CONFIG_INTEL_IOMMU_SVM
1560	/* The PCIe spec, in its wisdom, declares that the behaviour of
1561	   the device if you enable PASID support after ATS support is
1562	   undefined. So always enable PASID support on devices which
1563	   have it, even if we can't yet know if we're ever going to
1564	   use it. */
1565	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1566		info->pasid_enabled = 1;
1567
1568	if (info->pri_supported &&
1569	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1570	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1571		info->pri_enabled = 1;
1572#endif
1573	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1574	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1575		info->ats_enabled = 1;
1576		domain_update_iotlb(info->domain);
1577		info->ats_qdep = pci_ats_queue_depth(pdev);
1578	}
1579}
1580
1581static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1582{
1583	struct pci_dev *pdev;
1584
1585	assert_spin_locked(&device_domain_lock);
1586
1587	if (!dev_is_pci(info->dev))
1588		return;
1589
1590	pdev = to_pci_dev(info->dev);
1591
1592	if (info->ats_enabled) {
1593		pci_disable_ats(pdev);
1594		info->ats_enabled = 0;
1595		domain_update_iotlb(info->domain);
1596	}
1597#ifdef CONFIG_INTEL_IOMMU_SVM
1598	if (info->pri_enabled) {
1599		pci_disable_pri(pdev);
1600		info->pri_enabled = 0;
1601	}
1602	if (info->pasid_enabled) {
1603		pci_disable_pasid(pdev);
1604		info->pasid_enabled = 0;
1605	}
1606#endif
1607}
1608
1609static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1610				  u64 addr, unsigned mask)
1611{
1612	u16 sid, qdep;
1613	unsigned long flags;
1614	struct device_domain_info *info;
1615
1616	if (!domain->has_iotlb_device)
1617		return;
1618
1619	spin_lock_irqsave(&device_domain_lock, flags);
1620	list_for_each_entry(info, &domain->devices, link) {
1621		if (!info->ats_enabled)
1622			continue;
1623
1624		sid = info->bus << 8 | info->devfn;
1625		qdep = info->ats_qdep;
1626		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1627				qdep, addr, mask);
1628	}
1629	spin_unlock_irqrestore(&device_domain_lock, flags);
1630}
1631
1632static void domain_flush_piotlb(struct intel_iommu *iommu,
1633				struct dmar_domain *domain,
1634				u64 addr, unsigned long npages, bool ih)
1635{
1636	u16 did = domain->iommu_did[iommu->seq_id];
1637
1638	if (domain->default_pasid)
1639		qi_flush_piotlb(iommu, did, domain->default_pasid,
1640				addr, npages, ih);
1641
1642	if (!list_empty(&domain->devices))
1643		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1644}
1645
1646static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1647				  struct dmar_domain *domain,
1648				  unsigned long pfn, unsigned int pages,
1649				  int ih, int map)
1650{
1651	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1652	unsigned int mask = ilog2(aligned_pages);
1653	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1654	u16 did = domain->iommu_did[iommu->seq_id];
1655
1656	BUG_ON(pages == 0);
1657
1658	if (ih)
1659		ih = 1 << 6;
1660
1661	if (domain_use_first_level(domain)) {
1662		domain_flush_piotlb(iommu, domain, addr, pages, ih);
1663	} else {
1664		unsigned long bitmask = aligned_pages - 1;
1665
1666		/*
1667		 * PSI masks the low order bits of the base address. If the
1668		 * address isn't aligned to the mask, then compute a mask value
1669		 * needed to ensure the target range is flushed.
1670		 */
1671		if (unlikely(bitmask & pfn)) {
1672			unsigned long end_pfn = pfn + pages - 1, shared_bits;
1673
1674			/*
1675			 * Since end_pfn <= pfn + bitmask, the only way bits
1676			 * higher than bitmask can differ in pfn and end_pfn is
1677			 * by carrying. This means after masking out bitmask,
1678			 * high bits starting with the first set bit in
1679			 * shared_bits are all equal in both pfn and end_pfn.
1680			 */
1681			shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1682			mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1683		}
1684
1685		/*
1686		 * Fallback to domain selective flush if no PSI support or
1687		 * the size is too big.
1688		 */
1689		if (!cap_pgsel_inv(iommu->cap) ||
1690		    mask > cap_max_amask_val(iommu->cap))
1691			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1692							DMA_TLB_DSI_FLUSH);
1693		else
1694			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1695							DMA_TLB_PSI_FLUSH);
1696	}
1697
1698	/*
1699	 * In caching mode, changes of pages from non-present to present require
1700	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1701	 */
1702	if (!cap_caching_mode(iommu->cap) || !map)
1703		iommu_flush_dev_iotlb(domain, addr, mask);
1704}
1705
1706/* Notification for newly created mappings */
1707static inline void __mapping_notify_one(struct intel_iommu *iommu,
1708					struct dmar_domain *domain,
1709					unsigned long pfn, unsigned int pages)
1710{
1711	/*
1712	 * It's a non-present to present mapping. Only flush if caching mode
1713	 * and second level.
1714	 */
1715	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1716		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1717	else
1718		iommu_flush_write_buffer(iommu);
1719}
1720
1721static void iommu_flush_iova(struct iova_domain *iovad)
1722{
1723	struct dmar_domain *domain;
1724	int idx;
1725
1726	domain = container_of(iovad, struct dmar_domain, iovad);
1727
1728	for_each_domain_iommu(idx, domain) {
1729		struct intel_iommu *iommu = g_iommus[idx];
1730		u16 did = domain->iommu_did[iommu->seq_id];
1731
1732		if (domain_use_first_level(domain))
1733			domain_flush_piotlb(iommu, domain, 0, -1, 0);
1734		else
1735			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1736						 DMA_TLB_DSI_FLUSH);
1737
1738		if (!cap_caching_mode(iommu->cap))
1739			iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1740					      0, MAX_AGAW_PFN_WIDTH);
1741	}
1742}
1743
1744static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1745{
1746	u32 pmen;
1747	unsigned long flags;
1748
1749	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1750		return;
1751
1752	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1753	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1754	pmen &= ~DMA_PMEN_EPM;
1755	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1756
1757	/* wait for the protected region status bit to clear */
1758	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1759		readl, !(pmen & DMA_PMEN_PRS), pmen);
1760
1761	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1762}
1763
1764static void iommu_enable_translation(struct intel_iommu *iommu)
1765{
1766	u32 sts;
1767	unsigned long flags;
1768
1769	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1770	iommu->gcmd |= DMA_GCMD_TE;
1771	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1772
1773	/* Make sure hardware complete it */
1774	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1775		      readl, (sts & DMA_GSTS_TES), sts);
1776
1777	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1778}
1779
1780static void iommu_disable_translation(struct intel_iommu *iommu)
1781{
1782	u32 sts;
1783	unsigned long flag;
1784
1785	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1786	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1787		return;
1788
1789	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1790	iommu->gcmd &= ~DMA_GCMD_TE;
1791	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1792
1793	/* Make sure hardware complete it */
1794	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1795		      readl, (!(sts & DMA_GSTS_TES)), sts);
1796
1797	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1798}
1799
1800static int iommu_init_domains(struct intel_iommu *iommu)
1801{
1802	u32 ndomains, nlongs;
1803	size_t size;
1804
1805	ndomains = cap_ndoms(iommu->cap);
1806	pr_debug("%s: Number of Domains supported <%d>\n",
1807		 iommu->name, ndomains);
1808	nlongs = BITS_TO_LONGS(ndomains);
1809
1810	spin_lock_init(&iommu->lock);
1811
1812	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1813	if (!iommu->domain_ids) {
1814		pr_err("%s: Allocating domain id array failed\n",
1815		       iommu->name);
1816		return -ENOMEM;
1817	}
1818
1819	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1820	iommu->domains = kzalloc(size, GFP_KERNEL);
1821
1822	if (iommu->domains) {
1823		size = 256 * sizeof(struct dmar_domain *);
1824		iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1825	}
1826
1827	if (!iommu->domains || !iommu->domains[0]) {
1828		pr_err("%s: Allocating domain array failed\n",
1829		       iommu->name);
1830		kfree(iommu->domain_ids);
1831		kfree(iommu->domains);
1832		iommu->domain_ids = NULL;
1833		iommu->domains    = NULL;
1834		return -ENOMEM;
1835	}
1836
1837	/*
1838	 * If Caching mode is set, then invalid translations are tagged
1839	 * with domain-id 0, hence we need to pre-allocate it. We also
1840	 * use domain-id 0 as a marker for non-allocated domain-id, so
1841	 * make sure it is not used for a real domain.
1842	 */
1843	set_bit(0, iommu->domain_ids);
1844
1845	/*
1846	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1847	 * entry for first-level or pass-through translation modes should
1848	 * be programmed with a domain id different from those used for
1849	 * second-level or nested translation. We reserve a domain id for
1850	 * this purpose.
1851	 */
1852	if (sm_supported(iommu))
1853		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1854
1855	return 0;
1856}
1857
1858static void disable_dmar_iommu(struct intel_iommu *iommu)
1859{
1860	struct device_domain_info *info, *tmp;
1861	unsigned long flags;
1862
1863	if (!iommu->domains || !iommu->domain_ids)
1864		return;
1865
1866	spin_lock_irqsave(&device_domain_lock, flags);
1867	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1868		if (info->iommu != iommu)
1869			continue;
1870
1871		if (!info->dev || !info->domain)
1872			continue;
1873
1874		__dmar_remove_one_dev_info(info);
1875	}
1876	spin_unlock_irqrestore(&device_domain_lock, flags);
1877
1878	if (iommu->gcmd & DMA_GCMD_TE)
1879		iommu_disable_translation(iommu);
1880}
1881
1882static void free_dmar_iommu(struct intel_iommu *iommu)
1883{
1884	if ((iommu->domains) && (iommu->domain_ids)) {
1885		int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1886		int i;
1887
1888		for (i = 0; i < elems; i++)
1889			kfree(iommu->domains[i]);
1890		kfree(iommu->domains);
1891		kfree(iommu->domain_ids);
1892		iommu->domains = NULL;
1893		iommu->domain_ids = NULL;
1894	}
1895
1896	g_iommus[iommu->seq_id] = NULL;
1897
1898	/* free context mapping */
1899	free_context_table(iommu);
1900
1901#ifdef CONFIG_INTEL_IOMMU_SVM
1902	if (pasid_supported(iommu)) {
1903		if (ecap_prs(iommu->ecap))
1904			intel_svm_finish_prq(iommu);
1905	}
1906	if (vccap_pasid(iommu->vccap))
1907		ioasid_unregister_allocator(&iommu->pasid_allocator);
1908
1909#endif
1910}
1911
1912/*
1913 * Check and return whether first level is used by default for
1914 * DMA translation.
1915 */
1916static bool first_level_by_default(void)
1917{
1918	struct dmar_drhd_unit *drhd;
1919	struct intel_iommu *iommu;
1920	static int first_level_support = -1;
1921
1922	if (likely(first_level_support != -1))
1923		return first_level_support;
1924
1925	first_level_support = 1;
1926
1927	rcu_read_lock();
1928	for_each_active_iommu(iommu, drhd) {
1929		if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1930			first_level_support = 0;
1931			break;
1932		}
1933	}
1934	rcu_read_unlock();
1935
1936	return first_level_support;
1937}
1938
1939static struct dmar_domain *alloc_domain(int flags)
1940{
1941	struct dmar_domain *domain;
1942
1943	domain = alloc_domain_mem();
1944	if (!domain)
1945		return NULL;
1946
1947	memset(domain, 0, sizeof(*domain));
1948	domain->nid = NUMA_NO_NODE;
1949	domain->flags = flags;
1950	if (first_level_by_default())
1951		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1952	domain->has_iotlb_device = false;
1953	INIT_LIST_HEAD(&domain->devices);
1954
1955	return domain;
1956}
1957
1958/* Must be called with iommu->lock */
1959static int domain_attach_iommu(struct dmar_domain *domain,
1960			       struct intel_iommu *iommu)
1961{
1962	unsigned long ndomains;
1963	int num;
1964
1965	assert_spin_locked(&device_domain_lock);
1966	assert_spin_locked(&iommu->lock);
1967
1968	domain->iommu_refcnt[iommu->seq_id] += 1;
1969	domain->iommu_count += 1;
1970	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1971		ndomains = cap_ndoms(iommu->cap);
1972		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1973
1974		if (num >= ndomains) {
1975			pr_err("%s: No free domain ids\n", iommu->name);
1976			domain->iommu_refcnt[iommu->seq_id] -= 1;
1977			domain->iommu_count -= 1;
1978			return -ENOSPC;
1979		}
1980
1981		set_bit(num, iommu->domain_ids);
1982		set_iommu_domain(iommu, num, domain);
1983
1984		domain->iommu_did[iommu->seq_id] = num;
1985		domain->nid			 = iommu->node;
1986
1987		domain_update_iommu_cap(domain);
1988	}
1989
1990	return 0;
1991}
1992
1993static int domain_detach_iommu(struct dmar_domain *domain,
1994			       struct intel_iommu *iommu)
1995{
1996	int num, count;
1997
1998	assert_spin_locked(&device_domain_lock);
1999	assert_spin_locked(&iommu->lock);
2000
2001	domain->iommu_refcnt[iommu->seq_id] -= 1;
2002	count = --domain->iommu_count;
2003	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
2004		num = domain->iommu_did[iommu->seq_id];
2005		clear_bit(num, iommu->domain_ids);
2006		set_iommu_domain(iommu, num, NULL);
2007
2008		domain_update_iommu_cap(domain);
2009		domain->iommu_did[iommu->seq_id] = 0;
2010	}
2011
2012	return count;
2013}
2014
2015static struct iova_domain reserved_iova_list;
2016static struct lock_class_key reserved_rbtree_key;
2017
2018static int dmar_init_reserved_ranges(void)
2019{
2020	struct pci_dev *pdev = NULL;
2021	struct iova *iova;
2022	int i;
2023
2024	init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
2025
2026	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
2027		&reserved_rbtree_key);
2028
2029	/* IOAPIC ranges shouldn't be accessed by DMA */
2030	iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
2031		IOVA_PFN(IOAPIC_RANGE_END));
2032	if (!iova) {
2033		pr_err("Reserve IOAPIC range failed\n");
2034		return -ENODEV;
2035	}
2036
2037	/* Reserve all PCI MMIO to avoid peer-to-peer access */
2038	for_each_pci_dev(pdev) {
2039		struct resource *r;
2040
2041		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
2042			r = &pdev->resource[i];
2043			if (!r->flags || !(r->flags & IORESOURCE_MEM))
2044				continue;
2045			iova = reserve_iova(&reserved_iova_list,
2046					    IOVA_PFN(r->start),
2047					    IOVA_PFN(r->end));
2048			if (!iova) {
2049				pci_err(pdev, "Reserve iova for %pR failed\n", r);
2050				return -ENODEV;
2051			}
2052		}
2053	}
2054	return 0;
2055}
2056
2057static inline int guestwidth_to_adjustwidth(int gaw)
2058{
2059	int agaw;
2060	int r = (gaw - 12) % 9;
2061
2062	if (r == 0)
2063		agaw = gaw;
2064	else
2065		agaw = gaw + 9 - r;
2066	if (agaw > 64)
2067		agaw = 64;
2068	return agaw;
2069}
2070
2071static void domain_exit(struct dmar_domain *domain)
2072{
2073
2074	/* Remove associated devices and clear attached or cached domains */
2075	domain_remove_dev_info(domain);
2076
2077	/* destroy iovas */
2078	if (domain->domain.type == IOMMU_DOMAIN_DMA)
2079		put_iova_domain(&domain->iovad);
2080
2081	if (domain->pgd) {
2082		struct page *freelist;
2083
2084		freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2085		dma_free_pagelist(freelist);
2086	}
2087
2088	free_domain_mem(domain);
2089}
2090
2091/*
2092 * Get the PASID directory size for scalable mode context entry.
2093 * Value of X in the PDTS field of a scalable mode context entry
2094 * indicates PASID directory with 2^(X + 7) entries.
2095 */
2096static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2097{
2098	int pds, max_pde;
2099
2100	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2101	pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2102	if (pds < 7)
2103		return 0;
2104
2105	return pds - 7;
2106}
2107
2108/*
2109 * Set the RID_PASID field of a scalable mode context entry. The
2110 * IOMMU hardware will use the PASID value set in this field for
2111 * DMA translations of DMA requests without PASID.
2112 */
2113static inline void
2114context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2115{
2116	context->hi |= pasid & ((1 << 20) - 1);
2117}
2118
2119/*
2120 * Set the DTE(Device-TLB Enable) field of a scalable mode context
2121 * entry.
2122 */
2123static inline void context_set_sm_dte(struct context_entry *context)
2124{
2125	context->lo |= (1 << 2);
2126}
2127
2128/*
2129 * Set the PRE(Page Request Enable) field of a scalable mode context
2130 * entry.
2131 */
2132static inline void context_set_sm_pre(struct context_entry *context)
2133{
2134	context->lo |= (1 << 4);
2135}
2136
2137/* Convert value to context PASID directory size field coding. */
2138#define context_pdts(pds)	(((pds) & 0x7) << 9)
2139
2140static int domain_context_mapping_one(struct dmar_domain *domain,
2141				      struct intel_iommu *iommu,
2142				      struct pasid_table *table,
2143				      u8 bus, u8 devfn)
2144{
2145	u16 did = domain->iommu_did[iommu->seq_id];
2146	int translation = CONTEXT_TT_MULTI_LEVEL;
2147	struct device_domain_info *info = NULL;
2148	struct context_entry *context;
2149	unsigned long flags;
2150	int ret;
2151
2152	WARN_ON(did == 0);
2153
2154	if (hw_pass_through && domain_type_is_si(domain))
2155		translation = CONTEXT_TT_PASS_THROUGH;
2156
2157	pr_debug("Set context mapping for %02x:%02x.%d\n",
2158		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2159
2160	BUG_ON(!domain->pgd);
2161
2162	spin_lock_irqsave(&device_domain_lock, flags);
2163	spin_lock(&iommu->lock);
2164
2165	ret = -ENOMEM;
2166	context = iommu_context_addr(iommu, bus, devfn, 1);
2167	if (!context)
2168		goto out_unlock;
2169
2170	ret = 0;
2171	if (context_present(context))
2172		goto out_unlock;
2173
2174	/*
2175	 * For kdump cases, old valid entries may be cached due to the
2176	 * in-flight DMA and copied pgtable, but there is no unmapping
2177	 * behaviour for them, thus we need an explicit cache flush for
2178	 * the newly-mapped device. For kdump, at this point, the device
2179	 * is supposed to finish reset at its driver probe stage, so no
2180	 * in-flight DMA will exist, and we don't need to worry anymore
2181	 * hereafter.
2182	 */
2183	if (context_copied(context)) {
2184		u16 did_old = context_domain_id(context);
2185
2186		if (did_old < cap_ndoms(iommu->cap)) {
2187			iommu->flush.flush_context(iommu, did_old,
2188						   (((u16)bus) << 8) | devfn,
2189						   DMA_CCMD_MASK_NOBIT,
2190						   DMA_CCMD_DEVICE_INVL);
2191			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2192						 DMA_TLB_DSI_FLUSH);
2193		}
2194	}
2195
2196	context_clear_entry(context);
2197
2198	if (sm_supported(iommu)) {
2199		unsigned long pds;
2200
2201		WARN_ON(!table);
2202
2203		/* Setup the PASID DIR pointer: */
2204		pds = context_get_sm_pds(table);
2205		context->lo = (u64)virt_to_phys(table->table) |
2206				context_pdts(pds);
2207
2208		/* Setup the RID_PASID field: */
2209		context_set_sm_rid2pasid(context, PASID_RID2PASID);
2210
2211		/*
2212		 * Setup the Device-TLB enable bit and Page request
2213		 * Enable bit:
2214		 */
2215		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2216		if (info && info->ats_supported)
2217			context_set_sm_dte(context);
2218		if (info && info->pri_supported)
2219			context_set_sm_pre(context);
2220	} else {
2221		struct dma_pte *pgd = domain->pgd;
2222		int agaw;
2223
2224		context_set_domain_id(context, did);
2225
2226		if (translation != CONTEXT_TT_PASS_THROUGH) {
2227			/*
2228			 * Skip top levels of page tables for iommu which has
2229			 * less agaw than default. Unnecessary for PT mode.
2230			 */
2231			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2232				ret = -ENOMEM;
2233				pgd = phys_to_virt(dma_pte_addr(pgd));
2234				if (!dma_pte_present(pgd))
2235					goto out_unlock;
2236			}
2237
2238			info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2239			if (info && info->ats_supported)
2240				translation = CONTEXT_TT_DEV_IOTLB;
2241			else
2242				translation = CONTEXT_TT_MULTI_LEVEL;
2243
2244			context_set_address_root(context, virt_to_phys(pgd));
2245			context_set_address_width(context, agaw);
2246		} else {
2247			/*
2248			 * In pass through mode, AW must be programmed to
2249			 * indicate the largest AGAW value supported by
2250			 * hardware. And ASR is ignored by hardware.
2251			 */
2252			context_set_address_width(context, iommu->msagaw);
2253		}
2254
2255		context_set_translation_type(context, translation);
2256	}
2257
2258	context_set_fault_enable(context);
2259	context_set_present(context);
2260	if (!ecap_coherent(iommu->ecap))
2261		clflush_cache_range(context, sizeof(*context));
2262
2263	/*
2264	 * It's a non-present to present mapping. If hardware doesn't cache
2265	 * non-present entry we only need to flush the write-buffer. If the
2266	 * _does_ cache non-present entries, then it does so in the special
2267	 * domain #0, which we have to flush:
2268	 */
2269	if (cap_caching_mode(iommu->cap)) {
2270		iommu->flush.flush_context(iommu, 0,
2271					   (((u16)bus) << 8) | devfn,
2272					   DMA_CCMD_MASK_NOBIT,
2273					   DMA_CCMD_DEVICE_INVL);
2274		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2275	} else {
2276		iommu_flush_write_buffer(iommu);
2277	}
2278	iommu_enable_dev_iotlb(info);
2279
2280	ret = 0;
2281
2282out_unlock:
2283	spin_unlock(&iommu->lock);
2284	spin_unlock_irqrestore(&device_domain_lock, flags);
2285
2286	return ret;
2287}
2288
2289struct domain_context_mapping_data {
2290	struct dmar_domain *domain;
2291	struct intel_iommu *iommu;
2292	struct pasid_table *table;
2293};
2294
2295static int domain_context_mapping_cb(struct pci_dev *pdev,
2296				     u16 alias, void *opaque)
2297{
2298	struct domain_context_mapping_data *data = opaque;
2299
2300	return domain_context_mapping_one(data->domain, data->iommu,
2301					  data->table, PCI_BUS_NUM(alias),
2302					  alias & 0xff);
2303}
2304
2305static int
2306domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2307{
2308	struct domain_context_mapping_data data;
2309	struct pasid_table *table;
2310	struct intel_iommu *iommu;
2311	u8 bus, devfn;
2312
2313	iommu = device_to_iommu(dev, &bus, &devfn);
2314	if (!iommu)
2315		return -ENODEV;
2316
2317	table = intel_pasid_get_table(dev);
2318
2319	if (!dev_is_pci(dev))
2320		return domain_context_mapping_one(domain, iommu, table,
2321						  bus, devfn);
2322
2323	data.domain = domain;
2324	data.iommu = iommu;
2325	data.table = table;
2326
2327	return pci_for_each_dma_alias(to_pci_dev(dev),
2328				      &domain_context_mapping_cb, &data);
2329}
2330
2331static int domain_context_mapped_cb(struct pci_dev *pdev,
2332				    u16 alias, void *opaque)
2333{
2334	struct intel_iommu *iommu = opaque;
2335
2336	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2337}
2338
2339static int domain_context_mapped(struct device *dev)
2340{
2341	struct intel_iommu *iommu;
2342	u8 bus, devfn;
2343
2344	iommu = device_to_iommu(dev, &bus, &devfn);
2345	if (!iommu)
2346		return -ENODEV;
2347
2348	if (!dev_is_pci(dev))
2349		return device_context_mapped(iommu, bus, devfn);
2350
2351	return !pci_for_each_dma_alias(to_pci_dev(dev),
2352				       domain_context_mapped_cb, iommu);
2353}
2354
2355/* Returns a number of VTD pages, but aligned to MM page size */
2356static inline unsigned long aligned_nrpages(unsigned long host_addr,
2357					    size_t size)
2358{
2359	host_addr &= ~PAGE_MASK;
2360	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2361}
2362
2363/* Return largest possible superpage level for a given mapping */
2364static inline int hardware_largepage_caps(struct dmar_domain *domain,
2365					  unsigned long iov_pfn,
2366					  unsigned long phy_pfn,
2367					  unsigned long pages)
2368{
2369	int support, level = 1;
2370	unsigned long pfnmerge;
2371
2372	support = domain->iommu_superpage;
2373
2374	/* To use a large page, the virtual *and* physical addresses
2375	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2376	   of them will mean we have to use smaller pages. So just
2377	   merge them and check both at once. */
2378	pfnmerge = iov_pfn | phy_pfn;
2379
2380	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2381		pages >>= VTD_STRIDE_SHIFT;
2382		if (!pages)
2383			break;
2384		pfnmerge >>= VTD_STRIDE_SHIFT;
2385		level++;
2386		support--;
2387	}
2388	return level;
2389}
2390
2391static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2392			    struct scatterlist *sg, unsigned long phys_pfn,
2393			    unsigned long nr_pages, int prot)
2394{
2395	struct dma_pte *first_pte = NULL, *pte = NULL;
2396	phys_addr_t pteval;
2397	unsigned long sg_res = 0;
2398	unsigned int largepage_lvl = 0;
2399	unsigned long lvl_pages = 0;
2400	u64 attr;
2401
2402	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2403
2404	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2405		return -EINVAL;
2406
2407	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2408	attr |= DMA_FL_PTE_PRESENT;
2409	if (domain_use_first_level(domain)) {
2410		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
2411
2412		if (domain->domain.type == IOMMU_DOMAIN_DMA) {
2413			attr |= DMA_FL_PTE_ACCESS;
2414			if (prot & DMA_PTE_WRITE)
2415				attr |= DMA_FL_PTE_DIRTY;
2416		}
2417	}
2418
2419	if (!sg) {
2420		sg_res = nr_pages;
2421		pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2422	}
2423
2424	while (nr_pages > 0) {
2425		uint64_t tmp;
2426
2427		if (!sg_res) {
2428			unsigned int pgoff = sg->offset & ~PAGE_MASK;
2429
2430			sg_res = aligned_nrpages(sg->offset, sg->length);
2431			sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2432			sg->dma_length = sg->length;
2433			pteval = (sg_phys(sg) - pgoff) | attr;
2434			phys_pfn = pteval >> VTD_PAGE_SHIFT;
2435		}
2436
2437		if (!pte) {
2438			largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2439
2440			first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2441			if (!pte)
2442				return -ENOMEM;
2443			/* It is large page*/
2444			if (largepage_lvl > 1) {
2445				unsigned long nr_superpages, end_pfn;
2446
2447				pteval |= DMA_PTE_LARGE_PAGE;
2448				lvl_pages = lvl_to_nr_pages(largepage_lvl);
2449
2450				nr_superpages = sg_res / lvl_pages;
2451				end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2452
2453				/*
2454				 * Ensure that old small page tables are
2455				 * removed to make room for superpage(s).
2456				 * We're adding new large pages, so make sure
2457				 * we don't remove their parent tables.
2458				 */
2459				dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2460						       largepage_lvl + 1);
2461			} else {
2462				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2463			}
2464
2465		}
2466		/* We don't need lock here, nobody else
2467		 * touches the iova range
2468		 */
2469		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2470		if (tmp) {
2471			static int dumps = 5;
2472			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2473				iov_pfn, tmp, (unsigned long long)pteval);
2474			if (dumps) {
2475				dumps--;
2476				debug_dma_dump_mappings(NULL);
2477			}
2478			WARN_ON(1);
2479		}
2480
2481		lvl_pages = lvl_to_nr_pages(largepage_lvl);
2482
2483		BUG_ON(nr_pages < lvl_pages);
2484		BUG_ON(sg_res < lvl_pages);
2485
2486		nr_pages -= lvl_pages;
2487		iov_pfn += lvl_pages;
2488		phys_pfn += lvl_pages;
2489		pteval += lvl_pages * VTD_PAGE_SIZE;
2490		sg_res -= lvl_pages;
2491
2492		/* If the next PTE would be the first in a new page, then we
2493		   need to flush the cache on the entries we've just written.
2494		   And then we'll need to recalculate 'pte', so clear it and
2495		   let it get set again in the if (!pte) block above.
2496
2497		   If we're done (!nr_pages) we need to flush the cache too.
2498
2499		   Also if we've been setting superpages, we may need to
2500		   recalculate 'pte' and switch back to smaller pages for the
2501		   end of the mapping, if the trailing size is not enough to
2502		   use another superpage (i.e. sg_res < lvl_pages). */
2503		pte++;
2504		if (!nr_pages || first_pte_in_page(pte) ||
2505		    (largepage_lvl > 1 && sg_res < lvl_pages)) {
2506			domain_flush_cache(domain, first_pte,
2507					   (void *)pte - (void *)first_pte);
2508			pte = NULL;
2509		}
2510
2511		if (!sg_res && nr_pages)
2512			sg = sg_next(sg);
2513	}
2514	return 0;
2515}
2516
2517static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2518			  struct scatterlist *sg, unsigned long phys_pfn,
2519			  unsigned long nr_pages, int prot)
2520{
2521	int iommu_id, ret;
2522	struct intel_iommu *iommu;
2523
2524	/* Do the real mapping first */
2525	ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2526	if (ret)
2527		return ret;
2528
2529	for_each_domain_iommu(iommu_id, domain) {
2530		iommu = g_iommus[iommu_id];
2531		__mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2532	}
2533
2534	return 0;
2535}
2536
2537static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2538				    struct scatterlist *sg, unsigned long nr_pages,
2539				    int prot)
2540{
2541	return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2542}
2543
2544static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2545				     unsigned long phys_pfn, unsigned long nr_pages,
2546				     int prot)
2547{
2548	return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2549}
2550
2551static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2552{
2553	unsigned long flags;
2554	struct context_entry *context;
2555	u16 did_old;
2556
2557	if (!iommu)
2558		return;
2559
2560	spin_lock_irqsave(&iommu->lock, flags);
2561	context = iommu_context_addr(iommu, bus, devfn, 0);
2562	if (!context) {
2563		spin_unlock_irqrestore(&iommu->lock, flags);
2564		return;
2565	}
2566	did_old = context_domain_id(context);
2567	context_clear_entry(context);
2568	__iommu_flush_cache(iommu, context, sizeof(*context));
2569	spin_unlock_irqrestore(&iommu->lock, flags);
2570	iommu->flush.flush_context(iommu,
2571				   did_old,
2572				   (((u16)bus) << 8) | devfn,
2573				   DMA_CCMD_MASK_NOBIT,
2574				   DMA_CCMD_DEVICE_INVL);
2575
2576	if (sm_supported(iommu))
2577		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2578
2579	iommu->flush.flush_iotlb(iommu,
2580				 did_old,
2581				 0,
2582				 0,
2583				 DMA_TLB_DSI_FLUSH);
2584}
2585
2586static inline void unlink_domain_info(struct device_domain_info *info)
2587{
2588	assert_spin_locked(&device_domain_lock);
2589	list_del(&info->link);
2590	list_del(&info->global);
2591	if (info->dev)
2592		dev_iommu_priv_set(info->dev, NULL);
2593}
2594
2595static void domain_remove_dev_info(struct dmar_domain *domain)
2596{
2597	struct device_domain_info *info, *tmp;
2598	unsigned long flags;
2599
2600	spin_lock_irqsave(&device_domain_lock, flags);
2601	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2602		__dmar_remove_one_dev_info(info);
2603	spin_unlock_irqrestore(&device_domain_lock, flags);
2604}
2605
2606struct dmar_domain *find_domain(struct device *dev)
2607{
2608	struct device_domain_info *info;
2609
2610	if (unlikely(!dev || !dev->iommu))
2611		return NULL;
2612
2613	if (unlikely(attach_deferred(dev)))
2614		return NULL;
2615
2616	/* No lock here, assumes no domain exit in normal case */
2617	info = get_domain_info(dev);
2618	if (likely(info))
2619		return info->domain;
2620
2621	return NULL;
2622}
2623
2624static void do_deferred_attach(struct device *dev)
2625{
2626	struct iommu_domain *domain;
2627
2628	dev_iommu_priv_set(dev, NULL);
2629	domain = iommu_get_domain_for_dev(dev);
2630	if (domain)
2631		intel_iommu_attach_device(domain, dev);
2632}
2633
2634static inline struct device_domain_info *
2635dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2636{
2637	struct device_domain_info *info;
2638
2639	list_for_each_entry(info, &device_domain_list, global)
2640		if (info->segment == segment && info->bus == bus &&
2641		    info->devfn == devfn)
2642			return info;
2643
2644	return NULL;
2645}
2646
2647static int domain_setup_first_level(struct intel_iommu *iommu,
2648				    struct dmar_domain *domain,
2649				    struct device *dev,
2650				    u32 pasid)
2651{
2652	struct dma_pte *pgd = domain->pgd;
2653	int agaw, level;
2654	int flags = 0;
2655
2656	/*
2657	 * Skip top levels of page tables for iommu which has
2658	 * less agaw than default. Unnecessary for PT mode.
2659	 */
2660	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2661		pgd = phys_to_virt(dma_pte_addr(pgd));
2662		if (!dma_pte_present(pgd))
2663			return -ENOMEM;
2664	}
2665
2666	level = agaw_to_level(agaw);
2667	if (level != 4 && level != 5)
2668		return -EINVAL;
2669
2670	if (pasid != PASID_RID2PASID)
2671		flags |= PASID_FLAG_SUPERVISOR_MODE;
2672	if (level == 5)
2673		flags |= PASID_FLAG_FL5LP;
2674
2675	if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
2676		flags |= PASID_FLAG_PAGE_SNOOP;
2677
2678	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2679					     domain->iommu_did[iommu->seq_id],
2680					     flags);
2681}
2682
2683static bool dev_is_real_dma_subdevice(struct device *dev)
2684{
2685	return dev && dev_is_pci(dev) &&
2686	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2687}
2688
2689static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2690						    int bus, int devfn,
2691						    struct device *dev,
2692						    struct dmar_domain *domain)
2693{
2694	struct dmar_domain *found = NULL;
2695	struct device_domain_info *info;
2696	unsigned long flags;
2697	int ret;
2698
2699	info = alloc_devinfo_mem();
2700	if (!info)
2701		return NULL;
2702
2703	if (!dev_is_real_dma_subdevice(dev)) {
2704		info->bus = bus;
2705		info->devfn = devfn;
2706		info->segment = iommu->segment;
2707	} else {
2708		struct pci_dev *pdev = to_pci_dev(dev);
2709
2710		info->bus = pdev->bus->number;
2711		info->devfn = pdev->devfn;
2712		info->segment = pci_domain_nr(pdev->bus);
2713	}
2714
2715	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2716	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2717	info->ats_qdep = 0;
2718	info->dev = dev;
2719	info->domain = domain;
2720	info->iommu = iommu;
2721	info->pasid_table = NULL;
2722	info->auxd_enabled = 0;
2723	INIT_LIST_HEAD(&info->auxiliary_domains);
2724
2725	if (dev && dev_is_pci(dev)) {
2726		struct pci_dev *pdev = to_pci_dev(info->dev);
2727
2728		if (ecap_dev_iotlb_support(iommu->ecap) &&
2729		    pci_ats_supported(pdev) &&
2730		    dmar_find_matched_atsr_unit(pdev))
2731			info->ats_supported = 1;
2732
2733		if (sm_supported(iommu)) {
2734			if (pasid_supported(iommu)) {
2735				int features = pci_pasid_features(pdev);
2736				if (features >= 0)
2737					info->pasid_supported = features | 1;
2738			}
2739
2740			if (info->ats_supported && ecap_prs(iommu->ecap) &&
2741			    pci_pri_supported(pdev))
2742				info->pri_supported = 1;
2743		}
2744	}
2745
2746	spin_lock_irqsave(&device_domain_lock, flags);
2747	if (dev)
2748		found = find_domain(dev);
2749
2750	if (!found) {
2751		struct device_domain_info *info2;
2752		info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2753						       info->devfn);
2754		if (info2) {
2755			found      = info2->domain;
2756			info2->dev = dev;
2757		}
2758	}
2759
2760	if (found) {
2761		spin_unlock_irqrestore(&device_domain_lock, flags);
2762		free_devinfo_mem(info);
2763		/* Caller must free the original domain */
2764		return found;
2765	}
2766
2767	spin_lock(&iommu->lock);
2768	ret = domain_attach_iommu(domain, iommu);
2769	spin_unlock(&iommu->lock);
2770
2771	if (ret) {
2772		spin_unlock_irqrestore(&device_domain_lock, flags);
2773		free_devinfo_mem(info);
2774		return NULL;
2775	}
2776
2777	list_add(&info->link, &domain->devices);
2778	list_add(&info->global, &device_domain_list);
2779	if (dev)
2780		dev_iommu_priv_set(dev, info);
2781	spin_unlock_irqrestore(&device_domain_lock, flags);
2782
2783	/* PASID table is mandatory for a PCI device in scalable mode. */
2784	if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2785		ret = intel_pasid_alloc_table(dev);
2786		if (ret) {
2787			dev_err(dev, "PASID table allocation failed\n");
2788			dmar_remove_one_dev_info(dev);
2789			return NULL;
2790		}
2791
2792		/* Setup the PASID entry for requests without PASID: */
2793		spin_lock_irqsave(&iommu->lock, flags);
2794		if (hw_pass_through && domain_type_is_si(domain))
2795			ret = intel_pasid_setup_pass_through(iommu, domain,
2796					dev, PASID_RID2PASID);
2797		else if (domain_use_first_level(domain))
2798			ret = domain_setup_first_level(iommu, domain, dev,
2799					PASID_RID2PASID);
2800		else
2801			ret = intel_pasid_setup_second_level(iommu, domain,
2802					dev, PASID_RID2PASID);
2803		spin_unlock_irqrestore(&iommu->lock, flags);
2804		if (ret) {
2805			dev_err(dev, "Setup RID2PASID failed\n");
2806			dmar_remove_one_dev_info(dev);
2807			return NULL;
2808		}
2809	}
2810
2811	if (dev && domain_context_mapping(domain, dev)) {
2812		dev_err(dev, "Domain context map failed\n");
2813		dmar_remove_one_dev_info(dev);
2814		return NULL;
2815	}
2816
2817	return domain;
2818}
2819
2820static int iommu_domain_identity_map(struct dmar_domain *domain,
2821				     unsigned long first_vpfn,
2822				     unsigned long last_vpfn)
2823{
2824	/*
2825	 * RMRR range might have overlap with physical memory range,
2826	 * clear it first
2827	 */
2828	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2829
2830	return __domain_mapping(domain, first_vpfn, NULL,
2831				first_vpfn, last_vpfn - first_vpfn + 1,
2832				DMA_PTE_READ|DMA_PTE_WRITE);
2833}
2834
2835static int md_domain_init(struct dmar_domain *domain, int guest_width);
2836
2837static int __init si_domain_init(int hw)
2838{
2839	struct dmar_rmrr_unit *rmrr;
2840	struct device *dev;
2841	int i, nid, ret;
2842
2843	si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2844	if (!si_domain)
2845		return -EFAULT;
2846
2847	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2848		domain_exit(si_domain);
2849		si_domain = NULL;
2850		return -EFAULT;
2851	}
2852
2853	if (hw)
2854		return 0;
2855
2856	for_each_online_node(nid) {
2857		unsigned long start_pfn, end_pfn;
2858		int i;
2859
2860		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2861			ret = iommu_domain_identity_map(si_domain,
2862					mm_to_dma_pfn(start_pfn),
2863					mm_to_dma_pfn(end_pfn));
2864			if (ret)
2865				return ret;
2866		}
2867	}
2868
2869	/*
2870	 * Identity map the RMRRs so that devices with RMRRs could also use
2871	 * the si_domain.
2872	 */
2873	for_each_rmrr_units(rmrr) {
2874		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2875					  i, dev) {
2876			unsigned long long start = rmrr->base_address;
2877			unsigned long long end = rmrr->end_address;
2878
2879			if (WARN_ON(end < start ||
2880				    end >> agaw_to_width(si_domain->agaw)))
2881				continue;
2882
2883			ret = iommu_domain_identity_map(si_domain,
2884					mm_to_dma_pfn(start >> PAGE_SHIFT),
2885					mm_to_dma_pfn(end >> PAGE_SHIFT));
2886			if (ret)
2887				return ret;
2888		}
2889	}
2890
2891	return 0;
2892}
2893
2894static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2895{
2896	struct dmar_domain *ndomain;
2897	struct intel_iommu *iommu;
2898	u8 bus, devfn;
2899
2900	iommu = device_to_iommu(dev, &bus, &devfn);
2901	if (!iommu)
2902		return -ENODEV;
2903
2904	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2905	if (ndomain != domain)
2906		return -EBUSY;
2907
2908	return 0;
2909}
2910
2911static bool device_has_rmrr(struct device *dev)
2912{
2913	struct dmar_rmrr_unit *rmrr;
2914	struct device *tmp;
2915	int i;
2916
2917	rcu_read_lock();
2918	for_each_rmrr_units(rmrr) {
2919		/*
2920		 * Return TRUE if this RMRR contains the device that
2921		 * is passed in.
2922		 */
2923		for_each_active_dev_scope(rmrr->devices,
2924					  rmrr->devices_cnt, i, tmp)
2925			if (tmp == dev ||
2926			    is_downstream_to_pci_bridge(dev, tmp)) {
2927				rcu_read_unlock();
2928				return true;
2929			}
2930	}
2931	rcu_read_unlock();
2932	return false;
2933}
2934
2935/**
2936 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2937 * is relaxable (ie. is allowed to be not enforced under some conditions)
2938 * @dev: device handle
2939 *
2940 * We assume that PCI USB devices with RMRRs have them largely
2941 * for historical reasons and that the RMRR space is not actively used post
2942 * boot.  This exclusion may change if vendors begin to abuse it.
2943 *
2944 * The same exception is made for graphics devices, with the requirement that
2945 * any use of the RMRR regions will be torn down before assigning the device
2946 * to a guest.
2947 *
2948 * Return: true if the RMRR is relaxable, false otherwise
2949 */
2950static bool device_rmrr_is_relaxable(struct device *dev)
2951{
2952	struct pci_dev *pdev;
2953
2954	if (!dev_is_pci(dev))
2955		return false;
2956
2957	pdev = to_pci_dev(dev);
2958	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2959		return true;
2960	else
2961		return false;
2962}
2963
2964/*
2965 * There are a couple cases where we need to restrict the functionality of
2966 * devices associated with RMRRs.  The first is when evaluating a device for
2967 * identity mapping because problems exist when devices are moved in and out
2968 * of domains and their respective RMRR information is lost.  This means that
2969 * a device with associated RMRRs will never be in a "passthrough" domain.
2970 * The second is use of the device through the IOMMU API.  This interface
2971 * expects to have full control of the IOVA space for the device.  We cannot
2972 * satisfy both the requirement that RMRR access is maintained and have an
2973 * unencumbered IOVA space.  We also have no ability to quiesce the device's
2974 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2975 * We therefore prevent devices associated with an RMRR from participating in
2976 * the IOMMU API, which eliminates them from device assignment.
2977 *
2978 * In both cases, devices which have relaxable RMRRs are not concerned by this
2979 * restriction. See device_rmrr_is_relaxable comment.
2980 */
2981static bool device_is_rmrr_locked(struct device *dev)
2982{
2983	if (!device_has_rmrr(dev))
2984		return false;
2985
2986	if (device_rmrr_is_relaxable(dev))
2987		return false;
2988
2989	return true;
2990}
2991
2992/*
2993 * Return the required default domain type for a specific device.
2994 *
2995 * @dev: the device in query
2996 * @startup: true if this is during early boot
2997 *
2998 * Returns:
2999 *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
3000 *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
3001 *  - 0: both identity and dynamic domains work for this device
3002 */
3003static int device_def_domain_type(struct device *dev)
3004{
3005	if (dev_is_pci(dev)) {
3006		struct pci_dev *pdev = to_pci_dev(dev);
3007
3008		/*
3009		 * Prevent any device marked as untrusted from getting
3010		 * placed into the statically identity mapping domain.
3011		 */
3012		if (pdev->untrusted)
3013			return IOMMU_DOMAIN_DMA;
3014
3015		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
3016			return IOMMU_DOMAIN_IDENTITY;
3017
3018		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
3019			return IOMMU_DOMAIN_IDENTITY;
3020	}
3021
3022	return 0;
3023}
3024
3025static void intel_iommu_init_qi(struct intel_iommu *iommu)
3026{
3027	/*
3028	 * Start from the sane iommu hardware state.
3029	 * If the queued invalidation is already initialized by us
3030	 * (for example, while enabling interrupt-remapping) then
3031	 * we got the things already rolling from a sane state.
3032	 */
3033	if (!iommu->qi) {
3034		/*
3035		 * Clear any previous faults.
3036		 */
3037		dmar_fault(-1, iommu);
3038		/*
3039		 * Disable queued invalidation if supported and already enabled
3040		 * before OS handover.
3041		 */
3042		dmar_disable_qi(iommu);
3043	}
3044
3045	if (dmar_enable_qi(iommu)) {
3046		/*
3047		 * Queued Invalidate not enabled, use Register Based Invalidate
3048		 */
3049		iommu->flush.flush_context = __iommu_flush_context;
3050		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3051		pr_info("%s: Using Register based invalidation\n",
3052			iommu->name);
3053	} else {
3054		iommu->flush.flush_context = qi_flush_context;
3055		iommu->flush.flush_iotlb = qi_flush_iotlb;
3056		pr_info("%s: Using Queued invalidation\n", iommu->name);
3057	}
3058}
3059
3060static int copy_context_table(struct intel_iommu *iommu,
3061			      struct root_entry *old_re,
3062			      struct context_entry **tbl,
3063			      int bus, bool ext)
3064{
3065	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3066	struct context_entry *new_ce = NULL, ce;
3067	struct context_entry *old_ce = NULL;
3068	struct root_entry re;
3069	phys_addr_t old_ce_phys;
3070
3071	tbl_idx = ext ? bus * 2 : bus;
3072	memcpy(&re, old_re, sizeof(re));
3073
3074	for (devfn = 0; devfn < 256; devfn++) {
3075		/* First calculate the correct index */
3076		idx = (ext ? devfn * 2 : devfn) % 256;
3077
3078		if (idx == 0) {
3079			/* First save what we may have and clean up */
3080			if (new_ce) {
3081				tbl[tbl_idx] = new_ce;
3082				__iommu_flush_cache(iommu, new_ce,
3083						    VTD_PAGE_SIZE);
3084				pos = 1;
3085			}
3086
3087			if (old_ce)
3088				memunmap(old_ce);
3089
3090			ret = 0;
3091			if (devfn < 0x80)
3092				old_ce_phys = root_entry_lctp(&re);
3093			else
3094				old_ce_phys = root_entry_uctp(&re);
3095
3096			if (!old_ce_phys) {
3097				if (ext && devfn == 0) {
3098					/* No LCTP, try UCTP */
3099					devfn = 0x7f;
3100					continue;
3101				} else {
3102					goto out;
3103				}
3104			}
3105
3106			ret = -ENOMEM;
3107			old_ce = memremap(old_ce_phys, PAGE_SIZE,
3108					MEMREMAP_WB);
3109			if (!old_ce)
3110				goto out;
3111
3112			new_ce = alloc_pgtable_page(iommu->node);
3113			if (!new_ce)
3114				goto out_unmap;
3115
3116			ret = 0;
3117		}
3118
3119		/* Now copy the context entry */
3120		memcpy(&ce, old_ce + idx, sizeof(ce));
3121
3122		if (!__context_present(&ce))
3123			continue;
3124
3125		did = context_domain_id(&ce);
3126		if (did >= 0 && did < cap_ndoms(iommu->cap))
3127			set_bit(did, iommu->domain_ids);
3128
3129		/*
3130		 * We need a marker for copied context entries. This
3131		 * marker needs to work for the old format as well as
3132		 * for extended context entries.
3133		 *
3134		 * Bit 67 of the context entry is used. In the old
3135		 * format this bit is available to software, in the
3136		 * extended format it is the PGE bit, but PGE is ignored
3137		 * by HW if PASIDs are disabled (and thus still
3138		 * available).
3139		 *
3140		 * So disable PASIDs first and then mark the entry
3141		 * copied. This means that we don't copy PASID
3142		 * translations from the old kernel, but this is fine as
3143		 * faults there are not fatal.
3144		 */
3145		context_clear_pasid_enable(&ce);
3146		context_set_copied(&ce);
3147
3148		new_ce[idx] = ce;
3149	}
3150
3151	tbl[tbl_idx + pos] = new_ce;
3152
3153	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3154
3155out_unmap:
3156	memunmap(old_ce);
3157
3158out:
3159	return ret;
3160}
3161
3162static int copy_translation_tables(struct intel_iommu *iommu)
3163{
3164	struct context_entry **ctxt_tbls;
3165	struct root_entry *old_rt;
3166	phys_addr_t old_rt_phys;
3167	int ctxt_table_entries;
3168	unsigned long flags;
3169	u64 rtaddr_reg;
3170	int bus, ret;
3171	bool new_ext, ext;
3172
3173	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3174	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3175	new_ext    = !!ecap_ecs(iommu->ecap);
3176
3177	/*
3178	 * The RTT bit can only be changed when translation is disabled,
3179	 * but disabling translation means to open a window for data
3180	 * corruption. So bail out and don't copy anything if we would
3181	 * have to change the bit.
3182	 */
3183	if (new_ext != ext)
3184		return -EINVAL;
3185
3186	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3187	if (!old_rt_phys)
3188		return -EINVAL;
3189
3190	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3191	if (!old_rt)
3192		return -ENOMEM;
3193
3194	/* This is too big for the stack - allocate it from slab */
3195	ctxt_table_entries = ext ? 512 : 256;
3196	ret = -ENOMEM;
3197	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3198	if (!ctxt_tbls)
3199		goto out_unmap;
3200
3201	for (bus = 0; bus < 256; bus++) {
3202		ret = copy_context_table(iommu, &old_rt[bus],
3203					 ctxt_tbls, bus, ext);
3204		if (ret) {
3205			pr_err("%s: Failed to copy context table for bus %d\n",
3206				iommu->name, bus);
3207			continue;
3208		}
3209	}
3210
3211	spin_lock_irqsave(&iommu->lock, flags);
3212
3213	/* Context tables are copied, now write them to the root_entry table */
3214	for (bus = 0; bus < 256; bus++) {
3215		int idx = ext ? bus * 2 : bus;
3216		u64 val;
3217
3218		if (ctxt_tbls[idx]) {
3219			val = virt_to_phys(ctxt_tbls[idx]) | 1;
3220			iommu->root_entry[bus].lo = val;
3221		}
3222
3223		if (!ext || !ctxt_tbls[idx + 1])
3224			continue;
3225
3226		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3227		iommu->root_entry[bus].hi = val;
3228	}
3229
3230	spin_unlock_irqrestore(&iommu->lock, flags);
3231
3232	kfree(ctxt_tbls);
3233
3234	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3235
3236	ret = 0;
3237
3238out_unmap:
3239	memunmap(old_rt);
3240
3241	return ret;
3242}
3243
3244#ifdef CONFIG_INTEL_IOMMU_SVM
3245static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3246{
3247	struct intel_iommu *iommu = data;
3248	ioasid_t ioasid;
3249
3250	if (!iommu)
3251		return INVALID_IOASID;
3252	/*
3253	 * VT-d virtual command interface always uses the full 20 bit
3254	 * PASID range. Host can partition guest PASID range based on
3255	 * policies but it is out of guest's control.
3256	 */
3257	if (min < PASID_MIN || max > intel_pasid_max_id)
3258		return INVALID_IOASID;
3259
3260	if (vcmd_alloc_pasid(iommu, &ioasid))
3261		return INVALID_IOASID;
3262
3263	return ioasid;
3264}
3265
3266static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3267{
3268	struct intel_iommu *iommu = data;
3269
3270	if (!iommu)
3271		return;
3272	/*
3273	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3274	 * We can only free the PASID when all the devices are unbound.
3275	 */
3276	if (ioasid_find(NULL, ioasid, NULL)) {
3277		pr_alert("Cannot free active IOASID %d\n", ioasid);
3278		return;
3279	}
3280	vcmd_free_pasid(iommu, ioasid);
3281}
3282
3283static void register_pasid_allocator(struct intel_iommu *iommu)
3284{
3285	/*
3286	 * If we are running in the host, no need for custom allocator
3287	 * in that PASIDs are allocated from the host system-wide.
3288	 */
3289	if (!cap_caching_mode(iommu->cap))
3290		return;
3291
3292	if (!sm_supported(iommu)) {
3293		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3294		return;
3295	}
3296
3297	/*
3298	 * Register a custom PASID allocator if we are running in a guest,
3299	 * guest PASID must be obtained via virtual command interface.
3300	 * There can be multiple vIOMMUs in each guest but only one allocator
3301	 * is active. All vIOMMU allocators will eventually be calling the same
3302	 * host allocator.
3303	 */
3304	if (!vccap_pasid(iommu->vccap))
3305		return;
3306
3307	pr_info("Register custom PASID allocator\n");
3308	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3309	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3310	iommu->pasid_allocator.pdata = (void *)iommu;
3311	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3312		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3313		/*
3314		 * Disable scalable mode on this IOMMU if there
3315		 * is no custom allocator. Mixing SM capable vIOMMU
3316		 * and non-SM vIOMMU are not supported.
3317		 */
3318		intel_iommu_sm = 0;
3319	}
3320}
3321#endif
3322
3323static int __init init_dmars(void)
3324{
3325	struct dmar_drhd_unit *drhd;
3326	struct intel_iommu *iommu;
3327	int ret;
3328
3329	/*
3330	 * for each drhd
3331	 *    allocate root
3332	 *    initialize and program root entry to not present
3333	 * endfor
3334	 */
3335	for_each_drhd_unit(drhd) {
3336		/*
3337		 * lock not needed as this is only incremented in the single
3338		 * threaded kernel __init code path all other access are read
3339		 * only
3340		 */
3341		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3342			g_num_of_iommus++;
3343			continue;
3344		}
3345		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3346	}
3347
3348	/* Preallocate enough resources for IOMMU hot-addition */
3349	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3350		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3351
3352	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3353			GFP_KERNEL);
3354	if (!g_iommus) {
3355		pr_err("Allocating global iommu array failed\n");
3356		ret = -ENOMEM;
3357		goto error;
3358	}
3359
3360	for_each_iommu(iommu, drhd) {
3361		if (drhd->ignored) {
3362			iommu_disable_translation(iommu);
3363			continue;
3364		}
3365
3366		/*
3367		 * Find the max pasid size of all IOMMU's in the system.
3368		 * We need to ensure the system pasid table is no bigger
3369		 * than the smallest supported.
3370		 */
3371		if (pasid_supported(iommu)) {
3372			u32 temp = 2 << ecap_pss(iommu->ecap);
3373
3374			intel_pasid_max_id = min_t(u32, temp,
3375						   intel_pasid_max_id);
3376		}
3377
3378		g_iommus[iommu->seq_id] = iommu;
3379
3380		intel_iommu_init_qi(iommu);
3381
3382		ret = iommu_init_domains(iommu);
3383		if (ret)
3384			goto free_iommu;
3385
3386		init_translation_status(iommu);
3387
3388		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3389			iommu_disable_translation(iommu);
3390			clear_translation_pre_enabled(iommu);
3391			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3392				iommu->name);
3393		}
3394
3395		/*
3396		 * TBD:
3397		 * we could share the same root & context tables
3398		 * among all IOMMU's. Need to Split it later.
3399		 */
3400		ret = iommu_alloc_root_entry(iommu);
3401		if (ret)
3402			goto free_iommu;
3403
3404		if (translation_pre_enabled(iommu)) {
3405			pr_info("Translation already enabled - trying to copy translation structures\n");
3406
3407			ret = copy_translation_tables(iommu);
3408			if (ret) {
3409				/*
3410				 * We found the IOMMU with translation
3411				 * enabled - but failed to copy over the
3412				 * old root-entry table. Try to proceed
3413				 * by disabling translation now and
3414				 * allocating a clean root-entry table.
3415				 * This might cause DMAR faults, but
3416				 * probably the dump will still succeed.
3417				 */
3418				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3419				       iommu->name);
3420				iommu_disable_translation(iommu);
3421				clear_translation_pre_enabled(iommu);
3422			} else {
3423				pr_info("Copied translation tables from previous kernel for %s\n",
3424					iommu->name);
3425			}
3426		}
3427
3428		if (!ecap_pass_through(iommu->ecap))
3429			hw_pass_through = 0;
3430
3431		if (!intel_iommu_strict && cap_caching_mode(iommu->cap)) {
3432			pr_warn("Disable batched IOTLB flush due to virtualization");
3433			intel_iommu_strict = 1;
3434		}
3435		intel_svm_check(iommu);
3436	}
3437
3438	/*
3439	 * Now that qi is enabled on all iommus, set the root entry and flush
3440	 * caches. This is required on some Intel X58 chipsets, otherwise the
3441	 * flush_context function will loop forever and the boot hangs.
3442	 */
3443	for_each_active_iommu(iommu, drhd) {
3444		iommu_flush_write_buffer(iommu);
3445#ifdef CONFIG_INTEL_IOMMU_SVM
3446		register_pasid_allocator(iommu);
3447#endif
3448		iommu_set_root_entry(iommu);
3449	}
3450
3451#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3452	dmar_map_gfx = 0;
3453#endif
3454
3455	if (!dmar_map_gfx)
3456		iommu_identity_mapping |= IDENTMAP_GFX;
3457
3458	check_tylersburg_isoch();
3459
3460	ret = si_domain_init(hw_pass_through);
3461	if (ret)
3462		goto free_iommu;
3463
3464	/*
3465	 * for each drhd
3466	 *   enable fault log
3467	 *   global invalidate context cache
3468	 *   global invalidate iotlb
3469	 *   enable translation
3470	 */
3471	for_each_iommu(iommu, drhd) {
3472		if (drhd->ignored) {
3473			/*
3474			 * we always have to disable PMRs or DMA may fail on
3475			 * this device
3476			 */
3477			if (force_on)
3478				iommu_disable_protect_mem_regions(iommu);
3479			continue;
3480		}
3481
3482		iommu_flush_write_buffer(iommu);
3483
3484#ifdef CONFIG_INTEL_IOMMU_SVM
3485		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3486			/*
3487			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3488			 * could cause possible lock race condition.
3489			 */
3490			up_write(&dmar_global_lock);
3491			ret = intel_svm_enable_prq(iommu);
3492			down_write(&dmar_global_lock);
3493			if (ret)
3494				goto free_iommu;
3495		}
3496#endif
3497		ret = dmar_set_interrupt(iommu);
3498		if (ret)
3499			goto free_iommu;
3500	}
3501
3502	return 0;
3503
3504free_iommu:
3505	for_each_active_iommu(iommu, drhd) {
3506		disable_dmar_iommu(iommu);
3507		free_dmar_iommu(iommu);
3508	}
3509	if (si_domain) {
3510		domain_exit(si_domain);
3511		si_domain = NULL;
3512	}
3513
3514	kfree(g_iommus);
3515
3516error:
3517	return ret;
3518}
3519
3520/* This takes a number of _MM_ pages, not VTD pages */
3521static unsigned long intel_alloc_iova(struct device *dev,
3522				     struct dmar_domain *domain,
3523				     unsigned long nrpages, uint64_t dma_mask)
3524{
3525	unsigned long iova_pfn;
3526
3527	/*
3528	 * Restrict dma_mask to the width that the iommu can handle.
3529	 * First-level translation restricts the input-address to a
3530	 * canonical address (i.e., address bits 63:N have the same
3531	 * value as address bit [N-1], where N is 48-bits with 4-level
3532	 * paging and 57-bits with 5-level paging). Hence, skip bit
3533	 * [N-1].
3534	 */
3535	if (domain_use_first_level(domain))
3536		dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3537				 dma_mask);
3538	else
3539		dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3540				 dma_mask);
3541
3542	/* Ensure we reserve the whole size-aligned region */
3543	nrpages = __roundup_pow_of_two(nrpages);
3544
3545	if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3546		/*
3547		 * First try to allocate an io virtual address in
3548		 * DMA_BIT_MASK(32) and if that fails then try allocating
3549		 * from higher range
3550		 */
3551		iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3552					   IOVA_PFN(DMA_BIT_MASK(32)), false);
3553		if (iova_pfn)
3554			return iova_pfn;
3555	}
3556	iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3557				   IOVA_PFN(dma_mask), true);
3558	if (unlikely(!iova_pfn)) {
3559		dev_err_once(dev, "Allocating %ld-page iova failed\n",
3560			     nrpages);
3561		return 0;
3562	}
3563
3564	return iova_pfn;
3565}
3566
3567static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3568				     size_t size, int dir, u64 dma_mask)
3569{
3570	struct dmar_domain *domain;
3571	phys_addr_t start_paddr;
3572	unsigned long iova_pfn;
3573	int prot = 0;
3574	int ret;
3575	struct intel_iommu *iommu;
3576	unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3577
3578	BUG_ON(dir == DMA_NONE);
3579
3580	if (unlikely(attach_deferred(dev)))
3581		do_deferred_attach(dev);
3582
3583	domain = find_domain(dev);
3584	if (!domain)
3585		return DMA_MAPPING_ERROR;
3586
3587	iommu = domain_get_iommu(domain);
3588	size = aligned_nrpages(paddr, size);
3589
3590	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3591	if (!iova_pfn)
3592		goto error;
3593
3594	/*
3595	 * Check if DMAR supports zero-length reads on write only
3596	 * mappings..
3597	 */
3598	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3599			!cap_zlr(iommu->cap))
3600		prot |= DMA_PTE_READ;
3601	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3602		prot |= DMA_PTE_WRITE;
3603	/*
3604	 * paddr - (paddr + size) might be partial page, we should map the whole
3605	 * page.  Note: if two part of one page are separately mapped, we
3606	 * might have two guest_addr mapping to the same host paddr, but this
3607	 * is not a big problem
3608	 */
3609	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3610				 mm_to_dma_pfn(paddr_pfn), size, prot);
3611	if (ret)
3612		goto error;
3613
3614	start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3615	start_paddr += paddr & ~PAGE_MASK;
3616
3617	trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3618
3619	return start_paddr;
3620
3621error:
3622	if (iova_pfn)
3623		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3624	dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3625		size, (unsigned long long)paddr, dir);
3626	return DMA_MAPPING_ERROR;
3627}
3628
3629static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3630				 unsigned long offset, size_t size,
3631				 enum dma_data_direction dir,
3632				 unsigned long attrs)
3633{
3634	return __intel_map_single(dev, page_to_phys(page) + offset,
3635				  size, dir, *dev->dma_mask);
3636}
3637
3638static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3639				     size_t size, enum dma_data_direction dir,
3640				     unsigned long attrs)
3641{
3642	return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3643}
3644
3645static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3646{
3647	struct dmar_domain *domain;
3648	unsigned long start_pfn, last_pfn;
3649	unsigned long nrpages;
3650	unsigned long iova_pfn;
3651	struct intel_iommu *iommu;
3652	struct page *freelist;
3653	struct pci_dev *pdev = NULL;
3654
3655	domain = find_domain(dev);
3656	BUG_ON(!domain);
3657
3658	iommu = domain_get_iommu(domain);
3659
3660	iova_pfn = IOVA_PFN(dev_addr);
3661
3662	nrpages = aligned_nrpages(dev_addr, size);
3663	start_pfn = mm_to_dma_pfn(iova_pfn);
3664	last_pfn = start_pfn + nrpages - 1;
3665
3666	if (dev_is_pci(dev))
3667		pdev = to_pci_dev(dev);
3668
3669	freelist = domain_unmap(domain, start_pfn, last_pfn);
3670	if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3671			!has_iova_flush_queue(&domain->iovad)) {
3672		iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3673				      nrpages, !freelist, 0);
3674		/* free iova */
3675		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3676		dma_free_pagelist(freelist);
3677	} else {
3678		queue_iova(&domain->iovad, iova_pfn, nrpages,
3679			   (unsigned long)freelist);
3680		/*
3681		 * queue up the release of the unmap to save the 1/6th of the
3682		 * cpu used up by the iotlb flush operation...
3683		 */
3684	}
3685
3686	trace_unmap_single(dev, dev_addr, size);
3687}
3688
3689static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3690			     size_t size, enum dma_data_direction dir,
3691			     unsigned long attrs)
3692{
3693	intel_unmap(dev, dev_addr, size);
3694}
3695
3696static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3697		size_t size, enum dma_data_direction dir, unsigned long attrs)
3698{
3699	intel_unmap(dev, dev_addr, size);
3700}
3701
3702static void *intel_alloc_coherent(struct device *dev, size_t size,
3703				  dma_addr_t *dma_handle, gfp_t flags,
3704				  unsigned long attrs)
3705{
3706	struct page *page = NULL;
3707	int order;
3708
3709	if (unlikely(attach_deferred(dev)))
3710		do_deferred_attach(dev);
3711
3712	size = PAGE_ALIGN(size);
3713	order = get_order(size);
3714
3715	if (gfpflags_allow_blocking(flags)) {
3716		unsigned int count = size >> PAGE_SHIFT;
3717
3718		page = dma_alloc_from_contiguous(dev, count, order,
3719						 flags & __GFP_NOWARN);
3720	}
3721
3722	if (!page)
3723		page = alloc_pages(flags, order);
3724	if (!page)
3725		return NULL;
3726	memset(page_address(page), 0, size);
3727
3728	*dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3729					 DMA_BIDIRECTIONAL,
3730					 dev->coherent_dma_mask);
3731	if (*dma_handle != DMA_MAPPING_ERROR)
3732		return page_address(page);
3733	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3734		__free_pages(page, order);
3735
3736	return NULL;
3737}
3738
3739static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3740				dma_addr_t dma_handle, unsigned long attrs)
3741{
3742	int order;
3743	struct page *page = virt_to_page(vaddr);
3744
3745	size = PAGE_ALIGN(size);
3746	order = get_order(size);
3747
3748	intel_unmap(dev, dma_handle, size);
3749	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3750		__free_pages(page, order);
3751}
3752
3753static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3754			   int nelems, enum dma_data_direction dir,
3755			   unsigned long attrs)
3756{
3757	dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3758	unsigned long nrpages = 0;
3759	struct scatterlist *sg;
3760	int i;
3761
3762	for_each_sg(sglist, sg, nelems, i) {
3763		nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3764	}
3765
3766	intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3767
3768	trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3769}
3770
3771static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3772			enum dma_data_direction dir, unsigned long attrs)
3773{
3774	int i;
3775	struct dmar_domain *domain;
3776	size_t size = 0;
3777	int prot = 0;
3778	unsigned long iova_pfn;
3779	int ret;
3780	struct scatterlist *sg;
3781	unsigned long start_vpfn;
3782	struct intel_iommu *iommu;
3783
3784	BUG_ON(dir == DMA_NONE);
3785
3786	if (unlikely(attach_deferred(dev)))
3787		do_deferred_attach(dev);
3788
3789	domain = find_domain(dev);
3790	if (!domain)
3791		return 0;
3792
3793	iommu = domain_get_iommu(domain);
3794
3795	for_each_sg(sglist, sg, nelems, i)
3796		size += aligned_nrpages(sg->offset, sg->length);
3797
3798	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3799				*dev->dma_mask);
3800	if (!iova_pfn) {
3801		sglist->dma_length = 0;
3802		return 0;
3803	}
3804
3805	/*
3806	 * Check if DMAR supports zero-length reads on write only
3807	 * mappings..
3808	 */
3809	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3810			!cap_zlr(iommu->cap))
3811		prot |= DMA_PTE_READ;
3812	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3813		prot |= DMA_PTE_WRITE;
3814
3815	start_vpfn = mm_to_dma_pfn(iova_pfn);
3816
3817	ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3818	if (unlikely(ret)) {
3819		dma_pte_free_pagetable(domain, start_vpfn,
3820				       start_vpfn + size - 1,
3821				       agaw_to_level(domain->agaw) + 1);
3822		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3823		return 0;
3824	}
3825
3826	for_each_sg(sglist, sg, nelems, i)
3827		trace_map_sg(dev, i + 1, nelems, sg);
3828
3829	return nelems;
3830}
3831
3832static u64 intel_get_required_mask(struct device *dev)
3833{
3834	return DMA_BIT_MASK(32);
3835}
3836
3837static const struct dma_map_ops intel_dma_ops = {
3838	.alloc = intel_alloc_coherent,
3839	.free = intel_free_coherent,
3840	.map_sg = intel_map_sg,
3841	.unmap_sg = intel_unmap_sg,
3842	.map_page = intel_map_page,
3843	.unmap_page = intel_unmap_page,
3844	.map_resource = intel_map_resource,
3845	.unmap_resource = intel_unmap_resource,
3846	.dma_supported = dma_direct_supported,
3847	.mmap = dma_common_mmap,
3848	.get_sgtable = dma_common_get_sgtable,
3849	.alloc_pages = dma_common_alloc_pages,
3850	.free_pages = dma_common_free_pages,
3851	.get_required_mask = intel_get_required_mask,
3852};
3853
3854static void
3855bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3856		   enum dma_data_direction dir, enum dma_sync_target target)
3857{
3858	struct dmar_domain *domain;
3859	phys_addr_t tlb_addr;
3860
3861	domain = find_domain(dev);
3862	if (WARN_ON(!domain))
3863		return;
3864
3865	tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3866	if (is_swiotlb_buffer(tlb_addr))
3867		swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3868}
3869
3870static dma_addr_t
3871bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3872		  enum dma_data_direction dir, unsigned long attrs,
3873		  u64 dma_mask)
3874{
3875	size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3876	struct dmar_domain *domain;
3877	struct intel_iommu *iommu;
3878	unsigned long iova_pfn;
3879	unsigned long nrpages;
3880	phys_addr_t tlb_addr;
3881	int prot = 0;
3882	int ret;
3883
3884	if (unlikely(attach_deferred(dev)))
3885		do_deferred_attach(dev);
3886
3887	domain = find_domain(dev);
3888
3889	if (WARN_ON(dir == DMA_NONE || !domain))
3890		return DMA_MAPPING_ERROR;
3891
3892	iommu = domain_get_iommu(domain);
3893	if (WARN_ON(!iommu))
3894		return DMA_MAPPING_ERROR;
3895
3896	nrpages = aligned_nrpages(0, size);
3897	iova_pfn = intel_alloc_iova(dev, domain,
3898				    dma_to_mm_pfn(nrpages), dma_mask);
3899	if (!iova_pfn)
3900		return DMA_MAPPING_ERROR;
3901
3902	/*
3903	 * Check if DMAR supports zero-length reads on write only
3904	 * mappings..
3905	 */
3906	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3907			!cap_zlr(iommu->cap))
3908		prot |= DMA_PTE_READ;
3909	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3910		prot |= DMA_PTE_WRITE;
3911
3912	/*
3913	 * If both the physical buffer start address and size are
3914	 * page aligned, we don't need to use a bounce page.
3915	 */
3916	if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3917		tlb_addr = swiotlb_tbl_map_single(dev, paddr, size,
3918				aligned_size, dir, attrs);
3919		if (tlb_addr == DMA_MAPPING_ERROR) {
3920			goto swiotlb_error;
3921		} else {
3922			/* Cleanup the padding area. */
3923			void *padding_start = phys_to_virt(tlb_addr);
3924			size_t padding_size = aligned_size;
3925
3926			if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3927			    (dir == DMA_TO_DEVICE ||
3928			     dir == DMA_BIDIRECTIONAL)) {
3929				padding_start += size;
3930				padding_size -= size;
3931			}
3932
3933			memset(padding_start, 0, padding_size);
3934		}
3935	} else {
3936		tlb_addr = paddr;
3937	}
3938
3939	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3940				 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3941	if (ret)
3942		goto mapping_error;
3943
3944	trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3945
3946	return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3947
3948mapping_error:
3949	if (is_swiotlb_buffer(tlb_addr))
3950		swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3951					 aligned_size, dir, attrs);
3952swiotlb_error:
3953	free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3954	dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3955		size, (unsigned long long)paddr, dir);
3956
3957	return DMA_MAPPING_ERROR;
3958}
3959
3960static void
3961bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3962		    enum dma_data_direction dir, unsigned long attrs)
3963{
3964	size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3965	struct dmar_domain *domain;
3966	phys_addr_t tlb_addr;
3967
3968	domain = find_domain(dev);
3969	if (WARN_ON(!domain))
3970		return;
3971
3972	tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3973	if (WARN_ON(!tlb_addr))
3974		return;
3975
3976	intel_unmap(dev, dev_addr, size);
3977	if (is_swiotlb_buffer(tlb_addr))
3978		swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3979					 aligned_size, dir, attrs);
3980
3981	trace_bounce_unmap_single(dev, dev_addr, size);
3982}
3983
3984static dma_addr_t
3985bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3986		size_t size, enum dma_data_direction dir, unsigned long attrs)
3987{
3988	return bounce_map_single(dev, page_to_phys(page) + offset,
3989				 size, dir, attrs, *dev->dma_mask);
3990}
3991
3992static dma_addr_t
3993bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3994		    enum dma_data_direction dir, unsigned long attrs)
3995{
3996	return bounce_map_single(dev, phys_addr, size,
3997				 dir, attrs, *dev->dma_mask);
3998}
3999
4000static void
4001bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
4002		  enum dma_data_direction dir, unsigned long attrs)
4003{
4004	bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4005}
4006
4007static void
4008bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
4009		      enum dma_data_direction dir, unsigned long attrs)
4010{
4011	bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4012}
4013
4014static void
4015bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4016		enum dma_data_direction dir, unsigned long attrs)
4017{
4018	struct scatterlist *sg;
4019	int i;
4020
4021	for_each_sg(sglist, sg, nelems, i)
4022		bounce_unmap_page(dev, sg->dma_address,
4023				  sg_dma_len(sg), dir, attrs);
4024}
4025
4026static int
4027bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4028	      enum dma_data_direction dir, unsigned long attrs)
4029{
4030	int i;
4031	struct scatterlist *sg;
4032
4033	for_each_sg(sglist, sg, nelems, i) {
4034		sg->dma_address = bounce_map_page(dev, sg_page(sg),
4035						  sg->offset, sg->length,
4036						  dir, attrs);
4037		if (sg->dma_address == DMA_MAPPING_ERROR)
4038			goto out_unmap;
4039		sg_dma_len(sg) = sg->length;
4040	}
4041
4042	for_each_sg(sglist, sg, nelems, i)
4043		trace_bounce_map_sg(dev, i + 1, nelems, sg);
4044
4045	return nelems;
4046
4047out_unmap:
4048	bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
4049	return 0;
4050}
4051
4052static void
4053bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
4054			   size_t size, enum dma_data_direction dir)
4055{
4056	bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
4057}
4058
4059static void
4060bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
4061			      size_t size, enum dma_data_direction dir)
4062{
4063	bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4064}
4065
4066static void
4067bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4068		       int nelems, enum dma_data_direction dir)
4069{
4070	struct scatterlist *sg;
4071	int i;
4072
4073	for_each_sg(sglist, sg, nelems, i)
4074		bounce_sync_single(dev, sg_dma_address(sg),
4075				   sg_dma_len(sg), dir, SYNC_FOR_CPU);
4076}
4077
4078static void
4079bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4080			  int nelems, enum dma_data_direction dir)
4081{
4082	struct scatterlist *sg;
4083	int i;
4084
4085	for_each_sg(sglist, sg, nelems, i)
4086		bounce_sync_single(dev, sg_dma_address(sg),
4087				   sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4088}
4089
4090static const struct dma_map_ops bounce_dma_ops = {
4091	.alloc			= intel_alloc_coherent,
4092	.free			= intel_free_coherent,
4093	.map_sg			= bounce_map_sg,
4094	.unmap_sg		= bounce_unmap_sg,
4095	.map_page		= bounce_map_page,
4096	.unmap_page		= bounce_unmap_page,
4097	.sync_single_for_cpu	= bounce_sync_single_for_cpu,
4098	.sync_single_for_device	= bounce_sync_single_for_device,
4099	.sync_sg_for_cpu	= bounce_sync_sg_for_cpu,
4100	.sync_sg_for_device	= bounce_sync_sg_for_device,
4101	.map_resource		= bounce_map_resource,
4102	.unmap_resource		= bounce_unmap_resource,
4103	.alloc_pages		= dma_common_alloc_pages,
4104	.free_pages		= dma_common_free_pages,
4105	.dma_supported		= dma_direct_supported,
4106};
4107
4108static inline int iommu_domain_cache_init(void)
4109{
4110	int ret = 0;
4111
4112	iommu_domain_cache = kmem_cache_create("iommu_domain",
4113					 sizeof(struct dmar_domain),
4114					 0,
4115					 SLAB_HWCACHE_ALIGN,
4116
4117					 NULL);
4118	if (!iommu_domain_cache) {
4119		pr_err("Couldn't create iommu_domain cache\n");
4120		ret = -ENOMEM;
4121	}
4122
4123	return ret;
4124}
4125
4126static inline int iommu_devinfo_cache_init(void)
4127{
4128	int ret = 0;
4129
4130	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4131					 sizeof(struct device_domain_info),
4132					 0,
4133					 SLAB_HWCACHE_ALIGN,
4134					 NULL);
4135	if (!iommu_devinfo_cache) {
4136		pr_err("Couldn't create devinfo cache\n");
4137		ret = -ENOMEM;
4138	}
4139
4140	return ret;
4141}
4142
4143static int __init iommu_init_mempool(void)
4144{
4145	int ret;
4146	ret = iova_cache_get();
4147	if (ret)
4148		return ret;
4149
4150	ret = iommu_domain_cache_init();
4151	if (ret)
4152		goto domain_error;
4153
4154	ret = iommu_devinfo_cache_init();
4155	if (!ret)
4156		return ret;
4157
4158	kmem_cache_destroy(iommu_domain_cache);
4159domain_error:
4160	iova_cache_put();
4161
4162	return -ENOMEM;
4163}
4164
4165static void __init iommu_exit_mempool(void)
4166{
4167	kmem_cache_destroy(iommu_devinfo_cache);
4168	kmem_cache_destroy(iommu_domain_cache);
4169	iova_cache_put();
4170}
4171
4172static void __init init_no_remapping_devices(void)
4173{
4174	struct dmar_drhd_unit *drhd;
4175	struct device *dev;
4176	int i;
4177
4178	for_each_drhd_unit(drhd) {
4179		if (!drhd->include_all) {
4180			for_each_active_dev_scope(drhd->devices,
4181						  drhd->devices_cnt, i, dev)
4182				break;
4183			/* ignore DMAR unit if no devices exist */
4184			if (i == drhd->devices_cnt)
4185				drhd->ignored = 1;
4186		}
4187	}
4188
4189	for_each_active_drhd_unit(drhd) {
4190		if (drhd->include_all)
4191			continue;
4192
4193		for_each_active_dev_scope(drhd->devices,
4194					  drhd->devices_cnt, i, dev)
4195			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4196				break;
4197		if (i < drhd->devices_cnt)
4198			continue;
4199
4200		/* This IOMMU has *only* gfx devices. Either bypass it or
4201		   set the gfx_mapped flag, as appropriate */
4202		drhd->gfx_dedicated = 1;
4203		if (!dmar_map_gfx)
4204			drhd->ignored = 1;
4205	}
4206}
4207
4208#ifdef CONFIG_SUSPEND
4209static int init_iommu_hw(void)
4210{
4211	struct dmar_drhd_unit *drhd;
4212	struct intel_iommu *iommu = NULL;
4213
4214	for_each_active_iommu(iommu, drhd)
4215		if (iommu->qi)
4216			dmar_reenable_qi(iommu);
4217
4218	for_each_iommu(iommu, drhd) {
4219		if (drhd->ignored) {
4220			/*
4221			 * we always have to disable PMRs or DMA may fail on
4222			 * this device
4223			 */
4224			if (force_on)
4225				iommu_disable_protect_mem_regions(iommu);
4226			continue;
4227		}
4228
4229		iommu_flush_write_buffer(iommu);
4230		iommu_set_root_entry(iommu);
4231		iommu_enable_translation(iommu);
4232		iommu_disable_protect_mem_regions(iommu);
4233	}
4234
4235	return 0;
4236}
4237
4238static void iommu_flush_all(void)
4239{
4240	struct dmar_drhd_unit *drhd;
4241	struct intel_iommu *iommu;
4242
4243	for_each_active_iommu(iommu, drhd) {
4244		iommu->flush.flush_context(iommu, 0, 0, 0,
4245					   DMA_CCMD_GLOBAL_INVL);
4246		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4247					 DMA_TLB_GLOBAL_FLUSH);
4248	}
4249}
4250
4251static int iommu_suspend(void)
4252{
4253	struct dmar_drhd_unit *drhd;
4254	struct intel_iommu *iommu = NULL;
4255	unsigned long flag;
4256
4257	for_each_active_iommu(iommu, drhd) {
4258		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4259						 GFP_ATOMIC);
4260		if (!iommu->iommu_state)
4261			goto nomem;
4262	}
4263
4264	iommu_flush_all();
4265
4266	for_each_active_iommu(iommu, drhd) {
4267		iommu_disable_translation(iommu);
4268
4269		raw_spin_lock_irqsave(&iommu->register_lock, flag);
4270
4271		iommu->iommu_state[SR_DMAR_FECTL_REG] =
4272			readl(iommu->reg + DMAR_FECTL_REG);
4273		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4274			readl(iommu->reg + DMAR_FEDATA_REG);
4275		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4276			readl(iommu->reg + DMAR_FEADDR_REG);
4277		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4278			readl(iommu->reg + DMAR_FEUADDR_REG);
4279
4280		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4281	}
4282	return 0;
4283
4284nomem:
4285	for_each_active_iommu(iommu, drhd)
4286		kfree(iommu->iommu_state);
4287
4288	return -ENOMEM;
4289}
4290
4291static void iommu_resume(void)
4292{
4293	struct dmar_drhd_unit *drhd;
4294	struct intel_iommu *iommu = NULL;
4295	unsigned long flag;
4296
4297	if (init_iommu_hw()) {
4298		if (force_on)
4299			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4300		else
4301			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4302		return;
4303	}
4304
4305	for_each_active_iommu(iommu, drhd) {
4306
4307		raw_spin_lock_irqsave(&iommu->register_lock, flag);
4308
4309		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4310			iommu->reg + DMAR_FECTL_REG);
4311		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4312			iommu->reg + DMAR_FEDATA_REG);
4313		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4314			iommu->reg + DMAR_FEADDR_REG);
4315		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4316			iommu->reg + DMAR_FEUADDR_REG);
4317
4318		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4319	}
4320
4321	for_each_active_iommu(iommu, drhd)
4322		kfree(iommu->iommu_state);
4323}
4324
4325static struct syscore_ops iommu_syscore_ops = {
4326	.resume		= iommu_resume,
4327	.suspend	= iommu_suspend,
4328};
4329
4330static void __init init_iommu_pm_ops(void)
4331{
4332	register_syscore_ops(&iommu_syscore_ops);
4333}
4334
4335#else
4336static inline void init_iommu_pm_ops(void) {}
4337#endif	/* CONFIG_PM */
4338
4339static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4340{
4341	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4342	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4343	    rmrr->end_address <= rmrr->base_address ||
4344	    arch_rmrr_sanity_check(rmrr))
4345		return -EINVAL;
4346
4347	return 0;
4348}
4349
4350int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4351{
4352	struct acpi_dmar_reserved_memory *rmrr;
4353	struct dmar_rmrr_unit *rmrru;
4354
4355	rmrr = (struct acpi_dmar_reserved_memory *)header;
4356	if (rmrr_sanity_check(rmrr)) {
4357		pr_warn(FW_BUG
4358			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4359			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4360			   rmrr->base_address, rmrr->end_address,
4361			   dmi_get_system_info(DMI_BIOS_VENDOR),
4362			   dmi_get_system_info(DMI_BIOS_VERSION),
4363			   dmi_get_system_info(DMI_PRODUCT_VERSION));
4364		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4365	}
4366
4367	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4368	if (!rmrru)
4369		goto out;
4370
4371	rmrru->hdr = header;
4372
4373	rmrru->base_address = rmrr->base_address;
4374	rmrru->end_address = rmrr->end_address;
4375
4376	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4377				((void *)rmrr) + rmrr->header.length,
4378				&rmrru->devices_cnt);
4379	if (rmrru->devices_cnt && rmrru->devices == NULL)
4380		goto free_rmrru;
4381
4382	list_add(&rmrru->list, &dmar_rmrr_units);
4383
4384	return 0;
4385free_rmrru:
4386	kfree(rmrru);
4387out:
4388	return -ENOMEM;
4389}
4390
4391static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4392{
4393	struct dmar_atsr_unit *atsru;
4394	struct acpi_dmar_atsr *tmp;
4395
4396	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4397				dmar_rcu_check()) {
4398		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4399		if (atsr->segment != tmp->segment)
4400			continue;
4401		if (atsr->header.length != tmp->header.length)
4402			continue;
4403		if (memcmp(atsr, tmp, atsr->header.length) == 0)
4404			return atsru;
4405	}
4406
4407	return NULL;
4408}
4409
4410int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4411{
4412	struct acpi_dmar_atsr *atsr;
4413	struct dmar_atsr_unit *atsru;
4414
4415	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4416		return 0;
4417
4418	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4419	atsru = dmar_find_atsr(atsr);
4420	if (atsru)
4421		return 0;
4422
4423	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4424	if (!atsru)
4425		return -ENOMEM;
4426
4427	/*
4428	 * If memory is allocated from slab by ACPI _DSM method, we need to
4429	 * copy the memory content because the memory buffer will be freed
4430	 * on return.
4431	 */
4432	atsru->hdr = (void *)(atsru + 1);
4433	memcpy(atsru->hdr, hdr, hdr->length);
4434	atsru->include_all = atsr->flags & 0x1;
4435	if (!atsru->include_all) {
4436		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4437				(void *)atsr + atsr->header.length,
4438				&atsru->devices_cnt);
4439		if (atsru->devices_cnt && atsru->devices == NULL) {
4440			kfree(atsru);
4441			return -ENOMEM;
4442		}
4443	}
4444
4445	list_add_rcu(&atsru->list, &dmar_atsr_units);
4446
4447	return 0;
4448}
4449
4450static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4451{
4452	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4453	kfree(atsru);
4454}
4455
4456int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4457{
4458	struct acpi_dmar_atsr *atsr;
4459	struct dmar_atsr_unit *atsru;
4460
4461	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4462	atsru = dmar_find_atsr(atsr);
4463	if (atsru) {
4464		list_del_rcu(&atsru->list);
4465		synchronize_rcu();
4466		intel_iommu_free_atsr(atsru);
4467	}
4468
4469	return 0;
4470}
4471
4472int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4473{
4474	int i;
4475	struct device *dev;
4476	struct acpi_dmar_atsr *atsr;
4477	struct dmar_atsr_unit *atsru;
4478
4479	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4480	atsru = dmar_find_atsr(atsr);
4481	if (!atsru)
4482		return 0;
4483
4484	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4485		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4486					  i, dev)
4487			return -EBUSY;
4488	}
4489
4490	return 0;
4491}
4492
4493static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4494{
4495	int sp, ret;
4496	struct intel_iommu *iommu = dmaru->iommu;
4497
4498	if (g_iommus[iommu->seq_id])
4499		return 0;
4500
4501	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4502		pr_warn("%s: Doesn't support hardware pass through.\n",
4503			iommu->name);
4504		return -ENXIO;
4505	}
4506	if (!ecap_sc_support(iommu->ecap) &&
4507	    domain_update_iommu_snooping(iommu)) {
4508		pr_warn("%s: Doesn't support snooping.\n",
4509			iommu->name);
4510		return -ENXIO;
4511	}
4512	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4513	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4514		pr_warn("%s: Doesn't support large page.\n",
4515			iommu->name);
4516		return -ENXIO;
4517	}
4518
4519	/*
4520	 * Disable translation if already enabled prior to OS handover.
4521	 */
4522	if (iommu->gcmd & DMA_GCMD_TE)
4523		iommu_disable_translation(iommu);
4524
4525	g_iommus[iommu->seq_id] = iommu;
4526	ret = iommu_init_domains(iommu);
4527	if (ret == 0)
4528		ret = iommu_alloc_root_entry(iommu);
4529	if (ret)
4530		goto out;
4531
4532	intel_svm_check(iommu);
4533
4534	if (dmaru->ignored) {
4535		/*
4536		 * we always have to disable PMRs or DMA may fail on this device
4537		 */
4538		if (force_on)
4539			iommu_disable_protect_mem_regions(iommu);
4540		return 0;
4541	}
4542
4543	intel_iommu_init_qi(iommu);
4544	iommu_flush_write_buffer(iommu);
4545
4546#ifdef CONFIG_INTEL_IOMMU_SVM
4547	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4548		ret = intel_svm_enable_prq(iommu);
4549		if (ret)
4550			goto disable_iommu;
4551	}
4552#endif
4553	ret = dmar_set_interrupt(iommu);
4554	if (ret)
4555		goto disable_iommu;
4556
4557	iommu_set_root_entry(iommu);
4558	iommu_enable_translation(iommu);
4559
4560	iommu_disable_protect_mem_regions(iommu);
4561	return 0;
4562
4563disable_iommu:
4564	disable_dmar_iommu(iommu);
4565out:
4566	free_dmar_iommu(iommu);
4567	return ret;
4568}
4569
4570int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4571{
4572	int ret = 0;
4573	struct intel_iommu *iommu = dmaru->iommu;
4574
4575	if (!intel_iommu_enabled)
4576		return 0;
4577	if (iommu == NULL)
4578		return -EINVAL;
4579
4580	if (insert) {
4581		ret = intel_iommu_add(dmaru);
4582	} else {
4583		disable_dmar_iommu(iommu);
4584		free_dmar_iommu(iommu);
4585	}
4586
4587	return ret;
4588}
4589
4590static void intel_iommu_free_dmars(void)
4591{
4592	struct dmar_rmrr_unit *rmrru, *rmrr_n;
4593	struct dmar_atsr_unit *atsru, *atsr_n;
4594
4595	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4596		list_del(&rmrru->list);
4597		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4598		kfree(rmrru);
4599	}
4600
4601	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4602		list_del(&atsru->list);
4603		intel_iommu_free_atsr(atsru);
4604	}
4605}
4606
4607int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4608{
4609	int i, ret = 1;
4610	struct pci_bus *bus;
4611	struct pci_dev *bridge = NULL;
4612	struct device *tmp;
4613	struct acpi_dmar_atsr *atsr;
4614	struct dmar_atsr_unit *atsru;
4615
4616	dev = pci_physfn(dev);
4617	for (bus = dev->bus; bus; bus = bus->parent) {
4618		bridge = bus->self;
4619		/* If it's an integrated device, allow ATS */
4620		if (!bridge)
4621			return 1;
4622		/* Connected via non-PCIe: no ATS */
4623		if (!pci_is_pcie(bridge) ||
4624		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4625			return 0;
4626		/* If we found the root port, look it up in the ATSR */
4627		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4628			break;
4629	}
4630
4631	rcu_read_lock();
4632	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4633		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4634		if (atsr->segment != pci_domain_nr(dev->bus))
4635			continue;
4636
4637		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4638			if (tmp == &bridge->dev)
4639				goto out;
4640
4641		if (atsru->include_all)
4642			goto out;
4643	}
4644	ret = 0;
4645out:
4646	rcu_read_unlock();
4647
4648	return ret;
4649}
4650
4651int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4652{
4653	int ret;
4654	struct dmar_rmrr_unit *rmrru;
4655	struct dmar_atsr_unit *atsru;
4656	struct acpi_dmar_atsr *atsr;
4657	struct acpi_dmar_reserved_memory *rmrr;
4658
4659	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4660		return 0;
4661
4662	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4663		rmrr = container_of(rmrru->hdr,
4664				    struct acpi_dmar_reserved_memory, header);
4665		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4666			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4667				((void *)rmrr) + rmrr->header.length,
4668				rmrr->segment, rmrru->devices,
4669				rmrru->devices_cnt);
4670			if (ret < 0)
4671				return ret;
4672		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4673			dmar_remove_dev_scope(info, rmrr->segment,
4674				rmrru->devices, rmrru->devices_cnt);
4675		}
4676	}
4677
4678	list_for_each_entry(atsru, &dmar_atsr_units, list) {
4679		if (atsru->include_all)
4680			continue;
4681
4682		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4683		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4684			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4685					(void *)atsr + atsr->header.length,
4686					atsr->segment, atsru->devices,
4687					atsru->devices_cnt);
4688			if (ret > 0)
4689				break;
4690			else if (ret < 0)
4691				return ret;
4692		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4693			if (dmar_remove_dev_scope(info, atsr->segment,
4694					atsru->devices, atsru->devices_cnt))
4695				break;
4696		}
4697	}
4698
4699	return 0;
4700}
4701
4702static int intel_iommu_memory_notifier(struct notifier_block *nb,
4703				       unsigned long val, void *v)
4704{
4705	struct memory_notify *mhp = v;
4706	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4707	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4708			mhp->nr_pages - 1);
4709
4710	switch (val) {
4711	case MEM_GOING_ONLINE:
4712		if (iommu_domain_identity_map(si_domain,
4713					      start_vpfn, last_vpfn)) {
4714			pr_warn("Failed to build identity map for [%lx-%lx]\n",
4715				start_vpfn, last_vpfn);
4716			return NOTIFY_BAD;
4717		}
4718		break;
4719
4720	case MEM_OFFLINE:
4721	case MEM_CANCEL_ONLINE:
4722		{
4723			struct dmar_drhd_unit *drhd;
4724			struct intel_iommu *iommu;
4725			struct page *freelist;
4726
4727			freelist = domain_unmap(si_domain,
4728						start_vpfn, last_vpfn);
4729
4730			rcu_read_lock();
4731			for_each_active_iommu(iommu, drhd)
4732				iommu_flush_iotlb_psi(iommu, si_domain,
4733					start_vpfn, mhp->nr_pages,
4734					!freelist, 0);
4735			rcu_read_unlock();
4736			dma_free_pagelist(freelist);
4737		}
4738		break;
4739	}
4740
4741	return NOTIFY_OK;
4742}
4743
4744static struct notifier_block intel_iommu_memory_nb = {
4745	.notifier_call = intel_iommu_memory_notifier,
4746	.priority = 0
4747};
4748
4749static void free_all_cpu_cached_iovas(unsigned int cpu)
4750{
4751	int i;
4752
4753	for (i = 0; i < g_num_of_iommus; i++) {
4754		struct intel_iommu *iommu = g_iommus[i];
4755		struct dmar_domain *domain;
4756		int did;
4757
4758		if (!iommu)
4759			continue;
4760
4761		for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4762			domain = get_iommu_domain(iommu, (u16)did);
4763
4764			if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4765				continue;
4766
4767			free_cpu_cached_iovas(cpu, &domain->iovad);
4768		}
4769	}
4770}
4771
4772static int intel_iommu_cpu_dead(unsigned int cpu)
4773{
4774	free_all_cpu_cached_iovas(cpu);
4775	return 0;
4776}
4777
4778static void intel_disable_iommus(void)
4779{
4780	struct intel_iommu *iommu = NULL;
4781	struct dmar_drhd_unit *drhd;
4782
4783	for_each_iommu(iommu, drhd)
4784		iommu_disable_translation(iommu);
4785}
4786
4787void intel_iommu_shutdown(void)
4788{
4789	struct dmar_drhd_unit *drhd;
4790	struct intel_iommu *iommu = NULL;
4791
4792	if (no_iommu || dmar_disabled)
4793		return;
4794
4795	down_write(&dmar_global_lock);
4796
4797	/* Disable PMRs explicitly here. */
4798	for_each_iommu(iommu, drhd)
4799		iommu_disable_protect_mem_regions(iommu);
4800
4801	/* Make sure the IOMMUs are switched off */
4802	intel_disable_iommus();
4803
4804	up_write(&dmar_global_lock);
4805}
4806
4807static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4808{
4809	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4810
4811	return container_of(iommu_dev, struct intel_iommu, iommu);
4812}
4813
4814static ssize_t intel_iommu_show_version(struct device *dev,
4815					struct device_attribute *attr,
4816					char *buf)
4817{
4818	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4819	u32 ver = readl(iommu->reg + DMAR_VER_REG);
4820	return sprintf(buf, "%d:%d\n",
4821		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4822}
4823static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4824
4825static ssize_t intel_iommu_show_address(struct device *dev,
4826					struct device_attribute *attr,
4827					char *buf)
4828{
4829	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4830	return sprintf(buf, "%llx\n", iommu->reg_phys);
4831}
4832static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4833
4834static ssize_t intel_iommu_show_cap(struct device *dev,
4835				    struct device_attribute *attr,
4836				    char *buf)
4837{
4838	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4839	return sprintf(buf, "%llx\n", iommu->cap);
4840}
4841static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4842
4843static ssize_t intel_iommu_show_ecap(struct device *dev,
4844				    struct device_attribute *attr,
4845				    char *buf)
4846{
4847	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4848	return sprintf(buf, "%llx\n", iommu->ecap);
4849}
4850static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4851
4852static ssize_t intel_iommu_show_ndoms(struct device *dev,
4853				      struct device_attribute *attr,
4854				      char *buf)
4855{
4856	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4857	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4858}
4859static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4860
4861static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4862					   struct device_attribute *attr,
4863					   char *buf)
4864{
4865	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4866	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4867						  cap_ndoms(iommu->cap)));
4868}
4869static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4870
4871static struct attribute *intel_iommu_attrs[] = {
4872	&dev_attr_version.attr,
4873	&dev_attr_address.attr,
4874	&dev_attr_cap.attr,
4875	&dev_attr_ecap.attr,
4876	&dev_attr_domains_supported.attr,
4877	&dev_attr_domains_used.attr,
4878	NULL,
4879};
4880
4881static struct attribute_group intel_iommu_group = {
4882	.name = "intel-iommu",
4883	.attrs = intel_iommu_attrs,
4884};
4885
4886const struct attribute_group *intel_iommu_groups[] = {
4887	&intel_iommu_group,
4888	NULL,
4889};
4890
4891static inline bool has_external_pci(void)
4892{
4893	struct pci_dev *pdev = NULL;
4894
4895	for_each_pci_dev(pdev)
4896		if (pdev->external_facing) {
4897			pci_dev_put(pdev);
4898			return true;
4899		}
4900
4901	return false;
4902}
4903
4904static int __init platform_optin_force_iommu(void)
4905{
4906	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4907		return 0;
4908
4909	if (no_iommu || dmar_disabled)
4910		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4911
4912	/*
4913	 * If Intel-IOMMU is disabled by default, we will apply identity
4914	 * map for all devices except those marked as being untrusted.
4915	 */
4916	if (dmar_disabled)
4917		iommu_set_default_passthrough(false);
4918
4919	dmar_disabled = 0;
4920	no_iommu = 0;
4921
4922	return 1;
4923}
4924
4925static int __init probe_acpi_namespace_devices(void)
4926{
4927	struct dmar_drhd_unit *drhd;
4928	/* To avoid a -Wunused-but-set-variable warning. */
4929	struct intel_iommu *iommu __maybe_unused;
4930	struct device *dev;
4931	int i, ret = 0;
4932
4933	for_each_active_iommu(iommu, drhd) {
4934		for_each_active_dev_scope(drhd->devices,
4935					  drhd->devices_cnt, i, dev) {
4936			struct acpi_device_physical_node *pn;
4937			struct iommu_group *group;
4938			struct acpi_device *adev;
4939
4940			if (dev->bus != &acpi_bus_type)
4941				continue;
4942
4943			adev = to_acpi_device(dev);
4944			mutex_lock(&adev->physical_node_lock);
4945			list_for_each_entry(pn,
4946					    &adev->physical_node_list, node) {
4947				group = iommu_group_get(pn->dev);
4948				if (group) {
4949					iommu_group_put(group);
4950					continue;
4951				}
4952
4953				pn->dev->bus->iommu_ops = &intel_iommu_ops;
4954				ret = iommu_probe_device(pn->dev);
4955				if (ret)
4956					break;
4957			}
4958			mutex_unlock(&adev->physical_node_lock);
4959
4960			if (ret)
4961				return ret;
4962		}
4963	}
4964
4965	return 0;
4966}
4967
4968int __init intel_iommu_init(void)
4969{
4970	int ret = -ENODEV;
4971	struct dmar_drhd_unit *drhd;
4972	struct intel_iommu *iommu;
4973
4974	/*
4975	 * Intel IOMMU is required for a TXT/tboot launch or platform
4976	 * opt in, so enforce that.
4977	 */
4978	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4979		    platform_optin_force_iommu();
4980
4981	if (iommu_init_mempool()) {
4982		if (force_on)
4983			panic("tboot: Failed to initialize iommu memory\n");
4984		return -ENOMEM;
4985	}
4986
4987	down_write(&dmar_global_lock);
4988	if (dmar_table_init()) {
4989		if (force_on)
4990			panic("tboot: Failed to initialize DMAR table\n");
4991		goto out_free_dmar;
4992	}
4993
4994	if (dmar_dev_scope_init() < 0) {
4995		if (force_on)
4996			panic("tboot: Failed to initialize DMAR device scope\n");
4997		goto out_free_dmar;
4998	}
4999
5000	up_write(&dmar_global_lock);
5001
5002	/*
5003	 * The bus notifier takes the dmar_global_lock, so lockdep will
5004	 * complain later when we register it under the lock.
5005	 */
5006	dmar_register_bus_notifier();
5007
5008	down_write(&dmar_global_lock);
5009
5010	if (!no_iommu)
5011		intel_iommu_debugfs_init();
5012
5013	if (no_iommu || dmar_disabled) {
5014		/*
5015		 * We exit the function here to ensure IOMMU's remapping and
5016		 * mempool aren't setup, which means that the IOMMU's PMRs
5017		 * won't be disabled via the call to init_dmars(). So disable
5018		 * it explicitly here. The PMRs were setup by tboot prior to
5019		 * calling SENTER, but the kernel is expected to reset/tear
5020		 * down the PMRs.
5021		 */
5022		if (intel_iommu_tboot_noforce) {
5023			for_each_iommu(iommu, drhd)
5024				iommu_disable_protect_mem_regions(iommu);
5025		}
5026
5027		/*
5028		 * Make sure the IOMMUs are switched off, even when we
5029		 * boot into a kexec kernel and the previous kernel left
5030		 * them enabled
5031		 */
5032		intel_disable_iommus();
5033		goto out_free_dmar;
5034	}
5035
5036	if (list_empty(&dmar_rmrr_units))
5037		pr_info("No RMRR found\n");
5038
5039	if (list_empty(&dmar_atsr_units))
5040		pr_info("No ATSR found\n");
5041
5042	if (dmar_init_reserved_ranges()) {
5043		if (force_on)
5044			panic("tboot: Failed to reserve iommu ranges\n");
5045		goto out_free_reserved_range;
5046	}
5047
5048	if (dmar_map_gfx)
5049		intel_iommu_gfx_mapped = 1;
5050
5051	init_no_remapping_devices();
5052
5053	ret = init_dmars();
5054	if (ret) {
5055		if (force_on)
5056			panic("tboot: Failed to initialize DMARs\n");
5057		pr_err("Initialization failed\n");
5058		goto out_free_reserved_range;
5059	}
5060	up_write(&dmar_global_lock);
5061
5062	init_iommu_pm_ops();
5063
5064	down_read(&dmar_global_lock);
5065	for_each_active_iommu(iommu, drhd) {
5066		iommu_device_sysfs_add(&iommu->iommu, NULL,
5067				       intel_iommu_groups,
5068				       "%s", iommu->name);
5069		iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5070		iommu_device_register(&iommu->iommu);
5071	}
5072	up_read(&dmar_global_lock);
5073
5074	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5075	if (si_domain && !hw_pass_through)
5076		register_memory_notifier(&intel_iommu_memory_nb);
5077	cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5078			  intel_iommu_cpu_dead);
5079
5080	down_read(&dmar_global_lock);
5081	if (probe_acpi_namespace_devices())
5082		pr_warn("ACPI name space devices didn't probe correctly\n");
5083
5084	/* Finally, we enable the DMA remapping hardware. */
5085	for_each_iommu(iommu, drhd) {
5086		if (!drhd->ignored && !translation_pre_enabled(iommu))
5087			iommu_enable_translation(iommu);
5088
5089		iommu_disable_protect_mem_regions(iommu);
5090	}
5091	up_read(&dmar_global_lock);
5092
5093	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5094
5095	intel_iommu_enabled = 1;
5096
5097	return 0;
5098
5099out_free_reserved_range:
5100	put_iova_domain(&reserved_iova_list);
5101out_free_dmar:
5102	intel_iommu_free_dmars();
5103	up_write(&dmar_global_lock);
5104	iommu_exit_mempool();
5105	return ret;
5106}
5107
5108static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5109{
5110	struct intel_iommu *iommu = opaque;
5111
5112	domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5113	return 0;
5114}
5115
5116/*
5117 * NB - intel-iommu lacks any sort of reference counting for the users of
5118 * dependent devices.  If multiple endpoints have intersecting dependent
5119 * devices, unbinding the driver from any one of them will possibly leave
5120 * the others unable to operate.
5121 */
5122static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5123{
5124	if (!iommu || !dev || !dev_is_pci(dev))
5125		return;
5126
5127	pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5128}
5129
5130static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5131{
5132	struct dmar_domain *domain;
5133	struct intel_iommu *iommu;
5134	unsigned long flags;
5135
5136	assert_spin_locked(&device_domain_lock);
5137
5138	if (WARN_ON(!info))
5139		return;
5140
5141	iommu = info->iommu;
5142	domain = info->domain;
5143
5144	if (info->dev) {
5145		if (dev_is_pci(info->dev) && sm_supported(iommu))
5146			intel_pasid_tear_down_entry(iommu, info->dev,
5147					PASID_RID2PASID, false);
5148
5149		iommu_disable_dev_iotlb(info);
5150		if (!dev_is_real_dma_subdevice(info->dev))
5151			domain_context_clear(iommu, info->dev);
5152		intel_pasid_free_table(info->dev);
5153	}
5154
5155	unlink_domain_info(info);
5156
5157	spin_lock_irqsave(&iommu->lock, flags);
5158	domain_detach_iommu(domain, iommu);
5159	spin_unlock_irqrestore(&iommu->lock, flags);
5160
5161	free_devinfo_mem(info);
5162}
5163
5164static void dmar_remove_one_dev_info(struct device *dev)
5165{
5166	struct device_domain_info *info;
5167	unsigned long flags;
5168
5169	spin_lock_irqsave(&device_domain_lock, flags);
5170	info = get_domain_info(dev);
5171	if (info)
5172		__dmar_remove_one_dev_info(info);
5173	spin_unlock_irqrestore(&device_domain_lock, flags);
5174}
5175
5176static int md_domain_init(struct dmar_domain *domain, int guest_width)
5177{
5178	int adjust_width;
5179
5180	/* calculate AGAW */
5181	domain->gaw = guest_width;
5182	adjust_width = guestwidth_to_adjustwidth(guest_width);
5183	domain->agaw = width_to_agaw(adjust_width);
5184
5185	domain->iommu_coherency = 0;
5186	domain->iommu_snooping = 0;
5187	domain->iommu_superpage = 0;
5188	domain->max_addr = 0;
5189
5190	/* always allocate the top pgd */
5191	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5192	if (!domain->pgd)
5193		return -ENOMEM;
5194	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5195	return 0;
5196}
5197
5198static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
5199{
5200	init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5201	copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
5202
5203	if (!intel_iommu_strict &&
5204	    init_iova_flush_queue(&dmar_domain->iovad,
5205				  iommu_flush_iova, iova_entry_free))
5206		pr_info("iova flush queue initialization failed\n");
5207}
5208
5209static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5210{
5211	struct dmar_domain *dmar_domain;
5212	struct iommu_domain *domain;
5213
5214	switch (type) {
5215	case IOMMU_DOMAIN_DMA:
5216	case IOMMU_DOMAIN_UNMANAGED:
5217		dmar_domain = alloc_domain(0);
5218		if (!dmar_domain) {
5219			pr_err("Can't allocate dmar_domain\n");
5220			return NULL;
5221		}
5222		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5223			pr_err("Domain initialization failed\n");
5224			domain_exit(dmar_domain);
5225			return NULL;
5226		}
5227
5228		if (type == IOMMU_DOMAIN_DMA)
5229			intel_init_iova_domain(dmar_domain);
5230
5231		domain = &dmar_domain->domain;
5232		domain->geometry.aperture_start = 0;
5233		domain->geometry.aperture_end   =
5234				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
5235		domain->geometry.force_aperture = true;
5236
5237		return domain;
5238	case IOMMU_DOMAIN_IDENTITY:
5239		return &si_domain->domain;
5240	default:
5241		return NULL;
5242	}
5243
5244	return NULL;
5245}
5246
5247static void intel_iommu_domain_free(struct iommu_domain *domain)
5248{
5249	if (domain != &si_domain->domain)
5250		domain_exit(to_dmar_domain(domain));
5251}
5252
5253/*
5254 * Check whether a @domain could be attached to the @dev through the
5255 * aux-domain attach/detach APIs.
5256 */
5257static inline bool
5258is_aux_domain(struct device *dev, struct iommu_domain *domain)
5259{
5260	struct device_domain_info *info = get_domain_info(dev);
5261
5262	return info && info->auxd_enabled &&
5263			domain->type == IOMMU_DOMAIN_UNMANAGED;
5264}
5265
5266static void auxiliary_link_device(struct dmar_domain *domain,
5267				  struct device *dev)
5268{
5269	struct device_domain_info *info = get_domain_info(dev);
5270
5271	assert_spin_locked(&device_domain_lock);
5272	if (WARN_ON(!info))
5273		return;
5274
5275	domain->auxd_refcnt++;
5276	list_add(&domain->auxd, &info->auxiliary_domains);
5277}
5278
5279static void auxiliary_unlink_device(struct dmar_domain *domain,
5280				    struct device *dev)
5281{
5282	struct device_domain_info *info = get_domain_info(dev);
5283
5284	assert_spin_locked(&device_domain_lock);
5285	if (WARN_ON(!info))
5286		return;
5287
5288	list_del(&domain->auxd);
5289	domain->auxd_refcnt--;
5290
5291	if (!domain->auxd_refcnt && domain->default_pasid > 0)
5292		ioasid_free(domain->default_pasid);
5293}
5294
5295static int aux_domain_add_dev(struct dmar_domain *domain,
5296			      struct device *dev)
5297{
5298	int ret;
5299	unsigned long flags;
5300	struct intel_iommu *iommu;
5301
5302	iommu = device_to_iommu(dev, NULL, NULL);
5303	if (!iommu)
5304		return -ENODEV;
5305
5306	if (domain->default_pasid <= 0) {
5307		u32 pasid;
5308
5309		/* No private data needed for the default pasid */
5310		pasid = ioasid_alloc(NULL, PASID_MIN,
5311				     pci_max_pasids(to_pci_dev(dev)) - 1,
5312				     NULL);
5313		if (pasid == INVALID_IOASID) {
5314			pr_err("Can't allocate default pasid\n");
5315			return -ENODEV;
5316		}
5317		domain->default_pasid = pasid;
5318	}
5319
5320	spin_lock_irqsave(&device_domain_lock, flags);
5321	/*
5322	 * iommu->lock must be held to attach domain to iommu and setup the
5323	 * pasid entry for second level translation.
5324	 */
5325	spin_lock(&iommu->lock);
5326	ret = domain_attach_iommu(domain, iommu);
5327	if (ret)
5328		goto attach_failed;
5329
5330	/* Setup the PASID entry for mediated devices: */
5331	if (domain_use_first_level(domain))
5332		ret = domain_setup_first_level(iommu, domain, dev,
5333					       domain->default_pasid);
5334	else
5335		ret = intel_pasid_setup_second_level(iommu, domain, dev,
5336						     domain->default_pasid);
5337	if (ret)
5338		goto table_failed;
5339	spin_unlock(&iommu->lock);
5340
5341	auxiliary_link_device(domain, dev);
5342
5343	spin_unlock_irqrestore(&device_domain_lock, flags);
5344
5345	return 0;
5346
5347table_failed:
5348	domain_detach_iommu(domain, iommu);
5349attach_failed:
5350	spin_unlock(&iommu->lock);
5351	spin_unlock_irqrestore(&device_domain_lock, flags);
5352	if (!domain->auxd_refcnt && domain->default_pasid > 0)
5353		ioasid_free(domain->default_pasid);
5354
5355	return ret;
5356}
5357
5358static void aux_domain_remove_dev(struct dmar_domain *domain,
5359				  struct device *dev)
5360{
5361	struct device_domain_info *info;
5362	struct intel_iommu *iommu;
5363	unsigned long flags;
5364
5365	if (!is_aux_domain(dev, &domain->domain))
5366		return;
5367
5368	spin_lock_irqsave(&device_domain_lock, flags);
5369	info = get_domain_info(dev);
5370	iommu = info->iommu;
5371
5372	auxiliary_unlink_device(domain, dev);
5373
5374	spin_lock(&iommu->lock);
5375	intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
5376	domain_detach_iommu(domain, iommu);
5377	spin_unlock(&iommu->lock);
5378
5379	spin_unlock_irqrestore(&device_domain_lock, flags);
5380}
5381
5382static int prepare_domain_attach_device(struct iommu_domain *domain,
5383					struct device *dev)
5384{
5385	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5386	struct intel_iommu *iommu;
5387	int addr_width;
5388
5389	iommu = device_to_iommu(dev, NULL, NULL);
5390	if (!iommu)
5391		return -ENODEV;
5392
5393	/* check if this iommu agaw is sufficient for max mapped address */
5394	addr_width = agaw_to_width(iommu->agaw);
5395	if (addr_width > cap_mgaw(iommu->cap))
5396		addr_width = cap_mgaw(iommu->cap);
5397
5398	if (dmar_domain->max_addr > (1LL << addr_width)) {
5399		dev_err(dev, "%s: iommu width (%d) is not "
5400		        "sufficient for the mapped address (%llx)\n",
5401		        __func__, addr_width, dmar_domain->max_addr);
5402		return -EFAULT;
5403	}
5404	dmar_domain->gaw = addr_width;
5405
5406	/*
5407	 * Knock out extra levels of page tables if necessary
5408	 */
5409	while (iommu->agaw < dmar_domain->agaw) {
5410		struct dma_pte *pte;
5411
5412		pte = dmar_domain->pgd;
5413		if (dma_pte_present(pte)) {
5414			dmar_domain->pgd = (struct dma_pte *)
5415				phys_to_virt(dma_pte_addr(pte));
5416			free_pgtable_page(pte);
5417		}
5418		dmar_domain->agaw--;
5419	}
5420
5421	return 0;
5422}
5423
5424static int intel_iommu_attach_device(struct iommu_domain *domain,
5425				     struct device *dev)
5426{
5427	int ret;
5428
5429	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5430	    device_is_rmrr_locked(dev)) {
5431		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5432		return -EPERM;
5433	}
5434
5435	if (is_aux_domain(dev, domain))
5436		return -EPERM;
5437
5438	/* normally dev is not mapped */
5439	if (unlikely(domain_context_mapped(dev))) {
5440		struct dmar_domain *old_domain;
5441
5442		old_domain = find_domain(dev);
5443		if (old_domain)
5444			dmar_remove_one_dev_info(dev);
5445	}
5446
5447	ret = prepare_domain_attach_device(domain, dev);
5448	if (ret)
5449		return ret;
5450
5451	return domain_add_dev_info(to_dmar_domain(domain), dev);
5452}
5453
5454static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5455					 struct device *dev)
5456{
5457	int ret;
5458
5459	if (!is_aux_domain(dev, domain))
5460		return -EPERM;
5461
5462	ret = prepare_domain_attach_device(domain, dev);
5463	if (ret)
5464		return ret;
5465
5466	return aux_domain_add_dev(to_dmar_domain(domain), dev);
5467}
5468
5469static void intel_iommu_detach_device(struct iommu_domain *domain,
5470				      struct device *dev)
5471{
5472	dmar_remove_one_dev_info(dev);
5473}
5474
5475static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5476					  struct device *dev)
5477{
5478	aux_domain_remove_dev(to_dmar_domain(domain), dev);
5479}
5480
5481#ifdef CONFIG_INTEL_IOMMU_SVM
5482/*
5483 * 2D array for converting and sanitizing IOMMU generic TLB granularity to
5484 * VT-d granularity. Invalidation is typically included in the unmap operation
5485 * as a result of DMA or VFIO unmap. However, for assigned devices guest
5486 * owns the first level page tables. Invalidations of translation caches in the
5487 * guest are trapped and passed down to the host.
5488 *
5489 * vIOMMU in the guest will only expose first level page tables, therefore
5490 * we do not support IOTLB granularity for request without PASID (second level).
5491 *
5492 * For example, to find the VT-d granularity encoding for IOTLB
5493 * type and page selective granularity within PASID:
5494 * X: indexed by iommu cache type
5495 * Y: indexed by enum iommu_inv_granularity
5496 * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
5497 */
5498
5499static const int
5500inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
5501	/*
5502	 * PASID based IOTLB invalidation: PASID selective (per PASID),
5503	 * page selective (address granularity)
5504	 */
5505	{-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
5506	/* PASID based dev TLBs */
5507	{-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
5508	/* PASID cache */
5509	{-EINVAL, -EINVAL, -EINVAL}
5510};
5511
5512static inline int to_vtd_granularity(int type, int granu)
5513{
5514	return inv_type_granu_table[type][granu];
5515}
5516
5517static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
5518{
5519	u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5520
5521	/* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5522	 * IOMMU cache invalidate API passes granu_size in bytes, and number of
5523	 * granu size in contiguous memory.
5524	 */
5525	return order_base_2(nr_pages);
5526}
5527
5528static int
5529intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5530			   struct iommu_cache_invalidate_info *inv_info)
5531{
5532	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5533	struct device_domain_info *info;
5534	struct intel_iommu *iommu;
5535	unsigned long flags;
5536	int cache_type;
5537	u8 bus, devfn;
5538	u16 did, sid;
5539	int ret = 0;
5540	u64 size = 0;
5541
5542	if (!inv_info || !dmar_domain)
5543		return -EINVAL;
5544
5545	if (!dev || !dev_is_pci(dev))
5546		return -ENODEV;
5547
5548	iommu = device_to_iommu(dev, &bus, &devfn);
5549	if (!iommu)
5550		return -ENODEV;
5551
5552	if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5553		return -EINVAL;
5554
5555	spin_lock_irqsave(&device_domain_lock, flags);
5556	spin_lock(&iommu->lock);
5557	info = get_domain_info(dev);
5558	if (!info) {
5559		ret = -EINVAL;
5560		goto out_unlock;
5561	}
5562	did = dmar_domain->iommu_did[iommu->seq_id];
5563	sid = PCI_DEVID(bus, devfn);
5564
5565	/* Size is only valid in address selective invalidation */
5566	if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
5567		size = to_vtd_size(inv_info->granu.addr_info.granule_size,
5568				   inv_info->granu.addr_info.nb_granules);
5569
5570	for_each_set_bit(cache_type,
5571			 (unsigned long *)&inv_info->cache,
5572			 IOMMU_CACHE_INV_TYPE_NR) {
5573		int granu = 0;
5574		u64 pasid = 0;
5575		u64 addr = 0;
5576
5577		granu = to_vtd_granularity(cache_type, inv_info->granularity);
5578		if (granu == -EINVAL) {
5579			pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5580					   cache_type, inv_info->granularity);
5581			break;
5582		}
5583
5584		/*
5585		 * PASID is stored in different locations based on the
5586		 * granularity.
5587		 */
5588		if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5589		    (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5590			pasid = inv_info->granu.pasid_info.pasid;
5591		else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5592			 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5593			pasid = inv_info->granu.addr_info.pasid;
5594
5595		switch (BIT(cache_type)) {
5596		case IOMMU_CACHE_INV_TYPE_IOTLB:
5597			/* HW will ignore LSB bits based on address mask */
5598			if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5599			    size &&
5600			    (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5601				pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
5602						   inv_info->granu.addr_info.addr, size);
5603			}
5604
5605			/*
5606			 * If granu is PASID-selective, address is ignored.
5607			 * We use npages = -1 to indicate that.
5608			 */
5609			qi_flush_piotlb(iommu, did, pasid,
5610					mm_to_dma_pfn(inv_info->granu.addr_info.addr),
5611					(granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5612					inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5613
5614			if (!info->ats_enabled)
5615				break;
5616			/*
5617			 * Always flush device IOTLB if ATS is enabled. vIOMMU
5618			 * in the guest may assume IOTLB flush is inclusive,
5619			 * which is more efficient.
5620			 */
5621			fallthrough;
5622		case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5623			/*
5624			 * PASID based device TLB invalidation does not support
5625			 * IOMMU_INV_GRANU_PASID granularity but only supports
5626			 * IOMMU_INV_GRANU_ADDR.
5627			 * The equivalent of that is we set the size to be the
5628			 * entire range of 64 bit. User only provides PASID info
5629			 * without address info. So we set addr to 0.
5630			 */
5631			if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5632				size = 64 - VTD_PAGE_SHIFT;
5633				addr = 0;
5634			} else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5635				addr = inv_info->granu.addr_info.addr;
5636			}
5637
5638			if (info->ats_enabled)
5639				qi_flush_dev_iotlb_pasid(iommu, sid,
5640						info->pfsid, pasid,
5641						info->ats_qdep, addr,
5642						size);
5643			else
5644				pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5645			break;
5646		default:
5647			dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5648					    cache_type);
5649			ret = -EINVAL;
5650		}
5651	}
5652out_unlock:
5653	spin_unlock(&iommu->lock);
5654	spin_unlock_irqrestore(&device_domain_lock, flags);
5655
5656	return ret;
5657}
5658#endif
5659
5660static int intel_iommu_map(struct iommu_domain *domain,
5661			   unsigned long iova, phys_addr_t hpa,
5662			   size_t size, int iommu_prot, gfp_t gfp)
5663{
5664	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5665	u64 max_addr;
5666	int prot = 0;
5667	int ret;
5668
5669	if (iommu_prot & IOMMU_READ)
5670		prot |= DMA_PTE_READ;
5671	if (iommu_prot & IOMMU_WRITE)
5672		prot |= DMA_PTE_WRITE;
5673	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5674		prot |= DMA_PTE_SNP;
5675
5676	max_addr = iova + size;
5677	if (dmar_domain->max_addr < max_addr) {
5678		u64 end;
5679
5680		/* check if minimum agaw is sufficient for mapped address */
5681		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5682		if (end < max_addr) {
5683			pr_err("%s: iommu width (%d) is not "
5684			       "sufficient for the mapped address (%llx)\n",
5685			       __func__, dmar_domain->gaw, max_addr);
5686			return -EFAULT;
5687		}
5688		dmar_domain->max_addr = max_addr;
5689	}
5690	/* Round up size to next multiple of PAGE_SIZE, if it and
5691	   the low bits of hpa would take us onto the next page */
5692	size = aligned_nrpages(hpa, size);
5693	ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5694				 hpa >> VTD_PAGE_SHIFT, size, prot);
5695	return ret;
5696}
5697
5698static size_t intel_iommu_unmap(struct iommu_domain *domain,
5699				unsigned long iova, size_t size,
5700				struct iommu_iotlb_gather *gather)
5701{
5702	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5703	struct page *freelist = NULL;
5704	unsigned long start_pfn, last_pfn;
5705	unsigned int npages;
5706	int iommu_id, level = 0;
5707
5708	/* Cope with horrid API which requires us to unmap more than the
5709	   size argument if it happens to be a large-page mapping. */
5710	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5711
5712	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5713		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5714
5715	start_pfn = iova >> VTD_PAGE_SHIFT;
5716	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5717
5718	freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5719
5720	npages = last_pfn - start_pfn + 1;
5721
5722	for_each_domain_iommu(iommu_id, dmar_domain)
5723		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5724				      start_pfn, npages, !freelist, 0);
5725
5726	dma_free_pagelist(freelist);
5727
5728	if (dmar_domain->max_addr == iova + size)
5729		dmar_domain->max_addr = iova;
5730
5731	return size;
5732}
5733
5734static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5735					    dma_addr_t iova)
5736{
5737	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5738	struct dma_pte *pte;
5739	int level = 0;
5740	u64 phys = 0;
5741
5742	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5743	if (pte && dma_pte_present(pte))
5744		phys = dma_pte_addr(pte) +
5745			(iova & (BIT_MASK(level_to_offset_bits(level) +
5746						VTD_PAGE_SHIFT) - 1));
5747
5748	return phys;
5749}
5750
5751static inline bool scalable_mode_support(void)
5752{
5753	struct dmar_drhd_unit *drhd;
5754	struct intel_iommu *iommu;
5755	bool ret = true;
5756
5757	rcu_read_lock();
5758	for_each_active_iommu(iommu, drhd) {
5759		if (!sm_supported(iommu)) {
5760			ret = false;
5761			break;
5762		}
5763	}
5764	rcu_read_unlock();
5765
5766	return ret;
5767}
5768
5769static inline bool iommu_pasid_support(void)
5770{
5771	struct dmar_drhd_unit *drhd;
5772	struct intel_iommu *iommu;
5773	bool ret = true;
5774
5775	rcu_read_lock();
5776	for_each_active_iommu(iommu, drhd) {
5777		if (!pasid_supported(iommu)) {
5778			ret = false;
5779			break;
5780		}
5781	}
5782	rcu_read_unlock();
5783
5784	return ret;
5785}
5786
5787static inline bool nested_mode_support(void)
5788{
5789	struct dmar_drhd_unit *drhd;
5790	struct intel_iommu *iommu;
5791	bool ret = true;
5792
5793	rcu_read_lock();
5794	for_each_active_iommu(iommu, drhd) {
5795		if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5796			ret = false;
5797			break;
5798		}
5799	}
5800	rcu_read_unlock();
5801
5802	return ret;
5803}
5804
5805static bool intel_iommu_capable(enum iommu_cap cap)
5806{
5807	if (cap == IOMMU_CAP_CACHE_COHERENCY)
5808		return domain_update_iommu_snooping(NULL) == 1;
5809	if (cap == IOMMU_CAP_INTR_REMAP)
5810		return irq_remapping_enabled == 1;
5811
5812	return false;
5813}
5814
5815static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5816{
5817	struct intel_iommu *iommu;
5818
5819	iommu = device_to_iommu(dev, NULL, NULL);
5820	if (!iommu)
5821		return ERR_PTR(-ENODEV);
5822
5823	if (translation_pre_enabled(iommu))
5824		dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5825
5826	return &iommu->iommu;
5827}
5828
5829static void intel_iommu_release_device(struct device *dev)
5830{
5831	struct intel_iommu *iommu;
5832
5833	iommu = device_to_iommu(dev, NULL, NULL);
5834	if (!iommu)
5835		return;
5836
5837	dmar_remove_one_dev_info(dev);
5838
5839	set_dma_ops(dev, NULL);
5840}
5841
5842static void intel_iommu_probe_finalize(struct device *dev)
5843{
5844	struct iommu_domain *domain;
5845
5846	domain = iommu_get_domain_for_dev(dev);
5847	if (device_needs_bounce(dev))
5848		set_dma_ops(dev, &bounce_dma_ops);
5849	else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5850		set_dma_ops(dev, &intel_dma_ops);
5851	else
5852		set_dma_ops(dev, NULL);
5853}
5854
5855static void intel_iommu_get_resv_regions(struct device *device,
5856					 struct list_head *head)
5857{
5858	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5859	struct iommu_resv_region *reg;
5860	struct dmar_rmrr_unit *rmrr;
5861	struct device *i_dev;
5862	int i;
5863
5864	down_read(&dmar_global_lock);
5865	for_each_rmrr_units(rmrr) {
5866		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5867					  i, i_dev) {
5868			struct iommu_resv_region *resv;
5869			enum iommu_resv_type type;
5870			size_t length;
5871
5872			if (i_dev != device &&
5873			    !is_downstream_to_pci_bridge(device, i_dev))
5874				continue;
5875
5876			length = rmrr->end_address - rmrr->base_address + 1;
5877
5878			type = device_rmrr_is_relaxable(device) ?
5879				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5880
5881			resv = iommu_alloc_resv_region(rmrr->base_address,
5882						       length, prot, type);
5883			if (!resv)
5884				break;
5885
5886			list_add_tail(&resv->list, head);
5887		}
5888	}
5889	up_read(&dmar_global_lock);
5890
5891#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5892	if (dev_is_pci(device)) {
5893		struct pci_dev *pdev = to_pci_dev(device);
5894
5895		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5896			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5897						   IOMMU_RESV_DIRECT_RELAXABLE);
5898			if (reg)
5899				list_add_tail(&reg->list, head);
5900		}
5901	}
5902#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5903
5904	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5905				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5906				      0, IOMMU_RESV_MSI);
5907	if (!reg)
5908		return;
5909	list_add_tail(&reg->list, head);
5910}
5911
5912int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5913{
5914	struct device_domain_info *info;
5915	struct context_entry *context;
5916	struct dmar_domain *domain;
5917	unsigned long flags;
5918	u64 ctx_lo;
5919	int ret;
5920
5921	domain = find_domain(dev);
5922	if (!domain)
5923		return -EINVAL;
5924
5925	spin_lock_irqsave(&device_domain_lock, flags);
5926	spin_lock(&iommu->lock);
5927
5928	ret = -EINVAL;
5929	info = get_domain_info(dev);
5930	if (!info || !info->pasid_supported)
5931		goto out;
5932
5933	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5934	if (WARN_ON(!context))
5935		goto out;
5936
5937	ctx_lo = context[0].lo;
5938
5939	if (!(ctx_lo & CONTEXT_PASIDE)) {
5940		ctx_lo |= CONTEXT_PASIDE;
5941		context[0].lo = ctx_lo;
5942		wmb();
5943		iommu->flush.flush_context(iommu,
5944					   domain->iommu_did[iommu->seq_id],
5945					   PCI_DEVID(info->bus, info->devfn),
5946					   DMA_CCMD_MASK_NOBIT,
5947					   DMA_CCMD_DEVICE_INVL);
5948	}
5949
5950	/* Enable PASID support in the device, if it wasn't already */
5951	if (!info->pasid_enabled)
5952		iommu_enable_dev_iotlb(info);
5953
5954	ret = 0;
5955
5956 out:
5957	spin_unlock(&iommu->lock);
5958	spin_unlock_irqrestore(&device_domain_lock, flags);
5959
5960	return ret;
5961}
5962
5963static void intel_iommu_apply_resv_region(struct device *dev,
5964					  struct iommu_domain *domain,
5965					  struct iommu_resv_region *region)
5966{
5967	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5968	unsigned long start, end;
5969
5970	start = IOVA_PFN(region->start);
5971	end   = IOVA_PFN(region->start + region->length - 1);
5972
5973	WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5974}
5975
5976static struct iommu_group *intel_iommu_device_group(struct device *dev)
5977{
5978	if (dev_is_pci(dev))
5979		return pci_device_group(dev);
5980	return generic_device_group(dev);
5981}
5982
5983static int intel_iommu_enable_auxd(struct device *dev)
5984{
5985	struct device_domain_info *info;
5986	struct intel_iommu *iommu;
5987	unsigned long flags;
5988	int ret;
5989
5990	iommu = device_to_iommu(dev, NULL, NULL);
5991	if (!iommu || dmar_disabled)
5992		return -EINVAL;
5993
5994	if (!sm_supported(iommu) || !pasid_supported(iommu))
5995		return -EINVAL;
5996
5997	ret = intel_iommu_enable_pasid(iommu, dev);
5998	if (ret)
5999		return -ENODEV;
6000
6001	spin_lock_irqsave(&device_domain_lock, flags);
6002	info = get_domain_info(dev);
6003	info->auxd_enabled = 1;
6004	spin_unlock_irqrestore(&device_domain_lock, flags);
6005
6006	return 0;
6007}
6008
6009static int intel_iommu_disable_auxd(struct device *dev)
6010{
6011	struct device_domain_info *info;
6012	unsigned long flags;
6013
6014	spin_lock_irqsave(&device_domain_lock, flags);
6015	info = get_domain_info(dev);
6016	if (!WARN_ON(!info))
6017		info->auxd_enabled = 0;
6018	spin_unlock_irqrestore(&device_domain_lock, flags);
6019
6020	return 0;
6021}
6022
6023/*
6024 * A PCI express designated vendor specific extended capability is defined
6025 * in the section 3.7 of Intel scalable I/O virtualization technical spec
6026 * for system software and tools to detect endpoint devices supporting the
6027 * Intel scalable IO virtualization without host driver dependency.
6028 *
6029 * Returns the address of the matching extended capability structure within
6030 * the device's PCI configuration space or 0 if the device does not support
6031 * it.
6032 */
6033static int siov_find_pci_dvsec(struct pci_dev *pdev)
6034{
6035	int pos;
6036	u16 vendor, id;
6037
6038	pos = pci_find_next_ext_capability(pdev, 0, 0x23);
6039	while (pos) {
6040		pci_read_config_word(pdev, pos + 4, &vendor);
6041		pci_read_config_word(pdev, pos + 8, &id);
6042		if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
6043			return pos;
6044
6045		pos = pci_find_next_ext_capability(pdev, pos, 0x23);
6046	}
6047
6048	return 0;
6049}
6050
6051static bool
6052intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
6053{
6054	if (feat == IOMMU_DEV_FEAT_AUX) {
6055		int ret;
6056
6057		if (!dev_is_pci(dev) || dmar_disabled ||
6058		    !scalable_mode_support() || !iommu_pasid_support())
6059			return false;
6060
6061		ret = pci_pasid_features(to_pci_dev(dev));
6062		if (ret < 0)
6063			return false;
6064
6065		return !!siov_find_pci_dvsec(to_pci_dev(dev));
6066	}
6067
6068	if (feat == IOMMU_DEV_FEAT_SVA) {
6069		struct device_domain_info *info = get_domain_info(dev);
6070
6071		return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
6072			info->pasid_supported && info->pri_supported &&
6073			info->ats_supported;
6074	}
6075
6076	return false;
6077}
6078
6079static int
6080intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
6081{
6082	if (feat == IOMMU_DEV_FEAT_AUX)
6083		return intel_iommu_enable_auxd(dev);
6084
6085	if (feat == IOMMU_DEV_FEAT_SVA) {
6086		struct device_domain_info *info = get_domain_info(dev);
6087
6088		if (!info)
6089			return -EINVAL;
6090
6091		if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
6092			return 0;
6093	}
6094
6095	return -ENODEV;
6096}
6097
6098static int
6099intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
6100{
6101	if (feat == IOMMU_DEV_FEAT_AUX)
6102		return intel_iommu_disable_auxd(dev);
6103
6104	return -ENODEV;
6105}
6106
6107static bool
6108intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
6109{
6110	struct device_domain_info *info = get_domain_info(dev);
6111
6112	if (feat == IOMMU_DEV_FEAT_AUX)
6113		return scalable_mode_support() && info && info->auxd_enabled;
6114
6115	return false;
6116}
6117
6118static int
6119intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
6120{
6121	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6122
6123	return dmar_domain->default_pasid > 0 ?
6124			dmar_domain->default_pasid : -EINVAL;
6125}
6126
6127static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
6128					   struct device *dev)
6129{
6130	return attach_deferred(dev);
6131}
6132
6133static int
6134intel_iommu_domain_set_attr(struct iommu_domain *domain,
6135			    enum iommu_attr attr, void *data)
6136{
6137	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6138	unsigned long flags;
6139	int ret = 0;
6140
6141	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6142		return -EINVAL;
6143
6144	switch (attr) {
6145	case DOMAIN_ATTR_NESTING:
6146		spin_lock_irqsave(&device_domain_lock, flags);
6147		if (nested_mode_support() &&
6148		    list_empty(&dmar_domain->devices)) {
6149			dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6150			dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6151		} else {
6152			ret = -ENODEV;
6153		}
6154		spin_unlock_irqrestore(&device_domain_lock, flags);
6155		break;
6156	default:
6157		ret = -EINVAL;
6158		break;
6159	}
6160
6161	return ret;
6162}
6163
6164/*
6165 * Check that the device does not live on an external facing PCI port that is
6166 * marked as untrusted. Such devices should not be able to apply quirks and
6167 * thus not be able to bypass the IOMMU restrictions.
6168 */
6169static bool risky_device(struct pci_dev *pdev)
6170{
6171	if (pdev->untrusted) {
6172		pci_info(pdev,
6173			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
6174			 pdev->vendor, pdev->device);
6175		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
6176		return true;
6177	}
6178	return false;
6179}
6180
6181const struct iommu_ops intel_iommu_ops = {
6182	.capable		= intel_iommu_capable,
6183	.domain_alloc		= intel_iommu_domain_alloc,
6184	.domain_free		= intel_iommu_domain_free,
6185	.domain_set_attr	= intel_iommu_domain_set_attr,
6186	.attach_dev		= intel_iommu_attach_device,
6187	.detach_dev		= intel_iommu_detach_device,
6188	.aux_attach_dev		= intel_iommu_aux_attach_device,
6189	.aux_detach_dev		= intel_iommu_aux_detach_device,
6190	.aux_get_pasid		= intel_iommu_aux_get_pasid,
6191	.map			= intel_iommu_map,
6192	.unmap			= intel_iommu_unmap,
6193	.iova_to_phys		= intel_iommu_iova_to_phys,
6194	.probe_device		= intel_iommu_probe_device,
6195	.probe_finalize		= intel_iommu_probe_finalize,
6196	.release_device		= intel_iommu_release_device,
6197	.get_resv_regions	= intel_iommu_get_resv_regions,
6198	.put_resv_regions	= generic_iommu_put_resv_regions,
6199	.apply_resv_region	= intel_iommu_apply_resv_region,
6200	.device_group		= intel_iommu_device_group,
6201	.dev_has_feat		= intel_iommu_dev_has_feat,
6202	.dev_feat_enabled	= intel_iommu_dev_feat_enabled,
6203	.dev_enable_feat	= intel_iommu_dev_enable_feat,
6204	.dev_disable_feat	= intel_iommu_dev_disable_feat,
6205	.is_attach_deferred	= intel_iommu_is_attach_deferred,
6206	.def_domain_type	= device_def_domain_type,
6207	.pgsize_bitmap		= INTEL_IOMMU_PGSIZES,
6208#ifdef CONFIG_INTEL_IOMMU_SVM
6209	.cache_invalidate	= intel_iommu_sva_invalidate,
6210	.sva_bind_gpasid	= intel_svm_bind_gpasid,
6211	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
6212	.sva_bind		= intel_svm_bind,
6213	.sva_unbind		= intel_svm_unbind,
6214	.sva_get_pasid		= intel_svm_get_pasid,
6215	.page_response		= intel_svm_page_response,
6216#endif
6217};
6218
6219static void quirk_iommu_igfx(struct pci_dev *dev)
6220{
6221	if (risky_device(dev))
6222		return;
6223
6224	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6225	dmar_map_gfx = 0;
6226}
6227
6228/* G4x/GM45 integrated gfx dmar support is totally busted. */
6229DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6230DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6231DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6232DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6233DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6234DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6235DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6236
6237/* Broadwell igfx malfunctions with dmar */
6238DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6239DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6240DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6241DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6242DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6243DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6244DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6245DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6246DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6247DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6248DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6249DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6250DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6251DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6252DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6253DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6254DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6255DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6256DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6257DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6258DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6259DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6260DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6261DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6262
6263static void quirk_iommu_rwbf(struct pci_dev *dev)
6264{
6265	if (risky_device(dev))
6266		return;
6267
6268	/*
6269	 * Mobile 4 Series Chipset neglects to set RWBF capability,
6270	 * but needs it. Same seems to hold for the desktop versions.
6271	 */
6272	pci_info(dev, "Forcing write-buffer flush capability\n");
6273	rwbf_quirk = 1;
6274}
6275
6276DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6277DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6278DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6279DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6280DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6281DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6282DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6283
6284#define GGC 0x52
6285#define GGC_MEMORY_SIZE_MASK	(0xf << 8)
6286#define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
6287#define GGC_MEMORY_SIZE_1M	(0x1 << 8)
6288#define GGC_MEMORY_SIZE_2M	(0x3 << 8)
6289#define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
6290#define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
6291#define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
6292#define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
6293
6294static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6295{
6296	unsigned short ggc;
6297
6298	if (risky_device(dev))
6299		return;
6300
6301	if (pci_read_config_word(dev, GGC, &ggc))
6302		return;
6303
6304	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6305		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6306		dmar_map_gfx = 0;
6307	} else if (dmar_map_gfx) {
6308		/* we have to ensure the gfx device is idle before we flush */
6309		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6310		intel_iommu_strict = 1;
6311       }
6312}
6313DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6314DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6315DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6316DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6317
6318static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
6319{
6320	unsigned short ver;
6321
6322	if (!IS_GFX_DEVICE(dev))
6323		return;
6324
6325	ver = (dev->device >> 8) & 0xff;
6326	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
6327	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
6328	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
6329		return;
6330
6331	if (risky_device(dev))
6332		return;
6333
6334	pci_info(dev, "Skip IOMMU disabling for graphics\n");
6335	iommu_skip_te_disable = 1;
6336}
6337DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
6338
6339/* On Tylersburg chipsets, some BIOSes have been known to enable the
6340   ISOCH DMAR unit for the Azalia sound device, but not give it any
6341   TLB entries, which causes it to deadlock. Check for that.  We do
6342   this in a function called from init_dmars(), instead of in a PCI
6343   quirk, because we don't want to print the obnoxious "BIOS broken"
6344   message if VT-d is actually disabled.
6345*/
6346static void __init check_tylersburg_isoch(void)
6347{
6348	struct pci_dev *pdev;
6349	uint32_t vtisochctrl;
6350
6351	/* If there's no Azalia in the system anyway, forget it. */
6352	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6353	if (!pdev)
6354		return;
6355
6356	if (risky_device(pdev)) {
6357		pci_dev_put(pdev);
6358		return;
6359	}
6360
6361	pci_dev_put(pdev);
6362
6363	/* System Management Registers. Might be hidden, in which case
6364	   we can't do the sanity check. But that's OK, because the
6365	   known-broken BIOSes _don't_ actually hide it, so far. */
6366	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6367	if (!pdev)
6368		return;
6369
6370	if (risky_device(pdev)) {
6371		pci_dev_put(pdev);
6372		return;
6373	}
6374
6375	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6376		pci_dev_put(pdev);
6377		return;
6378	}
6379
6380	pci_dev_put(pdev);
6381
6382	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6383	if (vtisochctrl & 1)
6384		return;
6385
6386	/* Drop all bits other than the number of TLB entries */
6387	vtisochctrl &= 0x1c;
6388
6389	/* If we have the recommended number of TLB entries (16), fine. */
6390	if (vtisochctrl == 0x10)
6391		return;
6392
6393	/* Zero TLB entries? You get to ride the short bus to school. */
6394	if (!vtisochctrl) {
6395		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6396		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6397		     dmi_get_system_info(DMI_BIOS_VENDOR),
6398		     dmi_get_system_info(DMI_BIOS_VERSION),
6399		     dmi_get_system_info(DMI_PRODUCT_VERSION));
6400		iommu_identity_mapping |= IDENTMAP_AZALIA;
6401		return;
6402	}
6403
6404	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6405	       vtisochctrl);
6406}
6407