xref: /kernel/linux/linux-6.6/drivers/iommu/amd/iommu.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
4 * Author: Joerg Roedel <jroedel@suse.de>
5 *         Leo Duran <leo.duran@amd.com>
6 */
7
8#define pr_fmt(fmt)     "AMD-Vi: " fmt
9#define dev_fmt(fmt)    pr_fmt(fmt)
10
11#include <linux/ratelimit.h>
12#include <linux/pci.h>
13#include <linux/acpi.h>
14#include <linux/pci-ats.h>
15#include <linux/bitmap.h>
16#include <linux/slab.h>
17#include <linux/debugfs.h>
18#include <linux/scatterlist.h>
19#include <linux/dma-map-ops.h>
20#include <linux/dma-direct.h>
21#include <linux/iommu-helper.h>
22#include <linux/delay.h>
23#include <linux/amd-iommu.h>
24#include <linux/notifier.h>
25#include <linux/export.h>
26#include <linux/irq.h>
27#include <linux/msi.h>
28#include <linux/irqdomain.h>
29#include <linux/percpu.h>
30#include <linux/io-pgtable.h>
31#include <linux/cc_platform.h>
32#include <asm/irq_remapping.h>
33#include <asm/io_apic.h>
34#include <asm/apic.h>
35#include <asm/hw_irq.h>
36#include <asm/proto.h>
37#include <asm/iommu.h>
38#include <asm/gart.h>
39#include <asm/dma.h>
40
41#include "amd_iommu.h"
42#include "../dma-iommu.h"
43#include "../irq_remapping.h"
44
45#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
46
47#define LOOP_TIMEOUT	100000
48
49/* IO virtual address start page frame number */
50#define IOVA_START_PFN		(1)
51#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
52
53/* Reserved IOVA ranges */
54#define MSI_RANGE_START		(0xfee00000)
55#define MSI_RANGE_END		(0xfeefffff)
56#define HT_RANGE_START		(0xfd00000000ULL)
57#define HT_RANGE_END		(0xffffffffffULL)
58
59#define DEFAULT_PGTABLE_LEVEL	PAGE_MODE_3_LEVEL
60
61static DEFINE_SPINLOCK(pd_bitmap_lock);
62
63LIST_HEAD(ioapic_map);
64LIST_HEAD(hpet_map);
65LIST_HEAD(acpihid_map);
66
67const struct iommu_ops amd_iommu_ops;
68
69static ATOMIC_NOTIFIER_HEAD(ppr_notifier);
70int amd_iommu_max_glx_val = -1;
71
72/*
73 * general struct to manage commands send to an IOMMU
74 */
75struct iommu_cmd {
76	u32 data[4];
77};
78
79struct kmem_cache *amd_iommu_irq_cache;
80
81static void detach_device(struct device *dev);
82static int domain_enable_v2(struct protection_domain *domain, int pasids);
83
84/****************************************************************************
85 *
86 * Helper functions
87 *
88 ****************************************************************************/
89
90static inline int get_acpihid_device_id(struct device *dev,
91					struct acpihid_map_entry **entry)
92{
93	struct acpi_device *adev = ACPI_COMPANION(dev);
94	struct acpihid_map_entry *p;
95
96	if (!adev)
97		return -ENODEV;
98
99	list_for_each_entry(p, &acpihid_map, list) {
100		if (acpi_dev_hid_uid_match(adev, p->hid,
101					   p->uid[0] ? p->uid : NULL)) {
102			if (entry)
103				*entry = p;
104			return p->devid;
105		}
106	}
107	return -EINVAL;
108}
109
110static inline int get_device_sbdf_id(struct device *dev)
111{
112	int sbdf;
113
114	if (dev_is_pci(dev))
115		sbdf = get_pci_sbdf_id(to_pci_dev(dev));
116	else
117		sbdf = get_acpihid_device_id(dev, NULL);
118
119	return sbdf;
120}
121
122struct dev_table_entry *get_dev_table(struct amd_iommu *iommu)
123{
124	struct dev_table_entry *dev_table;
125	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
126
127	BUG_ON(pci_seg == NULL);
128	dev_table = pci_seg->dev_table;
129	BUG_ON(dev_table == NULL);
130
131	return dev_table;
132}
133
134static inline u16 get_device_segment(struct device *dev)
135{
136	u16 seg;
137
138	if (dev_is_pci(dev)) {
139		struct pci_dev *pdev = to_pci_dev(dev);
140
141		seg = pci_domain_nr(pdev->bus);
142	} else {
143		u32 devid = get_acpihid_device_id(dev, NULL);
144
145		seg = PCI_SBDF_TO_SEGID(devid);
146	}
147
148	return seg;
149}
150
151/* Writes the specific IOMMU for a device into the PCI segment rlookup table */
152void amd_iommu_set_rlookup_table(struct amd_iommu *iommu, u16 devid)
153{
154	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
155
156	pci_seg->rlookup_table[devid] = iommu;
157}
158
159static struct amd_iommu *__rlookup_amd_iommu(u16 seg, u16 devid)
160{
161	struct amd_iommu_pci_seg *pci_seg;
162
163	for_each_pci_segment(pci_seg) {
164		if (pci_seg->id == seg)
165			return pci_seg->rlookup_table[devid];
166	}
167	return NULL;
168}
169
170static struct amd_iommu *rlookup_amd_iommu(struct device *dev)
171{
172	u16 seg = get_device_segment(dev);
173	int devid = get_device_sbdf_id(dev);
174
175	if (devid < 0)
176		return NULL;
177	return __rlookup_amd_iommu(seg, PCI_SBDF_TO_DEVID(devid));
178}
179
180static struct protection_domain *to_pdomain(struct iommu_domain *dom)
181{
182	return container_of(dom, struct protection_domain, domain);
183}
184
185static struct iommu_dev_data *alloc_dev_data(struct amd_iommu *iommu, u16 devid)
186{
187	struct iommu_dev_data *dev_data;
188	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
189
190	dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
191	if (!dev_data)
192		return NULL;
193
194	spin_lock_init(&dev_data->lock);
195	dev_data->devid = devid;
196	ratelimit_default_init(&dev_data->rs);
197
198	llist_add(&dev_data->dev_data_list, &pci_seg->dev_data_list);
199	return dev_data;
200}
201
202static struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid)
203{
204	struct iommu_dev_data *dev_data;
205	struct llist_node *node;
206	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
207
208	if (llist_empty(&pci_seg->dev_data_list))
209		return NULL;
210
211	node = pci_seg->dev_data_list.first;
212	llist_for_each_entry(dev_data, node, dev_data_list) {
213		if (dev_data->devid == devid)
214			return dev_data;
215	}
216
217	return NULL;
218}
219
220static int clone_alias(struct pci_dev *pdev, u16 alias, void *data)
221{
222	struct amd_iommu *iommu;
223	struct dev_table_entry *dev_table;
224	u16 devid = pci_dev_id(pdev);
225
226	if (devid == alias)
227		return 0;
228
229	iommu = rlookup_amd_iommu(&pdev->dev);
230	if (!iommu)
231		return 0;
232
233	amd_iommu_set_rlookup_table(iommu, alias);
234	dev_table = get_dev_table(iommu);
235	memcpy(dev_table[alias].data,
236	       dev_table[devid].data,
237	       sizeof(dev_table[alias].data));
238
239	return 0;
240}
241
242static void clone_aliases(struct amd_iommu *iommu, struct device *dev)
243{
244	struct pci_dev *pdev;
245
246	if (!dev_is_pci(dev))
247		return;
248	pdev = to_pci_dev(dev);
249
250	/*
251	 * The IVRS alias stored in the alias table may not be
252	 * part of the PCI DMA aliases if it's bus differs
253	 * from the original device.
254	 */
255	clone_alias(pdev, iommu->pci_seg->alias_table[pci_dev_id(pdev)], NULL);
256
257	pci_for_each_dma_alias(pdev, clone_alias, NULL);
258}
259
260static void setup_aliases(struct amd_iommu *iommu, struct device *dev)
261{
262	struct pci_dev *pdev = to_pci_dev(dev);
263	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
264	u16 ivrs_alias;
265
266	/* For ACPI HID devices, there are no aliases */
267	if (!dev_is_pci(dev))
268		return;
269
270	/*
271	 * Add the IVRS alias to the pci aliases if it is on the same
272	 * bus. The IVRS table may know about a quirk that we don't.
273	 */
274	ivrs_alias = pci_seg->alias_table[pci_dev_id(pdev)];
275	if (ivrs_alias != pci_dev_id(pdev) &&
276	    PCI_BUS_NUM(ivrs_alias) == pdev->bus->number)
277		pci_add_dma_alias(pdev, ivrs_alias & 0xff, 1);
278
279	clone_aliases(iommu, dev);
280}
281
282static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid)
283{
284	struct iommu_dev_data *dev_data;
285
286	dev_data = search_dev_data(iommu, devid);
287
288	if (dev_data == NULL) {
289		dev_data = alloc_dev_data(iommu, devid);
290		if (!dev_data)
291			return NULL;
292
293		if (translation_pre_enabled(iommu))
294			dev_data->defer_attach = true;
295	}
296
297	return dev_data;
298}
299
300/*
301* Find or create an IOMMU group for a acpihid device.
302*/
303static struct iommu_group *acpihid_device_group(struct device *dev)
304{
305	struct acpihid_map_entry *p, *entry = NULL;
306	int devid;
307
308	devid = get_acpihid_device_id(dev, &entry);
309	if (devid < 0)
310		return ERR_PTR(devid);
311
312	list_for_each_entry(p, &acpihid_map, list) {
313		if ((devid == p->devid) && p->group)
314			entry->group = p->group;
315	}
316
317	if (!entry->group)
318		entry->group = generic_device_group(dev);
319	else
320		iommu_group_ref_get(entry->group);
321
322	return entry->group;
323}
324
325static bool pci_iommuv2_capable(struct pci_dev *pdev)
326{
327	static const int caps[] = {
328		PCI_EXT_CAP_ID_PRI,
329		PCI_EXT_CAP_ID_PASID,
330	};
331	int i, pos;
332
333	if (!pci_ats_supported(pdev))
334		return false;
335
336	for (i = 0; i < 2; ++i) {
337		pos = pci_find_ext_capability(pdev, caps[i]);
338		if (pos == 0)
339			return false;
340	}
341
342	return true;
343}
344
345/*
346 * This function checks if the driver got a valid device from the caller to
347 * avoid dereferencing invalid pointers.
348 */
349static bool check_device(struct device *dev)
350{
351	struct amd_iommu_pci_seg *pci_seg;
352	struct amd_iommu *iommu;
353	int devid, sbdf;
354
355	if (!dev)
356		return false;
357
358	sbdf = get_device_sbdf_id(dev);
359	if (sbdf < 0)
360		return false;
361	devid = PCI_SBDF_TO_DEVID(sbdf);
362
363	iommu = rlookup_amd_iommu(dev);
364	if (!iommu)
365		return false;
366
367	/* Out of our scope? */
368	pci_seg = iommu->pci_seg;
369	if (devid > pci_seg->last_bdf)
370		return false;
371
372	return true;
373}
374
375static int iommu_init_device(struct amd_iommu *iommu, struct device *dev)
376{
377	struct iommu_dev_data *dev_data;
378	int devid, sbdf;
379
380	if (dev_iommu_priv_get(dev))
381		return 0;
382
383	sbdf = get_device_sbdf_id(dev);
384	if (sbdf < 0)
385		return sbdf;
386
387	devid = PCI_SBDF_TO_DEVID(sbdf);
388	dev_data = find_dev_data(iommu, devid);
389	if (!dev_data)
390		return -ENOMEM;
391
392	dev_data->dev = dev;
393	setup_aliases(iommu, dev);
394
395	/*
396	 * By default we use passthrough mode for IOMMUv2 capable device.
397	 * But if amd_iommu=force_isolation is set (e.g. to debug DMA to
398	 * invalid address), we ignore the capability for the device so
399	 * it'll be forced to go into translation mode.
400	 */
401	if ((iommu_default_passthrough() || !amd_iommu_force_isolation) &&
402	    dev_is_pci(dev) && pci_iommuv2_capable(to_pci_dev(dev))) {
403		dev_data->iommu_v2 = iommu->is_iommu_v2;
404	}
405
406	dev_iommu_priv_set(dev, dev_data);
407
408	return 0;
409}
410
411static void iommu_ignore_device(struct amd_iommu *iommu, struct device *dev)
412{
413	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
414	struct dev_table_entry *dev_table = get_dev_table(iommu);
415	int devid, sbdf;
416
417	sbdf = get_device_sbdf_id(dev);
418	if (sbdf < 0)
419		return;
420
421	devid = PCI_SBDF_TO_DEVID(sbdf);
422	pci_seg->rlookup_table[devid] = NULL;
423	memset(&dev_table[devid], 0, sizeof(struct dev_table_entry));
424
425	setup_aliases(iommu, dev);
426}
427
428static void amd_iommu_uninit_device(struct device *dev)
429{
430	struct iommu_dev_data *dev_data;
431
432	dev_data = dev_iommu_priv_get(dev);
433	if (!dev_data)
434		return;
435
436	if (dev_data->domain)
437		detach_device(dev);
438
439	dev_iommu_priv_set(dev, NULL);
440
441	/*
442	 * We keep dev_data around for unplugged devices and reuse it when the
443	 * device is re-plugged - not doing so would introduce a ton of races.
444	 */
445}
446
447/****************************************************************************
448 *
449 * Interrupt handling functions
450 *
451 ****************************************************************************/
452
453static void dump_dte_entry(struct amd_iommu *iommu, u16 devid)
454{
455	int i;
456	struct dev_table_entry *dev_table = get_dev_table(iommu);
457
458	for (i = 0; i < 4; ++i)
459		pr_err("DTE[%d]: %016llx\n", i, dev_table[devid].data[i]);
460}
461
462static void dump_command(unsigned long phys_addr)
463{
464	struct iommu_cmd *cmd = iommu_phys_to_virt(phys_addr);
465	int i;
466
467	for (i = 0; i < 4; ++i)
468		pr_err("CMD[%d]: %08x\n", i, cmd->data[i]);
469}
470
471static void amd_iommu_report_rmp_hw_error(struct amd_iommu *iommu, volatile u32 *event)
472{
473	struct iommu_dev_data *dev_data = NULL;
474	int devid, vmg_tag, flags;
475	struct pci_dev *pdev;
476	u64 spa;
477
478	devid   = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
479	vmg_tag = (event[1]) & 0xFFFF;
480	flags   = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
481	spa     = ((u64)event[3] << 32) | (event[2] & 0xFFFFFFF8);
482
483	pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid),
484					   devid & 0xff);
485	if (pdev)
486		dev_data = dev_iommu_priv_get(&pdev->dev);
487
488	if (dev_data) {
489		if (__ratelimit(&dev_data->rs)) {
490			pci_err(pdev, "Event logged [RMP_HW_ERROR vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n",
491				vmg_tag, spa, flags);
492		}
493	} else {
494		pr_err_ratelimited("Event logged [RMP_HW_ERROR device=%04x:%02x:%02x.%x, vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n",
495			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
496			vmg_tag, spa, flags);
497	}
498
499	if (pdev)
500		pci_dev_put(pdev);
501}
502
503static void amd_iommu_report_rmp_fault(struct amd_iommu *iommu, volatile u32 *event)
504{
505	struct iommu_dev_data *dev_data = NULL;
506	int devid, flags_rmp, vmg_tag, flags;
507	struct pci_dev *pdev;
508	u64 gpa;
509
510	devid     = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
511	flags_rmp = (event[0] >> EVENT_FLAGS_SHIFT) & 0xFF;
512	vmg_tag   = (event[1]) & 0xFFFF;
513	flags     = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
514	gpa       = ((u64)event[3] << 32) | event[2];
515
516	pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid),
517					   devid & 0xff);
518	if (pdev)
519		dev_data = dev_iommu_priv_get(&pdev->dev);
520
521	if (dev_data) {
522		if (__ratelimit(&dev_data->rs)) {
523			pci_err(pdev, "Event logged [RMP_PAGE_FAULT vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n",
524				vmg_tag, gpa, flags_rmp, flags);
525		}
526	} else {
527		pr_err_ratelimited("Event logged [RMP_PAGE_FAULT device=%04x:%02x:%02x.%x, vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n",
528			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
529			vmg_tag, gpa, flags_rmp, flags);
530	}
531
532	if (pdev)
533		pci_dev_put(pdev);
534}
535
536#define IS_IOMMU_MEM_TRANSACTION(flags)		\
537	(((flags) & EVENT_FLAG_I) == 0)
538
539#define IS_WRITE_REQUEST(flags)			\
540	((flags) & EVENT_FLAG_RW)
541
542static void amd_iommu_report_page_fault(struct amd_iommu *iommu,
543					u16 devid, u16 domain_id,
544					u64 address, int flags)
545{
546	struct iommu_dev_data *dev_data = NULL;
547	struct pci_dev *pdev;
548
549	pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid),
550					   devid & 0xff);
551	if (pdev)
552		dev_data = dev_iommu_priv_get(&pdev->dev);
553
554	if (dev_data) {
555		/*
556		 * If this is a DMA fault (for which the I(nterrupt)
557		 * bit will be unset), allow report_iommu_fault() to
558		 * prevent logging it.
559		 */
560		if (IS_IOMMU_MEM_TRANSACTION(flags)) {
561			/* Device not attached to domain properly */
562			if (dev_data->domain == NULL) {
563				pr_err_ratelimited("Event logged [Device not attached to domain properly]\n");
564				pr_err_ratelimited("  device=%04x:%02x:%02x.%x domain=0x%04x\n",
565						   iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
566						   PCI_FUNC(devid), domain_id);
567				goto out;
568			}
569
570			if (!report_iommu_fault(&dev_data->domain->domain,
571						&pdev->dev, address,
572						IS_WRITE_REQUEST(flags) ?
573							IOMMU_FAULT_WRITE :
574							IOMMU_FAULT_READ))
575				goto out;
576		}
577
578		if (__ratelimit(&dev_data->rs)) {
579			pci_err(pdev, "Event logged [IO_PAGE_FAULT domain=0x%04x address=0x%llx flags=0x%04x]\n",
580				domain_id, address, flags);
581		}
582	} else {
583		pr_err_ratelimited("Event logged [IO_PAGE_FAULT device=%04x:%02x:%02x.%x domain=0x%04x address=0x%llx flags=0x%04x]\n",
584			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
585			domain_id, address, flags);
586	}
587
588out:
589	if (pdev)
590		pci_dev_put(pdev);
591}
592
593static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
594{
595	struct device *dev = iommu->iommu.dev;
596	int type, devid, flags, tag;
597	volatile u32 *event = __evt;
598	int count = 0;
599	u64 address;
600	u32 pasid;
601
602retry:
603	type    = (event[1] >> EVENT_TYPE_SHIFT)  & EVENT_TYPE_MASK;
604	devid   = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
605	pasid   = (event[0] & EVENT_DOMID_MASK_HI) |
606		  (event[1] & EVENT_DOMID_MASK_LO);
607	flags   = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
608	address = (u64)(((u64)event[3]) << 32) | event[2];
609
610	if (type == 0) {
611		/* Did we hit the erratum? */
612		if (++count == LOOP_TIMEOUT) {
613			pr_err("No event written to event log\n");
614			return;
615		}
616		udelay(1);
617		goto retry;
618	}
619
620	if (type == EVENT_TYPE_IO_FAULT) {
621		amd_iommu_report_page_fault(iommu, devid, pasid, address, flags);
622		return;
623	}
624
625	switch (type) {
626	case EVENT_TYPE_ILL_DEV:
627		dev_err(dev, "Event logged [ILLEGAL_DEV_TABLE_ENTRY device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n",
628			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
629			pasid, address, flags);
630		dump_dte_entry(iommu, devid);
631		break;
632	case EVENT_TYPE_DEV_TAB_ERR:
633		dev_err(dev, "Event logged [DEV_TAB_HARDWARE_ERROR device=%04x:%02x:%02x.%x "
634			"address=0x%llx flags=0x%04x]\n",
635			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
636			address, flags);
637		break;
638	case EVENT_TYPE_PAGE_TAB_ERR:
639		dev_err(dev, "Event logged [PAGE_TAB_HARDWARE_ERROR device=%04x:%02x:%02x.%x pasid=0x%04x address=0x%llx flags=0x%04x]\n",
640			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
641			pasid, address, flags);
642		break;
643	case EVENT_TYPE_ILL_CMD:
644		dev_err(dev, "Event logged [ILLEGAL_COMMAND_ERROR address=0x%llx]\n", address);
645		dump_command(address);
646		break;
647	case EVENT_TYPE_CMD_HARD_ERR:
648		dev_err(dev, "Event logged [COMMAND_HARDWARE_ERROR address=0x%llx flags=0x%04x]\n",
649			address, flags);
650		break;
651	case EVENT_TYPE_IOTLB_INV_TO:
652		dev_err(dev, "Event logged [IOTLB_INV_TIMEOUT device=%04x:%02x:%02x.%x address=0x%llx]\n",
653			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
654			address);
655		break;
656	case EVENT_TYPE_INV_DEV_REQ:
657		dev_err(dev, "Event logged [INVALID_DEVICE_REQUEST device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n",
658			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
659			pasid, address, flags);
660		break;
661	case EVENT_TYPE_RMP_FAULT:
662		amd_iommu_report_rmp_fault(iommu, event);
663		break;
664	case EVENT_TYPE_RMP_HW_ERR:
665		amd_iommu_report_rmp_hw_error(iommu, event);
666		break;
667	case EVENT_TYPE_INV_PPR_REQ:
668		pasid = PPR_PASID(*((u64 *)__evt));
669		tag = event[1] & 0x03FF;
670		dev_err(dev, "Event logged [INVALID_PPR_REQUEST device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x tag=0x%03x]\n",
671			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
672			pasid, address, flags, tag);
673		break;
674	default:
675		dev_err(dev, "Event logged [UNKNOWN event[0]=0x%08x event[1]=0x%08x event[2]=0x%08x event[3]=0x%08x\n",
676			event[0], event[1], event[2], event[3]);
677	}
678
679	/*
680	 * To detect the hardware errata 732 we need to clear the
681	 * entry back to zero. This issue does not exist on SNP
682	 * enabled system. Also this buffer is not writeable on
683	 * SNP enabled system.
684	 */
685	if (!amd_iommu_snp_en)
686		memset(__evt, 0, 4 * sizeof(u32));
687}
688
689static void iommu_poll_events(struct amd_iommu *iommu)
690{
691	u32 head, tail;
692
693	head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
694	tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
695
696	while (head != tail) {
697		iommu_print_event(iommu, iommu->evt_buf + head);
698		head = (head + EVENT_ENTRY_SIZE) % EVT_BUFFER_SIZE;
699	}
700
701	writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
702}
703
704static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u64 *raw)
705{
706	struct amd_iommu_fault fault;
707
708	if (PPR_REQ_TYPE(raw[0]) != PPR_REQ_FAULT) {
709		pr_err_ratelimited("Unknown PPR request received\n");
710		return;
711	}
712
713	fault.address   = raw[1];
714	fault.pasid     = PPR_PASID(raw[0]);
715	fault.sbdf      = PCI_SEG_DEVID_TO_SBDF(iommu->pci_seg->id, PPR_DEVID(raw[0]));
716	fault.tag       = PPR_TAG(raw[0]);
717	fault.flags     = PPR_FLAGS(raw[0]);
718
719	atomic_notifier_call_chain(&ppr_notifier, 0, &fault);
720}
721
722static void iommu_poll_ppr_log(struct amd_iommu *iommu)
723{
724	u32 head, tail;
725
726	if (iommu->ppr_log == NULL)
727		return;
728
729	head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
730	tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
731
732	while (head != tail) {
733		volatile u64 *raw;
734		u64 entry[2];
735		int i;
736
737		raw = (u64 *)(iommu->ppr_log + head);
738
739		/*
740		 * Hardware bug: Interrupt may arrive before the entry is
741		 * written to memory. If this happens we need to wait for the
742		 * entry to arrive.
743		 */
744		for (i = 0; i < LOOP_TIMEOUT; ++i) {
745			if (PPR_REQ_TYPE(raw[0]) != 0)
746				break;
747			udelay(1);
748		}
749
750		/* Avoid memcpy function-call overhead */
751		entry[0] = raw[0];
752		entry[1] = raw[1];
753
754		/*
755		 * To detect the hardware errata 733 we need to clear the
756		 * entry back to zero. This issue does not exist on SNP
757		 * enabled system. Also this buffer is not writeable on
758		 * SNP enabled system.
759		 */
760		if (!amd_iommu_snp_en)
761			raw[0] = raw[1] = 0UL;
762
763		/* Update head pointer of hardware ring-buffer */
764		head = (head + PPR_ENTRY_SIZE) % PPR_LOG_SIZE;
765		writel(head, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
766
767		/* Handle PPR entry */
768		iommu_handle_ppr_entry(iommu, entry);
769
770		/* Refresh ring-buffer information */
771		head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
772		tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
773	}
774}
775
776#ifdef CONFIG_IRQ_REMAP
777static int (*iommu_ga_log_notifier)(u32);
778
779int amd_iommu_register_ga_log_notifier(int (*notifier)(u32))
780{
781	iommu_ga_log_notifier = notifier;
782
783	return 0;
784}
785EXPORT_SYMBOL(amd_iommu_register_ga_log_notifier);
786
787static void iommu_poll_ga_log(struct amd_iommu *iommu)
788{
789	u32 head, tail;
790
791	if (iommu->ga_log == NULL)
792		return;
793
794	head = readl(iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
795	tail = readl(iommu->mmio_base + MMIO_GA_TAIL_OFFSET);
796
797	while (head != tail) {
798		volatile u64 *raw;
799		u64 log_entry;
800
801		raw = (u64 *)(iommu->ga_log + head);
802
803		/* Avoid memcpy function-call overhead */
804		log_entry = *raw;
805
806		/* Update head pointer of hardware ring-buffer */
807		head = (head + GA_ENTRY_SIZE) % GA_LOG_SIZE;
808		writel(head, iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
809
810		/* Handle GA entry */
811		switch (GA_REQ_TYPE(log_entry)) {
812		case GA_GUEST_NR:
813			if (!iommu_ga_log_notifier)
814				break;
815
816			pr_debug("%s: devid=%#x, ga_tag=%#x\n",
817				 __func__, GA_DEVID(log_entry),
818				 GA_TAG(log_entry));
819
820			if (iommu_ga_log_notifier(GA_TAG(log_entry)) != 0)
821				pr_err("GA log notifier failed.\n");
822			break;
823		default:
824			break;
825		}
826	}
827}
828
829static void
830amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu)
831{
832	if (!irq_remapping_enabled || !dev_is_pci(dev) ||
833	    !pci_dev_has_default_msi_parent_domain(to_pci_dev(dev)))
834		return;
835
836	dev_set_msi_domain(dev, iommu->ir_domain);
837}
838
839#else /* CONFIG_IRQ_REMAP */
840static inline void
841amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu) { }
842#endif /* !CONFIG_IRQ_REMAP */
843
844static void amd_iommu_handle_irq(void *data, const char *evt_type,
845				 u32 int_mask, u32 overflow_mask,
846				 void (*int_handler)(struct amd_iommu *),
847				 void (*overflow_handler)(struct amd_iommu *))
848{
849	struct amd_iommu *iommu = (struct amd_iommu *) data;
850	u32 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
851	u32 mask = int_mask | overflow_mask;
852
853	while (status & mask) {
854		/* Enable interrupt sources again */
855		writel(mask, iommu->mmio_base + MMIO_STATUS_OFFSET);
856
857		if (int_handler) {
858			pr_devel("Processing IOMMU (ivhd%d) %s Log\n",
859				 iommu->index, evt_type);
860			int_handler(iommu);
861		}
862
863		if ((status & overflow_mask) && overflow_handler)
864			overflow_handler(iommu);
865
866		/*
867		 * Hardware bug: ERBT1312
868		 * When re-enabling interrupt (by writing 1
869		 * to clear the bit), the hardware might also try to set
870		 * the interrupt bit in the event status register.
871		 * In this scenario, the bit will be set, and disable
872		 * subsequent interrupts.
873		 *
874		 * Workaround: The IOMMU driver should read back the
875		 * status register and check if the interrupt bits are cleared.
876		 * If not, driver will need to go through the interrupt handler
877		 * again and re-clear the bits
878		 */
879		status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
880	}
881}
882
883irqreturn_t amd_iommu_int_thread_evtlog(int irq, void *data)
884{
885	amd_iommu_handle_irq(data, "Evt", MMIO_STATUS_EVT_INT_MASK,
886			     MMIO_STATUS_EVT_OVERFLOW_MASK,
887			     iommu_poll_events, amd_iommu_restart_event_logging);
888
889	return IRQ_HANDLED;
890}
891
892irqreturn_t amd_iommu_int_thread_pprlog(int irq, void *data)
893{
894	amd_iommu_handle_irq(data, "PPR", MMIO_STATUS_PPR_INT_MASK,
895			     MMIO_STATUS_PPR_OVERFLOW_MASK,
896			     iommu_poll_ppr_log, amd_iommu_restart_ppr_log);
897
898	return IRQ_HANDLED;
899}
900
901irqreturn_t amd_iommu_int_thread_galog(int irq, void *data)
902{
903#ifdef CONFIG_IRQ_REMAP
904	amd_iommu_handle_irq(data, "GA", MMIO_STATUS_GALOG_INT_MASK,
905			     MMIO_STATUS_GALOG_OVERFLOW_MASK,
906			     iommu_poll_ga_log, amd_iommu_restart_ga_log);
907#endif
908
909	return IRQ_HANDLED;
910}
911
912irqreturn_t amd_iommu_int_thread(int irq, void *data)
913{
914	amd_iommu_int_thread_evtlog(irq, data);
915	amd_iommu_int_thread_pprlog(irq, data);
916	amd_iommu_int_thread_galog(irq, data);
917
918	return IRQ_HANDLED;
919}
920
921irqreturn_t amd_iommu_int_handler(int irq, void *data)
922{
923	return IRQ_WAKE_THREAD;
924}
925
926/****************************************************************************
927 *
928 * IOMMU command queuing functions
929 *
930 ****************************************************************************/
931
932static int wait_on_sem(struct amd_iommu *iommu, u64 data)
933{
934	int i = 0;
935
936	while (*iommu->cmd_sem != data && i < LOOP_TIMEOUT) {
937		udelay(1);
938		i += 1;
939	}
940
941	if (i == LOOP_TIMEOUT) {
942		pr_alert("Completion-Wait loop timed out\n");
943		return -EIO;
944	}
945
946	return 0;
947}
948
949static void copy_cmd_to_buffer(struct amd_iommu *iommu,
950			       struct iommu_cmd *cmd)
951{
952	u8 *target;
953	u32 tail;
954
955	/* Copy command to buffer */
956	tail = iommu->cmd_buf_tail;
957	target = iommu->cmd_buf + tail;
958	memcpy(target, cmd, sizeof(*cmd));
959
960	tail = (tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
961	iommu->cmd_buf_tail = tail;
962
963	/* Tell the IOMMU about it */
964	writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
965}
966
967static void build_completion_wait(struct iommu_cmd *cmd,
968				  struct amd_iommu *iommu,
969				  u64 data)
970{
971	u64 paddr = iommu_virt_to_phys((void *)iommu->cmd_sem);
972
973	memset(cmd, 0, sizeof(*cmd));
974	cmd->data[0] = lower_32_bits(paddr) | CMD_COMPL_WAIT_STORE_MASK;
975	cmd->data[1] = upper_32_bits(paddr);
976	cmd->data[2] = lower_32_bits(data);
977	cmd->data[3] = upper_32_bits(data);
978	CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
979}
980
981static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
982{
983	memset(cmd, 0, sizeof(*cmd));
984	cmd->data[0] = devid;
985	CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
986}
987
988/*
989 * Builds an invalidation address which is suitable for one page or multiple
990 * pages. Sets the size bit (S) as needed is more than one page is flushed.
991 */
992static inline u64 build_inv_address(u64 address, size_t size)
993{
994	u64 pages, end, msb_diff;
995
996	pages = iommu_num_pages(address, size, PAGE_SIZE);
997
998	if (pages == 1)
999		return address & PAGE_MASK;
1000
1001	end = address + size - 1;
1002
1003	/*
1004	 * msb_diff would hold the index of the most significant bit that
1005	 * flipped between the start and end.
1006	 */
1007	msb_diff = fls64(end ^ address) - 1;
1008
1009	/*
1010	 * Bits 63:52 are sign extended. If for some reason bit 51 is different
1011	 * between the start and the end, invalidate everything.
1012	 */
1013	if (unlikely(msb_diff > 51)) {
1014		address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
1015	} else {
1016		/*
1017		 * The msb-bit must be clear on the address. Just set all the
1018		 * lower bits.
1019		 */
1020		address |= (1ull << msb_diff) - 1;
1021	}
1022
1023	/* Clear bits 11:0 */
1024	address &= PAGE_MASK;
1025
1026	/* Set the size bit - we flush more than one 4kb page */
1027	return address | CMD_INV_IOMMU_PAGES_SIZE_MASK;
1028}
1029
1030static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
1031				  size_t size, u16 domid, int pde)
1032{
1033	u64 inv_address = build_inv_address(address, size);
1034
1035	memset(cmd, 0, sizeof(*cmd));
1036	cmd->data[1] |= domid;
1037	cmd->data[2]  = lower_32_bits(inv_address);
1038	cmd->data[3]  = upper_32_bits(inv_address);
1039	CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
1040	if (pde) /* PDE bit - we want to flush everything, not only the PTEs */
1041		cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
1042}
1043
1044static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
1045				  u64 address, size_t size)
1046{
1047	u64 inv_address = build_inv_address(address, size);
1048
1049	memset(cmd, 0, sizeof(*cmd));
1050	cmd->data[0]  = devid;
1051	cmd->data[0] |= (qdep & 0xff) << 24;
1052	cmd->data[1]  = devid;
1053	cmd->data[2]  = lower_32_bits(inv_address);
1054	cmd->data[3]  = upper_32_bits(inv_address);
1055	CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
1056}
1057
1058static void build_inv_iommu_pasid(struct iommu_cmd *cmd, u16 domid, u32 pasid,
1059				  u64 address, bool size)
1060{
1061	memset(cmd, 0, sizeof(*cmd));
1062
1063	address &= ~(0xfffULL);
1064
1065	cmd->data[0]  = pasid;
1066	cmd->data[1]  = domid;
1067	cmd->data[2]  = lower_32_bits(address);
1068	cmd->data[3]  = upper_32_bits(address);
1069	cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
1070	cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
1071	if (size)
1072		cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
1073	CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
1074}
1075
1076static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, u32 pasid,
1077				  int qdep, u64 address, bool size)
1078{
1079	memset(cmd, 0, sizeof(*cmd));
1080
1081	address &= ~(0xfffULL);
1082
1083	cmd->data[0]  = devid;
1084	cmd->data[0] |= ((pasid >> 8) & 0xff) << 16;
1085	cmd->data[0] |= (qdep  & 0xff) << 24;
1086	cmd->data[1]  = devid;
1087	cmd->data[1] |= (pasid & 0xff) << 16;
1088	cmd->data[2]  = lower_32_bits(address);
1089	cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
1090	cmd->data[3]  = upper_32_bits(address);
1091	if (size)
1092		cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
1093	CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
1094}
1095
1096static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, u32 pasid,
1097			       int status, int tag, bool gn)
1098{
1099	memset(cmd, 0, sizeof(*cmd));
1100
1101	cmd->data[0]  = devid;
1102	if (gn) {
1103		cmd->data[1]  = pasid;
1104		cmd->data[2]  = CMD_INV_IOMMU_PAGES_GN_MASK;
1105	}
1106	cmd->data[3]  = tag & 0x1ff;
1107	cmd->data[3] |= (status & PPR_STATUS_MASK) << PPR_STATUS_SHIFT;
1108
1109	CMD_SET_TYPE(cmd, CMD_COMPLETE_PPR);
1110}
1111
1112static void build_inv_all(struct iommu_cmd *cmd)
1113{
1114	memset(cmd, 0, sizeof(*cmd));
1115	CMD_SET_TYPE(cmd, CMD_INV_ALL);
1116}
1117
1118static void build_inv_irt(struct iommu_cmd *cmd, u16 devid)
1119{
1120	memset(cmd, 0, sizeof(*cmd));
1121	cmd->data[0] = devid;
1122	CMD_SET_TYPE(cmd, CMD_INV_IRT);
1123}
1124
1125/*
1126 * Writes the command to the IOMMUs command buffer and informs the
1127 * hardware about the new command.
1128 */
1129static int __iommu_queue_command_sync(struct amd_iommu *iommu,
1130				      struct iommu_cmd *cmd,
1131				      bool sync)
1132{
1133	unsigned int count = 0;
1134	u32 left, next_tail;
1135
1136	next_tail = (iommu->cmd_buf_tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
1137again:
1138	left      = (iommu->cmd_buf_head - next_tail) % CMD_BUFFER_SIZE;
1139
1140	if (left <= 0x20) {
1141		/* Skip udelay() the first time around */
1142		if (count++) {
1143			if (count == LOOP_TIMEOUT) {
1144				pr_err("Command buffer timeout\n");
1145				return -EIO;
1146			}
1147
1148			udelay(1);
1149		}
1150
1151		/* Update head and recheck remaining space */
1152		iommu->cmd_buf_head = readl(iommu->mmio_base +
1153					    MMIO_CMD_HEAD_OFFSET);
1154
1155		goto again;
1156	}
1157
1158	copy_cmd_to_buffer(iommu, cmd);
1159
1160	/* Do we need to make sure all commands are processed? */
1161	iommu->need_sync = sync;
1162
1163	return 0;
1164}
1165
1166static int iommu_queue_command_sync(struct amd_iommu *iommu,
1167				    struct iommu_cmd *cmd,
1168				    bool sync)
1169{
1170	unsigned long flags;
1171	int ret;
1172
1173	raw_spin_lock_irqsave(&iommu->lock, flags);
1174	ret = __iommu_queue_command_sync(iommu, cmd, sync);
1175	raw_spin_unlock_irqrestore(&iommu->lock, flags);
1176
1177	return ret;
1178}
1179
1180static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
1181{
1182	return iommu_queue_command_sync(iommu, cmd, true);
1183}
1184
1185/*
1186 * This function queues a completion wait command into the command
1187 * buffer of an IOMMU
1188 */
1189static int iommu_completion_wait(struct amd_iommu *iommu)
1190{
1191	struct iommu_cmd cmd;
1192	unsigned long flags;
1193	int ret;
1194	u64 data;
1195
1196	if (!iommu->need_sync)
1197		return 0;
1198
1199	data = atomic64_add_return(1, &iommu->cmd_sem_val);
1200	build_completion_wait(&cmd, iommu, data);
1201
1202	raw_spin_lock_irqsave(&iommu->lock, flags);
1203
1204	ret = __iommu_queue_command_sync(iommu, &cmd, false);
1205	if (ret)
1206		goto out_unlock;
1207
1208	ret = wait_on_sem(iommu, data);
1209
1210out_unlock:
1211	raw_spin_unlock_irqrestore(&iommu->lock, flags);
1212
1213	return ret;
1214}
1215
1216static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
1217{
1218	struct iommu_cmd cmd;
1219
1220	build_inv_dte(&cmd, devid);
1221
1222	return iommu_queue_command(iommu, &cmd);
1223}
1224
1225static void amd_iommu_flush_dte_all(struct amd_iommu *iommu)
1226{
1227	u32 devid;
1228	u16 last_bdf = iommu->pci_seg->last_bdf;
1229
1230	for (devid = 0; devid <= last_bdf; ++devid)
1231		iommu_flush_dte(iommu, devid);
1232
1233	iommu_completion_wait(iommu);
1234}
1235
1236/*
1237 * This function uses heavy locking and may disable irqs for some time. But
1238 * this is no issue because it is only called during resume.
1239 */
1240static void amd_iommu_flush_tlb_all(struct amd_iommu *iommu)
1241{
1242	u32 dom_id;
1243	u16 last_bdf = iommu->pci_seg->last_bdf;
1244
1245	for (dom_id = 0; dom_id <= last_bdf; ++dom_id) {
1246		struct iommu_cmd cmd;
1247		build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
1248				      dom_id, 1);
1249		iommu_queue_command(iommu, &cmd);
1250	}
1251
1252	iommu_completion_wait(iommu);
1253}
1254
1255static void amd_iommu_flush_tlb_domid(struct amd_iommu *iommu, u32 dom_id)
1256{
1257	struct iommu_cmd cmd;
1258
1259	build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
1260			      dom_id, 1);
1261	iommu_queue_command(iommu, &cmd);
1262
1263	iommu_completion_wait(iommu);
1264}
1265
1266static void amd_iommu_flush_all(struct amd_iommu *iommu)
1267{
1268	struct iommu_cmd cmd;
1269
1270	build_inv_all(&cmd);
1271
1272	iommu_queue_command(iommu, &cmd);
1273	iommu_completion_wait(iommu);
1274}
1275
1276static void iommu_flush_irt(struct amd_iommu *iommu, u16 devid)
1277{
1278	struct iommu_cmd cmd;
1279
1280	build_inv_irt(&cmd, devid);
1281
1282	iommu_queue_command(iommu, &cmd);
1283}
1284
1285static void amd_iommu_flush_irt_all(struct amd_iommu *iommu)
1286{
1287	u32 devid;
1288	u16 last_bdf = iommu->pci_seg->last_bdf;
1289
1290	if (iommu->irtcachedis_enabled)
1291		return;
1292
1293	for (devid = 0; devid <= last_bdf; devid++)
1294		iommu_flush_irt(iommu, devid);
1295
1296	iommu_completion_wait(iommu);
1297}
1298
1299void iommu_flush_all_caches(struct amd_iommu *iommu)
1300{
1301	if (iommu_feature(iommu, FEATURE_IA)) {
1302		amd_iommu_flush_all(iommu);
1303	} else {
1304		amd_iommu_flush_dte_all(iommu);
1305		amd_iommu_flush_irt_all(iommu);
1306		amd_iommu_flush_tlb_all(iommu);
1307	}
1308}
1309
1310/*
1311 * Command send function for flushing on-device TLB
1312 */
1313static int device_flush_iotlb(struct iommu_dev_data *dev_data,
1314			      u64 address, size_t size)
1315{
1316	struct amd_iommu *iommu;
1317	struct iommu_cmd cmd;
1318	int qdep;
1319
1320	qdep     = dev_data->ats.qdep;
1321	iommu    = rlookup_amd_iommu(dev_data->dev);
1322	if (!iommu)
1323		return -EINVAL;
1324
1325	build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address, size);
1326
1327	return iommu_queue_command(iommu, &cmd);
1328}
1329
1330static int device_flush_dte_alias(struct pci_dev *pdev, u16 alias, void *data)
1331{
1332	struct amd_iommu *iommu = data;
1333
1334	return iommu_flush_dte(iommu, alias);
1335}
1336
1337/*
1338 * Command send function for invalidating a device table entry
1339 */
1340static int device_flush_dte(struct iommu_dev_data *dev_data)
1341{
1342	struct amd_iommu *iommu;
1343	struct pci_dev *pdev = NULL;
1344	struct amd_iommu_pci_seg *pci_seg;
1345	u16 alias;
1346	int ret;
1347
1348	iommu = rlookup_amd_iommu(dev_data->dev);
1349	if (!iommu)
1350		return -EINVAL;
1351
1352	if (dev_is_pci(dev_data->dev))
1353		pdev = to_pci_dev(dev_data->dev);
1354
1355	if (pdev)
1356		ret = pci_for_each_dma_alias(pdev,
1357					     device_flush_dte_alias, iommu);
1358	else
1359		ret = iommu_flush_dte(iommu, dev_data->devid);
1360	if (ret)
1361		return ret;
1362
1363	pci_seg = iommu->pci_seg;
1364	alias = pci_seg->alias_table[dev_data->devid];
1365	if (alias != dev_data->devid) {
1366		ret = iommu_flush_dte(iommu, alias);
1367		if (ret)
1368			return ret;
1369	}
1370
1371	if (dev_data->ats.enabled)
1372		ret = device_flush_iotlb(dev_data, 0, ~0UL);
1373
1374	return ret;
1375}
1376
1377/*
1378 * TLB invalidation function which is called from the mapping functions.
1379 * It invalidates a single PTE if the range to flush is within a single
1380 * page. Otherwise it flushes the whole TLB of the IOMMU.
1381 */
1382static void __domain_flush_pages(struct protection_domain *domain,
1383				 u64 address, size_t size, int pde)
1384{
1385	struct iommu_dev_data *dev_data;
1386	struct iommu_cmd cmd;
1387	int ret = 0, i;
1388
1389	build_inv_iommu_pages(&cmd, address, size, domain->id, pde);
1390
1391	for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
1392		if (!domain->dev_iommu[i])
1393			continue;
1394
1395		/*
1396		 * Devices of this domain are behind this IOMMU
1397		 * We need a TLB flush
1398		 */
1399		ret |= iommu_queue_command(amd_iommus[i], &cmd);
1400	}
1401
1402	list_for_each_entry(dev_data, &domain->dev_list, list) {
1403
1404		if (!dev_data->ats.enabled)
1405			continue;
1406
1407		ret |= device_flush_iotlb(dev_data, address, size);
1408	}
1409
1410	WARN_ON(ret);
1411}
1412
1413static void domain_flush_pages(struct protection_domain *domain,
1414			       u64 address, size_t size, int pde)
1415{
1416	if (likely(!amd_iommu_np_cache)) {
1417		__domain_flush_pages(domain, address, size, pde);
1418		return;
1419	}
1420
1421	/*
1422	 * When NpCache is on, we infer that we run in a VM and use a vIOMMU.
1423	 * In such setups it is best to avoid flushes of ranges which are not
1424	 * naturally aligned, since it would lead to flushes of unmodified
1425	 * PTEs. Such flushes would require the hypervisor to do more work than
1426	 * necessary. Therefore, perform repeated flushes of aligned ranges
1427	 * until you cover the range. Each iteration flushes the smaller
1428	 * between the natural alignment of the address that we flush and the
1429	 * greatest naturally aligned region that fits in the range.
1430	 */
1431	while (size != 0) {
1432		int addr_alignment = __ffs(address);
1433		int size_alignment = __fls(size);
1434		int min_alignment;
1435		size_t flush_size;
1436
1437		/*
1438		 * size is always non-zero, but address might be zero, causing
1439		 * addr_alignment to be negative. As the casting of the
1440		 * argument in __ffs(address) to long might trim the high bits
1441		 * of the address on x86-32, cast to long when doing the check.
1442		 */
1443		if (likely((unsigned long)address != 0))
1444			min_alignment = min(addr_alignment, size_alignment);
1445		else
1446			min_alignment = size_alignment;
1447
1448		flush_size = 1ul << min_alignment;
1449
1450		__domain_flush_pages(domain, address, flush_size, pde);
1451		address += flush_size;
1452		size -= flush_size;
1453	}
1454}
1455
1456/* Flush the whole IO/TLB for a given protection domain - including PDE */
1457void amd_iommu_domain_flush_tlb_pde(struct protection_domain *domain)
1458{
1459	domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
1460}
1461
1462void amd_iommu_domain_flush_complete(struct protection_domain *domain)
1463{
1464	int i;
1465
1466	for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
1467		if (domain && !domain->dev_iommu[i])
1468			continue;
1469
1470		/*
1471		 * Devices of this domain are behind this IOMMU
1472		 * We need to wait for completion of all commands.
1473		 */
1474		iommu_completion_wait(amd_iommus[i]);
1475	}
1476}
1477
1478/* Flush the not present cache if it exists */
1479static void domain_flush_np_cache(struct protection_domain *domain,
1480		dma_addr_t iova, size_t size)
1481{
1482	if (unlikely(amd_iommu_np_cache)) {
1483		unsigned long flags;
1484
1485		spin_lock_irqsave(&domain->lock, flags);
1486		domain_flush_pages(domain, iova, size, 1);
1487		amd_iommu_domain_flush_complete(domain);
1488		spin_unlock_irqrestore(&domain->lock, flags);
1489	}
1490}
1491
1492
1493/*
1494 * This function flushes the DTEs for all devices in domain
1495 */
1496static void domain_flush_devices(struct protection_domain *domain)
1497{
1498	struct iommu_dev_data *dev_data;
1499
1500	list_for_each_entry(dev_data, &domain->dev_list, list)
1501		device_flush_dte(dev_data);
1502}
1503
1504/****************************************************************************
1505 *
1506 * The next functions belong to the domain allocation. A domain is
1507 * allocated for every IOMMU as the default domain. If device isolation
1508 * is enabled, every device get its own domain. The most important thing
1509 * about domains is the page table mapping the DMA address space they
1510 * contain.
1511 *
1512 ****************************************************************************/
1513
1514static u16 domain_id_alloc(void)
1515{
1516	int id;
1517
1518	spin_lock(&pd_bitmap_lock);
1519	id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
1520	BUG_ON(id == 0);
1521	if (id > 0 && id < MAX_DOMAIN_ID)
1522		__set_bit(id, amd_iommu_pd_alloc_bitmap);
1523	else
1524		id = 0;
1525	spin_unlock(&pd_bitmap_lock);
1526
1527	return id;
1528}
1529
1530static void domain_id_free(int id)
1531{
1532	spin_lock(&pd_bitmap_lock);
1533	if (id > 0 && id < MAX_DOMAIN_ID)
1534		__clear_bit(id, amd_iommu_pd_alloc_bitmap);
1535	spin_unlock(&pd_bitmap_lock);
1536}
1537
1538static void free_gcr3_tbl_level1(u64 *tbl)
1539{
1540	u64 *ptr;
1541	int i;
1542
1543	for (i = 0; i < 512; ++i) {
1544		if (!(tbl[i] & GCR3_VALID))
1545			continue;
1546
1547		ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
1548
1549		free_page((unsigned long)ptr);
1550	}
1551}
1552
1553static void free_gcr3_tbl_level2(u64 *tbl)
1554{
1555	u64 *ptr;
1556	int i;
1557
1558	for (i = 0; i < 512; ++i) {
1559		if (!(tbl[i] & GCR3_VALID))
1560			continue;
1561
1562		ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
1563
1564		free_gcr3_tbl_level1(ptr);
1565	}
1566}
1567
1568static void free_gcr3_table(struct protection_domain *domain)
1569{
1570	if (domain->glx == 2)
1571		free_gcr3_tbl_level2(domain->gcr3_tbl);
1572	else if (domain->glx == 1)
1573		free_gcr3_tbl_level1(domain->gcr3_tbl);
1574	else
1575		BUG_ON(domain->glx != 0);
1576
1577	free_page((unsigned long)domain->gcr3_tbl);
1578}
1579
1580static void set_dte_entry(struct amd_iommu *iommu, u16 devid,
1581			  struct protection_domain *domain, bool ats, bool ppr)
1582{
1583	u64 pte_root = 0;
1584	u64 flags = 0;
1585	u32 old_domid;
1586	struct dev_table_entry *dev_table = get_dev_table(iommu);
1587
1588	if (domain->iop.mode != PAGE_MODE_NONE)
1589		pte_root = iommu_virt_to_phys(domain->iop.root);
1590
1591	pte_root |= (domain->iop.mode & DEV_ENTRY_MODE_MASK)
1592		    << DEV_ENTRY_MODE_SHIFT;
1593
1594	pte_root |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V;
1595
1596	/*
1597	 * When SNP is enabled, Only set TV bit when IOMMU
1598	 * page translation is in use.
1599	 */
1600	if (!amd_iommu_snp_en || (domain->id != 0))
1601		pte_root |= DTE_FLAG_TV;
1602
1603	flags = dev_table[devid].data[1];
1604
1605	if (ats)
1606		flags |= DTE_FLAG_IOTLB;
1607
1608	if (ppr) {
1609		if (iommu_feature(iommu, FEATURE_EPHSUP))
1610			pte_root |= 1ULL << DEV_ENTRY_PPR;
1611	}
1612
1613	if (domain->flags & PD_IOMMUV2_MASK) {
1614		u64 gcr3 = iommu_virt_to_phys(domain->gcr3_tbl);
1615		u64 glx  = domain->glx;
1616		u64 tmp;
1617
1618		pte_root |= DTE_FLAG_GV;
1619		pte_root |= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT;
1620
1621		/* First mask out possible old values for GCR3 table */
1622		tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
1623		flags    &= ~tmp;
1624
1625		tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
1626		flags    &= ~tmp;
1627
1628		/* Encode GCR3 table into DTE */
1629		tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A;
1630		pte_root |= tmp;
1631
1632		tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B;
1633		flags    |= tmp;
1634
1635		tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C;
1636		flags    |= tmp;
1637
1638		if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) {
1639			dev_table[devid].data[2] |=
1640				((u64)GUEST_PGTABLE_5_LEVEL << DTE_GPT_LEVEL_SHIFT);
1641		}
1642
1643		if (domain->flags & PD_GIOV_MASK)
1644			pte_root |= DTE_FLAG_GIOV;
1645	}
1646
1647	flags &= ~DEV_DOMID_MASK;
1648	flags |= domain->id;
1649
1650	old_domid = dev_table[devid].data[1] & DEV_DOMID_MASK;
1651	dev_table[devid].data[1]  = flags;
1652	dev_table[devid].data[0]  = pte_root;
1653
1654	/*
1655	 * A kdump kernel might be replacing a domain ID that was copied from
1656	 * the previous kernel--if so, it needs to flush the translation cache
1657	 * entries for the old domain ID that is being overwritten
1658	 */
1659	if (old_domid) {
1660		amd_iommu_flush_tlb_domid(iommu, old_domid);
1661	}
1662}
1663
1664static void clear_dte_entry(struct amd_iommu *iommu, u16 devid)
1665{
1666	struct dev_table_entry *dev_table = get_dev_table(iommu);
1667
1668	/* remove entry from the device table seen by the hardware */
1669	dev_table[devid].data[0]  = DTE_FLAG_V;
1670
1671	if (!amd_iommu_snp_en)
1672		dev_table[devid].data[0] |= DTE_FLAG_TV;
1673
1674	dev_table[devid].data[1] &= DTE_FLAG_MASK;
1675
1676	amd_iommu_apply_erratum_63(iommu, devid);
1677}
1678
1679static void do_attach(struct iommu_dev_data *dev_data,
1680		      struct protection_domain *domain)
1681{
1682	struct amd_iommu *iommu;
1683	bool ats;
1684
1685	iommu = rlookup_amd_iommu(dev_data->dev);
1686	if (!iommu)
1687		return;
1688	ats   = dev_data->ats.enabled;
1689
1690	/* Update data structures */
1691	dev_data->domain = domain;
1692	list_add(&dev_data->list, &domain->dev_list);
1693
1694	/* Update NUMA Node ID */
1695	if (domain->nid == NUMA_NO_NODE)
1696		domain->nid = dev_to_node(dev_data->dev);
1697
1698	/* Do reference counting */
1699	domain->dev_iommu[iommu->index] += 1;
1700	domain->dev_cnt                 += 1;
1701
1702	/* Update device table */
1703	set_dte_entry(iommu, dev_data->devid, domain,
1704		      ats, dev_data->iommu_v2);
1705	clone_aliases(iommu, dev_data->dev);
1706
1707	device_flush_dte(dev_data);
1708}
1709
1710static void do_detach(struct iommu_dev_data *dev_data)
1711{
1712	struct protection_domain *domain = dev_data->domain;
1713	struct amd_iommu *iommu;
1714
1715	iommu = rlookup_amd_iommu(dev_data->dev);
1716	if (!iommu)
1717		return;
1718
1719	/* Update data structures */
1720	dev_data->domain = NULL;
1721	list_del(&dev_data->list);
1722	clear_dte_entry(iommu, dev_data->devid);
1723	clone_aliases(iommu, dev_data->dev);
1724
1725	/* Flush the DTE entry */
1726	device_flush_dte(dev_data);
1727
1728	/* Flush IOTLB */
1729	amd_iommu_domain_flush_tlb_pde(domain);
1730
1731	/* Wait for the flushes to finish */
1732	amd_iommu_domain_flush_complete(domain);
1733
1734	/* decrease reference counters - needs to happen after the flushes */
1735	domain->dev_iommu[iommu->index] -= 1;
1736	domain->dev_cnt                 -= 1;
1737}
1738
1739static void pdev_iommuv2_disable(struct pci_dev *pdev)
1740{
1741	pci_disable_ats(pdev);
1742	pci_disable_pri(pdev);
1743	pci_disable_pasid(pdev);
1744}
1745
1746static int pdev_pri_ats_enable(struct pci_dev *pdev)
1747{
1748	int ret;
1749
1750	/* Only allow access to user-accessible pages */
1751	ret = pci_enable_pasid(pdev, 0);
1752	if (ret)
1753		return ret;
1754
1755	/* First reset the PRI state of the device */
1756	ret = pci_reset_pri(pdev);
1757	if (ret)
1758		goto out_err_pasid;
1759
1760	/* Enable PRI */
1761	/* FIXME: Hardcode number of outstanding requests for now */
1762	ret = pci_enable_pri(pdev, 32);
1763	if (ret)
1764		goto out_err_pasid;
1765
1766	ret = pci_enable_ats(pdev, PAGE_SHIFT);
1767	if (ret)
1768		goto out_err_pri;
1769
1770	return 0;
1771
1772out_err_pri:
1773	pci_disable_pri(pdev);
1774
1775out_err_pasid:
1776	pci_disable_pasid(pdev);
1777
1778	return ret;
1779}
1780
1781/*
1782 * If a device is not yet associated with a domain, this function makes the
1783 * device visible in the domain
1784 */
1785static int attach_device(struct device *dev,
1786			 struct protection_domain *domain)
1787{
1788	struct iommu_dev_data *dev_data;
1789	struct pci_dev *pdev;
1790	unsigned long flags;
1791	int ret;
1792
1793	spin_lock_irqsave(&domain->lock, flags);
1794
1795	dev_data = dev_iommu_priv_get(dev);
1796
1797	spin_lock(&dev_data->lock);
1798
1799	ret = -EBUSY;
1800	if (dev_data->domain != NULL)
1801		goto out;
1802
1803	if (!dev_is_pci(dev))
1804		goto skip_ats_check;
1805
1806	pdev = to_pci_dev(dev);
1807	if (domain->flags & PD_IOMMUV2_MASK) {
1808		struct iommu_domain *def_domain = iommu_get_dma_domain(dev);
1809
1810		ret = -EINVAL;
1811
1812		/*
1813		 * In case of using AMD_IOMMU_V1 page table mode and the device
1814		 * is enabling for PPR/ATS support (using v2 table),
1815		 * we need to make sure that the domain type is identity map.
1816		 */
1817		if ((amd_iommu_pgtable == AMD_IOMMU_V1) &&
1818		    def_domain->type != IOMMU_DOMAIN_IDENTITY) {
1819			goto out;
1820		}
1821
1822		if (dev_data->iommu_v2) {
1823			if (pdev_pri_ats_enable(pdev) != 0)
1824				goto out;
1825
1826			dev_data->ats.enabled = true;
1827			dev_data->ats.qdep    = pci_ats_queue_depth(pdev);
1828			dev_data->pri_tlp     = pci_prg_resp_pasid_required(pdev);
1829		}
1830	} else if (amd_iommu_iotlb_sup &&
1831		   pci_enable_ats(pdev, PAGE_SHIFT) == 0) {
1832		dev_data->ats.enabled = true;
1833		dev_data->ats.qdep    = pci_ats_queue_depth(pdev);
1834	}
1835
1836skip_ats_check:
1837	ret = 0;
1838
1839	do_attach(dev_data, domain);
1840
1841	/*
1842	 * We might boot into a crash-kernel here. The crashed kernel
1843	 * left the caches in the IOMMU dirty. So we have to flush
1844	 * here to evict all dirty stuff.
1845	 */
1846	amd_iommu_domain_flush_tlb_pde(domain);
1847
1848	amd_iommu_domain_flush_complete(domain);
1849
1850out:
1851	spin_unlock(&dev_data->lock);
1852
1853	spin_unlock_irqrestore(&domain->lock, flags);
1854
1855	return ret;
1856}
1857
1858/*
1859 * Removes a device from a protection domain (with devtable_lock held)
1860 */
1861static void detach_device(struct device *dev)
1862{
1863	struct protection_domain *domain;
1864	struct iommu_dev_data *dev_data;
1865	unsigned long flags;
1866
1867	dev_data = dev_iommu_priv_get(dev);
1868	domain   = dev_data->domain;
1869
1870	spin_lock_irqsave(&domain->lock, flags);
1871
1872	spin_lock(&dev_data->lock);
1873
1874	/*
1875	 * First check if the device is still attached. It might already
1876	 * be detached from its domain because the generic
1877	 * iommu_detach_group code detached it and we try again here in
1878	 * our alias handling.
1879	 */
1880	if (WARN_ON(!dev_data->domain))
1881		goto out;
1882
1883	do_detach(dev_data);
1884
1885	if (!dev_is_pci(dev))
1886		goto out;
1887
1888	if (domain->flags & PD_IOMMUV2_MASK && dev_data->iommu_v2)
1889		pdev_iommuv2_disable(to_pci_dev(dev));
1890	else if (dev_data->ats.enabled)
1891		pci_disable_ats(to_pci_dev(dev));
1892
1893	dev_data->ats.enabled = false;
1894
1895out:
1896	spin_unlock(&dev_data->lock);
1897
1898	spin_unlock_irqrestore(&domain->lock, flags);
1899}
1900
1901static struct iommu_device *amd_iommu_probe_device(struct device *dev)
1902{
1903	struct iommu_device *iommu_dev;
1904	struct amd_iommu *iommu;
1905	int ret;
1906
1907	if (!check_device(dev))
1908		return ERR_PTR(-ENODEV);
1909
1910	iommu = rlookup_amd_iommu(dev);
1911	if (!iommu)
1912		return ERR_PTR(-ENODEV);
1913
1914	/* Not registered yet? */
1915	if (!iommu->iommu.ops)
1916		return ERR_PTR(-ENODEV);
1917
1918	if (dev_iommu_priv_get(dev))
1919		return &iommu->iommu;
1920
1921	ret = iommu_init_device(iommu, dev);
1922	if (ret) {
1923		if (ret != -ENOTSUPP)
1924			dev_err(dev, "Failed to initialize - trying to proceed anyway\n");
1925		iommu_dev = ERR_PTR(ret);
1926		iommu_ignore_device(iommu, dev);
1927	} else {
1928		amd_iommu_set_pci_msi_domain(dev, iommu);
1929		iommu_dev = &iommu->iommu;
1930	}
1931
1932	iommu_completion_wait(iommu);
1933
1934	return iommu_dev;
1935}
1936
1937static void amd_iommu_probe_finalize(struct device *dev)
1938{
1939	/* Domains are initialized for this device - have a look what we ended up with */
1940	set_dma_ops(dev, NULL);
1941	iommu_setup_dma_ops(dev, 0, U64_MAX);
1942}
1943
1944static void amd_iommu_release_device(struct device *dev)
1945{
1946	struct amd_iommu *iommu;
1947
1948	if (!check_device(dev))
1949		return;
1950
1951	iommu = rlookup_amd_iommu(dev);
1952	if (!iommu)
1953		return;
1954
1955	amd_iommu_uninit_device(dev);
1956	iommu_completion_wait(iommu);
1957}
1958
1959static struct iommu_group *amd_iommu_device_group(struct device *dev)
1960{
1961	if (dev_is_pci(dev))
1962		return pci_device_group(dev);
1963
1964	return acpihid_device_group(dev);
1965}
1966
1967/*****************************************************************************
1968 *
1969 * The next functions belong to the dma_ops mapping/unmapping code.
1970 *
1971 *****************************************************************************/
1972
1973static void update_device_table(struct protection_domain *domain)
1974{
1975	struct iommu_dev_data *dev_data;
1976
1977	list_for_each_entry(dev_data, &domain->dev_list, list) {
1978		struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev);
1979
1980		if (!iommu)
1981			continue;
1982		set_dte_entry(iommu, dev_data->devid, domain,
1983			      dev_data->ats.enabled, dev_data->iommu_v2);
1984		clone_aliases(iommu, dev_data->dev);
1985	}
1986}
1987
1988void amd_iommu_update_and_flush_device_table(struct protection_domain *domain)
1989{
1990	update_device_table(domain);
1991	domain_flush_devices(domain);
1992}
1993
1994void amd_iommu_domain_update(struct protection_domain *domain)
1995{
1996	/* Update device table */
1997	amd_iommu_update_and_flush_device_table(domain);
1998
1999	/* Flush domain TLB(s) and wait for completion */
2000	amd_iommu_domain_flush_tlb_pde(domain);
2001	amd_iommu_domain_flush_complete(domain);
2002}
2003
2004/*****************************************************************************
2005 *
2006 * The following functions belong to the exported interface of AMD IOMMU
2007 *
2008 * This interface allows access to lower level functions of the IOMMU
2009 * like protection domain handling and assignement of devices to domains
2010 * which is not possible with the dma_ops interface.
2011 *
2012 *****************************************************************************/
2013
2014static void cleanup_domain(struct protection_domain *domain)
2015{
2016	struct iommu_dev_data *entry;
2017	unsigned long flags;
2018
2019	spin_lock_irqsave(&domain->lock, flags);
2020
2021	while (!list_empty(&domain->dev_list)) {
2022		entry = list_first_entry(&domain->dev_list,
2023					 struct iommu_dev_data, list);
2024		BUG_ON(!entry->domain);
2025		do_detach(entry);
2026	}
2027
2028	spin_unlock_irqrestore(&domain->lock, flags);
2029}
2030
2031static void protection_domain_free(struct protection_domain *domain)
2032{
2033	if (!domain)
2034		return;
2035
2036	if (domain->iop.pgtbl_cfg.tlb)
2037		free_io_pgtable_ops(&domain->iop.iop.ops);
2038
2039	if (domain->id)
2040		domain_id_free(domain->id);
2041
2042	kfree(domain);
2043}
2044
2045static int protection_domain_init_v1(struct protection_domain *domain, int mode)
2046{
2047	u64 *pt_root = NULL;
2048
2049	BUG_ON(mode < PAGE_MODE_NONE || mode > PAGE_MODE_6_LEVEL);
2050
2051	spin_lock_init(&domain->lock);
2052	domain->id = domain_id_alloc();
2053	if (!domain->id)
2054		return -ENOMEM;
2055	INIT_LIST_HEAD(&domain->dev_list);
2056
2057	if (mode != PAGE_MODE_NONE) {
2058		pt_root = (void *)get_zeroed_page(GFP_KERNEL);
2059		if (!pt_root) {
2060			domain_id_free(domain->id);
2061			return -ENOMEM;
2062		}
2063	}
2064
2065	amd_iommu_domain_set_pgtable(domain, pt_root, mode);
2066
2067	return 0;
2068}
2069
2070static int protection_domain_init_v2(struct protection_domain *domain)
2071{
2072	spin_lock_init(&domain->lock);
2073	domain->id = domain_id_alloc();
2074	if (!domain->id)
2075		return -ENOMEM;
2076	INIT_LIST_HEAD(&domain->dev_list);
2077
2078	domain->flags |= PD_GIOV_MASK;
2079
2080	domain->domain.pgsize_bitmap = AMD_IOMMU_PGSIZES_V2;
2081
2082	if (domain_enable_v2(domain, 1)) {
2083		domain_id_free(domain->id);
2084		return -ENOMEM;
2085	}
2086
2087	return 0;
2088}
2089
2090static struct protection_domain *protection_domain_alloc(unsigned int type)
2091{
2092	struct io_pgtable_ops *pgtbl_ops;
2093	struct protection_domain *domain;
2094	int pgtable;
2095	int mode = DEFAULT_PGTABLE_LEVEL;
2096	int ret;
2097
2098	/*
2099	 * Force IOMMU v1 page table when iommu=pt and
2100	 * when allocating domain for pass-through devices.
2101	 */
2102	if (type == IOMMU_DOMAIN_IDENTITY) {
2103		pgtable = AMD_IOMMU_V1;
2104		mode = PAGE_MODE_NONE;
2105	} else if (type == IOMMU_DOMAIN_UNMANAGED) {
2106		pgtable = AMD_IOMMU_V1;
2107	} else if (type == IOMMU_DOMAIN_DMA || type == IOMMU_DOMAIN_DMA_FQ) {
2108		pgtable = amd_iommu_pgtable;
2109	} else {
2110		return NULL;
2111	}
2112
2113	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
2114	if (!domain)
2115		return NULL;
2116
2117	switch (pgtable) {
2118	case AMD_IOMMU_V1:
2119		ret = protection_domain_init_v1(domain, mode);
2120		break;
2121	case AMD_IOMMU_V2:
2122		ret = protection_domain_init_v2(domain);
2123		break;
2124	default:
2125		ret = -EINVAL;
2126	}
2127
2128	if (ret)
2129		goto out_err;
2130
2131	/* No need to allocate io pgtable ops in passthrough mode */
2132	if (type == IOMMU_DOMAIN_IDENTITY)
2133		return domain;
2134
2135	domain->nid = NUMA_NO_NODE;
2136
2137	pgtbl_ops = alloc_io_pgtable_ops(pgtable, &domain->iop.pgtbl_cfg, domain);
2138	if (!pgtbl_ops) {
2139		domain_id_free(domain->id);
2140		goto out_err;
2141	}
2142
2143	return domain;
2144out_err:
2145	kfree(domain);
2146	return NULL;
2147}
2148
2149static inline u64 dma_max_address(void)
2150{
2151	if (amd_iommu_pgtable == AMD_IOMMU_V1)
2152		return ~0ULL;
2153
2154	/* V2 with 4/5 level page table */
2155	return ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1);
2156}
2157
2158static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
2159{
2160	struct protection_domain *domain;
2161
2162	/*
2163	 * Since DTE[Mode]=0 is prohibited on SNP-enabled system,
2164	 * default to use IOMMU_DOMAIN_DMA[_FQ].
2165	 */
2166	if (amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY))
2167		return NULL;
2168
2169	domain = protection_domain_alloc(type);
2170	if (!domain)
2171		return NULL;
2172
2173	domain->domain.geometry.aperture_start = 0;
2174	domain->domain.geometry.aperture_end   = dma_max_address();
2175	domain->domain.geometry.force_aperture = true;
2176
2177	return &domain->domain;
2178}
2179
2180static void amd_iommu_domain_free(struct iommu_domain *dom)
2181{
2182	struct protection_domain *domain;
2183
2184	domain = to_pdomain(dom);
2185
2186	if (domain->dev_cnt > 0)
2187		cleanup_domain(domain);
2188
2189	BUG_ON(domain->dev_cnt != 0);
2190
2191	if (!dom)
2192		return;
2193
2194	if (domain->flags & PD_IOMMUV2_MASK)
2195		free_gcr3_table(domain);
2196
2197	protection_domain_free(domain);
2198}
2199
2200static int amd_iommu_attach_device(struct iommu_domain *dom,
2201				   struct device *dev)
2202{
2203	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
2204	struct protection_domain *domain = to_pdomain(dom);
2205	struct amd_iommu *iommu = rlookup_amd_iommu(dev);
2206	int ret;
2207
2208	/*
2209	 * Skip attach device to domain if new domain is same as
2210	 * devices current domain
2211	 */
2212	if (dev_data->domain == domain)
2213		return 0;
2214
2215	dev_data->defer_attach = false;
2216
2217	if (dev_data->domain)
2218		detach_device(dev);
2219
2220	ret = attach_device(dev, domain);
2221
2222#ifdef CONFIG_IRQ_REMAP
2223	if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
2224		if (dom->type == IOMMU_DOMAIN_UNMANAGED)
2225			dev_data->use_vapic = 1;
2226		else
2227			dev_data->use_vapic = 0;
2228	}
2229#endif
2230
2231	iommu_completion_wait(iommu);
2232
2233	return ret;
2234}
2235
2236static void amd_iommu_iotlb_sync_map(struct iommu_domain *dom,
2237				     unsigned long iova, size_t size)
2238{
2239	struct protection_domain *domain = to_pdomain(dom);
2240	struct io_pgtable_ops *ops = &domain->iop.iop.ops;
2241
2242	if (ops->map_pages)
2243		domain_flush_np_cache(domain, iova, size);
2244}
2245
2246static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova,
2247			       phys_addr_t paddr, size_t pgsize, size_t pgcount,
2248			       int iommu_prot, gfp_t gfp, size_t *mapped)
2249{
2250	struct protection_domain *domain = to_pdomain(dom);
2251	struct io_pgtable_ops *ops = &domain->iop.iop.ops;
2252	int prot = 0;
2253	int ret = -EINVAL;
2254
2255	if ((amd_iommu_pgtable == AMD_IOMMU_V1) &&
2256	    (domain->iop.mode == PAGE_MODE_NONE))
2257		return -EINVAL;
2258
2259	if (iommu_prot & IOMMU_READ)
2260		prot |= IOMMU_PROT_IR;
2261	if (iommu_prot & IOMMU_WRITE)
2262		prot |= IOMMU_PROT_IW;
2263
2264	if (ops->map_pages) {
2265		ret = ops->map_pages(ops, iova, paddr, pgsize,
2266				     pgcount, prot, gfp, mapped);
2267	}
2268
2269	return ret;
2270}
2271
2272static void amd_iommu_iotlb_gather_add_page(struct iommu_domain *domain,
2273					    struct iommu_iotlb_gather *gather,
2274					    unsigned long iova, size_t size)
2275{
2276	/*
2277	 * AMD's IOMMU can flush as many pages as necessary in a single flush.
2278	 * Unless we run in a virtual machine, which can be inferred according
2279	 * to whether "non-present cache" is on, it is probably best to prefer
2280	 * (potentially) too extensive TLB flushing (i.e., more misses) over
2281	 * mutliple TLB flushes (i.e., more flushes). For virtual machines the
2282	 * hypervisor needs to synchronize the host IOMMU PTEs with those of
2283	 * the guest, and the trade-off is different: unnecessary TLB flushes
2284	 * should be avoided.
2285	 */
2286	if (amd_iommu_np_cache &&
2287	    iommu_iotlb_gather_is_disjoint(gather, iova, size))
2288		iommu_iotlb_sync(domain, gather);
2289
2290	iommu_iotlb_gather_add_range(gather, iova, size);
2291}
2292
2293static size_t amd_iommu_unmap_pages(struct iommu_domain *dom, unsigned long iova,
2294				    size_t pgsize, size_t pgcount,
2295				    struct iommu_iotlb_gather *gather)
2296{
2297	struct protection_domain *domain = to_pdomain(dom);
2298	struct io_pgtable_ops *ops = &domain->iop.iop.ops;
2299	size_t r;
2300
2301	if ((amd_iommu_pgtable == AMD_IOMMU_V1) &&
2302	    (domain->iop.mode == PAGE_MODE_NONE))
2303		return 0;
2304
2305	r = (ops->unmap_pages) ? ops->unmap_pages(ops, iova, pgsize, pgcount, NULL) : 0;
2306
2307	if (r)
2308		amd_iommu_iotlb_gather_add_page(dom, gather, iova, r);
2309
2310	return r;
2311}
2312
2313static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
2314					  dma_addr_t iova)
2315{
2316	struct protection_domain *domain = to_pdomain(dom);
2317	struct io_pgtable_ops *ops = &domain->iop.iop.ops;
2318
2319	return ops->iova_to_phys(ops, iova);
2320}
2321
2322static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
2323{
2324	switch (cap) {
2325	case IOMMU_CAP_CACHE_COHERENCY:
2326		return true;
2327	case IOMMU_CAP_NOEXEC:
2328		return false;
2329	case IOMMU_CAP_PRE_BOOT_PROTECTION:
2330		return amdr_ivrs_remap_support;
2331	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
2332		return true;
2333	case IOMMU_CAP_DEFERRED_FLUSH:
2334		return true;
2335	default:
2336		break;
2337	}
2338
2339	return false;
2340}
2341
2342static void amd_iommu_get_resv_regions(struct device *dev,
2343				       struct list_head *head)
2344{
2345	struct iommu_resv_region *region;
2346	struct unity_map_entry *entry;
2347	struct amd_iommu *iommu;
2348	struct amd_iommu_pci_seg *pci_seg;
2349	int devid, sbdf;
2350
2351	sbdf = get_device_sbdf_id(dev);
2352	if (sbdf < 0)
2353		return;
2354
2355	devid = PCI_SBDF_TO_DEVID(sbdf);
2356	iommu = rlookup_amd_iommu(dev);
2357	if (!iommu)
2358		return;
2359	pci_seg = iommu->pci_seg;
2360
2361	list_for_each_entry(entry, &pci_seg->unity_map, list) {
2362		int type, prot = 0;
2363		size_t length;
2364
2365		if (devid < entry->devid_start || devid > entry->devid_end)
2366			continue;
2367
2368		type   = IOMMU_RESV_DIRECT;
2369		length = entry->address_end - entry->address_start;
2370		if (entry->prot & IOMMU_PROT_IR)
2371			prot |= IOMMU_READ;
2372		if (entry->prot & IOMMU_PROT_IW)
2373			prot |= IOMMU_WRITE;
2374		if (entry->prot & IOMMU_UNITY_MAP_FLAG_EXCL_RANGE)
2375			/* Exclusion range */
2376			type = IOMMU_RESV_RESERVED;
2377
2378		region = iommu_alloc_resv_region(entry->address_start,
2379						 length, prot, type,
2380						 GFP_KERNEL);
2381		if (!region) {
2382			dev_err(dev, "Out of memory allocating dm-regions\n");
2383			return;
2384		}
2385		list_add_tail(&region->list, head);
2386	}
2387
2388	region = iommu_alloc_resv_region(MSI_RANGE_START,
2389					 MSI_RANGE_END - MSI_RANGE_START + 1,
2390					 0, IOMMU_RESV_MSI, GFP_KERNEL);
2391	if (!region)
2392		return;
2393	list_add_tail(&region->list, head);
2394
2395	region = iommu_alloc_resv_region(HT_RANGE_START,
2396					 HT_RANGE_END - HT_RANGE_START + 1,
2397					 0, IOMMU_RESV_RESERVED, GFP_KERNEL);
2398	if (!region)
2399		return;
2400	list_add_tail(&region->list, head);
2401}
2402
2403bool amd_iommu_is_attach_deferred(struct device *dev)
2404{
2405	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
2406
2407	return dev_data->defer_attach;
2408}
2409EXPORT_SYMBOL_GPL(amd_iommu_is_attach_deferred);
2410
2411static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
2412{
2413	struct protection_domain *dom = to_pdomain(domain);
2414	unsigned long flags;
2415
2416	spin_lock_irqsave(&dom->lock, flags);
2417	amd_iommu_domain_flush_tlb_pde(dom);
2418	amd_iommu_domain_flush_complete(dom);
2419	spin_unlock_irqrestore(&dom->lock, flags);
2420}
2421
2422static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
2423				 struct iommu_iotlb_gather *gather)
2424{
2425	struct protection_domain *dom = to_pdomain(domain);
2426	unsigned long flags;
2427
2428	spin_lock_irqsave(&dom->lock, flags);
2429	domain_flush_pages(dom, gather->start, gather->end - gather->start + 1, 1);
2430	amd_iommu_domain_flush_complete(dom);
2431	spin_unlock_irqrestore(&dom->lock, flags);
2432}
2433
2434static int amd_iommu_def_domain_type(struct device *dev)
2435{
2436	struct iommu_dev_data *dev_data;
2437
2438	dev_data = dev_iommu_priv_get(dev);
2439	if (!dev_data)
2440		return 0;
2441
2442	/*
2443	 * Do not identity map IOMMUv2 capable devices when:
2444	 *  - memory encryption is active, because some of those devices
2445	 *    (AMD GPUs) don't have the encryption bit in their DMA-mask
2446	 *    and require remapping.
2447	 *  - SNP is enabled, because it prohibits DTE[Mode]=0.
2448	 */
2449	if (dev_data->iommu_v2 &&
2450	    !cc_platform_has(CC_ATTR_MEM_ENCRYPT) &&
2451	    !amd_iommu_snp_en) {
2452		return IOMMU_DOMAIN_IDENTITY;
2453	}
2454
2455	return 0;
2456}
2457
2458static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain)
2459{
2460	/* IOMMU_PTE_FC is always set */
2461	return true;
2462}
2463
2464const struct iommu_ops amd_iommu_ops = {
2465	.capable = amd_iommu_capable,
2466	.domain_alloc = amd_iommu_domain_alloc,
2467	.probe_device = amd_iommu_probe_device,
2468	.release_device = amd_iommu_release_device,
2469	.probe_finalize = amd_iommu_probe_finalize,
2470	.device_group = amd_iommu_device_group,
2471	.get_resv_regions = amd_iommu_get_resv_regions,
2472	.is_attach_deferred = amd_iommu_is_attach_deferred,
2473	.pgsize_bitmap	= AMD_IOMMU_PGSIZES,
2474	.def_domain_type = amd_iommu_def_domain_type,
2475	.default_domain_ops = &(const struct iommu_domain_ops) {
2476		.attach_dev	= amd_iommu_attach_device,
2477		.map_pages	= amd_iommu_map_pages,
2478		.unmap_pages	= amd_iommu_unmap_pages,
2479		.iotlb_sync_map	= amd_iommu_iotlb_sync_map,
2480		.iova_to_phys	= amd_iommu_iova_to_phys,
2481		.flush_iotlb_all = amd_iommu_flush_iotlb_all,
2482		.iotlb_sync	= amd_iommu_iotlb_sync,
2483		.free		= amd_iommu_domain_free,
2484		.enforce_cache_coherency = amd_iommu_enforce_cache_coherency,
2485	}
2486};
2487
2488/*****************************************************************************
2489 *
2490 * The next functions do a basic initialization of IOMMU for pass through
2491 * mode
2492 *
2493 * In passthrough mode the IOMMU is initialized and enabled but not used for
2494 * DMA-API translation.
2495 *
2496 *****************************************************************************/
2497
2498/* IOMMUv2 specific functions */
2499int amd_iommu_register_ppr_notifier(struct notifier_block *nb)
2500{
2501	return atomic_notifier_chain_register(&ppr_notifier, nb);
2502}
2503EXPORT_SYMBOL(amd_iommu_register_ppr_notifier);
2504
2505int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb)
2506{
2507	return atomic_notifier_chain_unregister(&ppr_notifier, nb);
2508}
2509EXPORT_SYMBOL(amd_iommu_unregister_ppr_notifier);
2510
2511void amd_iommu_domain_direct_map(struct iommu_domain *dom)
2512{
2513	struct protection_domain *domain = to_pdomain(dom);
2514	unsigned long flags;
2515
2516	spin_lock_irqsave(&domain->lock, flags);
2517
2518	if (domain->iop.pgtbl_cfg.tlb)
2519		free_io_pgtable_ops(&domain->iop.iop.ops);
2520
2521	spin_unlock_irqrestore(&domain->lock, flags);
2522}
2523EXPORT_SYMBOL(amd_iommu_domain_direct_map);
2524
2525/* Note: This function expects iommu_domain->lock to be held prior calling the function. */
2526static int domain_enable_v2(struct protection_domain *domain, int pasids)
2527{
2528	int levels;
2529
2530	/* Number of GCR3 table levels required */
2531	for (levels = 0; (pasids - 1) & ~0x1ff; pasids >>= 9)
2532		levels += 1;
2533
2534	if (levels > amd_iommu_max_glx_val)
2535		return -EINVAL;
2536
2537	domain->gcr3_tbl = (void *)get_zeroed_page(GFP_ATOMIC);
2538	if (domain->gcr3_tbl == NULL)
2539		return -ENOMEM;
2540
2541	domain->glx      = levels;
2542	domain->flags   |= PD_IOMMUV2_MASK;
2543
2544	amd_iommu_domain_update(domain);
2545
2546	return 0;
2547}
2548
2549int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids)
2550{
2551	struct protection_domain *pdom = to_pdomain(dom);
2552	unsigned long flags;
2553	int ret;
2554
2555	spin_lock_irqsave(&pdom->lock, flags);
2556
2557	/*
2558	 * Save us all sanity checks whether devices already in the
2559	 * domain support IOMMUv2. Just force that the domain has no
2560	 * devices attached when it is switched into IOMMUv2 mode.
2561	 */
2562	ret = -EBUSY;
2563	if (pdom->dev_cnt > 0 || pdom->flags & PD_IOMMUV2_MASK)
2564		goto out;
2565
2566	if (!pdom->gcr3_tbl)
2567		ret = domain_enable_v2(pdom, pasids);
2568
2569out:
2570	spin_unlock_irqrestore(&pdom->lock, flags);
2571	return ret;
2572}
2573EXPORT_SYMBOL(amd_iommu_domain_enable_v2);
2574
2575static int __flush_pasid(struct protection_domain *domain, u32 pasid,
2576			 u64 address, bool size)
2577{
2578	struct iommu_dev_data *dev_data;
2579	struct iommu_cmd cmd;
2580	int i, ret;
2581
2582	if (!(domain->flags & PD_IOMMUV2_MASK))
2583		return -EINVAL;
2584
2585	build_inv_iommu_pasid(&cmd, domain->id, pasid, address, size);
2586
2587	/*
2588	 * IOMMU TLB needs to be flushed before Device TLB to
2589	 * prevent device TLB refill from IOMMU TLB
2590	 */
2591	for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
2592		if (domain->dev_iommu[i] == 0)
2593			continue;
2594
2595		ret = iommu_queue_command(amd_iommus[i], &cmd);
2596		if (ret != 0)
2597			goto out;
2598	}
2599
2600	/* Wait until IOMMU TLB flushes are complete */
2601	amd_iommu_domain_flush_complete(domain);
2602
2603	/* Now flush device TLBs */
2604	list_for_each_entry(dev_data, &domain->dev_list, list) {
2605		struct amd_iommu *iommu;
2606		int qdep;
2607
2608		/*
2609		   There might be non-IOMMUv2 capable devices in an IOMMUv2
2610		 * domain.
2611		 */
2612		if (!dev_data->ats.enabled)
2613			continue;
2614
2615		qdep  = dev_data->ats.qdep;
2616		iommu = rlookup_amd_iommu(dev_data->dev);
2617		if (!iommu)
2618			continue;
2619		build_inv_iotlb_pasid(&cmd, dev_data->devid, pasid,
2620				      qdep, address, size);
2621
2622		ret = iommu_queue_command(iommu, &cmd);
2623		if (ret != 0)
2624			goto out;
2625	}
2626
2627	/* Wait until all device TLBs are flushed */
2628	amd_iommu_domain_flush_complete(domain);
2629
2630	ret = 0;
2631
2632out:
2633
2634	return ret;
2635}
2636
2637static int __amd_iommu_flush_page(struct protection_domain *domain, u32 pasid,
2638				  u64 address)
2639{
2640	return __flush_pasid(domain, pasid, address, false);
2641}
2642
2643int amd_iommu_flush_page(struct iommu_domain *dom, u32 pasid,
2644			 u64 address)
2645{
2646	struct protection_domain *domain = to_pdomain(dom);
2647	unsigned long flags;
2648	int ret;
2649
2650	spin_lock_irqsave(&domain->lock, flags);
2651	ret = __amd_iommu_flush_page(domain, pasid, address);
2652	spin_unlock_irqrestore(&domain->lock, flags);
2653
2654	return ret;
2655}
2656EXPORT_SYMBOL(amd_iommu_flush_page);
2657
2658static int __amd_iommu_flush_tlb(struct protection_domain *domain, u32 pasid)
2659{
2660	return __flush_pasid(domain, pasid, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
2661			     true);
2662}
2663
2664int amd_iommu_flush_tlb(struct iommu_domain *dom, u32 pasid)
2665{
2666	struct protection_domain *domain = to_pdomain(dom);
2667	unsigned long flags;
2668	int ret;
2669
2670	spin_lock_irqsave(&domain->lock, flags);
2671	ret = __amd_iommu_flush_tlb(domain, pasid);
2672	spin_unlock_irqrestore(&domain->lock, flags);
2673
2674	return ret;
2675}
2676EXPORT_SYMBOL(amd_iommu_flush_tlb);
2677
2678static u64 *__get_gcr3_pte(u64 *root, int level, u32 pasid, bool alloc)
2679{
2680	int index;
2681	u64 *pte;
2682
2683	while (true) {
2684
2685		index = (pasid >> (9 * level)) & 0x1ff;
2686		pte   = &root[index];
2687
2688		if (level == 0)
2689			break;
2690
2691		if (!(*pte & GCR3_VALID)) {
2692			if (!alloc)
2693				return NULL;
2694
2695			root = (void *)get_zeroed_page(GFP_ATOMIC);
2696			if (root == NULL)
2697				return NULL;
2698
2699			*pte = iommu_virt_to_phys(root) | GCR3_VALID;
2700		}
2701
2702		root = iommu_phys_to_virt(*pte & PAGE_MASK);
2703
2704		level -= 1;
2705	}
2706
2707	return pte;
2708}
2709
2710static int __set_gcr3(struct protection_domain *domain, u32 pasid,
2711		      unsigned long cr3)
2712{
2713	u64 *pte;
2714
2715	if (domain->iop.mode != PAGE_MODE_NONE)
2716		return -EINVAL;
2717
2718	pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, true);
2719	if (pte == NULL)
2720		return -ENOMEM;
2721
2722	*pte = (cr3 & PAGE_MASK) | GCR3_VALID;
2723
2724	return __amd_iommu_flush_tlb(domain, pasid);
2725}
2726
2727static int __clear_gcr3(struct protection_domain *domain, u32 pasid)
2728{
2729	u64 *pte;
2730
2731	if (domain->iop.mode != PAGE_MODE_NONE)
2732		return -EINVAL;
2733
2734	pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, false);
2735	if (pte == NULL)
2736		return 0;
2737
2738	*pte = 0;
2739
2740	return __amd_iommu_flush_tlb(domain, pasid);
2741}
2742
2743int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, u32 pasid,
2744			      unsigned long cr3)
2745{
2746	struct protection_domain *domain = to_pdomain(dom);
2747	unsigned long flags;
2748	int ret;
2749
2750	spin_lock_irqsave(&domain->lock, flags);
2751	ret = __set_gcr3(domain, pasid, cr3);
2752	spin_unlock_irqrestore(&domain->lock, flags);
2753
2754	return ret;
2755}
2756EXPORT_SYMBOL(amd_iommu_domain_set_gcr3);
2757
2758int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, u32 pasid)
2759{
2760	struct protection_domain *domain = to_pdomain(dom);
2761	unsigned long flags;
2762	int ret;
2763
2764	spin_lock_irqsave(&domain->lock, flags);
2765	ret = __clear_gcr3(domain, pasid);
2766	spin_unlock_irqrestore(&domain->lock, flags);
2767
2768	return ret;
2769}
2770EXPORT_SYMBOL(amd_iommu_domain_clear_gcr3);
2771
2772int amd_iommu_complete_ppr(struct pci_dev *pdev, u32 pasid,
2773			   int status, int tag)
2774{
2775	struct iommu_dev_data *dev_data;
2776	struct amd_iommu *iommu;
2777	struct iommu_cmd cmd;
2778
2779	dev_data = dev_iommu_priv_get(&pdev->dev);
2780	iommu    = rlookup_amd_iommu(&pdev->dev);
2781	if (!iommu)
2782		return -ENODEV;
2783
2784	build_complete_ppr(&cmd, dev_data->devid, pasid, status,
2785			   tag, dev_data->pri_tlp);
2786
2787	return iommu_queue_command(iommu, &cmd);
2788}
2789EXPORT_SYMBOL(amd_iommu_complete_ppr);
2790
2791int amd_iommu_device_info(struct pci_dev *pdev,
2792                          struct amd_iommu_device_info *info)
2793{
2794	int max_pasids;
2795	int pos;
2796
2797	if (pdev == NULL || info == NULL)
2798		return -EINVAL;
2799
2800	if (!amd_iommu_v2_supported())
2801		return -EINVAL;
2802
2803	memset(info, 0, sizeof(*info));
2804
2805	if (pci_ats_supported(pdev))
2806		info->flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP;
2807
2808	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
2809	if (pos)
2810		info->flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP;
2811
2812	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID);
2813	if (pos) {
2814		int features;
2815
2816		max_pasids = 1 << (9 * (amd_iommu_max_glx_val + 1));
2817		max_pasids = min(max_pasids, (1 << 20));
2818
2819		info->flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP;
2820		info->max_pasids = min(pci_max_pasids(pdev), max_pasids);
2821
2822		features = pci_pasid_features(pdev);
2823		if (features & PCI_PASID_CAP_EXEC)
2824			info->flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP;
2825		if (features & PCI_PASID_CAP_PRIV)
2826			info->flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP;
2827	}
2828
2829	return 0;
2830}
2831EXPORT_SYMBOL(amd_iommu_device_info);
2832
2833#ifdef CONFIG_IRQ_REMAP
2834
2835/*****************************************************************************
2836 *
2837 * Interrupt Remapping Implementation
2838 *
2839 *****************************************************************************/
2840
2841static struct irq_chip amd_ir_chip;
2842static DEFINE_SPINLOCK(iommu_table_lock);
2843
2844static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid)
2845{
2846	int ret;
2847	u64 data;
2848	unsigned long flags;
2849	struct iommu_cmd cmd, cmd2;
2850
2851	if (iommu->irtcachedis_enabled)
2852		return;
2853
2854	build_inv_irt(&cmd, devid);
2855	data = atomic64_add_return(1, &iommu->cmd_sem_val);
2856	build_completion_wait(&cmd2, iommu, data);
2857
2858	raw_spin_lock_irqsave(&iommu->lock, flags);
2859	ret = __iommu_queue_command_sync(iommu, &cmd, true);
2860	if (ret)
2861		goto out;
2862	ret = __iommu_queue_command_sync(iommu, &cmd2, false);
2863	if (ret)
2864		goto out;
2865	wait_on_sem(iommu, data);
2866out:
2867	raw_spin_unlock_irqrestore(&iommu->lock, flags);
2868}
2869
2870static void set_dte_irq_entry(struct amd_iommu *iommu, u16 devid,
2871			      struct irq_remap_table *table)
2872{
2873	u64 dte;
2874	struct dev_table_entry *dev_table = get_dev_table(iommu);
2875
2876	dte	= dev_table[devid].data[2];
2877	dte	&= ~DTE_IRQ_PHYS_ADDR_MASK;
2878	dte	|= iommu_virt_to_phys(table->table);
2879	dte	|= DTE_IRQ_REMAP_INTCTL;
2880	dte	|= DTE_INTTABLEN;
2881	dte	|= DTE_IRQ_REMAP_ENABLE;
2882
2883	dev_table[devid].data[2] = dte;
2884}
2885
2886static struct irq_remap_table *get_irq_table(struct amd_iommu *iommu, u16 devid)
2887{
2888	struct irq_remap_table *table;
2889	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
2890
2891	if (WARN_ONCE(!pci_seg->rlookup_table[devid],
2892		      "%s: no iommu for devid %x:%x\n",
2893		      __func__, pci_seg->id, devid))
2894		return NULL;
2895
2896	table = pci_seg->irq_lookup_table[devid];
2897	if (WARN_ONCE(!table, "%s: no table for devid %x:%x\n",
2898		      __func__, pci_seg->id, devid))
2899		return NULL;
2900
2901	return table;
2902}
2903
2904static struct irq_remap_table *__alloc_irq_table(void)
2905{
2906	struct irq_remap_table *table;
2907
2908	table = kzalloc(sizeof(*table), GFP_KERNEL);
2909	if (!table)
2910		return NULL;
2911
2912	table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_KERNEL);
2913	if (!table->table) {
2914		kfree(table);
2915		return NULL;
2916	}
2917	raw_spin_lock_init(&table->lock);
2918
2919	if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
2920		memset(table->table, 0,
2921		       MAX_IRQS_PER_TABLE * sizeof(u32));
2922	else
2923		memset(table->table, 0,
2924		       (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2)));
2925	return table;
2926}
2927
2928static void set_remap_table_entry(struct amd_iommu *iommu, u16 devid,
2929				  struct irq_remap_table *table)
2930{
2931	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
2932
2933	pci_seg->irq_lookup_table[devid] = table;
2934	set_dte_irq_entry(iommu, devid, table);
2935	iommu_flush_dte(iommu, devid);
2936}
2937
2938static int set_remap_table_entry_alias(struct pci_dev *pdev, u16 alias,
2939				       void *data)
2940{
2941	struct irq_remap_table *table = data;
2942	struct amd_iommu_pci_seg *pci_seg;
2943	struct amd_iommu *iommu = rlookup_amd_iommu(&pdev->dev);
2944
2945	if (!iommu)
2946		return -EINVAL;
2947
2948	pci_seg = iommu->pci_seg;
2949	pci_seg->irq_lookup_table[alias] = table;
2950	set_dte_irq_entry(iommu, alias, table);
2951	iommu_flush_dte(pci_seg->rlookup_table[alias], alias);
2952
2953	return 0;
2954}
2955
2956static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu,
2957					       u16 devid, struct pci_dev *pdev)
2958{
2959	struct irq_remap_table *table = NULL;
2960	struct irq_remap_table *new_table = NULL;
2961	struct amd_iommu_pci_seg *pci_seg;
2962	unsigned long flags;
2963	u16 alias;
2964
2965	spin_lock_irqsave(&iommu_table_lock, flags);
2966
2967	pci_seg = iommu->pci_seg;
2968	table = pci_seg->irq_lookup_table[devid];
2969	if (table)
2970		goto out_unlock;
2971
2972	alias = pci_seg->alias_table[devid];
2973	table = pci_seg->irq_lookup_table[alias];
2974	if (table) {
2975		set_remap_table_entry(iommu, devid, table);
2976		goto out_wait;
2977	}
2978	spin_unlock_irqrestore(&iommu_table_lock, flags);
2979
2980	/* Nothing there yet, allocate new irq remapping table */
2981	new_table = __alloc_irq_table();
2982	if (!new_table)
2983		return NULL;
2984
2985	spin_lock_irqsave(&iommu_table_lock, flags);
2986
2987	table = pci_seg->irq_lookup_table[devid];
2988	if (table)
2989		goto out_unlock;
2990
2991	table = pci_seg->irq_lookup_table[alias];
2992	if (table) {
2993		set_remap_table_entry(iommu, devid, table);
2994		goto out_wait;
2995	}
2996
2997	table = new_table;
2998	new_table = NULL;
2999
3000	if (pdev)
3001		pci_for_each_dma_alias(pdev, set_remap_table_entry_alias,
3002				       table);
3003	else
3004		set_remap_table_entry(iommu, devid, table);
3005
3006	if (devid != alias)
3007		set_remap_table_entry(iommu, alias, table);
3008
3009out_wait:
3010	iommu_completion_wait(iommu);
3011
3012out_unlock:
3013	spin_unlock_irqrestore(&iommu_table_lock, flags);
3014
3015	if (new_table) {
3016		kmem_cache_free(amd_iommu_irq_cache, new_table->table);
3017		kfree(new_table);
3018	}
3019	return table;
3020}
3021
3022static int alloc_irq_index(struct amd_iommu *iommu, u16 devid, int count,
3023			   bool align, struct pci_dev *pdev)
3024{
3025	struct irq_remap_table *table;
3026	int index, c, alignment = 1;
3027	unsigned long flags;
3028
3029	table = alloc_irq_table(iommu, devid, pdev);
3030	if (!table)
3031		return -ENODEV;
3032
3033	if (align)
3034		alignment = roundup_pow_of_two(count);
3035
3036	raw_spin_lock_irqsave(&table->lock, flags);
3037
3038	/* Scan table for free entries */
3039	for (index = ALIGN(table->min_index, alignment), c = 0;
3040	     index < MAX_IRQS_PER_TABLE;) {
3041		if (!iommu->irte_ops->is_allocated(table, index)) {
3042			c += 1;
3043		} else {
3044			c     = 0;
3045			index = ALIGN(index + 1, alignment);
3046			continue;
3047		}
3048
3049		if (c == count)	{
3050			for (; c != 0; --c)
3051				iommu->irte_ops->set_allocated(table, index - c + 1);
3052
3053			index -= count - 1;
3054			goto out;
3055		}
3056
3057		index++;
3058	}
3059
3060	index = -ENOSPC;
3061
3062out:
3063	raw_spin_unlock_irqrestore(&table->lock, flags);
3064
3065	return index;
3066}
3067
3068static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
3069			  struct irte_ga *irte)
3070{
3071	struct irq_remap_table *table;
3072	struct irte_ga *entry;
3073	unsigned long flags;
3074	u128 old;
3075
3076	table = get_irq_table(iommu, devid);
3077	if (!table)
3078		return -ENOMEM;
3079
3080	raw_spin_lock_irqsave(&table->lock, flags);
3081
3082	entry = (struct irte_ga *)table->table;
3083	entry = &entry[index];
3084
3085	/*
3086	 * We use cmpxchg16 to atomically update the 128-bit IRTE,
3087	 * and it cannot be updated by the hardware or other processors
3088	 * behind us, so the return value of cmpxchg16 should be the
3089	 * same as the old value.
3090	 */
3091	old = entry->irte;
3092	WARN_ON(!try_cmpxchg128(&entry->irte, &old, irte->irte));
3093
3094	raw_spin_unlock_irqrestore(&table->lock, flags);
3095
3096	iommu_flush_irt_and_complete(iommu, devid);
3097
3098	return 0;
3099}
3100
3101static int modify_irte(struct amd_iommu *iommu,
3102		       u16 devid, int index, union irte *irte)
3103{
3104	struct irq_remap_table *table;
3105	unsigned long flags;
3106
3107	table = get_irq_table(iommu, devid);
3108	if (!table)
3109		return -ENOMEM;
3110
3111	raw_spin_lock_irqsave(&table->lock, flags);
3112	table->table[index] = irte->val;
3113	raw_spin_unlock_irqrestore(&table->lock, flags);
3114
3115	iommu_flush_irt_and_complete(iommu, devid);
3116
3117	return 0;
3118}
3119
3120static void free_irte(struct amd_iommu *iommu, u16 devid, int index)
3121{
3122	struct irq_remap_table *table;
3123	unsigned long flags;
3124
3125	table = get_irq_table(iommu, devid);
3126	if (!table)
3127		return;
3128
3129	raw_spin_lock_irqsave(&table->lock, flags);
3130	iommu->irte_ops->clear_allocated(table, index);
3131	raw_spin_unlock_irqrestore(&table->lock, flags);
3132
3133	iommu_flush_irt_and_complete(iommu, devid);
3134}
3135
3136static void irte_prepare(void *entry,
3137			 u32 delivery_mode, bool dest_mode,
3138			 u8 vector, u32 dest_apicid, int devid)
3139{
3140	union irte *irte = (union irte *) entry;
3141
3142	irte->val                = 0;
3143	irte->fields.vector      = vector;
3144	irte->fields.int_type    = delivery_mode;
3145	irte->fields.destination = dest_apicid;
3146	irte->fields.dm          = dest_mode;
3147	irte->fields.valid       = 1;
3148}
3149
3150static void irte_ga_prepare(void *entry,
3151			    u32 delivery_mode, bool dest_mode,
3152			    u8 vector, u32 dest_apicid, int devid)
3153{
3154	struct irte_ga *irte = (struct irte_ga *) entry;
3155
3156	irte->lo.val                      = 0;
3157	irte->hi.val                      = 0;
3158	irte->lo.fields_remap.int_type    = delivery_mode;
3159	irte->lo.fields_remap.dm          = dest_mode;
3160	irte->hi.fields.vector            = vector;
3161	irte->lo.fields_remap.destination = APICID_TO_IRTE_DEST_LO(dest_apicid);
3162	irte->hi.fields.destination       = APICID_TO_IRTE_DEST_HI(dest_apicid);
3163	irte->lo.fields_remap.valid       = 1;
3164}
3165
3166static void irte_activate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3167{
3168	union irte *irte = (union irte *) entry;
3169
3170	irte->fields.valid = 1;
3171	modify_irte(iommu, devid, index, irte);
3172}
3173
3174static void irte_ga_activate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3175{
3176	struct irte_ga *irte = (struct irte_ga *) entry;
3177
3178	irte->lo.fields_remap.valid = 1;
3179	modify_irte_ga(iommu, devid, index, irte);
3180}
3181
3182static void irte_deactivate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3183{
3184	union irte *irte = (union irte *) entry;
3185
3186	irte->fields.valid = 0;
3187	modify_irte(iommu, devid, index, irte);
3188}
3189
3190static void irte_ga_deactivate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3191{
3192	struct irte_ga *irte = (struct irte_ga *) entry;
3193
3194	irte->lo.fields_remap.valid = 0;
3195	modify_irte_ga(iommu, devid, index, irte);
3196}
3197
3198static void irte_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid, u16 index,
3199			      u8 vector, u32 dest_apicid)
3200{
3201	union irte *irte = (union irte *) entry;
3202
3203	irte->fields.vector = vector;
3204	irte->fields.destination = dest_apicid;
3205	modify_irte(iommu, devid, index, irte);
3206}
3207
3208static void irte_ga_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid, u16 index,
3209				 u8 vector, u32 dest_apicid)
3210{
3211	struct irte_ga *irte = (struct irte_ga *) entry;
3212
3213	if (!irte->lo.fields_remap.guest_mode) {
3214		irte->hi.fields.vector = vector;
3215		irte->lo.fields_remap.destination =
3216					APICID_TO_IRTE_DEST_LO(dest_apicid);
3217		irte->hi.fields.destination =
3218					APICID_TO_IRTE_DEST_HI(dest_apicid);
3219		modify_irte_ga(iommu, devid, index, irte);
3220	}
3221}
3222
3223#define IRTE_ALLOCATED (~1U)
3224static void irte_set_allocated(struct irq_remap_table *table, int index)
3225{
3226	table->table[index] = IRTE_ALLOCATED;
3227}
3228
3229static void irte_ga_set_allocated(struct irq_remap_table *table, int index)
3230{
3231	struct irte_ga *ptr = (struct irte_ga *)table->table;
3232	struct irte_ga *irte = &ptr[index];
3233
3234	memset(&irte->lo.val, 0, sizeof(u64));
3235	memset(&irte->hi.val, 0, sizeof(u64));
3236	irte->hi.fields.vector = 0xff;
3237}
3238
3239static bool irte_is_allocated(struct irq_remap_table *table, int index)
3240{
3241	union irte *ptr = (union irte *)table->table;
3242	union irte *irte = &ptr[index];
3243
3244	return irte->val != 0;
3245}
3246
3247static bool irte_ga_is_allocated(struct irq_remap_table *table, int index)
3248{
3249	struct irte_ga *ptr = (struct irte_ga *)table->table;
3250	struct irte_ga *irte = &ptr[index];
3251
3252	return irte->hi.fields.vector != 0;
3253}
3254
3255static void irte_clear_allocated(struct irq_remap_table *table, int index)
3256{
3257	table->table[index] = 0;
3258}
3259
3260static void irte_ga_clear_allocated(struct irq_remap_table *table, int index)
3261{
3262	struct irte_ga *ptr = (struct irte_ga *)table->table;
3263	struct irte_ga *irte = &ptr[index];
3264
3265	memset(&irte->lo.val, 0, sizeof(u64));
3266	memset(&irte->hi.val, 0, sizeof(u64));
3267}
3268
3269static int get_devid(struct irq_alloc_info *info)
3270{
3271	switch (info->type) {
3272	case X86_IRQ_ALLOC_TYPE_IOAPIC:
3273		return get_ioapic_devid(info->devid);
3274	case X86_IRQ_ALLOC_TYPE_HPET:
3275		return get_hpet_devid(info->devid);
3276	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
3277	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
3278		return get_device_sbdf_id(msi_desc_to_dev(info->desc));
3279	default:
3280		WARN_ON_ONCE(1);
3281		return -1;
3282	}
3283}
3284
3285struct irq_remap_ops amd_iommu_irq_ops = {
3286	.prepare		= amd_iommu_prepare,
3287	.enable			= amd_iommu_enable,
3288	.disable		= amd_iommu_disable,
3289	.reenable		= amd_iommu_reenable,
3290	.enable_faulting	= amd_iommu_enable_faulting,
3291};
3292
3293static void fill_msi_msg(struct msi_msg *msg, u32 index)
3294{
3295	msg->data = index;
3296	msg->address_lo = 0;
3297	msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW;
3298	msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH;
3299}
3300
3301static void irq_remapping_prepare_irte(struct amd_ir_data *data,
3302				       struct irq_cfg *irq_cfg,
3303				       struct irq_alloc_info *info,
3304				       int devid, int index, int sub_handle)
3305{
3306	struct irq_2_irte *irte_info = &data->irq_2_irte;
3307	struct amd_iommu *iommu = data->iommu;
3308
3309	if (!iommu)
3310		return;
3311
3312	data->irq_2_irte.devid = devid;
3313	data->irq_2_irte.index = index + sub_handle;
3314	iommu->irte_ops->prepare(data->entry, apic->delivery_mode,
3315				 apic->dest_mode_logical, irq_cfg->vector,
3316				 irq_cfg->dest_apicid, devid);
3317
3318	switch (info->type) {
3319	case X86_IRQ_ALLOC_TYPE_IOAPIC:
3320	case X86_IRQ_ALLOC_TYPE_HPET:
3321	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
3322	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
3323		fill_msi_msg(&data->msi_entry, irte_info->index);
3324		break;
3325
3326	default:
3327		BUG_ON(1);
3328		break;
3329	}
3330}
3331
3332struct amd_irte_ops irte_32_ops = {
3333	.prepare = irte_prepare,
3334	.activate = irte_activate,
3335	.deactivate = irte_deactivate,
3336	.set_affinity = irte_set_affinity,
3337	.set_allocated = irte_set_allocated,
3338	.is_allocated = irte_is_allocated,
3339	.clear_allocated = irte_clear_allocated,
3340};
3341
3342struct amd_irte_ops irte_128_ops = {
3343	.prepare = irte_ga_prepare,
3344	.activate = irte_ga_activate,
3345	.deactivate = irte_ga_deactivate,
3346	.set_affinity = irte_ga_set_affinity,
3347	.set_allocated = irte_ga_set_allocated,
3348	.is_allocated = irte_ga_is_allocated,
3349	.clear_allocated = irte_ga_clear_allocated,
3350};
3351
3352static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
3353			       unsigned int nr_irqs, void *arg)
3354{
3355	struct irq_alloc_info *info = arg;
3356	struct irq_data *irq_data;
3357	struct amd_ir_data *data = NULL;
3358	struct amd_iommu *iommu;
3359	struct irq_cfg *cfg;
3360	int i, ret, devid, seg, sbdf;
3361	int index;
3362
3363	if (!info)
3364		return -EINVAL;
3365	if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_PCI_MSI)
3366		return -EINVAL;
3367
3368	sbdf = get_devid(info);
3369	if (sbdf < 0)
3370		return -EINVAL;
3371
3372	seg = PCI_SBDF_TO_SEGID(sbdf);
3373	devid = PCI_SBDF_TO_DEVID(sbdf);
3374	iommu = __rlookup_amd_iommu(seg, devid);
3375	if (!iommu)
3376		return -EINVAL;
3377
3378	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
3379	if (ret < 0)
3380		return ret;
3381
3382	if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) {
3383		struct irq_remap_table *table;
3384
3385		table = alloc_irq_table(iommu, devid, NULL);
3386		if (table) {
3387			if (!table->min_index) {
3388				/*
3389				 * Keep the first 32 indexes free for IOAPIC
3390				 * interrupts.
3391				 */
3392				table->min_index = 32;
3393				for (i = 0; i < 32; ++i)
3394					iommu->irte_ops->set_allocated(table, i);
3395			}
3396			WARN_ON(table->min_index != 32);
3397			index = info->ioapic.pin;
3398		} else {
3399			index = -ENOMEM;
3400		}
3401	} else if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI ||
3402		   info->type == X86_IRQ_ALLOC_TYPE_PCI_MSIX) {
3403		bool align = (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI);
3404
3405		index = alloc_irq_index(iommu, devid, nr_irqs, align,
3406					msi_desc_to_pci_dev(info->desc));
3407	} else {
3408		index = alloc_irq_index(iommu, devid, nr_irqs, false, NULL);
3409	}
3410
3411	if (index < 0) {
3412		pr_warn("Failed to allocate IRTE\n");
3413		ret = index;
3414		goto out_free_parent;
3415	}
3416
3417	for (i = 0; i < nr_irqs; i++) {
3418		irq_data = irq_domain_get_irq_data(domain, virq + i);
3419		cfg = irq_data ? irqd_cfg(irq_data) : NULL;
3420		if (!cfg) {
3421			ret = -EINVAL;
3422			goto out_free_data;
3423		}
3424
3425		ret = -ENOMEM;
3426		data = kzalloc(sizeof(*data), GFP_KERNEL);
3427		if (!data)
3428			goto out_free_data;
3429
3430		if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
3431			data->entry = kzalloc(sizeof(union irte), GFP_KERNEL);
3432		else
3433			data->entry = kzalloc(sizeof(struct irte_ga),
3434						     GFP_KERNEL);
3435		if (!data->entry) {
3436			kfree(data);
3437			goto out_free_data;
3438		}
3439
3440		data->iommu = iommu;
3441		irq_data->hwirq = (devid << 16) + i;
3442		irq_data->chip_data = data;
3443		irq_data->chip = &amd_ir_chip;
3444		irq_remapping_prepare_irte(data, cfg, info, devid, index, i);
3445		irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT);
3446	}
3447
3448	return 0;
3449
3450out_free_data:
3451	for (i--; i >= 0; i--) {
3452		irq_data = irq_domain_get_irq_data(domain, virq + i);
3453		if (irq_data)
3454			kfree(irq_data->chip_data);
3455	}
3456	for (i = 0; i < nr_irqs; i++)
3457		free_irte(iommu, devid, index + i);
3458out_free_parent:
3459	irq_domain_free_irqs_common(domain, virq, nr_irqs);
3460	return ret;
3461}
3462
3463static void irq_remapping_free(struct irq_domain *domain, unsigned int virq,
3464			       unsigned int nr_irqs)
3465{
3466	struct irq_2_irte *irte_info;
3467	struct irq_data *irq_data;
3468	struct amd_ir_data *data;
3469	int i;
3470
3471	for (i = 0; i < nr_irqs; i++) {
3472		irq_data = irq_domain_get_irq_data(domain, virq  + i);
3473		if (irq_data && irq_data->chip_data) {
3474			data = irq_data->chip_data;
3475			irte_info = &data->irq_2_irte;
3476			free_irte(data->iommu, irte_info->devid, irte_info->index);
3477			kfree(data->entry);
3478			kfree(data);
3479		}
3480	}
3481	irq_domain_free_irqs_common(domain, virq, nr_irqs);
3482}
3483
3484static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu,
3485			       struct amd_ir_data *ir_data,
3486			       struct irq_2_irte *irte_info,
3487			       struct irq_cfg *cfg);
3488
3489static int irq_remapping_activate(struct irq_domain *domain,
3490				  struct irq_data *irq_data, bool reserve)
3491{
3492	struct amd_ir_data *data = irq_data->chip_data;
3493	struct irq_2_irte *irte_info = &data->irq_2_irte;
3494	struct amd_iommu *iommu = data->iommu;
3495	struct irq_cfg *cfg = irqd_cfg(irq_data);
3496
3497	if (!iommu)
3498		return 0;
3499
3500	iommu->irte_ops->activate(iommu, data->entry, irte_info->devid,
3501				  irte_info->index);
3502	amd_ir_update_irte(irq_data, iommu, data, irte_info, cfg);
3503	return 0;
3504}
3505
3506static void irq_remapping_deactivate(struct irq_domain *domain,
3507				     struct irq_data *irq_data)
3508{
3509	struct amd_ir_data *data = irq_data->chip_data;
3510	struct irq_2_irte *irte_info = &data->irq_2_irte;
3511	struct amd_iommu *iommu = data->iommu;
3512
3513	if (iommu)
3514		iommu->irte_ops->deactivate(iommu, data->entry, irte_info->devid,
3515					    irte_info->index);
3516}
3517
3518static int irq_remapping_select(struct irq_domain *d, struct irq_fwspec *fwspec,
3519				enum irq_domain_bus_token bus_token)
3520{
3521	struct amd_iommu *iommu;
3522	int devid = -1;
3523
3524	if (!amd_iommu_irq_remap)
3525		return 0;
3526
3527	if (x86_fwspec_is_ioapic(fwspec))
3528		devid = get_ioapic_devid(fwspec->param[0]);
3529	else if (x86_fwspec_is_hpet(fwspec))
3530		devid = get_hpet_devid(fwspec->param[0]);
3531
3532	if (devid < 0)
3533		return 0;
3534	iommu = __rlookup_amd_iommu((devid >> 16), (devid & 0xffff));
3535
3536	return iommu && iommu->ir_domain == d;
3537}
3538
3539static const struct irq_domain_ops amd_ir_domain_ops = {
3540	.select = irq_remapping_select,
3541	.alloc = irq_remapping_alloc,
3542	.free = irq_remapping_free,
3543	.activate = irq_remapping_activate,
3544	.deactivate = irq_remapping_deactivate,
3545};
3546
3547int amd_iommu_activate_guest_mode(void *data)
3548{
3549	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
3550	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
3551	u64 valid;
3552
3553	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) || !entry)
3554		return 0;
3555
3556	valid = entry->lo.fields_vapic.valid;
3557
3558	entry->lo.val = 0;
3559	entry->hi.val = 0;
3560
3561	entry->lo.fields_vapic.valid       = valid;
3562	entry->lo.fields_vapic.guest_mode  = 1;
3563	entry->lo.fields_vapic.ga_log_intr = 1;
3564	entry->hi.fields.ga_root_ptr       = ir_data->ga_root_ptr;
3565	entry->hi.fields.vector            = ir_data->ga_vector;
3566	entry->lo.fields_vapic.ga_tag      = ir_data->ga_tag;
3567
3568	return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
3569			      ir_data->irq_2_irte.index, entry);
3570}
3571EXPORT_SYMBOL(amd_iommu_activate_guest_mode);
3572
3573int amd_iommu_deactivate_guest_mode(void *data)
3574{
3575	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
3576	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
3577	struct irq_cfg *cfg = ir_data->cfg;
3578	u64 valid;
3579
3580	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
3581	    !entry || !entry->lo.fields_vapic.guest_mode)
3582		return 0;
3583
3584	valid = entry->lo.fields_remap.valid;
3585
3586	entry->lo.val = 0;
3587	entry->hi.val = 0;
3588
3589	entry->lo.fields_remap.valid       = valid;
3590	entry->lo.fields_remap.dm          = apic->dest_mode_logical;
3591	entry->lo.fields_remap.int_type    = apic->delivery_mode;
3592	entry->hi.fields.vector            = cfg->vector;
3593	entry->lo.fields_remap.destination =
3594				APICID_TO_IRTE_DEST_LO(cfg->dest_apicid);
3595	entry->hi.fields.destination =
3596				APICID_TO_IRTE_DEST_HI(cfg->dest_apicid);
3597
3598	return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
3599			      ir_data->irq_2_irte.index, entry);
3600}
3601EXPORT_SYMBOL(amd_iommu_deactivate_guest_mode);
3602
3603static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
3604{
3605	int ret;
3606	struct amd_iommu_pi_data *pi_data = vcpu_info;
3607	struct vcpu_data *vcpu_pi_info = pi_data->vcpu_data;
3608	struct amd_ir_data *ir_data = data->chip_data;
3609	struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
3610	struct iommu_dev_data *dev_data;
3611
3612	if (ir_data->iommu == NULL)
3613		return -EINVAL;
3614
3615	dev_data = search_dev_data(ir_data->iommu, irte_info->devid);
3616
3617	/* Note:
3618	 * This device has never been set up for guest mode.
3619	 * we should not modify the IRTE
3620	 */
3621	if (!dev_data || !dev_data->use_vapic)
3622		return 0;
3623
3624	ir_data->cfg = irqd_cfg(data);
3625	pi_data->ir_data = ir_data;
3626
3627	/* Note:
3628	 * SVM tries to set up for VAPIC mode, but we are in
3629	 * legacy mode. So, we force legacy mode instead.
3630	 */
3631	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
3632		pr_debug("%s: Fall back to using intr legacy remap\n",
3633			 __func__);
3634		pi_data->is_guest_mode = false;
3635	}
3636
3637	pi_data->prev_ga_tag = ir_data->cached_ga_tag;
3638	if (pi_data->is_guest_mode) {
3639		ir_data->ga_root_ptr = (pi_data->base >> 12);
3640		ir_data->ga_vector = vcpu_pi_info->vector;
3641		ir_data->ga_tag = pi_data->ga_tag;
3642		ret = amd_iommu_activate_guest_mode(ir_data);
3643		if (!ret)
3644			ir_data->cached_ga_tag = pi_data->ga_tag;
3645	} else {
3646		ret = amd_iommu_deactivate_guest_mode(ir_data);
3647
3648		/*
3649		 * This communicates the ga_tag back to the caller
3650		 * so that it can do all the necessary clean up.
3651		 */
3652		if (!ret)
3653			ir_data->cached_ga_tag = 0;
3654	}
3655
3656	return ret;
3657}
3658
3659
3660static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu,
3661			       struct amd_ir_data *ir_data,
3662			       struct irq_2_irte *irte_info,
3663			       struct irq_cfg *cfg)
3664{
3665
3666	/*
3667	 * Atomically updates the IRTE with the new destination, vector
3668	 * and flushes the interrupt entry cache.
3669	 */
3670	iommu->irte_ops->set_affinity(iommu, ir_data->entry, irte_info->devid,
3671				      irte_info->index, cfg->vector,
3672				      cfg->dest_apicid);
3673}
3674
3675static int amd_ir_set_affinity(struct irq_data *data,
3676			       const struct cpumask *mask, bool force)
3677{
3678	struct amd_ir_data *ir_data = data->chip_data;
3679	struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
3680	struct irq_cfg *cfg = irqd_cfg(data);
3681	struct irq_data *parent = data->parent_data;
3682	struct amd_iommu *iommu = ir_data->iommu;
3683	int ret;
3684
3685	if (!iommu)
3686		return -ENODEV;
3687
3688	ret = parent->chip->irq_set_affinity(parent, mask, force);
3689	if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
3690		return ret;
3691
3692	amd_ir_update_irte(data, iommu, ir_data, irte_info, cfg);
3693	/*
3694	 * After this point, all the interrupts will start arriving
3695	 * at the new destination. So, time to cleanup the previous
3696	 * vector allocation.
3697	 */
3698	vector_schedule_cleanup(cfg);
3699
3700	return IRQ_SET_MASK_OK_DONE;
3701}
3702
3703static void ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg)
3704{
3705	struct amd_ir_data *ir_data = irq_data->chip_data;
3706
3707	*msg = ir_data->msi_entry;
3708}
3709
3710static struct irq_chip amd_ir_chip = {
3711	.name			= "AMD-IR",
3712	.irq_ack		= apic_ack_irq,
3713	.irq_set_affinity	= amd_ir_set_affinity,
3714	.irq_set_vcpu_affinity	= amd_ir_set_vcpu_affinity,
3715	.irq_compose_msi_msg	= ir_compose_msi_msg,
3716};
3717
3718static const struct msi_parent_ops amdvi_msi_parent_ops = {
3719	.supported_flags	= X86_VECTOR_MSI_FLAGS_SUPPORTED |
3720				  MSI_FLAG_MULTI_PCI_MSI |
3721				  MSI_FLAG_PCI_IMS,
3722	.prefix			= "IR-",
3723	.init_dev_msi_info	= msi_parent_init_dev_msi_info,
3724};
3725
3726static const struct msi_parent_ops virt_amdvi_msi_parent_ops = {
3727	.supported_flags	= X86_VECTOR_MSI_FLAGS_SUPPORTED |
3728				  MSI_FLAG_MULTI_PCI_MSI,
3729	.prefix			= "vIR-",
3730	.init_dev_msi_info	= msi_parent_init_dev_msi_info,
3731};
3732
3733int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
3734{
3735	struct fwnode_handle *fn;
3736
3737	fn = irq_domain_alloc_named_id_fwnode("AMD-IR", iommu->index);
3738	if (!fn)
3739		return -ENOMEM;
3740	iommu->ir_domain = irq_domain_create_hierarchy(arch_get_ir_parent_domain(), 0, 0,
3741						       fn, &amd_ir_domain_ops, iommu);
3742	if (!iommu->ir_domain) {
3743		irq_domain_free_fwnode(fn);
3744		return -ENOMEM;
3745	}
3746
3747	irq_domain_update_bus_token(iommu->ir_domain,  DOMAIN_BUS_AMDVI);
3748	iommu->ir_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT |
3749				   IRQ_DOMAIN_FLAG_ISOLATED_MSI;
3750
3751	if (amd_iommu_np_cache)
3752		iommu->ir_domain->msi_parent_ops = &virt_amdvi_msi_parent_ops;
3753	else
3754		iommu->ir_domain->msi_parent_ops = &amdvi_msi_parent_ops;
3755
3756	return 0;
3757}
3758
3759int amd_iommu_update_ga(int cpu, bool is_run, void *data)
3760{
3761	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
3762	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
3763
3764	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
3765	    !entry || !entry->lo.fields_vapic.guest_mode)
3766		return 0;
3767
3768	if (!ir_data->iommu)
3769		return -ENODEV;
3770
3771	if (cpu >= 0) {
3772		entry->lo.fields_vapic.destination =
3773					APICID_TO_IRTE_DEST_LO(cpu);
3774		entry->hi.fields.destination =
3775					APICID_TO_IRTE_DEST_HI(cpu);
3776	}
3777	entry->lo.fields_vapic.is_run = is_run;
3778
3779	return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
3780			      ir_data->irq_2_irte.index, entry);
3781}
3782EXPORT_SYMBOL(amd_iommu_update_ga);
3783#endif
3784