1// SPDX-License-Identifier: GPL-2.0
2/**
3 * intel-pasid.c - PASID idr, table and entry manipulation
4 *
5 * Copyright (C) 2018 Intel Corporation
6 *
7 * Author: Lu Baolu <baolu.lu@linux.intel.com>
8 */
9
10#define pr_fmt(fmt)	"DMAR: " fmt
11
12#include <linux/bitops.h>
13#include <linux/cpufeature.h>
14#include <linux/dmar.h>
15#include <linux/intel-iommu.h>
16#include <linux/iommu.h>
17#include <linux/memory.h>
18#include <linux/pci.h>
19#include <linux/pci-ats.h>
20#include <linux/spinlock.h>
21
22#include "pasid.h"
23
24/*
25 * Intel IOMMU system wide PASID name space:
26 */
27u32 intel_pasid_max_id = PASID_MAX;
28
29int vcmd_alloc_pasid(struct intel_iommu *iommu, u32 *pasid)
30{
31	unsigned long flags;
32	u8 status_code;
33	int ret = 0;
34	u64 res;
35
36	raw_spin_lock_irqsave(&iommu->register_lock, flags);
37	dmar_writeq(iommu->reg + DMAR_VCMD_REG, VCMD_CMD_ALLOC);
38	IOMMU_WAIT_OP(iommu, DMAR_VCRSP_REG, dmar_readq,
39		      !(res & VCMD_VRSP_IP), res);
40	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
41
42	status_code = VCMD_VRSP_SC(res);
43	switch (status_code) {
44	case VCMD_VRSP_SC_SUCCESS:
45		*pasid = VCMD_VRSP_RESULT_PASID(res);
46		break;
47	case VCMD_VRSP_SC_NO_PASID_AVAIL:
48		pr_info("IOMMU: %s: No PASID available\n", iommu->name);
49		ret = -ENOSPC;
50		break;
51	default:
52		ret = -ENODEV;
53		pr_warn("IOMMU: %s: Unexpected error code %d\n",
54			iommu->name, status_code);
55	}
56
57	return ret;
58}
59
60void vcmd_free_pasid(struct intel_iommu *iommu, u32 pasid)
61{
62	unsigned long flags;
63	u8 status_code;
64	u64 res;
65
66	raw_spin_lock_irqsave(&iommu->register_lock, flags);
67	dmar_writeq(iommu->reg + DMAR_VCMD_REG,
68		    VCMD_CMD_OPERAND(pasid) | VCMD_CMD_FREE);
69	IOMMU_WAIT_OP(iommu, DMAR_VCRSP_REG, dmar_readq,
70		      !(res & VCMD_VRSP_IP), res);
71	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
72
73	status_code = VCMD_VRSP_SC(res);
74	switch (status_code) {
75	case VCMD_VRSP_SC_SUCCESS:
76		break;
77	case VCMD_VRSP_SC_INVALID_PASID:
78		pr_info("IOMMU: %s: Invalid PASID\n", iommu->name);
79		break;
80	default:
81		pr_warn("IOMMU: %s: Unexpected error code %d\n",
82			iommu->name, status_code);
83	}
84}
85
86/*
87 * Per device pasid table management:
88 */
89static inline void
90device_attach_pasid_table(struct device_domain_info *info,
91			  struct pasid_table *pasid_table)
92{
93	info->pasid_table = pasid_table;
94	list_add(&info->table, &pasid_table->dev);
95}
96
97static inline void
98device_detach_pasid_table(struct device_domain_info *info,
99			  struct pasid_table *pasid_table)
100{
101	info->pasid_table = NULL;
102	list_del(&info->table);
103}
104
105struct pasid_table_opaque {
106	struct pasid_table	**pasid_table;
107	int			segment;
108	int			bus;
109	int			devfn;
110};
111
112static int search_pasid_table(struct device_domain_info *info, void *opaque)
113{
114	struct pasid_table_opaque *data = opaque;
115
116	if (info->iommu->segment == data->segment &&
117	    info->bus == data->bus &&
118	    info->devfn == data->devfn &&
119	    info->pasid_table) {
120		*data->pasid_table = info->pasid_table;
121		return 1;
122	}
123
124	return 0;
125}
126
127static int get_alias_pasid_table(struct pci_dev *pdev, u16 alias, void *opaque)
128{
129	struct pasid_table_opaque *data = opaque;
130
131	data->segment = pci_domain_nr(pdev->bus);
132	data->bus = PCI_BUS_NUM(alias);
133	data->devfn = alias & 0xff;
134
135	return for_each_device_domain(&search_pasid_table, data);
136}
137
138/*
139 * Allocate a pasid table for @dev. It should be called in a
140 * single-thread context.
141 */
142int intel_pasid_alloc_table(struct device *dev)
143{
144	struct device_domain_info *info;
145	struct pasid_table *pasid_table;
146	struct pasid_table_opaque data;
147	struct page *pages;
148	u32 max_pasid = 0;
149	int ret, order;
150	int size;
151
152	might_sleep();
153	info = get_domain_info(dev);
154	if (WARN_ON(!info || !dev_is_pci(dev) || info->pasid_table))
155		return -EINVAL;
156
157	/* DMA alias device already has a pasid table, use it: */
158	data.pasid_table = &pasid_table;
159	ret = pci_for_each_dma_alias(to_pci_dev(dev),
160				     &get_alias_pasid_table, &data);
161	if (ret)
162		goto attach_out;
163
164	pasid_table = kzalloc(sizeof(*pasid_table), GFP_KERNEL);
165	if (!pasid_table)
166		return -ENOMEM;
167	INIT_LIST_HEAD(&pasid_table->dev);
168
169	if (info->pasid_supported)
170		max_pasid = min_t(u32, pci_max_pasids(to_pci_dev(dev)),
171				  intel_pasid_max_id);
172
173	size = max_pasid >> (PASID_PDE_SHIFT - 3);
174	order = size ? get_order(size) : 0;
175	pages = alloc_pages_node(info->iommu->node,
176				 GFP_KERNEL | __GFP_ZERO, order);
177	if (!pages) {
178		kfree(pasid_table);
179		return -ENOMEM;
180	}
181
182	pasid_table->table = page_address(pages);
183	pasid_table->order = order;
184	pasid_table->max_pasid = 1 << (order + PAGE_SHIFT + 3);
185
186attach_out:
187	device_attach_pasid_table(info, pasid_table);
188
189	if (!ecap_coherent(info->iommu->ecap))
190		clflush_cache_range(pasid_table->table, (1 << order) * PAGE_SIZE);
191
192	return 0;
193}
194
195void intel_pasid_free_table(struct device *dev)
196{
197	struct device_domain_info *info;
198	struct pasid_table *pasid_table;
199	struct pasid_dir_entry *dir;
200	struct pasid_entry *table;
201	int i, max_pde;
202
203	info = get_domain_info(dev);
204	if (!info || !dev_is_pci(dev) || !info->pasid_table)
205		return;
206
207	pasid_table = info->pasid_table;
208	device_detach_pasid_table(info, pasid_table);
209
210	if (!list_empty(&pasid_table->dev))
211		return;
212
213	/* Free scalable mode PASID directory tables: */
214	dir = pasid_table->table;
215	max_pde = pasid_table->max_pasid >> PASID_PDE_SHIFT;
216	for (i = 0; i < max_pde; i++) {
217		table = get_pasid_table_from_pde(&dir[i]);
218		free_pgtable_page(table);
219	}
220
221	free_pages((unsigned long)pasid_table->table, pasid_table->order);
222	kfree(pasid_table);
223}
224
225struct pasid_table *intel_pasid_get_table(struct device *dev)
226{
227	struct device_domain_info *info;
228
229	info = get_domain_info(dev);
230	if (!info)
231		return NULL;
232
233	return info->pasid_table;
234}
235
236int intel_pasid_get_dev_max_id(struct device *dev)
237{
238	struct device_domain_info *info;
239
240	info = get_domain_info(dev);
241	if (!info || !info->pasid_table)
242		return 0;
243
244	return info->pasid_table->max_pasid;
245}
246
247struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid)
248{
249	struct device_domain_info *info;
250	struct pasid_table *pasid_table;
251	struct pasid_dir_entry *dir;
252	struct pasid_entry *entries;
253	int dir_index, index;
254
255	pasid_table = intel_pasid_get_table(dev);
256	if (WARN_ON(!pasid_table || pasid >= intel_pasid_get_dev_max_id(dev)))
257		return NULL;
258
259	dir = pasid_table->table;
260	info = get_domain_info(dev);
261	dir_index = pasid >> PASID_PDE_SHIFT;
262	index = pasid & PASID_PTE_MASK;
263
264retry:
265	entries = get_pasid_table_from_pde(&dir[dir_index]);
266	if (!entries) {
267		entries = alloc_pgtable_page(info->iommu->node);
268		if (!entries)
269			return NULL;
270
271		/*
272		 * The pasid directory table entry won't be freed after
273		 * allocation. No worry about the race with free and
274		 * clear. However, this entry might be populated by others
275		 * while we are preparing it. Use theirs with a retry.
276		 */
277		if (cmpxchg64(&dir[dir_index].val, 0ULL,
278			      (u64)virt_to_phys(entries) | PASID_PTE_PRESENT)) {
279			free_pgtable_page(entries);
280			goto retry;
281		}
282		if (!ecap_coherent(info->iommu->ecap)) {
283			clflush_cache_range(entries, VTD_PAGE_SIZE);
284			clflush_cache_range(&dir[dir_index].val, sizeof(*dir));
285		}
286	}
287
288	return &entries[index];
289}
290
291/*
292 * Interfaces for PASID table entry manipulation:
293 */
294static inline void pasid_clear_entry(struct pasid_entry *pe)
295{
296	WRITE_ONCE(pe->val[0], 0);
297	WRITE_ONCE(pe->val[1], 0);
298	WRITE_ONCE(pe->val[2], 0);
299	WRITE_ONCE(pe->val[3], 0);
300	WRITE_ONCE(pe->val[4], 0);
301	WRITE_ONCE(pe->val[5], 0);
302	WRITE_ONCE(pe->val[6], 0);
303	WRITE_ONCE(pe->val[7], 0);
304}
305
306static inline void pasid_clear_entry_with_fpd(struct pasid_entry *pe)
307{
308	WRITE_ONCE(pe->val[0], PASID_PTE_FPD);
309	WRITE_ONCE(pe->val[1], 0);
310	WRITE_ONCE(pe->val[2], 0);
311	WRITE_ONCE(pe->val[3], 0);
312	WRITE_ONCE(pe->val[4], 0);
313	WRITE_ONCE(pe->val[5], 0);
314	WRITE_ONCE(pe->val[6], 0);
315	WRITE_ONCE(pe->val[7], 0);
316}
317
318static void
319intel_pasid_clear_entry(struct device *dev, u32 pasid, bool fault_ignore)
320{
321	struct pasid_entry *pe;
322
323	pe = intel_pasid_get_entry(dev, pasid);
324	if (WARN_ON(!pe))
325		return;
326
327	if (fault_ignore && pasid_pte_is_present(pe))
328		pasid_clear_entry_with_fpd(pe);
329	else
330		pasid_clear_entry(pe);
331}
332
333static inline void pasid_set_bits(u64 *ptr, u64 mask, u64 bits)
334{
335	u64 old;
336
337	old = READ_ONCE(*ptr);
338	WRITE_ONCE(*ptr, (old & ~mask) | bits);
339}
340
341/*
342 * Setup the DID(Domain Identifier) field (Bit 64~79) of scalable mode
343 * PASID entry.
344 */
345static inline void
346pasid_set_domain_id(struct pasid_entry *pe, u64 value)
347{
348	pasid_set_bits(&pe->val[1], GENMASK_ULL(15, 0), value);
349}
350
351/*
352 * Get domain ID value of a scalable mode PASID entry.
353 */
354static inline u16
355pasid_get_domain_id(struct pasid_entry *pe)
356{
357	return (u16)(READ_ONCE(pe->val[1]) & GENMASK_ULL(15, 0));
358}
359
360/*
361 * Setup the SLPTPTR(Second Level Page Table Pointer) field (Bit 12~63)
362 * of a scalable mode PASID entry.
363 */
364static inline void
365pasid_set_slptr(struct pasid_entry *pe, u64 value)
366{
367	pasid_set_bits(&pe->val[0], VTD_PAGE_MASK, value);
368}
369
370/*
371 * Setup the AW(Address Width) field (Bit 2~4) of a scalable mode PASID
372 * entry.
373 */
374static inline void
375pasid_set_address_width(struct pasid_entry *pe, u64 value)
376{
377	pasid_set_bits(&pe->val[0], GENMASK_ULL(4, 2), value << 2);
378}
379
380/*
381 * Setup the PGTT(PASID Granular Translation Type) field (Bit 6~8)
382 * of a scalable mode PASID entry.
383 */
384static inline void
385pasid_set_translation_type(struct pasid_entry *pe, u64 value)
386{
387	pasid_set_bits(&pe->val[0], GENMASK_ULL(8, 6), value << 6);
388}
389
390/*
391 * Enable fault processing by clearing the FPD(Fault Processing
392 * Disable) field (Bit 1) of a scalable mode PASID entry.
393 */
394static inline void pasid_set_fault_enable(struct pasid_entry *pe)
395{
396	pasid_set_bits(&pe->val[0], 1 << 1, 0);
397}
398
399/*
400 * Setup the SRE(Supervisor Request Enable) field (Bit 128) of a
401 * scalable mode PASID entry.
402 */
403static inline void pasid_set_sre(struct pasid_entry *pe)
404{
405	pasid_set_bits(&pe->val[2], 1 << 0, 1);
406}
407
408/*
409 * Setup the P(Present) field (Bit 0) of a scalable mode PASID
410 * entry.
411 */
412static inline void pasid_set_present(struct pasid_entry *pe)
413{
414	pasid_set_bits(&pe->val[0], 1 << 0, 1);
415}
416
417/*
418 * Setup Page Walk Snoop bit (Bit 87) of a scalable mode PASID
419 * entry.
420 */
421static inline void pasid_set_page_snoop(struct pasid_entry *pe, bool value)
422{
423	pasid_set_bits(&pe->val[1], 1 << 23, value << 23);
424}
425
426/*
427 * Setup the Page Snoop (PGSNP) field (Bit 88) of a scalable mode
428 * PASID entry.
429 */
430static inline void
431pasid_set_pgsnp(struct pasid_entry *pe)
432{
433	pasid_set_bits(&pe->val[1], 1ULL << 24, 1ULL << 24);
434}
435
436/*
437 * Setup the First Level Page table Pointer field (Bit 140~191)
438 * of a scalable mode PASID entry.
439 */
440static inline void
441pasid_set_flptr(struct pasid_entry *pe, u64 value)
442{
443	pasid_set_bits(&pe->val[2], VTD_PAGE_MASK, value);
444}
445
446/*
447 * Setup the First Level Paging Mode field (Bit 130~131) of a
448 * scalable mode PASID entry.
449 */
450static inline void
451pasid_set_flpm(struct pasid_entry *pe, u64 value)
452{
453	pasid_set_bits(&pe->val[2], GENMASK_ULL(3, 2), value << 2);
454}
455
456/*
457 * Setup the Extended Access Flag Enable (EAFE) field (Bit 135)
458 * of a scalable mode PASID entry.
459 */
460static inline void
461pasid_set_eafe(struct pasid_entry *pe)
462{
463	pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7);
464}
465
466static void
467pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu,
468				    u16 did, u32 pasid)
469{
470	struct qi_desc desc;
471
472	desc.qw0 = QI_PC_DID(did) | QI_PC_GRAN(QI_PC_PASID_SEL) |
473		QI_PC_PASID(pasid) | QI_PC_TYPE;
474	desc.qw1 = 0;
475	desc.qw2 = 0;
476	desc.qw3 = 0;
477
478	qi_submit_sync(iommu, &desc, 1, 0);
479}
480
481static void
482devtlb_invalidation_with_pasid(struct intel_iommu *iommu,
483			       struct device *dev, u32 pasid)
484{
485	struct device_domain_info *info;
486	u16 sid, qdep, pfsid;
487
488	info = get_domain_info(dev);
489	if (!info || !info->ats_enabled)
490		return;
491
492	sid = info->bus << 8 | info->devfn;
493	qdep = info->ats_qdep;
494	pfsid = info->pfsid;
495
496	/*
497	 * When PASID 0 is used, it indicates RID2PASID(DMA request w/o PASID),
498	 * devTLB flush w/o PASID should be used. For non-zero PASID under
499	 * SVA usage, device could do DMA with multiple PASIDs. It is more
500	 * efficient to flush devTLB specific to the PASID.
501	 */
502	if (pasid == PASID_RID2PASID)
503		qi_flush_dev_iotlb(iommu, sid, pfsid, qdep, 0, 64 - VTD_PAGE_SHIFT);
504	else
505		qi_flush_dev_iotlb_pasid(iommu, sid, pfsid, pasid, qdep, 0, 64 - VTD_PAGE_SHIFT);
506}
507
508void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev,
509				 u32 pasid, bool fault_ignore)
510{
511	struct pasid_entry *pte;
512	u16 did, pgtt;
513
514	pte = intel_pasid_get_entry(dev, pasid);
515	if (WARN_ON(!pte))
516		return;
517
518	did = pasid_get_domain_id(pte);
519	pgtt = pasid_pte_get_pgtt(pte);
520
521	intel_pasid_clear_entry(dev, pasid, fault_ignore);
522
523	if (!ecap_coherent(iommu->ecap))
524		clflush_cache_range(pte, sizeof(*pte));
525
526	pasid_cache_invalidation_with_pasid(iommu, did, pasid);
527
528	if (pgtt == PASID_ENTRY_PGTT_PT || pgtt == PASID_ENTRY_PGTT_FL_ONLY)
529		qi_flush_piotlb(iommu, did, pasid, 0, -1, 0);
530	else
531		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
532
533	/* Device IOTLB doesn't need to be flushed in caching mode. */
534	if (!cap_caching_mode(iommu->cap))
535		devtlb_invalidation_with_pasid(iommu, dev, pasid);
536}
537
538static void pasid_flush_caches(struct intel_iommu *iommu,
539				struct pasid_entry *pte,
540			       u32 pasid, u16 did)
541{
542	if (!ecap_coherent(iommu->ecap))
543		clflush_cache_range(pte, sizeof(*pte));
544
545	if (cap_caching_mode(iommu->cap)) {
546		pasid_cache_invalidation_with_pasid(iommu, did, pasid);
547		qi_flush_piotlb(iommu, did, pasid, 0, -1, 0);
548	} else {
549		iommu_flush_write_buffer(iommu);
550	}
551}
552
553/*
554 * Set up the scalable mode pasid table entry for first only
555 * translation type.
556 */
557int intel_pasid_setup_first_level(struct intel_iommu *iommu,
558				  struct device *dev, pgd_t *pgd,
559				  u32 pasid, u16 did, int flags)
560{
561	struct pasid_entry *pte;
562
563	if (!ecap_flts(iommu->ecap)) {
564		pr_err("No first level translation support on %s\n",
565		       iommu->name);
566		return -EINVAL;
567	}
568
569	pte = intel_pasid_get_entry(dev, pasid);
570	if (WARN_ON(!pte))
571		return -EINVAL;
572
573	pasid_clear_entry(pte);
574
575	/* Setup the first level page table pointer: */
576	pasid_set_flptr(pte, (u64)__pa(pgd));
577	if (flags & PASID_FLAG_SUPERVISOR_MODE) {
578		if (!ecap_srs(iommu->ecap)) {
579			pr_err("No supervisor request support on %s\n",
580			       iommu->name);
581			return -EINVAL;
582		}
583		pasid_set_sre(pte);
584	}
585
586	if (flags & PASID_FLAG_FL5LP) {
587		if (cap_5lp_support(iommu->cap)) {
588			pasid_set_flpm(pte, 1);
589		} else {
590			pr_err("No 5-level paging support for first-level\n");
591			pasid_clear_entry(pte);
592			return -EINVAL;
593		}
594	}
595
596	if (flags & PASID_FLAG_PAGE_SNOOP)
597		pasid_set_pgsnp(pte);
598
599	pasid_set_domain_id(pte, did);
600	pasid_set_address_width(pte, iommu->agaw);
601	pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
602
603	/* Setup Present and PASID Granular Transfer Type: */
604	pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY);
605	pasid_set_present(pte);
606	pasid_flush_caches(iommu, pte, pasid, did);
607
608	return 0;
609}
610
611/*
612 * Skip top levels of page tables for iommu which has less agaw
613 * than default. Unnecessary for PT mode.
614 */
615static inline int iommu_skip_agaw(struct dmar_domain *domain,
616				  struct intel_iommu *iommu,
617				  struct dma_pte **pgd)
618{
619	int agaw;
620
621	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
622		*pgd = phys_to_virt(dma_pte_addr(*pgd));
623		if (!dma_pte_present(*pgd))
624			return -EINVAL;
625	}
626
627	return agaw;
628}
629
630/*
631 * Set up the scalable mode pasid entry for second only translation type.
632 */
633int intel_pasid_setup_second_level(struct intel_iommu *iommu,
634				   struct dmar_domain *domain,
635				   struct device *dev, u32 pasid)
636{
637	struct pasid_entry *pte;
638	struct dma_pte *pgd;
639	u64 pgd_val;
640	int agaw;
641	u16 did;
642
643	/*
644	 * If hardware advertises no support for second level
645	 * translation, return directly.
646	 */
647	if (!ecap_slts(iommu->ecap)) {
648		pr_err("No second level translation support on %s\n",
649		       iommu->name);
650		return -EINVAL;
651	}
652
653	pgd = domain->pgd;
654	agaw = iommu_skip_agaw(domain, iommu, &pgd);
655	if (agaw < 0) {
656		dev_err(dev, "Invalid domain page table\n");
657		return -EINVAL;
658	}
659
660	pgd_val = virt_to_phys(pgd);
661	did = domain->iommu_did[iommu->seq_id];
662
663	pte = intel_pasid_get_entry(dev, pasid);
664	if (!pte) {
665		dev_err(dev, "Failed to get pasid entry of PASID %d\n", pasid);
666		return -ENODEV;
667	}
668
669	pasid_clear_entry(pte);
670	pasid_set_domain_id(pte, did);
671	pasid_set_slptr(pte, pgd_val);
672	pasid_set_address_width(pte, agaw);
673	pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY);
674	pasid_set_fault_enable(pte);
675	pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
676
677	if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
678		pasid_set_pgsnp(pte);
679
680	/*
681	 * Since it is a second level only translation setup, we should
682	 * set SRE bit as well (addresses are expected to be GPAs).
683	 */
684	if (pasid != PASID_RID2PASID && ecap_srs(iommu->ecap))
685		pasid_set_sre(pte);
686	pasid_set_present(pte);
687	pasid_flush_caches(iommu, pte, pasid, did);
688
689	return 0;
690}
691
692/*
693 * Set up the scalable mode pasid entry for passthrough translation type.
694 */
695int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
696				   struct dmar_domain *domain,
697				   struct device *dev, u32 pasid)
698{
699	u16 did = FLPT_DEFAULT_DID;
700	struct pasid_entry *pte;
701
702	pte = intel_pasid_get_entry(dev, pasid);
703	if (!pte) {
704		dev_err(dev, "Failed to get pasid entry of PASID %d\n", pasid);
705		return -ENODEV;
706	}
707
708	pasid_clear_entry(pte);
709	pasid_set_domain_id(pte, did);
710	pasid_set_address_width(pte, iommu->agaw);
711	pasid_set_translation_type(pte, PASID_ENTRY_PGTT_PT);
712	pasid_set_fault_enable(pte);
713	pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
714
715	/*
716	 * We should set SRE bit as well since the addresses are expected
717	 * to be GPAs.
718	 */
719	if (ecap_srs(iommu->ecap))
720		pasid_set_sre(pte);
721	pasid_set_present(pte);
722	pasid_flush_caches(iommu, pte, pasid, did);
723
724	return 0;
725}
726
727static int
728intel_pasid_setup_bind_data(struct intel_iommu *iommu, struct pasid_entry *pte,
729			    struct iommu_gpasid_bind_data_vtd *pasid_data)
730{
731	/*
732	 * Not all guest PASID table entry fields are passed down during bind,
733	 * here we only set up the ones that are dependent on guest settings.
734	 * Execution related bits such as NXE, SMEP are not supported.
735	 * Other fields, such as snoop related, are set based on host needs
736	 * regardless of guest settings.
737	 */
738	if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_SRE) {
739		if (!ecap_srs(iommu->ecap)) {
740			pr_err_ratelimited("No supervisor request support on %s\n",
741					   iommu->name);
742			return -EINVAL;
743		}
744		pasid_set_sre(pte);
745	}
746
747	if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_EAFE) {
748		if (!ecap_eafs(iommu->ecap)) {
749			pr_err_ratelimited("No extended access flag support on %s\n",
750					   iommu->name);
751			return -EINVAL;
752		}
753		pasid_set_eafe(pte);
754	}
755
756	/*
757	 * Memory type is only applicable to devices inside processor coherent
758	 * domain. Will add MTS support once coherent devices are available.
759	 */
760	if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_MTS_MASK) {
761		pr_warn_ratelimited("No memory type support %s\n",
762				    iommu->name);
763		return -EINVAL;
764	}
765
766	return 0;
767}
768
769/**
770 * intel_pasid_setup_nested() - Set up PASID entry for nested translation.
771 * This could be used for guest shared virtual address. In this case, the
772 * first level page tables are used for GVA-GPA translation in the guest,
773 * second level page tables are used for GPA-HPA translation.
774 *
775 * @iommu:      IOMMU which the device belong to
776 * @dev:        Device to be set up for translation
777 * @gpgd:       FLPTPTR: First Level Page translation pointer in GPA
778 * @pasid:      PASID to be programmed in the device PASID table
779 * @pasid_data: Additional PASID info from the guest bind request
780 * @domain:     Domain info for setting up second level page tables
781 * @addr_width: Address width of the first level (guest)
782 */
783int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
784			     pgd_t *gpgd, u32 pasid,
785			     struct iommu_gpasid_bind_data_vtd *pasid_data,
786			     struct dmar_domain *domain, int addr_width)
787{
788	struct pasid_entry *pte;
789	struct dma_pte *pgd;
790	int ret = 0;
791	u64 pgd_val;
792	int agaw;
793	u16 did;
794
795	if (!ecap_nest(iommu->ecap)) {
796		pr_err_ratelimited("IOMMU: %s: No nested translation support\n",
797				   iommu->name);
798		return -EINVAL;
799	}
800
801	if (!(domain->flags & DOMAIN_FLAG_NESTING_MODE)) {
802		pr_err_ratelimited("Domain is not in nesting mode, %x\n",
803				   domain->flags);
804		return -EINVAL;
805	}
806
807	pte = intel_pasid_get_entry(dev, pasid);
808	if (WARN_ON(!pte))
809		return -EINVAL;
810
811	/*
812	 * Caller must ensure PASID entry is not in use, i.e. not bind the
813	 * same PASID to the same device twice.
814	 */
815	if (pasid_pte_is_present(pte))
816		return -EBUSY;
817
818	pasid_clear_entry(pte);
819
820	/* Sanity checking performed by caller to make sure address
821	 * width matching in two dimensions:
822	 * 1. CPU vs. IOMMU
823	 * 2. Guest vs. Host.
824	 */
825	switch (addr_width) {
826#ifdef CONFIG_X86
827	case ADDR_WIDTH_5LEVEL:
828		if (!cpu_feature_enabled(X86_FEATURE_LA57) ||
829		    !cap_5lp_support(iommu->cap)) {
830			dev_err_ratelimited(dev,
831					    "5-level paging not supported\n");
832			return -EINVAL;
833		}
834
835		pasid_set_flpm(pte, 1);
836		break;
837#endif
838	case ADDR_WIDTH_4LEVEL:
839		pasid_set_flpm(pte, 0);
840		break;
841	default:
842		dev_err_ratelimited(dev, "Invalid guest address width %d\n",
843				    addr_width);
844		return -EINVAL;
845	}
846
847	/* First level PGD is in GPA, must be supported by the second level */
848	if ((uintptr_t)gpgd > domain->max_addr) {
849		dev_err_ratelimited(dev,
850				    "Guest PGD %lx not supported, max %llx\n",
851				    (uintptr_t)gpgd, domain->max_addr);
852		return -EINVAL;
853	}
854	pasid_set_flptr(pte, (uintptr_t)gpgd);
855
856	ret = intel_pasid_setup_bind_data(iommu, pte, pasid_data);
857	if (ret)
858		return ret;
859
860	/* Setup the second level based on the given domain */
861	pgd = domain->pgd;
862
863	agaw = iommu_skip_agaw(domain, iommu, &pgd);
864	if (agaw < 0) {
865		dev_err_ratelimited(dev, "Invalid domain page table\n");
866		return -EINVAL;
867	}
868	pgd_val = virt_to_phys(pgd);
869	pasid_set_slptr(pte, pgd_val);
870	pasid_set_fault_enable(pte);
871
872	did = domain->iommu_did[iommu->seq_id];
873	pasid_set_domain_id(pte, did);
874
875	pasid_set_address_width(pte, agaw);
876	pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
877
878	pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED);
879	pasid_set_present(pte);
880	pasid_flush_caches(iommu, pte, pasid, did);
881
882	return ret;
883}
884