1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright © 2015 Intel Corporation.
4 *
5 * Authors: David Woodhouse <dwmw2@infradead.org>
6 */
7
8#include <linux/intel-iommu.h>
9#include <linux/mmu_notifier.h>
10#include <linux/sched.h>
11#include <linux/sched/mm.h>
12#include <linux/slab.h>
13#include <linux/intel-svm.h>
14#include <linux/rculist.h>
15#include <linux/pci.h>
16#include <linux/pci-ats.h>
17#include <linux/dmar.h>
18#include <linux/interrupt.h>
19#include <linux/mm_types.h>
20#include <linux/ioasid.h>
21#include <asm/page.h>
22#include <asm/fpu/api.h>
23
24#include "pasid.h"
25
26static irqreturn_t prq_event_thread(int irq, void *d);
27static void intel_svm_drain_prq(struct device *dev, u32 pasid);
28
29#define PRQ_ORDER 0
30
31int intel_svm_enable_prq(struct intel_iommu *iommu)
32{
33	struct page *pages;
34	int irq, ret;
35
36	pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER);
37	if (!pages) {
38		pr_warn("IOMMU: %s: Failed to allocate page request queue\n",
39			iommu->name);
40		return -ENOMEM;
41	}
42	iommu->prq = page_address(pages);
43
44	irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu);
45	if (irq <= 0) {
46		pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n",
47		       iommu->name);
48		ret = -EINVAL;
49	err:
50		free_pages((unsigned long)iommu->prq, PRQ_ORDER);
51		iommu->prq = NULL;
52		return ret;
53	}
54	iommu->pr_irq = irq;
55
56	snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id);
57
58	ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT,
59				   iommu->prq_name, iommu);
60	if (ret) {
61		pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n",
62		       iommu->name);
63		dmar_free_hwirq(irq);
64		iommu->pr_irq = 0;
65		goto err;
66	}
67	dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
68	dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
69	dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER);
70
71	init_completion(&iommu->prq_complete);
72
73	return 0;
74}
75
76int intel_svm_finish_prq(struct intel_iommu *iommu)
77{
78	dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
79	dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
80	dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL);
81
82	if (iommu->pr_irq) {
83		free_irq(iommu->pr_irq, iommu);
84		dmar_free_hwirq(iommu->pr_irq);
85		iommu->pr_irq = 0;
86	}
87
88	free_pages((unsigned long)iommu->prq, PRQ_ORDER);
89	iommu->prq = NULL;
90
91	return 0;
92}
93
94static inline bool intel_svm_capable(struct intel_iommu *iommu)
95{
96	return iommu->flags & VTD_FLAG_SVM_CAPABLE;
97}
98
99void intel_svm_check(struct intel_iommu *iommu)
100{
101	if (!pasid_supported(iommu))
102		return;
103
104	if (cpu_feature_enabled(X86_FEATURE_GBPAGES) &&
105	    !cap_fl1gp_support(iommu->cap)) {
106		pr_err("%s SVM disabled, incompatible 1GB page capability\n",
107		       iommu->name);
108		return;
109	}
110
111	if (cpu_feature_enabled(X86_FEATURE_LA57) &&
112	    !cap_5lp_support(iommu->cap)) {
113		pr_err("%s SVM disabled, incompatible paging mode\n",
114		       iommu->name);
115		return;
116	}
117
118	iommu->flags |= VTD_FLAG_SVM_CAPABLE;
119}
120
121static void __flush_svm_range_dev(struct intel_svm *svm,
122				  struct intel_svm_dev *sdev,
123				  unsigned long address,
124				  unsigned long pages, int ih)
125{
126	struct device_domain_info *info = get_domain_info(sdev->dev);
127
128	if (WARN_ON(!pages))
129		return;
130
131	qi_flush_piotlb(sdev->iommu, sdev->did, svm->pasid, address, pages, ih);
132	if (info->ats_enabled)
133		qi_flush_dev_iotlb_pasid(sdev->iommu, sdev->sid, info->pfsid,
134					 svm->pasid, sdev->qdep, address,
135					 order_base_2(pages));
136}
137
138static void intel_flush_svm_range_dev(struct intel_svm *svm,
139				      struct intel_svm_dev *sdev,
140				      unsigned long address,
141				      unsigned long pages, int ih)
142{
143	unsigned long shift = ilog2(__roundup_pow_of_two(pages));
144	unsigned long align = (1ULL << (VTD_PAGE_SHIFT + shift));
145	unsigned long start = ALIGN_DOWN(address, align);
146	unsigned long end = ALIGN(address + (pages << VTD_PAGE_SHIFT), align);
147
148	while (start < end) {
149		__flush_svm_range_dev(svm, sdev, start, align >> VTD_PAGE_SHIFT, ih);
150		start += align;
151	}
152}
153
154static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address,
155				unsigned long pages, int ih)
156{
157	struct intel_svm_dev *sdev;
158
159	rcu_read_lock();
160	list_for_each_entry_rcu(sdev, &svm->devs, list)
161		intel_flush_svm_range_dev(svm, sdev, address, pages, ih);
162	rcu_read_unlock();
163}
164
165/* Pages have been freed at this point */
166static void intel_invalidate_range(struct mmu_notifier *mn,
167				   struct mm_struct *mm,
168				   unsigned long start, unsigned long end)
169{
170	struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
171
172	intel_flush_svm_range(svm, start,
173			      (end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT, 0);
174}
175
176static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
177{
178	struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
179	struct intel_svm_dev *sdev;
180
181	/* This might end up being called from exit_mmap(), *before* the page
182	 * tables are cleared. And __mmu_notifier_release() will delete us from
183	 * the list of notifiers so that our invalidate_range() callback doesn't
184	 * get called when the page tables are cleared. So we need to protect
185	 * against hardware accessing those page tables.
186	 *
187	 * We do it by clearing the entry in the PASID table and then flushing
188	 * the IOTLB and the PASID table caches. This might upset hardware;
189	 * perhaps we'll want to point the PASID to a dummy PGD (like the zero
190	 * page) so that we end up taking a fault that the hardware really
191	 * *has* to handle gracefully without affecting other processes.
192	 */
193	rcu_read_lock();
194	list_for_each_entry_rcu(sdev, &svm->devs, list)
195		intel_pasid_tear_down_entry(sdev->iommu, sdev->dev,
196					    svm->pasid, true);
197	rcu_read_unlock();
198
199}
200
201static const struct mmu_notifier_ops intel_mmuops = {
202	.release = intel_mm_release,
203	.invalidate_range = intel_invalidate_range,
204};
205
206static DEFINE_MUTEX(pasid_mutex);
207static LIST_HEAD(global_svm_list);
208
209#define for_each_svm_dev(sdev, svm, d)			\
210	list_for_each_entry((sdev), &(svm)->devs, list)	\
211		if ((d) != (sdev)->dev) {} else
212
213static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid,
214			     struct intel_svm **rsvm,
215			     struct intel_svm_dev **rsdev)
216{
217	struct intel_svm_dev *d, *sdev = NULL;
218	struct intel_svm *svm;
219
220	/* The caller should hold the pasid_mutex lock */
221	if (WARN_ON(!mutex_is_locked(&pasid_mutex)))
222		return -EINVAL;
223
224	if (pasid == INVALID_IOASID || pasid >= PASID_MAX)
225		return -EINVAL;
226
227	svm = ioasid_find(NULL, pasid, NULL);
228	if (IS_ERR(svm))
229		return PTR_ERR(svm);
230
231	if (!svm)
232		goto out;
233
234	/*
235	 * If we found svm for the PASID, there must be at least one device
236	 * bond.
237	 */
238	if (WARN_ON(list_empty(&svm->devs)))
239		return -EINVAL;
240
241	rcu_read_lock();
242	list_for_each_entry_rcu(d, &svm->devs, list) {
243		if (d->dev == dev) {
244			sdev = d;
245			break;
246		}
247	}
248	rcu_read_unlock();
249
250out:
251	*rsvm = svm;
252	*rsdev = sdev;
253
254	return 0;
255}
256
257int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
258			  struct iommu_gpasid_bind_data *data)
259{
260	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
261	struct intel_svm_dev *sdev = NULL;
262	struct dmar_domain *dmar_domain;
263	struct device_domain_info *info;
264	struct intel_svm *svm = NULL;
265	unsigned long iflags;
266	int ret = 0;
267
268	if (WARN_ON(!iommu) || !data)
269		return -EINVAL;
270
271	if (data->format != IOMMU_PASID_FORMAT_INTEL_VTD)
272		return -EINVAL;
273
274	/* IOMMU core ensures argsz is more than the start of the union */
275	if (data->argsz < offsetofend(struct iommu_gpasid_bind_data, vendor.vtd))
276		return -EINVAL;
277
278	/* Make sure no undefined flags are used in vendor data */
279	if (data->vendor.vtd.flags & ~(IOMMU_SVA_VTD_GPASID_LAST - 1))
280		return -EINVAL;
281
282	if (!dev_is_pci(dev))
283		return -ENOTSUPP;
284
285	/* VT-d supports devices with full 20 bit PASIDs only */
286	if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX)
287		return -EINVAL;
288
289	/*
290	 * We only check host PASID range, we have no knowledge to check
291	 * guest PASID range.
292	 */
293	if (data->hpasid <= 0 || data->hpasid >= PASID_MAX)
294		return -EINVAL;
295
296	info = get_domain_info(dev);
297	if (!info)
298		return -EINVAL;
299
300	dmar_domain = to_dmar_domain(domain);
301
302	mutex_lock(&pasid_mutex);
303	ret = pasid_to_svm_sdev(dev, data->hpasid, &svm, &sdev);
304	if (ret)
305		goto out;
306
307	if (sdev) {
308		/*
309		 * Do not allow multiple bindings of the same device-PASID since
310		 * there is only one SL page tables per PASID. We may revisit
311		 * once sharing PGD across domains are supported.
312		 */
313		dev_warn_ratelimited(dev, "Already bound with PASID %u\n",
314				     svm->pasid);
315		ret = -EBUSY;
316		goto out;
317	}
318
319	if (!svm) {
320		/* We come here when PASID has never been bond to a device. */
321		svm = kzalloc(sizeof(*svm), GFP_KERNEL);
322		if (!svm) {
323			ret = -ENOMEM;
324			goto out;
325		}
326		/* REVISIT: upper layer/VFIO can track host process that bind
327		 * the PASID. ioasid_set = mm might be sufficient for vfio to
328		 * check pasid VMM ownership. We can drop the following line
329		 * once VFIO and IOASID set check is in place.
330		 */
331		svm->mm = get_task_mm(current);
332		svm->pasid = data->hpasid;
333		if (data->flags & IOMMU_SVA_GPASID_VAL) {
334			svm->gpasid = data->gpasid;
335			svm->flags |= SVM_FLAG_GUEST_PASID;
336		}
337		ioasid_set_data(data->hpasid, svm);
338		INIT_LIST_HEAD_RCU(&svm->devs);
339		mmput(svm->mm);
340	}
341	sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
342	if (!sdev) {
343		ret = -ENOMEM;
344		goto out;
345	}
346	sdev->dev = dev;
347	sdev->sid = PCI_DEVID(info->bus, info->devfn);
348	sdev->iommu = iommu;
349
350	/* Only count users if device has aux domains */
351	if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
352		sdev->users = 1;
353
354	/* Set up device context entry for PASID if not enabled already */
355	ret = intel_iommu_enable_pasid(iommu, sdev->dev);
356	if (ret) {
357		dev_err_ratelimited(dev, "Failed to enable PASID capability\n");
358		kfree(sdev);
359		goto out;
360	}
361
362	/*
363	 * PASID table is per device for better security. Therefore, for
364	 * each bind of a new device even with an existing PASID, we need to
365	 * call the nested mode setup function here.
366	 */
367	spin_lock_irqsave(&iommu->lock, iflags);
368	ret = intel_pasid_setup_nested(iommu, dev,
369				       (pgd_t *)(uintptr_t)data->gpgd,
370				       data->hpasid, &data->vendor.vtd, dmar_domain,
371				       data->addr_width);
372	spin_unlock_irqrestore(&iommu->lock, iflags);
373	if (ret) {
374		dev_err_ratelimited(dev, "Failed to set up PASID %llu in nested mode, Err %d\n",
375				    data->hpasid, ret);
376		/*
377		 * PASID entry should be in cleared state if nested mode
378		 * set up failed. So we only need to clear IOASID tracking
379		 * data such that free call will succeed.
380		 */
381		kfree(sdev);
382		goto out;
383	}
384
385	svm->flags |= SVM_FLAG_GUEST_MODE;
386
387	init_rcu_head(&sdev->rcu);
388	list_add_rcu(&sdev->list, &svm->devs);
389 out:
390	if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) {
391		ioasid_set_data(data->hpasid, NULL);
392		kfree(svm);
393	}
394
395	mutex_unlock(&pasid_mutex);
396	return ret;
397}
398
399int intel_svm_unbind_gpasid(struct device *dev, u32 pasid)
400{
401	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
402	struct intel_svm_dev *sdev;
403	struct intel_svm *svm;
404	int ret;
405
406	if (WARN_ON(!iommu))
407		return -EINVAL;
408
409	mutex_lock(&pasid_mutex);
410	ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev);
411	if (ret)
412		goto out;
413
414	if (sdev) {
415		if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
416			sdev->users--;
417		if (!sdev->users) {
418			list_del_rcu(&sdev->list);
419			intel_pasid_tear_down_entry(iommu, dev,
420						    svm->pasid, false);
421			intel_svm_drain_prq(dev, svm->pasid);
422			kfree_rcu(sdev, rcu);
423
424			if (list_empty(&svm->devs)) {
425				/*
426				 * We do not free the IOASID here in that
427				 * IOMMU driver did not allocate it.
428				 * Unlike native SVM, IOASID for guest use was
429				 * allocated prior to the bind call.
430				 * In any case, if the free call comes before
431				 * the unbind, IOMMU driver will get notified
432				 * and perform cleanup.
433				 */
434				ioasid_set_data(pasid, NULL);
435				kfree(svm);
436			}
437		}
438	}
439out:
440	mutex_unlock(&pasid_mutex);
441	return ret;
442}
443
444static void _load_pasid(void *unused)
445{
446	update_pasid();
447}
448
449static void load_pasid(struct mm_struct *mm, u32 pasid)
450{
451	mutex_lock(&mm->context.lock);
452
453	/* Synchronize with READ_ONCE in update_pasid(). */
454	smp_store_release(&mm->pasid, pasid);
455
456	/* Update PASID MSR on all CPUs running the mm's tasks. */
457	on_each_cpu_mask(mm_cpumask(mm), _load_pasid, NULL, true);
458
459	mutex_unlock(&mm->context.lock);
460}
461
462/* Caller must hold pasid_mutex, mm reference */
463static int
464intel_svm_bind_mm(struct device *dev, unsigned int flags,
465		  struct svm_dev_ops *ops,
466		  struct mm_struct *mm, struct intel_svm_dev **sd)
467{
468	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
469	struct device_domain_info *info;
470	struct intel_svm_dev *sdev;
471	struct intel_svm *svm = NULL;
472	unsigned long iflags;
473	int pasid_max;
474	int ret;
475
476	if (!iommu || dmar_disabled)
477		return -EINVAL;
478
479	if (!intel_svm_capable(iommu))
480		return -ENOTSUPP;
481
482	if (dev_is_pci(dev)) {
483		pasid_max = pci_max_pasids(to_pci_dev(dev));
484		if (pasid_max < 0)
485			return -EINVAL;
486	} else
487		pasid_max = 1 << 20;
488
489	/* Bind supervisor PASID shuld have mm = NULL */
490	if (flags & SVM_FLAG_SUPERVISOR_MODE) {
491		if (!ecap_srs(iommu->ecap) || mm) {
492			pr_err("Supervisor PASID with user provided mm.\n");
493			return -EINVAL;
494		}
495	}
496
497	if (!(flags & SVM_FLAG_PRIVATE_PASID)) {
498		struct intel_svm *t;
499
500		list_for_each_entry(t, &global_svm_list, list) {
501			if (t->mm != mm || (t->flags & SVM_FLAG_PRIVATE_PASID))
502				continue;
503
504			svm = t;
505			if (svm->pasid >= pasid_max) {
506				dev_warn(dev,
507					 "Limited PASID width. Cannot use existing PASID %d\n",
508					 svm->pasid);
509				ret = -ENOSPC;
510				goto out;
511			}
512
513			/* Find the matching device in svm list */
514			for_each_svm_dev(sdev, svm, dev) {
515				if (sdev->ops != ops) {
516					ret = -EBUSY;
517					goto out;
518				}
519				sdev->users++;
520				goto success;
521			}
522
523			break;
524		}
525	}
526
527	sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
528	if (!sdev) {
529		ret = -ENOMEM;
530		goto out;
531	}
532	sdev->dev = dev;
533	sdev->iommu = iommu;
534
535	ret = intel_iommu_enable_pasid(iommu, dev);
536	if (ret) {
537		kfree(sdev);
538		goto out;
539	}
540
541	info = get_domain_info(dev);
542	sdev->did = FLPT_DEFAULT_DID;
543	sdev->sid = PCI_DEVID(info->bus, info->devfn);
544	if (info->ats_enabled) {
545		sdev->dev_iotlb = 1;
546		sdev->qdep = info->ats_qdep;
547		if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
548			sdev->qdep = 0;
549	}
550
551	/* Finish the setup now we know we're keeping it */
552	sdev->users = 1;
553	sdev->ops = ops;
554	init_rcu_head(&sdev->rcu);
555
556	if (!svm) {
557		svm = kzalloc(sizeof(*svm), GFP_KERNEL);
558		if (!svm) {
559			ret = -ENOMEM;
560			kfree(sdev);
561			goto out;
562		}
563
564		if (pasid_max > intel_pasid_max_id)
565			pasid_max = intel_pasid_max_id;
566
567		/* Do not use PASID 0, reserved for RID to PASID */
568		svm->pasid = ioasid_alloc(NULL, PASID_MIN,
569					  pasid_max - 1, svm);
570		if (svm->pasid == INVALID_IOASID) {
571			kfree(svm);
572			kfree(sdev);
573			ret = -ENOSPC;
574			goto out;
575		}
576		svm->notifier.ops = &intel_mmuops;
577		svm->mm = mm;
578		svm->flags = flags;
579		INIT_LIST_HEAD_RCU(&svm->devs);
580		INIT_LIST_HEAD(&svm->list);
581		ret = -ENOMEM;
582		if (mm) {
583			ret = mmu_notifier_register(&svm->notifier, mm);
584			if (ret) {
585				ioasid_free(svm->pasid);
586				kfree(svm);
587				kfree(sdev);
588				goto out;
589			}
590		}
591
592		spin_lock_irqsave(&iommu->lock, iflags);
593		ret = intel_pasid_setup_first_level(iommu, dev,
594				mm ? mm->pgd : init_mm.pgd,
595				svm->pasid, FLPT_DEFAULT_DID,
596				(mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
597				(cpu_feature_enabled(X86_FEATURE_LA57) ?
598				 PASID_FLAG_FL5LP : 0));
599		spin_unlock_irqrestore(&iommu->lock, iflags);
600		if (ret) {
601			if (mm)
602				mmu_notifier_unregister(&svm->notifier, mm);
603			ioasid_free(svm->pasid);
604			kfree(svm);
605			kfree(sdev);
606			goto out;
607		}
608
609		list_add_tail(&svm->list, &global_svm_list);
610		if (mm) {
611			/* The newly allocated pasid is loaded to the mm. */
612			load_pasid(mm, svm->pasid);
613		}
614	} else {
615		/*
616		 * Binding a new device with existing PASID, need to setup
617		 * the PASID entry.
618		 */
619		spin_lock_irqsave(&iommu->lock, iflags);
620		ret = intel_pasid_setup_first_level(iommu, dev,
621						mm ? mm->pgd : init_mm.pgd,
622						svm->pasid, FLPT_DEFAULT_DID,
623						(mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
624						(cpu_feature_enabled(X86_FEATURE_LA57) ?
625						PASID_FLAG_FL5LP : 0));
626		spin_unlock_irqrestore(&iommu->lock, iflags);
627		if (ret) {
628			kfree(sdev);
629			goto out;
630		}
631	}
632	list_add_rcu(&sdev->list, &svm->devs);
633success:
634	sdev->pasid = svm->pasid;
635	sdev->sva.dev = dev;
636	if (sd)
637		*sd = sdev;
638	ret = 0;
639out:
640	return ret;
641}
642
643/* Caller must hold pasid_mutex */
644static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
645{
646	struct intel_svm_dev *sdev;
647	struct intel_iommu *iommu;
648	struct intel_svm *svm;
649	int ret = -EINVAL;
650
651	iommu = device_to_iommu(dev, NULL, NULL);
652	if (!iommu)
653		goto out;
654
655	ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev);
656	if (ret)
657		goto out;
658
659	if (sdev) {
660		sdev->users--;
661		if (!sdev->users) {
662			list_del_rcu(&sdev->list);
663			/* Flush the PASID cache and IOTLB for this device.
664			 * Note that we do depend on the hardware *not* using
665			 * the PASID any more. Just as we depend on other
666			 * devices never using PASIDs that they have no right
667			 * to use. We have a *shared* PASID table, because it's
668			 * large and has to be physically contiguous. So it's
669			 * hard to be as defensive as we might like. */
670			intel_pasid_tear_down_entry(iommu, dev,
671						    svm->pasid, false);
672			intel_svm_drain_prq(dev, svm->pasid);
673			kfree_rcu(sdev, rcu);
674
675			if (list_empty(&svm->devs)) {
676				ioasid_free(svm->pasid);
677				if (svm->mm) {
678					mmu_notifier_unregister(&svm->notifier, svm->mm);
679					/* Clear mm's pasid. */
680					load_pasid(svm->mm, PASID_DISABLED);
681				}
682				list_del(&svm->list);
683				/* We mandate that no page faults may be outstanding
684				 * for the PASID when intel_svm_unbind_mm() is called.
685				 * If that is not obeyed, subtle errors will happen.
686				 * Let's make them less subtle... */
687				memset(svm, 0x6b, sizeof(*svm));
688				kfree(svm);
689			}
690		}
691	}
692out:
693	return ret;
694}
695
696/* Page request queue descriptor */
697struct page_req_dsc {
698	union {
699		struct {
700			u64 type:8;
701			u64 pasid_present:1;
702			u64 priv_data_present:1;
703			u64 rsvd:6;
704			u64 rid:16;
705			u64 pasid:20;
706			u64 exe_req:1;
707			u64 pm_req:1;
708			u64 rsvd2:10;
709		};
710		u64 qw_0;
711	};
712	union {
713		struct {
714			u64 rd_req:1;
715			u64 wr_req:1;
716			u64 lpig:1;
717			u64 prg_index:9;
718			u64 addr:52;
719		};
720		u64 qw_1;
721	};
722	u64 priv_data[2];
723};
724
725#define PRQ_RING_MASK	((0x1000 << PRQ_ORDER) - 0x20)
726
727static bool access_error(struct vm_area_struct *vma, struct page_req_dsc *req)
728{
729	unsigned long requested = 0;
730
731	if (req->exe_req)
732		requested |= VM_EXEC;
733
734	if (req->rd_req)
735		requested |= VM_READ;
736
737	if (req->wr_req)
738		requested |= VM_WRITE;
739
740	return (requested & ~vma->vm_flags) != 0;
741}
742
743static bool is_canonical_address(u64 addr)
744{
745	int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
746	long saddr = (long) addr;
747
748	return (((saddr << shift) >> shift) == saddr);
749}
750
751/**
752 * intel_svm_drain_prq - Drain page requests and responses for a pasid
753 * @dev: target device
754 * @pasid: pasid for draining
755 *
756 * Drain all pending page requests and responses related to @pasid in both
757 * software and hardware. This is supposed to be called after the device
758 * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB
759 * and DevTLB have been invalidated.
760 *
761 * It waits until all pending page requests for @pasid in the page fault
762 * queue are completed by the prq handling thread. Then follow the steps
763 * described in VT-d spec CH7.10 to drain all page requests and page
764 * responses pending in the hardware.
765 */
766static void intel_svm_drain_prq(struct device *dev, u32 pasid)
767{
768	struct device_domain_info *info;
769	struct dmar_domain *domain;
770	struct intel_iommu *iommu;
771	struct qi_desc desc[3];
772	struct pci_dev *pdev;
773	int head, tail;
774	u16 sid, did;
775	int qdep;
776
777	info = get_domain_info(dev);
778	if (WARN_ON(!info || !dev_is_pci(dev)))
779		return;
780
781	if (!info->pri_enabled)
782		return;
783
784	iommu = info->iommu;
785	domain = info->domain;
786	pdev = to_pci_dev(dev);
787	sid = PCI_DEVID(info->bus, info->devfn);
788	did = domain->iommu_did[iommu->seq_id];
789	qdep = pci_ats_queue_depth(pdev);
790
791	/*
792	 * Check and wait until all pending page requests in the queue are
793	 * handled by the prq handling thread.
794	 */
795prq_retry:
796	reinit_completion(&iommu->prq_complete);
797	tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
798	head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
799	while (head != tail) {
800		struct page_req_dsc *req;
801
802		req = &iommu->prq[head / sizeof(*req)];
803		if (!req->pasid_present || req->pasid != pasid) {
804			head = (head + sizeof(*req)) & PRQ_RING_MASK;
805			continue;
806		}
807
808		wait_for_completion(&iommu->prq_complete);
809		goto prq_retry;
810	}
811
812	/*
813	 * Perform steps described in VT-d spec CH7.10 to drain page
814	 * requests and responses in hardware.
815	 */
816	memset(desc, 0, sizeof(desc));
817	desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) |
818			QI_IWD_FENCE |
819			QI_IWD_TYPE;
820	desc[1].qw0 = QI_EIOTLB_PASID(pasid) |
821			QI_EIOTLB_DID(did) |
822			QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
823			QI_EIOTLB_TYPE;
824	desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) |
825			QI_DEV_EIOTLB_SID(sid) |
826			QI_DEV_EIOTLB_QDEP(qdep) |
827			QI_DEIOTLB_TYPE |
828			QI_DEV_IOTLB_PFSID(info->pfsid);
829qi_retry:
830	reinit_completion(&iommu->prq_complete);
831	qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN);
832	if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) {
833		wait_for_completion(&iommu->prq_complete);
834		goto qi_retry;
835	}
836}
837
838static int prq_to_iommu_prot(struct page_req_dsc *req)
839{
840	int prot = 0;
841
842	if (req->rd_req)
843		prot |= IOMMU_FAULT_PERM_READ;
844	if (req->wr_req)
845		prot |= IOMMU_FAULT_PERM_WRITE;
846	if (req->exe_req)
847		prot |= IOMMU_FAULT_PERM_EXEC;
848	if (req->pm_req)
849		prot |= IOMMU_FAULT_PERM_PRIV;
850
851	return prot;
852}
853
854static int
855intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc)
856{
857	struct iommu_fault_event event;
858
859	if (!dev || !dev_is_pci(dev))
860		return -ENODEV;
861
862	/* Fill in event data for device specific processing */
863	memset(&event, 0, sizeof(struct iommu_fault_event));
864	event.fault.type = IOMMU_FAULT_PAGE_REQ;
865	event.fault.prm.addr = (u64)desc->addr << VTD_PAGE_SHIFT;
866	event.fault.prm.pasid = desc->pasid;
867	event.fault.prm.grpid = desc->prg_index;
868	event.fault.prm.perm = prq_to_iommu_prot(desc);
869
870	if (desc->lpig)
871		event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
872	if (desc->pasid_present) {
873		event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
874		event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID;
875	}
876	if (desc->priv_data_present) {
877		/*
878		 * Set last page in group bit if private data is present,
879		 * page response is required as it does for LPIG.
880		 * iommu_report_device_fault() doesn't understand this vendor
881		 * specific requirement thus we set last_page as a workaround.
882		 */
883		event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
884		event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
885		memcpy(event.fault.prm.private_data, desc->priv_data,
886		       sizeof(desc->priv_data));
887	}
888
889	return iommu_report_device_fault(dev, &event);
890}
891
892static irqreturn_t prq_event_thread(int irq, void *d)
893{
894	struct intel_svm_dev *sdev = NULL;
895	struct intel_iommu *iommu = d;
896	struct intel_svm *svm = NULL;
897	int head, tail, handled = 0;
898
899	/* Clear PPR bit before reading head/tail registers, to
900	 * ensure that we get a new interrupt if needed. */
901	writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG);
902
903	tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
904	head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
905	while (head != tail) {
906		struct vm_area_struct *vma;
907		struct page_req_dsc *req;
908		struct qi_desc resp;
909		int result;
910		vm_fault_t ret;
911		u64 address;
912
913		handled = 1;
914
915		req = &iommu->prq[head / sizeof(*req)];
916
917		result = QI_RESP_FAILURE;
918		address = (u64)req->addr << VTD_PAGE_SHIFT;
919		if (!req->pasid_present) {
920			pr_err("%s: Page request without PASID: %08llx %08llx\n",
921			       iommu->name, ((unsigned long long *)req)[0],
922			       ((unsigned long long *)req)[1]);
923			goto no_pasid;
924		}
925		/* We shall not receive page request for supervisor SVM */
926		if (req->pm_req && (req->rd_req | req->wr_req)) {
927			pr_err("Unexpected page request in Privilege Mode");
928			/* No need to find the matching sdev as for bad_req */
929			goto no_pasid;
930		}
931		/* DMA read with exec requeset is not supported. */
932		if (req->exe_req && req->rd_req) {
933			pr_err("Execution request not supported\n");
934			goto no_pasid;
935		}
936		if (!svm || svm->pasid != req->pasid) {
937			rcu_read_lock();
938			svm = ioasid_find(NULL, req->pasid, NULL);
939			/* It *can't* go away, because the driver is not permitted
940			 * to unbind the mm while any page faults are outstanding.
941			 * So we only need RCU to protect the internal idr code. */
942			rcu_read_unlock();
943			if (IS_ERR_OR_NULL(svm)) {
944				pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n",
945				       iommu->name, req->pasid, ((unsigned long long *)req)[0],
946				       ((unsigned long long *)req)[1]);
947				goto no_pasid;
948			}
949		}
950
951		if (!sdev || sdev->sid != req->rid) {
952			struct intel_svm_dev *t;
953
954			sdev = NULL;
955			rcu_read_lock();
956			list_for_each_entry_rcu(t, &svm->devs, list) {
957				if (t->sid == req->rid) {
958					sdev = t;
959					break;
960				}
961			}
962			rcu_read_unlock();
963		}
964
965		result = QI_RESP_INVALID;
966		/* Since we're using init_mm.pgd directly, we should never take
967		 * any faults on kernel addresses. */
968		if (!svm->mm)
969			goto bad_req;
970
971		/* If address is not canonical, return invalid response */
972		if (!is_canonical_address(address))
973			goto bad_req;
974
975		/*
976		 * If prq is to be handled outside iommu driver via receiver of
977		 * the fault notifiers, we skip the page response here.
978		 */
979		if (svm->flags & SVM_FLAG_GUEST_MODE) {
980			if (sdev && !intel_svm_prq_report(sdev->dev, req))
981				goto prq_advance;
982			else
983				goto bad_req;
984		}
985
986		/* If the mm is already defunct, don't handle faults. */
987		if (!mmget_not_zero(svm->mm))
988			goto bad_req;
989
990		mmap_read_lock(svm->mm);
991		vma = find_extend_vma(svm->mm, address);
992		if (!vma || address < vma->vm_start)
993			goto invalid;
994
995		if (access_error(vma, req))
996			goto invalid;
997
998		ret = handle_mm_fault(vma, address,
999				      req->wr_req ? FAULT_FLAG_WRITE : 0,
1000				      NULL);
1001		if (ret & VM_FAULT_ERROR)
1002			goto invalid;
1003
1004		result = QI_RESP_SUCCESS;
1005invalid:
1006		mmap_read_unlock(svm->mm);
1007		mmput(svm->mm);
1008bad_req:
1009		WARN_ON(!sdev);
1010		if (sdev && sdev->ops && sdev->ops->fault_cb) {
1011			int rwxp = (req->rd_req << 3) | (req->wr_req << 2) |
1012				(req->exe_req << 1) | (req->pm_req);
1013			sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr,
1014					    req->priv_data, rwxp, result);
1015		}
1016		/* We get here in the error case where the PASID lookup failed,
1017		   and these can be NULL. Do not use them below this point! */
1018		sdev = NULL;
1019		svm = NULL;
1020no_pasid:
1021		if (req->lpig || req->priv_data_present) {
1022			/*
1023			 * Per VT-d spec. v3.0 ch7.7, system software must
1024			 * respond with page group response if private data
1025			 * is present (PDP) or last page in group (LPIG) bit
1026			 * is set. This is an additional VT-d feature beyond
1027			 * PCI ATS spec.
1028			 */
1029			resp.qw0 = QI_PGRP_PASID(req->pasid) |
1030				QI_PGRP_DID(req->rid) |
1031				QI_PGRP_PASID_P(req->pasid_present) |
1032				QI_PGRP_PDP(req->priv_data_present) |
1033				QI_PGRP_RESP_CODE(result) |
1034				QI_PGRP_RESP_TYPE;
1035			resp.qw1 = QI_PGRP_IDX(req->prg_index) |
1036				QI_PGRP_LPIG(req->lpig);
1037			resp.qw2 = 0;
1038			resp.qw3 = 0;
1039
1040			if (req->priv_data_present)
1041				memcpy(&resp.qw2, req->priv_data,
1042				       sizeof(req->priv_data));
1043			qi_submit_sync(iommu, &resp, 1, 0);
1044		}
1045prq_advance:
1046		head = (head + sizeof(*req)) & PRQ_RING_MASK;
1047	}
1048
1049	dmar_writeq(iommu->reg + DMAR_PQH_REG, tail);
1050
1051	/*
1052	 * Clear the page request overflow bit and wake up all threads that
1053	 * are waiting for the completion of this handling.
1054	 */
1055	if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) {
1056		pr_info_ratelimited("IOMMU: %s: PRQ overflow detected\n",
1057				    iommu->name);
1058		head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
1059		tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
1060		if (head == tail) {
1061			writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG);
1062			pr_info_ratelimited("IOMMU: %s: PRQ overflow cleared",
1063					    iommu->name);
1064		}
1065	}
1066
1067	if (!completion_done(&iommu->prq_complete))
1068		complete(&iommu->prq_complete);
1069
1070	return IRQ_RETVAL(handled);
1071}
1072
1073#define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva)
1074struct iommu_sva *
1075intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
1076{
1077	struct iommu_sva *sva = ERR_PTR(-EINVAL);
1078	struct intel_svm_dev *sdev = NULL;
1079	unsigned int flags = 0;
1080	int ret;
1081
1082	/*
1083	 * TODO: Consolidate with generic iommu-sva bind after it is merged.
1084	 * It will require shared SVM data structures, i.e. combine io_mm
1085	 * and intel_svm etc.
1086	 */
1087	if (drvdata)
1088		flags = *(unsigned int *)drvdata;
1089	mutex_lock(&pasid_mutex);
1090	ret = intel_svm_bind_mm(dev, flags, NULL, mm, &sdev);
1091	if (ret)
1092		sva = ERR_PTR(ret);
1093	else if (sdev)
1094		sva = &sdev->sva;
1095	else
1096		WARN(!sdev, "SVM bind succeeded with no sdev!\n");
1097
1098	mutex_unlock(&pasid_mutex);
1099
1100	return sva;
1101}
1102
1103void intel_svm_unbind(struct iommu_sva *sva)
1104{
1105	struct intel_svm_dev *sdev;
1106
1107	mutex_lock(&pasid_mutex);
1108	sdev = to_intel_svm_dev(sva);
1109	intel_svm_unbind_mm(sdev->dev, sdev->pasid);
1110	mutex_unlock(&pasid_mutex);
1111}
1112
1113u32 intel_svm_get_pasid(struct iommu_sva *sva)
1114{
1115	struct intel_svm_dev *sdev;
1116	u32 pasid;
1117
1118	mutex_lock(&pasid_mutex);
1119	sdev = to_intel_svm_dev(sva);
1120	pasid = sdev->pasid;
1121	mutex_unlock(&pasid_mutex);
1122
1123	return pasid;
1124}
1125
1126int intel_svm_page_response(struct device *dev,
1127			    struct iommu_fault_event *evt,
1128			    struct iommu_page_response *msg)
1129{
1130	struct iommu_fault_page_request *prm;
1131	struct intel_svm_dev *sdev = NULL;
1132	struct intel_svm *svm = NULL;
1133	struct intel_iommu *iommu;
1134	bool private_present;
1135	bool pasid_present;
1136	bool last_page;
1137	u8 bus, devfn;
1138	int ret = 0;
1139	u16 sid;
1140
1141	if (!dev || !dev_is_pci(dev))
1142		return -ENODEV;
1143
1144	iommu = device_to_iommu(dev, &bus, &devfn);
1145	if (!iommu)
1146		return -ENODEV;
1147
1148	if (!msg || !evt)
1149		return -EINVAL;
1150
1151	mutex_lock(&pasid_mutex);
1152
1153	prm = &evt->fault.prm;
1154	sid = PCI_DEVID(bus, devfn);
1155	pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
1156	private_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
1157	last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
1158
1159	if (!pasid_present) {
1160		ret = -EINVAL;
1161		goto out;
1162	}
1163
1164	if (prm->pasid == 0 || prm->pasid >= PASID_MAX) {
1165		ret = -EINVAL;
1166		goto out;
1167	}
1168
1169	ret = pasid_to_svm_sdev(dev, prm->pasid, &svm, &sdev);
1170	if (ret || !sdev) {
1171		ret = -ENODEV;
1172		goto out;
1173	}
1174
1175	/*
1176	 * For responses from userspace, need to make sure that the
1177	 * pasid has been bound to its mm.
1178	 */
1179	if (svm->flags & SVM_FLAG_GUEST_MODE) {
1180		struct mm_struct *mm;
1181
1182		mm = get_task_mm(current);
1183		if (!mm) {
1184			ret = -EINVAL;
1185			goto out;
1186		}
1187
1188		if (mm != svm->mm) {
1189			ret = -ENODEV;
1190			mmput(mm);
1191			goto out;
1192		}
1193
1194		mmput(mm);
1195	}
1196
1197	/*
1198	 * Per VT-d spec. v3.0 ch7.7, system software must respond
1199	 * with page group response if private data is present (PDP)
1200	 * or last page in group (LPIG) bit is set. This is an
1201	 * additional VT-d requirement beyond PCI ATS spec.
1202	 */
1203	if (last_page || private_present) {
1204		struct qi_desc desc;
1205
1206		desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) |
1207				QI_PGRP_PASID_P(pasid_present) |
1208				QI_PGRP_PDP(private_present) |
1209				QI_PGRP_RESP_CODE(msg->code) |
1210				QI_PGRP_RESP_TYPE;
1211		desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page);
1212		desc.qw2 = 0;
1213		desc.qw3 = 0;
1214		if (private_present)
1215			memcpy(&desc.qw2, prm->private_data,
1216			       sizeof(prm->private_data));
1217
1218		qi_submit_sync(iommu, &desc, 1, 0);
1219	}
1220out:
1221	mutex_unlock(&pasid_mutex);
1222	return ret;
1223}
1224