1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * VFIO: IOMMU DMA mapping support for TCE on POWER
4 *
5 * Copyright (C) 2013 IBM Corp.  All rights reserved.
6 *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
7 *
8 * Derived from original vfio_iommu_type1.c:
9 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
10 *     Author: Alex Williamson <alex.williamson@redhat.com>
11 */
12
13#include <linux/module.h>
14#include <linux/pci.h>
15#include <linux/slab.h>
16#include <linux/uaccess.h>
17#include <linux/err.h>
18#include <linux/vfio.h>
19#include <linux/vmalloc.h>
20#include <linux/sched/mm.h>
21#include <linux/sched/signal.h>
22#include <linux/mm.h>
23
24#include <asm/iommu.h>
25#include <asm/tce.h>
26#include <asm/mmu_context.h>
27
28#define DRIVER_VERSION  "0.1"
29#define DRIVER_AUTHOR   "aik@ozlabs.ru"
30#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
31
32static void tce_iommu_detach_group(void *iommu_data,
33		struct iommu_group *iommu_group);
34
35/*
36 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
37 *
38 * This code handles mapping and unmapping of user data buffers
39 * into DMA'ble space using the IOMMU
40 */
41
42struct tce_iommu_group {
43	struct list_head next;
44	struct iommu_group *grp;
45};
46
47/*
48 * A container needs to remember which preregistered region  it has
49 * referenced to do proper cleanup at the userspace process exit.
50 */
51struct tce_iommu_prereg {
52	struct list_head next;
53	struct mm_iommu_table_group_mem_t *mem;
54};
55
56/*
57 * The container descriptor supports only a single group per container.
58 * Required by the API as the container is not supplied with the IOMMU group
59 * at the moment of initialization.
60 */
61struct tce_container {
62	struct mutex lock;
63	bool enabled;
64	bool v2;
65	bool def_window_pending;
66	unsigned long locked_pages;
67	struct mm_struct *mm;
68	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
69	struct list_head group_list;
70	struct list_head prereg_list;
71};
72
73static long tce_iommu_mm_set(struct tce_container *container)
74{
75	if (container->mm) {
76		if (container->mm == current->mm)
77			return 0;
78		return -EPERM;
79	}
80	BUG_ON(!current->mm);
81	container->mm = current->mm;
82	mmgrab(container->mm);
83
84	return 0;
85}
86
87static long tce_iommu_prereg_free(struct tce_container *container,
88		struct tce_iommu_prereg *tcemem)
89{
90	long ret;
91
92	ret = mm_iommu_put(container->mm, tcemem->mem);
93	if (ret)
94		return ret;
95
96	list_del(&tcemem->next);
97	kfree(tcemem);
98
99	return 0;
100}
101
102static long tce_iommu_unregister_pages(struct tce_container *container,
103		__u64 vaddr, __u64 size)
104{
105	struct mm_iommu_table_group_mem_t *mem;
106	struct tce_iommu_prereg *tcemem;
107	bool found = false;
108	long ret;
109
110	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
111		return -EINVAL;
112
113	mem = mm_iommu_get(container->mm, vaddr, size >> PAGE_SHIFT);
114	if (!mem)
115		return -ENOENT;
116
117	list_for_each_entry(tcemem, &container->prereg_list, next) {
118		if (tcemem->mem == mem) {
119			found = true;
120			break;
121		}
122	}
123
124	if (!found)
125		ret = -ENOENT;
126	else
127		ret = tce_iommu_prereg_free(container, tcemem);
128
129	mm_iommu_put(container->mm, mem);
130
131	return ret;
132}
133
134static long tce_iommu_register_pages(struct tce_container *container,
135		__u64 vaddr, __u64 size)
136{
137	long ret = 0;
138	struct mm_iommu_table_group_mem_t *mem = NULL;
139	struct tce_iommu_prereg *tcemem;
140	unsigned long entries = size >> PAGE_SHIFT;
141
142	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
143			((vaddr + size) < vaddr))
144		return -EINVAL;
145
146	mem = mm_iommu_get(container->mm, vaddr, entries);
147	if (mem) {
148		list_for_each_entry(tcemem, &container->prereg_list, next) {
149			if (tcemem->mem == mem) {
150				ret = -EBUSY;
151				goto put_exit;
152			}
153		}
154	} else {
155		ret = mm_iommu_new(container->mm, vaddr, entries, &mem);
156		if (ret)
157			return ret;
158	}
159
160	tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL);
161	if (!tcemem) {
162		ret = -ENOMEM;
163		goto put_exit;
164	}
165
166	tcemem->mem = mem;
167	list_add(&tcemem->next, &container->prereg_list);
168
169	container->enabled = true;
170
171	return 0;
172
173put_exit:
174	mm_iommu_put(container->mm, mem);
175	return ret;
176}
177
178static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa,
179		unsigned int it_page_shift)
180{
181	struct page *page;
182	unsigned long size = 0;
183
184	if (mm_iommu_is_devmem(mm, hpa, it_page_shift, &size))
185		return size == (1UL << it_page_shift);
186
187	page = pfn_to_page(hpa >> PAGE_SHIFT);
188	/*
189	 * Check that the TCE table granularity is not bigger than the size of
190	 * a page we just found. Otherwise the hardware can get access to
191	 * a bigger memory chunk that it should.
192	 */
193	return page_shift(compound_head(page)) >= it_page_shift;
194}
195
196static inline bool tce_groups_attached(struct tce_container *container)
197{
198	return !list_empty(&container->group_list);
199}
200
201static long tce_iommu_find_table(struct tce_container *container,
202		phys_addr_t ioba, struct iommu_table **ptbl)
203{
204	long i;
205
206	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
207		struct iommu_table *tbl = container->tables[i];
208
209		if (tbl) {
210			unsigned long entry = ioba >> tbl->it_page_shift;
211			unsigned long start = tbl->it_offset;
212			unsigned long end = start + tbl->it_size;
213
214			if ((start <= entry) && (entry < end)) {
215				*ptbl = tbl;
216				return i;
217			}
218		}
219	}
220
221	return -1;
222}
223
224static int tce_iommu_find_free_table(struct tce_container *container)
225{
226	int i;
227
228	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
229		if (!container->tables[i])
230			return i;
231	}
232
233	return -ENOSPC;
234}
235
236static int tce_iommu_enable(struct tce_container *container)
237{
238	int ret = 0;
239	unsigned long locked;
240	struct iommu_table_group *table_group;
241	struct tce_iommu_group *tcegrp;
242
243	if (container->enabled)
244		return -EBUSY;
245
246	/*
247	 * When userspace pages are mapped into the IOMMU, they are effectively
248	 * locked memory, so, theoretically, we need to update the accounting
249	 * of locked pages on each map and unmap.  For powerpc, the map unmap
250	 * paths can be very hot, though, and the accounting would kill
251	 * performance, especially since it would be difficult to impossible
252	 * to handle the accounting in real mode only.
253	 *
254	 * To address that, rather than precisely accounting every page, we
255	 * instead account for a worst case on locked memory when the iommu is
256	 * enabled and disabled.  The worst case upper bound on locked memory
257	 * is the size of the whole iommu window, which is usually relatively
258	 * small (compared to total memory sizes) on POWER hardware.
259	 *
260	 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
261	 * that would effectively kill the guest at random points, much better
262	 * enforcing the limit based on the max that the guest can map.
263	 *
264	 * Unfortunately at the moment it counts whole tables, no matter how
265	 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
266	 * each with 2GB DMA window, 8GB will be counted here. The reason for
267	 * this is that we cannot tell here the amount of RAM used by the guest
268	 * as this information is only available from KVM and VFIO is
269	 * KVM agnostic.
270	 *
271	 * So we do not allow enabling a container without a group attached
272	 * as there is no way to know how much we should increment
273	 * the locked_vm counter.
274	 */
275	if (!tce_groups_attached(container))
276		return -ENODEV;
277
278	tcegrp = list_first_entry(&container->group_list,
279			struct tce_iommu_group, next);
280	table_group = iommu_group_get_iommudata(tcegrp->grp);
281	if (!table_group)
282		return -ENODEV;
283
284	if (!table_group->tce32_size)
285		return -EPERM;
286
287	ret = tce_iommu_mm_set(container);
288	if (ret)
289		return ret;
290
291	locked = table_group->tce32_size >> PAGE_SHIFT;
292	ret = account_locked_vm(container->mm, locked, true);
293	if (ret)
294		return ret;
295
296	container->locked_pages = locked;
297
298	container->enabled = true;
299
300	return ret;
301}
302
303static void tce_iommu_disable(struct tce_container *container)
304{
305	if (!container->enabled)
306		return;
307
308	container->enabled = false;
309
310	BUG_ON(!container->mm);
311	account_locked_vm(container->mm, container->locked_pages, false);
312}
313
314static void *tce_iommu_open(unsigned long arg)
315{
316	struct tce_container *container;
317
318	if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) {
319		pr_err("tce_vfio: Wrong IOMMU type\n");
320		return ERR_PTR(-EINVAL);
321	}
322
323	container = kzalloc(sizeof(*container), GFP_KERNEL);
324	if (!container)
325		return ERR_PTR(-ENOMEM);
326
327	mutex_init(&container->lock);
328	INIT_LIST_HEAD_RCU(&container->group_list);
329	INIT_LIST_HEAD_RCU(&container->prereg_list);
330
331	container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
332
333	return container;
334}
335
336static int tce_iommu_clear(struct tce_container *container,
337		struct iommu_table *tbl,
338		unsigned long entry, unsigned long pages);
339static void tce_iommu_free_table(struct tce_container *container,
340		struct iommu_table *tbl);
341
342static void tce_iommu_release(void *iommu_data)
343{
344	struct tce_container *container = iommu_data;
345	struct tce_iommu_group *tcegrp;
346	struct tce_iommu_prereg *tcemem, *tmtmp;
347	long i;
348
349	while (tce_groups_attached(container)) {
350		tcegrp = list_first_entry(&container->group_list,
351				struct tce_iommu_group, next);
352		tce_iommu_detach_group(iommu_data, tcegrp->grp);
353	}
354
355	/*
356	 * If VFIO created a table, it was not disposed
357	 * by tce_iommu_detach_group() so do it now.
358	 */
359	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
360		struct iommu_table *tbl = container->tables[i];
361
362		if (!tbl)
363			continue;
364
365		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
366		tce_iommu_free_table(container, tbl);
367	}
368
369	list_for_each_entry_safe(tcemem, tmtmp, &container->prereg_list, next)
370		WARN_ON(tce_iommu_prereg_free(container, tcemem));
371
372	tce_iommu_disable(container);
373	if (container->mm)
374		mmdrop(container->mm);
375	mutex_destroy(&container->lock);
376
377	kfree(container);
378}
379
380static void tce_iommu_unuse_page(struct tce_container *container,
381		unsigned long hpa)
382{
383	struct page *page;
384
385	page = pfn_to_page(hpa >> PAGE_SHIFT);
386	unpin_user_page(page);
387}
388
389static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container,
390		unsigned long tce, unsigned long shift,
391		unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem)
392{
393	long ret = 0;
394	struct mm_iommu_table_group_mem_t *mem;
395
396	mem = mm_iommu_lookup(container->mm, tce, 1ULL << shift);
397	if (!mem)
398		return -EINVAL;
399
400	ret = mm_iommu_ua_to_hpa(mem, tce, shift, phpa);
401	if (ret)
402		return -EINVAL;
403
404	*pmem = mem;
405
406	return 0;
407}
408
409static void tce_iommu_unuse_page_v2(struct tce_container *container,
410		struct iommu_table *tbl, unsigned long entry)
411{
412	struct mm_iommu_table_group_mem_t *mem = NULL;
413	int ret;
414	unsigned long hpa = 0;
415	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
416
417	if (!pua)
418		return;
419
420	ret = tce_iommu_prereg_ua_to_hpa(container, be64_to_cpu(*pua),
421			tbl->it_page_shift, &hpa, &mem);
422	if (ret)
423		pr_debug("%s: tce %llx at #%lx was not cached, ret=%d\n",
424				__func__, be64_to_cpu(*pua), entry, ret);
425	if (mem)
426		mm_iommu_mapped_dec(mem);
427
428	*pua = cpu_to_be64(0);
429}
430
431static int tce_iommu_clear(struct tce_container *container,
432		struct iommu_table *tbl,
433		unsigned long entry, unsigned long pages)
434{
435	unsigned long oldhpa;
436	long ret;
437	enum dma_data_direction direction;
438	unsigned long lastentry = entry + pages, firstentry = entry;
439
440	for ( ; entry < lastentry; ++entry) {
441		if (tbl->it_indirect_levels && tbl->it_userspace) {
442			/*
443			 * For multilevel tables, we can take a shortcut here
444			 * and skip some TCEs as we know that the userspace
445			 * addresses cache is a mirror of the real TCE table
446			 * and if it is missing some indirect levels, then
447			 * the hardware table does not have them allocated
448			 * either and therefore does not require updating.
449			 */
450			__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl,
451					entry);
452			if (!pua) {
453				/* align to level_size which is power of two */
454				entry |= tbl->it_level_size - 1;
455				continue;
456			}
457		}
458
459		cond_resched();
460
461		direction = DMA_NONE;
462		oldhpa = 0;
463		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry, &oldhpa,
464				&direction);
465		if (ret)
466			continue;
467
468		if (direction == DMA_NONE)
469			continue;
470
471		if (container->v2) {
472			tce_iommu_unuse_page_v2(container, tbl, entry);
473			continue;
474		}
475
476		tce_iommu_unuse_page(container, oldhpa);
477	}
478
479	iommu_tce_kill(tbl, firstentry, pages);
480
481	return 0;
482}
483
484static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
485{
486	struct page *page = NULL;
487	enum dma_data_direction direction = iommu_tce_direction(tce);
488
489	if (pin_user_pages_fast(tce & PAGE_MASK, 1,
490			direction != DMA_TO_DEVICE ? FOLL_WRITE : 0,
491			&page) != 1)
492		return -EFAULT;
493
494	*hpa = __pa((unsigned long) page_address(page));
495
496	return 0;
497}
498
499static long tce_iommu_build(struct tce_container *container,
500		struct iommu_table *tbl,
501		unsigned long entry, unsigned long tce, unsigned long pages,
502		enum dma_data_direction direction)
503{
504	long i, ret = 0;
505	unsigned long hpa;
506	enum dma_data_direction dirtmp;
507
508	for (i = 0; i < pages; ++i) {
509		unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
510
511		ret = tce_iommu_use_page(tce, &hpa);
512		if (ret)
513			break;
514
515		if (!tce_page_is_contained(container->mm, hpa,
516				tbl->it_page_shift)) {
517			ret = -EPERM;
518			break;
519		}
520
521		hpa |= offset;
522		dirtmp = direction;
523		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
524				&hpa, &dirtmp);
525		if (ret) {
526			tce_iommu_unuse_page(container, hpa);
527			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
528					__func__, entry << tbl->it_page_shift,
529					tce, ret);
530			break;
531		}
532
533		if (dirtmp != DMA_NONE)
534			tce_iommu_unuse_page(container, hpa);
535
536		tce += IOMMU_PAGE_SIZE(tbl);
537	}
538
539	if (ret)
540		tce_iommu_clear(container, tbl, entry, i);
541	else
542		iommu_tce_kill(tbl, entry, pages);
543
544	return ret;
545}
546
547static long tce_iommu_build_v2(struct tce_container *container,
548		struct iommu_table *tbl,
549		unsigned long entry, unsigned long tce, unsigned long pages,
550		enum dma_data_direction direction)
551{
552	long i, ret = 0;
553	unsigned long hpa;
554	enum dma_data_direction dirtmp;
555
556	for (i = 0; i < pages; ++i) {
557		struct mm_iommu_table_group_mem_t *mem = NULL;
558		__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry + i);
559
560		ret = tce_iommu_prereg_ua_to_hpa(container,
561				tce, tbl->it_page_shift, &hpa, &mem);
562		if (ret)
563			break;
564
565		if (!tce_page_is_contained(container->mm, hpa,
566				tbl->it_page_shift)) {
567			ret = -EPERM;
568			break;
569		}
570
571		/* Preserve offset within IOMMU page */
572		hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
573		dirtmp = direction;
574
575		/* The registered region is being unregistered */
576		if (mm_iommu_mapped_inc(mem))
577			break;
578
579		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
580				&hpa, &dirtmp);
581		if (ret) {
582			/* dirtmp cannot be DMA_NONE here */
583			tce_iommu_unuse_page_v2(container, tbl, entry + i);
584			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
585					__func__, entry << tbl->it_page_shift,
586					tce, ret);
587			break;
588		}
589
590		if (dirtmp != DMA_NONE)
591			tce_iommu_unuse_page_v2(container, tbl, entry + i);
592
593		*pua = cpu_to_be64(tce);
594
595		tce += IOMMU_PAGE_SIZE(tbl);
596	}
597
598	if (ret)
599		tce_iommu_clear(container, tbl, entry, i);
600	else
601		iommu_tce_kill(tbl, entry, pages);
602
603	return ret;
604}
605
606static long tce_iommu_create_table(struct tce_container *container,
607			struct iommu_table_group *table_group,
608			int num,
609			__u32 page_shift,
610			__u64 window_size,
611			__u32 levels,
612			struct iommu_table **ptbl)
613{
614	long ret, table_size;
615
616	table_size = table_group->ops->get_table_size(page_shift, window_size,
617			levels);
618	if (!table_size)
619		return -EINVAL;
620
621	ret = account_locked_vm(container->mm, table_size >> PAGE_SHIFT, true);
622	if (ret)
623		return ret;
624
625	ret = table_group->ops->create_table(table_group, num,
626			page_shift, window_size, levels, ptbl);
627
628	WARN_ON(!ret && !(*ptbl)->it_ops->free);
629	WARN_ON(!ret && ((*ptbl)->it_allocated_size > table_size));
630
631	return ret;
632}
633
634static void tce_iommu_free_table(struct tce_container *container,
635		struct iommu_table *tbl)
636{
637	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
638
639	iommu_tce_table_put(tbl);
640	account_locked_vm(container->mm, pages, false);
641}
642
643static long tce_iommu_create_window(struct tce_container *container,
644		__u32 page_shift, __u64 window_size, __u32 levels,
645		__u64 *start_addr)
646{
647	struct tce_iommu_group *tcegrp;
648	struct iommu_table_group *table_group;
649	struct iommu_table *tbl = NULL;
650	long ret, num;
651
652	num = tce_iommu_find_free_table(container);
653	if (num < 0)
654		return num;
655
656	/* Get the first group for ops::create_table */
657	tcegrp = list_first_entry(&container->group_list,
658			struct tce_iommu_group, next);
659	table_group = iommu_group_get_iommudata(tcegrp->grp);
660	if (!table_group)
661		return -EFAULT;
662
663	if (!(table_group->pgsizes & (1ULL << page_shift)))
664		return -EINVAL;
665
666	if (!table_group->ops->set_window || !table_group->ops->unset_window ||
667			!table_group->ops->get_table_size ||
668			!table_group->ops->create_table)
669		return -EPERM;
670
671	/* Create TCE table */
672	ret = tce_iommu_create_table(container, table_group, num,
673			page_shift, window_size, levels, &tbl);
674	if (ret)
675		return ret;
676
677	BUG_ON(!tbl->it_ops->free);
678
679	/*
680	 * Program the table to every group.
681	 * Groups have been tested for compatibility at the attach time.
682	 */
683	list_for_each_entry(tcegrp, &container->group_list, next) {
684		table_group = iommu_group_get_iommudata(tcegrp->grp);
685
686		ret = table_group->ops->set_window(table_group, num, tbl);
687		if (ret)
688			goto unset_exit;
689	}
690
691	container->tables[num] = tbl;
692
693	/* Return start address assigned by platform in create_table() */
694	*start_addr = tbl->it_offset << tbl->it_page_shift;
695
696	return 0;
697
698unset_exit:
699	list_for_each_entry(tcegrp, &container->group_list, next) {
700		table_group = iommu_group_get_iommudata(tcegrp->grp);
701		table_group->ops->unset_window(table_group, num);
702	}
703	tce_iommu_free_table(container, tbl);
704
705	return ret;
706}
707
708static long tce_iommu_remove_window(struct tce_container *container,
709		__u64 start_addr)
710{
711	struct iommu_table_group *table_group = NULL;
712	struct iommu_table *tbl;
713	struct tce_iommu_group *tcegrp;
714	int num;
715
716	num = tce_iommu_find_table(container, start_addr, &tbl);
717	if (num < 0)
718		return -EINVAL;
719
720	BUG_ON(!tbl->it_size);
721
722	/* Detach groups from IOMMUs */
723	list_for_each_entry(tcegrp, &container->group_list, next) {
724		table_group = iommu_group_get_iommudata(tcegrp->grp);
725
726		/*
727		 * SPAPR TCE IOMMU exposes the default DMA window to
728		 * the guest via dma32_window_start/size of
729		 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow
730		 * the userspace to remove this window, some do not so
731		 * here we check for the platform capability.
732		 */
733		if (!table_group->ops || !table_group->ops->unset_window)
734			return -EPERM;
735
736		table_group->ops->unset_window(table_group, num);
737	}
738
739	/* Free table */
740	tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
741	tce_iommu_free_table(container, tbl);
742	container->tables[num] = NULL;
743
744	return 0;
745}
746
747static long tce_iommu_create_default_window(struct tce_container *container)
748{
749	long ret;
750	__u64 start_addr = 0;
751	struct tce_iommu_group *tcegrp;
752	struct iommu_table_group *table_group;
753
754	if (!container->def_window_pending)
755		return 0;
756
757	if (!tce_groups_attached(container))
758		return -ENODEV;
759
760	tcegrp = list_first_entry(&container->group_list,
761			struct tce_iommu_group, next);
762	table_group = iommu_group_get_iommudata(tcegrp->grp);
763	if (!table_group)
764		return -ENODEV;
765
766	ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K,
767			table_group->tce32_size, 1, &start_addr);
768	WARN_ON_ONCE(!ret && start_addr);
769
770	if (!ret)
771		container->def_window_pending = false;
772
773	return ret;
774}
775
776static long tce_iommu_ioctl(void *iommu_data,
777				 unsigned int cmd, unsigned long arg)
778{
779	struct tce_container *container = iommu_data;
780	unsigned long minsz, ddwsz;
781	long ret;
782
783	switch (cmd) {
784	case VFIO_CHECK_EXTENSION:
785		switch (arg) {
786		case VFIO_SPAPR_TCE_IOMMU:
787		case VFIO_SPAPR_TCE_v2_IOMMU:
788			ret = 1;
789			break;
790		default:
791			ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg);
792			break;
793		}
794
795		return (ret < 0) ? 0 : ret;
796	}
797
798	/*
799	 * Sanity check to prevent one userspace from manipulating
800	 * another userspace mm.
801	 */
802	BUG_ON(!container);
803	if (container->mm && container->mm != current->mm)
804		return -EPERM;
805
806	switch (cmd) {
807	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
808		struct vfio_iommu_spapr_tce_info info;
809		struct tce_iommu_group *tcegrp;
810		struct iommu_table_group *table_group;
811
812		if (!tce_groups_attached(container))
813			return -ENXIO;
814
815		tcegrp = list_first_entry(&container->group_list,
816				struct tce_iommu_group, next);
817		table_group = iommu_group_get_iommudata(tcegrp->grp);
818
819		if (!table_group)
820			return -ENXIO;
821
822		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
823				dma32_window_size);
824
825		if (copy_from_user(&info, (void __user *)arg, minsz))
826			return -EFAULT;
827
828		if (info.argsz < minsz)
829			return -EINVAL;
830
831		info.dma32_window_start = table_group->tce32_start;
832		info.dma32_window_size = table_group->tce32_size;
833		info.flags = 0;
834		memset(&info.ddw, 0, sizeof(info.ddw));
835
836		if (table_group->max_dynamic_windows_supported &&
837				container->v2) {
838			info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW;
839			info.ddw.pgsizes = table_group->pgsizes;
840			info.ddw.max_dynamic_windows_supported =
841				table_group->max_dynamic_windows_supported;
842			info.ddw.levels = table_group->max_levels;
843		}
844
845		ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw);
846
847		if (info.argsz >= ddwsz)
848			minsz = ddwsz;
849
850		if (copy_to_user((void __user *)arg, &info, minsz))
851			return -EFAULT;
852
853		return 0;
854	}
855	case VFIO_IOMMU_MAP_DMA: {
856		struct vfio_iommu_type1_dma_map param;
857		struct iommu_table *tbl = NULL;
858		long num;
859		enum dma_data_direction direction;
860
861		if (!container->enabled)
862			return -EPERM;
863
864		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
865
866		if (copy_from_user(&param, (void __user *)arg, minsz))
867			return -EFAULT;
868
869		if (param.argsz < minsz)
870			return -EINVAL;
871
872		if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
873				VFIO_DMA_MAP_FLAG_WRITE))
874			return -EINVAL;
875
876		ret = tce_iommu_create_default_window(container);
877		if (ret)
878			return ret;
879
880		num = tce_iommu_find_table(container, param.iova, &tbl);
881		if (num < 0)
882			return -ENXIO;
883
884		if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
885				(param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
886			return -EINVAL;
887
888		/* iova is checked by the IOMMU API */
889		if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
890			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
891				direction = DMA_BIDIRECTIONAL;
892			else
893				direction = DMA_TO_DEVICE;
894		} else {
895			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
896				direction = DMA_FROM_DEVICE;
897			else
898				return -EINVAL;
899		}
900
901		ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr);
902		if (ret)
903			return ret;
904
905		if (container->v2)
906			ret = tce_iommu_build_v2(container, tbl,
907					param.iova >> tbl->it_page_shift,
908					param.vaddr,
909					param.size >> tbl->it_page_shift,
910					direction);
911		else
912			ret = tce_iommu_build(container, tbl,
913					param.iova >> tbl->it_page_shift,
914					param.vaddr,
915					param.size >> tbl->it_page_shift,
916					direction);
917
918		iommu_flush_tce(tbl);
919
920		return ret;
921	}
922	case VFIO_IOMMU_UNMAP_DMA: {
923		struct vfio_iommu_type1_dma_unmap param;
924		struct iommu_table *tbl = NULL;
925		long num;
926
927		if (!container->enabled)
928			return -EPERM;
929
930		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
931				size);
932
933		if (copy_from_user(&param, (void __user *)arg, minsz))
934			return -EFAULT;
935
936		if (param.argsz < minsz)
937			return -EINVAL;
938
939		/* No flag is supported now */
940		if (param.flags)
941			return -EINVAL;
942
943		ret = tce_iommu_create_default_window(container);
944		if (ret)
945			return ret;
946
947		num = tce_iommu_find_table(container, param.iova, &tbl);
948		if (num < 0)
949			return -ENXIO;
950
951		if (param.size & ~IOMMU_PAGE_MASK(tbl))
952			return -EINVAL;
953
954		ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
955				param.size >> tbl->it_page_shift);
956		if (ret)
957			return ret;
958
959		ret = tce_iommu_clear(container, tbl,
960				param.iova >> tbl->it_page_shift,
961				param.size >> tbl->it_page_shift);
962		iommu_flush_tce(tbl);
963
964		return ret;
965	}
966	case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: {
967		struct vfio_iommu_spapr_register_memory param;
968
969		if (!container->v2)
970			break;
971
972		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
973				size);
974
975		ret = tce_iommu_mm_set(container);
976		if (ret)
977			return ret;
978
979		if (copy_from_user(&param, (void __user *)arg, minsz))
980			return -EFAULT;
981
982		if (param.argsz < minsz)
983			return -EINVAL;
984
985		/* No flag is supported now */
986		if (param.flags)
987			return -EINVAL;
988
989		mutex_lock(&container->lock);
990		ret = tce_iommu_register_pages(container, param.vaddr,
991				param.size);
992		mutex_unlock(&container->lock);
993
994		return ret;
995	}
996	case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: {
997		struct vfio_iommu_spapr_register_memory param;
998
999		if (!container->v2)
1000			break;
1001
1002		if (!container->mm)
1003			return -EPERM;
1004
1005		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
1006				size);
1007
1008		if (copy_from_user(&param, (void __user *)arg, minsz))
1009			return -EFAULT;
1010
1011		if (param.argsz < minsz)
1012			return -EINVAL;
1013
1014		/* No flag is supported now */
1015		if (param.flags)
1016			return -EINVAL;
1017
1018		mutex_lock(&container->lock);
1019		ret = tce_iommu_unregister_pages(container, param.vaddr,
1020				param.size);
1021		mutex_unlock(&container->lock);
1022
1023		return ret;
1024	}
1025	case VFIO_IOMMU_ENABLE:
1026		if (container->v2)
1027			break;
1028
1029		mutex_lock(&container->lock);
1030		ret = tce_iommu_enable(container);
1031		mutex_unlock(&container->lock);
1032		return ret;
1033
1034
1035	case VFIO_IOMMU_DISABLE:
1036		if (container->v2)
1037			break;
1038
1039		mutex_lock(&container->lock);
1040		tce_iommu_disable(container);
1041		mutex_unlock(&container->lock);
1042		return 0;
1043
1044	case VFIO_EEH_PE_OP: {
1045		struct tce_iommu_group *tcegrp;
1046
1047		ret = 0;
1048		list_for_each_entry(tcegrp, &container->group_list, next) {
1049			ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp,
1050					cmd, arg);
1051			if (ret)
1052				return ret;
1053		}
1054		return ret;
1055	}
1056
1057	case VFIO_IOMMU_SPAPR_TCE_CREATE: {
1058		struct vfio_iommu_spapr_tce_create create;
1059
1060		if (!container->v2)
1061			break;
1062
1063		ret = tce_iommu_mm_set(container);
1064		if (ret)
1065			return ret;
1066
1067		if (!tce_groups_attached(container))
1068			return -ENXIO;
1069
1070		minsz = offsetofend(struct vfio_iommu_spapr_tce_create,
1071				start_addr);
1072
1073		if (copy_from_user(&create, (void __user *)arg, minsz))
1074			return -EFAULT;
1075
1076		if (create.argsz < minsz)
1077			return -EINVAL;
1078
1079		if (create.flags)
1080			return -EINVAL;
1081
1082		mutex_lock(&container->lock);
1083
1084		ret = tce_iommu_create_default_window(container);
1085		if (!ret)
1086			ret = tce_iommu_create_window(container,
1087					create.page_shift,
1088					create.window_size, create.levels,
1089					&create.start_addr);
1090
1091		mutex_unlock(&container->lock);
1092
1093		if (!ret && copy_to_user((void __user *)arg, &create, minsz))
1094			ret = -EFAULT;
1095
1096		return ret;
1097	}
1098	case VFIO_IOMMU_SPAPR_TCE_REMOVE: {
1099		struct vfio_iommu_spapr_tce_remove remove;
1100
1101		if (!container->v2)
1102			break;
1103
1104		ret = tce_iommu_mm_set(container);
1105		if (ret)
1106			return ret;
1107
1108		if (!tce_groups_attached(container))
1109			return -ENXIO;
1110
1111		minsz = offsetofend(struct vfio_iommu_spapr_tce_remove,
1112				start_addr);
1113
1114		if (copy_from_user(&remove, (void __user *)arg, minsz))
1115			return -EFAULT;
1116
1117		if (remove.argsz < minsz)
1118			return -EINVAL;
1119
1120		if (remove.flags)
1121			return -EINVAL;
1122
1123		if (container->def_window_pending && !remove.start_addr) {
1124			container->def_window_pending = false;
1125			return 0;
1126		}
1127
1128		mutex_lock(&container->lock);
1129
1130		ret = tce_iommu_remove_window(container, remove.start_addr);
1131
1132		mutex_unlock(&container->lock);
1133
1134		return ret;
1135	}
1136	}
1137
1138	return -ENOTTY;
1139}
1140
1141static void tce_iommu_release_ownership(struct tce_container *container,
1142		struct iommu_table_group *table_group)
1143{
1144	int i;
1145
1146	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1147		struct iommu_table *tbl = container->tables[i];
1148
1149		if (!tbl)
1150			continue;
1151
1152		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
1153		if (tbl->it_map)
1154			iommu_release_ownership(tbl);
1155
1156		container->tables[i] = NULL;
1157	}
1158}
1159
1160static int tce_iommu_take_ownership(struct tce_container *container,
1161		struct iommu_table_group *table_group)
1162{
1163	int i, j, rc = 0;
1164
1165	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1166		struct iommu_table *tbl = table_group->tables[i];
1167
1168		if (!tbl || !tbl->it_map)
1169			continue;
1170
1171		rc = iommu_take_ownership(tbl);
1172		if (rc) {
1173			for (j = 0; j < i; ++j)
1174				iommu_release_ownership(
1175						table_group->tables[j]);
1176
1177			return rc;
1178		}
1179	}
1180
1181	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1182		container->tables[i] = table_group->tables[i];
1183
1184	return 0;
1185}
1186
1187static void tce_iommu_release_ownership_ddw(struct tce_container *container,
1188		struct iommu_table_group *table_group)
1189{
1190	long i;
1191
1192	if (!table_group->ops->unset_window) {
1193		WARN_ON_ONCE(1);
1194		return;
1195	}
1196
1197	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1198		if (container->tables[i])
1199			table_group->ops->unset_window(table_group, i);
1200
1201	table_group->ops->release_ownership(table_group);
1202}
1203
1204static long tce_iommu_take_ownership_ddw(struct tce_container *container,
1205		struct iommu_table_group *table_group)
1206{
1207	long i, ret = 0;
1208
1209	if (!table_group->ops->create_table || !table_group->ops->set_window ||
1210			!table_group->ops->release_ownership) {
1211		WARN_ON_ONCE(1);
1212		return -EFAULT;
1213	}
1214
1215	table_group->ops->take_ownership(table_group);
1216
1217	/* Set all windows to the new group */
1218	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1219		struct iommu_table *tbl = container->tables[i];
1220
1221		if (!tbl)
1222			continue;
1223
1224		ret = table_group->ops->set_window(table_group, i, tbl);
1225		if (ret)
1226			goto release_exit;
1227	}
1228
1229	return 0;
1230
1231release_exit:
1232	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1233		table_group->ops->unset_window(table_group, i);
1234
1235	table_group->ops->release_ownership(table_group);
1236
1237	return ret;
1238}
1239
1240static int tce_iommu_attach_group(void *iommu_data,
1241		struct iommu_group *iommu_group)
1242{
1243	int ret = 0;
1244	struct tce_container *container = iommu_data;
1245	struct iommu_table_group *table_group;
1246	struct tce_iommu_group *tcegrp = NULL;
1247
1248	mutex_lock(&container->lock);
1249
1250	/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
1251			iommu_group_id(iommu_group), iommu_group); */
1252	table_group = iommu_group_get_iommudata(iommu_group);
1253	if (!table_group) {
1254		ret = -ENODEV;
1255		goto unlock_exit;
1256	}
1257
1258	if (tce_groups_attached(container) && (!table_group->ops ||
1259			!table_group->ops->take_ownership ||
1260			!table_group->ops->release_ownership)) {
1261		ret = -EBUSY;
1262		goto unlock_exit;
1263	}
1264
1265	/* Check if new group has the same iommu_ops (i.e. compatible) */
1266	list_for_each_entry(tcegrp, &container->group_list, next) {
1267		struct iommu_table_group *table_group_tmp;
1268
1269		if (tcegrp->grp == iommu_group) {
1270			pr_warn("tce_vfio: Group %d is already attached\n",
1271					iommu_group_id(iommu_group));
1272			ret = -EBUSY;
1273			goto unlock_exit;
1274		}
1275		table_group_tmp = iommu_group_get_iommudata(tcegrp->grp);
1276		if (table_group_tmp->ops->create_table !=
1277				table_group->ops->create_table) {
1278			pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
1279					iommu_group_id(iommu_group),
1280					iommu_group_id(tcegrp->grp));
1281			ret = -EPERM;
1282			goto unlock_exit;
1283		}
1284	}
1285
1286	tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL);
1287	if (!tcegrp) {
1288		ret = -ENOMEM;
1289		goto unlock_exit;
1290	}
1291
1292	if (!table_group->ops || !table_group->ops->take_ownership ||
1293			!table_group->ops->release_ownership) {
1294		if (container->v2) {
1295			ret = -EPERM;
1296			goto free_exit;
1297		}
1298		ret = tce_iommu_take_ownership(container, table_group);
1299	} else {
1300		if (!container->v2) {
1301			ret = -EPERM;
1302			goto free_exit;
1303		}
1304		ret = tce_iommu_take_ownership_ddw(container, table_group);
1305		if (!tce_groups_attached(container) && !container->tables[0])
1306			container->def_window_pending = true;
1307	}
1308
1309	if (!ret) {
1310		tcegrp->grp = iommu_group;
1311		list_add(&tcegrp->next, &container->group_list);
1312	}
1313
1314free_exit:
1315	if (ret && tcegrp)
1316		kfree(tcegrp);
1317
1318unlock_exit:
1319	mutex_unlock(&container->lock);
1320
1321	return ret;
1322}
1323
1324static void tce_iommu_detach_group(void *iommu_data,
1325		struct iommu_group *iommu_group)
1326{
1327	struct tce_container *container = iommu_data;
1328	struct iommu_table_group *table_group;
1329	bool found = false;
1330	struct tce_iommu_group *tcegrp;
1331
1332	mutex_lock(&container->lock);
1333
1334	list_for_each_entry(tcegrp, &container->group_list, next) {
1335		if (tcegrp->grp == iommu_group) {
1336			found = true;
1337			break;
1338		}
1339	}
1340
1341	if (!found) {
1342		pr_warn("tce_vfio: detaching unattached group #%u\n",
1343				iommu_group_id(iommu_group));
1344		goto unlock_exit;
1345	}
1346
1347	list_del(&tcegrp->next);
1348	kfree(tcegrp);
1349
1350	table_group = iommu_group_get_iommudata(iommu_group);
1351	BUG_ON(!table_group);
1352
1353	if (!table_group->ops || !table_group->ops->release_ownership)
1354		tce_iommu_release_ownership(container, table_group);
1355	else
1356		tce_iommu_release_ownership_ddw(container, table_group);
1357
1358unlock_exit:
1359	mutex_unlock(&container->lock);
1360}
1361
1362static const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
1363	.name		= "iommu-vfio-powerpc",
1364	.owner		= THIS_MODULE,
1365	.open		= tce_iommu_open,
1366	.release	= tce_iommu_release,
1367	.ioctl		= tce_iommu_ioctl,
1368	.attach_group	= tce_iommu_attach_group,
1369	.detach_group	= tce_iommu_detach_group,
1370};
1371
1372static int __init tce_iommu_init(void)
1373{
1374	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
1375}
1376
1377static void __exit tce_iommu_cleanup(void)
1378{
1379	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
1380}
1381
1382module_init(tce_iommu_init);
1383module_exit(tce_iommu_cleanup);
1384
1385MODULE_VERSION(DRIVER_VERSION);
1386MODULE_LICENSE("GPL v2");
1387MODULE_AUTHOR(DRIVER_AUTHOR);
1388MODULE_DESCRIPTION(DRIVER_DESC);
1389
1390