1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * VFIO: IOMMU DMA mapping support for TCE on POWER
4 *
5 * Copyright (C) 2013 IBM Corp.  All rights reserved.
6 *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
7 * Copyright Gavin Shan, IBM Corporation 2014.
8 *
9 * Derived from original vfio_iommu_type1.c:
10 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
11 *     Author: Alex Williamson <alex.williamson@redhat.com>
12 */
13
14#include <linux/module.h>
15#include <linux/pci.h>
16#include <linux/slab.h>
17#include <linux/uaccess.h>
18#include <linux/err.h>
19#include <linux/vfio.h>
20#include <linux/vmalloc.h>
21#include <linux/sched/mm.h>
22#include <linux/sched/signal.h>
23#include <linux/mm.h>
24#include "vfio.h"
25
26#include <asm/iommu.h>
27#include <asm/tce.h>
28#include <asm/mmu_context.h>
29
30#define DRIVER_VERSION  "0.1"
31#define DRIVER_AUTHOR   "aik@ozlabs.ru"
32#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
33
34static void tce_iommu_detach_group(void *iommu_data,
35		struct iommu_group *iommu_group);
36
37/*
38 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
39 *
40 * This code handles mapping and unmapping of user data buffers
41 * into DMA'ble space using the IOMMU
42 */
43
44struct tce_iommu_group {
45	struct list_head next;
46	struct iommu_group *grp;
47};
48
49/*
50 * A container needs to remember which preregistered region  it has
51 * referenced to do proper cleanup at the userspace process exit.
52 */
53struct tce_iommu_prereg {
54	struct list_head next;
55	struct mm_iommu_table_group_mem_t *mem;
56};
57
58/*
59 * The container descriptor supports only a single group per container.
60 * Required by the API as the container is not supplied with the IOMMU group
61 * at the moment of initialization.
62 */
63struct tce_container {
64	struct mutex lock;
65	bool enabled;
66	bool v2;
67	bool def_window_pending;
68	unsigned long locked_pages;
69	struct mm_struct *mm;
70	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
71	struct list_head group_list;
72	struct list_head prereg_list;
73};
74
75static long tce_iommu_mm_set(struct tce_container *container)
76{
77	if (container->mm) {
78		if (container->mm == current->mm)
79			return 0;
80		return -EPERM;
81	}
82	BUG_ON(!current->mm);
83	container->mm = current->mm;
84	mmgrab(container->mm);
85
86	return 0;
87}
88
89static long tce_iommu_prereg_free(struct tce_container *container,
90		struct tce_iommu_prereg *tcemem)
91{
92	long ret;
93
94	ret = mm_iommu_put(container->mm, tcemem->mem);
95	if (ret)
96		return ret;
97
98	list_del(&tcemem->next);
99	kfree(tcemem);
100
101	return 0;
102}
103
104static long tce_iommu_unregister_pages(struct tce_container *container,
105		__u64 vaddr, __u64 size)
106{
107	struct mm_iommu_table_group_mem_t *mem;
108	struct tce_iommu_prereg *tcemem;
109	bool found = false;
110	long ret;
111
112	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
113		return -EINVAL;
114
115	mem = mm_iommu_get(container->mm, vaddr, size >> PAGE_SHIFT);
116	if (!mem)
117		return -ENOENT;
118
119	list_for_each_entry(tcemem, &container->prereg_list, next) {
120		if (tcemem->mem == mem) {
121			found = true;
122			break;
123		}
124	}
125
126	if (!found)
127		ret = -ENOENT;
128	else
129		ret = tce_iommu_prereg_free(container, tcemem);
130
131	mm_iommu_put(container->mm, mem);
132
133	return ret;
134}
135
136static long tce_iommu_register_pages(struct tce_container *container,
137		__u64 vaddr, __u64 size)
138{
139	long ret = 0;
140	struct mm_iommu_table_group_mem_t *mem = NULL;
141	struct tce_iommu_prereg *tcemem;
142	unsigned long entries = size >> PAGE_SHIFT;
143
144	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
145			((vaddr + size) < vaddr))
146		return -EINVAL;
147
148	mem = mm_iommu_get(container->mm, vaddr, entries);
149	if (mem) {
150		list_for_each_entry(tcemem, &container->prereg_list, next) {
151			if (tcemem->mem == mem) {
152				ret = -EBUSY;
153				goto put_exit;
154			}
155		}
156	} else {
157		ret = mm_iommu_new(container->mm, vaddr, entries, &mem);
158		if (ret)
159			return ret;
160	}
161
162	tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL);
163	if (!tcemem) {
164		ret = -ENOMEM;
165		goto put_exit;
166	}
167
168	tcemem->mem = mem;
169	list_add(&tcemem->next, &container->prereg_list);
170
171	container->enabled = true;
172
173	return 0;
174
175put_exit:
176	mm_iommu_put(container->mm, mem);
177	return ret;
178}
179
180static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa,
181		unsigned int it_page_shift)
182{
183	struct page *page;
184	unsigned long size = 0;
185
186	if (mm_iommu_is_devmem(mm, hpa, it_page_shift, &size))
187		return size == (1UL << it_page_shift);
188
189	page = pfn_to_page(hpa >> PAGE_SHIFT);
190	/*
191	 * Check that the TCE table granularity is not bigger than the size of
192	 * a page we just found. Otherwise the hardware can get access to
193	 * a bigger memory chunk that it should.
194	 */
195	return page_shift(compound_head(page)) >= it_page_shift;
196}
197
198static inline bool tce_groups_attached(struct tce_container *container)
199{
200	return !list_empty(&container->group_list);
201}
202
203static long tce_iommu_find_table(struct tce_container *container,
204		phys_addr_t ioba, struct iommu_table **ptbl)
205{
206	long i;
207
208	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
209		struct iommu_table *tbl = container->tables[i];
210
211		if (tbl) {
212			unsigned long entry = ioba >> tbl->it_page_shift;
213			unsigned long start = tbl->it_offset;
214			unsigned long end = start + tbl->it_size;
215
216			if ((start <= entry) && (entry < end)) {
217				*ptbl = tbl;
218				return i;
219			}
220		}
221	}
222
223	return -1;
224}
225
226static int tce_iommu_find_free_table(struct tce_container *container)
227{
228	int i;
229
230	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
231		if (!container->tables[i])
232			return i;
233	}
234
235	return -ENOSPC;
236}
237
238static int tce_iommu_enable(struct tce_container *container)
239{
240	int ret = 0;
241	unsigned long locked;
242	struct iommu_table_group *table_group;
243	struct tce_iommu_group *tcegrp;
244
245	if (container->enabled)
246		return -EBUSY;
247
248	/*
249	 * When userspace pages are mapped into the IOMMU, they are effectively
250	 * locked memory, so, theoretically, we need to update the accounting
251	 * of locked pages on each map and unmap.  For powerpc, the map unmap
252	 * paths can be very hot, though, and the accounting would kill
253	 * performance, especially since it would be difficult to impossible
254	 * to handle the accounting in real mode only.
255	 *
256	 * To address that, rather than precisely accounting every page, we
257	 * instead account for a worst case on locked memory when the iommu is
258	 * enabled and disabled.  The worst case upper bound on locked memory
259	 * is the size of the whole iommu window, which is usually relatively
260	 * small (compared to total memory sizes) on POWER hardware.
261	 *
262	 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
263	 * that would effectively kill the guest at random points, much better
264	 * enforcing the limit based on the max that the guest can map.
265	 *
266	 * Unfortunately at the moment it counts whole tables, no matter how
267	 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
268	 * each with 2GB DMA window, 8GB will be counted here. The reason for
269	 * this is that we cannot tell here the amount of RAM used by the guest
270	 * as this information is only available from KVM and VFIO is
271	 * KVM agnostic.
272	 *
273	 * So we do not allow enabling a container without a group attached
274	 * as there is no way to know how much we should increment
275	 * the locked_vm counter.
276	 */
277	if (!tce_groups_attached(container))
278		return -ENODEV;
279
280	tcegrp = list_first_entry(&container->group_list,
281			struct tce_iommu_group, next);
282	table_group = iommu_group_get_iommudata(tcegrp->grp);
283	if (!table_group)
284		return -ENODEV;
285
286	if (!table_group->tce32_size)
287		return -EPERM;
288
289	ret = tce_iommu_mm_set(container);
290	if (ret)
291		return ret;
292
293	locked = table_group->tce32_size >> PAGE_SHIFT;
294	ret = account_locked_vm(container->mm, locked, true);
295	if (ret)
296		return ret;
297
298	container->locked_pages = locked;
299
300	container->enabled = true;
301
302	return ret;
303}
304
305static void tce_iommu_disable(struct tce_container *container)
306{
307	if (!container->enabled)
308		return;
309
310	container->enabled = false;
311
312	BUG_ON(!container->mm);
313	account_locked_vm(container->mm, container->locked_pages, false);
314}
315
316static void *tce_iommu_open(unsigned long arg)
317{
318	struct tce_container *container;
319
320	if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) {
321		pr_err("tce_vfio: Wrong IOMMU type\n");
322		return ERR_PTR(-EINVAL);
323	}
324
325	container = kzalloc(sizeof(*container), GFP_KERNEL);
326	if (!container)
327		return ERR_PTR(-ENOMEM);
328
329	mutex_init(&container->lock);
330	INIT_LIST_HEAD_RCU(&container->group_list);
331	INIT_LIST_HEAD_RCU(&container->prereg_list);
332
333	container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
334
335	return container;
336}
337
338static int tce_iommu_clear(struct tce_container *container,
339		struct iommu_table *tbl,
340		unsigned long entry, unsigned long pages);
341static void tce_iommu_free_table(struct tce_container *container,
342		struct iommu_table *tbl);
343
344static void tce_iommu_release(void *iommu_data)
345{
346	struct tce_container *container = iommu_data;
347	struct tce_iommu_group *tcegrp;
348	struct tce_iommu_prereg *tcemem, *tmtmp;
349	long i;
350
351	while (tce_groups_attached(container)) {
352		tcegrp = list_first_entry(&container->group_list,
353				struct tce_iommu_group, next);
354		tce_iommu_detach_group(iommu_data, tcegrp->grp);
355	}
356
357	/*
358	 * If VFIO created a table, it was not disposed
359	 * by tce_iommu_detach_group() so do it now.
360	 */
361	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
362		struct iommu_table *tbl = container->tables[i];
363
364		if (!tbl)
365			continue;
366
367		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
368		tce_iommu_free_table(container, tbl);
369	}
370
371	list_for_each_entry_safe(tcemem, tmtmp, &container->prereg_list, next)
372		WARN_ON(tce_iommu_prereg_free(container, tcemem));
373
374	tce_iommu_disable(container);
375	if (container->mm)
376		mmdrop(container->mm);
377	mutex_destroy(&container->lock);
378
379	kfree(container);
380}
381
382static void tce_iommu_unuse_page(unsigned long hpa)
383{
384	struct page *page;
385
386	page = pfn_to_page(hpa >> PAGE_SHIFT);
387	unpin_user_page(page);
388}
389
390static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container,
391		unsigned long tce, unsigned long shift,
392		unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem)
393{
394	long ret = 0;
395	struct mm_iommu_table_group_mem_t *mem;
396
397	mem = mm_iommu_lookup(container->mm, tce, 1ULL << shift);
398	if (!mem)
399		return -EINVAL;
400
401	ret = mm_iommu_ua_to_hpa(mem, tce, shift, phpa);
402	if (ret)
403		return -EINVAL;
404
405	*pmem = mem;
406
407	return 0;
408}
409
410static void tce_iommu_unuse_page_v2(struct tce_container *container,
411		struct iommu_table *tbl, unsigned long entry)
412{
413	struct mm_iommu_table_group_mem_t *mem = NULL;
414	int ret;
415	unsigned long hpa = 0;
416	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
417
418	if (!pua)
419		return;
420
421	ret = tce_iommu_prereg_ua_to_hpa(container, be64_to_cpu(*pua),
422			tbl->it_page_shift, &hpa, &mem);
423	if (ret)
424		pr_debug("%s: tce %llx at #%lx was not cached, ret=%d\n",
425				__func__, be64_to_cpu(*pua), entry, ret);
426	if (mem)
427		mm_iommu_mapped_dec(mem);
428
429	*pua = cpu_to_be64(0);
430}
431
432static int tce_iommu_clear(struct tce_container *container,
433		struct iommu_table *tbl,
434		unsigned long entry, unsigned long pages)
435{
436	unsigned long oldhpa;
437	long ret;
438	enum dma_data_direction direction;
439	unsigned long lastentry = entry + pages, firstentry = entry;
440
441	for ( ; entry < lastentry; ++entry) {
442		if (tbl->it_indirect_levels && tbl->it_userspace) {
443			/*
444			 * For multilevel tables, we can take a shortcut here
445			 * and skip some TCEs as we know that the userspace
446			 * addresses cache is a mirror of the real TCE table
447			 * and if it is missing some indirect levels, then
448			 * the hardware table does not have them allocated
449			 * either and therefore does not require updating.
450			 */
451			__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl,
452					entry);
453			if (!pua) {
454				/* align to level_size which is power of two */
455				entry |= tbl->it_level_size - 1;
456				continue;
457			}
458		}
459
460		cond_resched();
461
462		direction = DMA_NONE;
463		oldhpa = 0;
464		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry, &oldhpa,
465				&direction);
466		if (ret)
467			continue;
468
469		if (direction == DMA_NONE)
470			continue;
471
472		if (container->v2) {
473			tce_iommu_unuse_page_v2(container, tbl, entry);
474			continue;
475		}
476
477		tce_iommu_unuse_page(oldhpa);
478	}
479
480	iommu_tce_kill(tbl, firstentry, pages);
481
482	return 0;
483}
484
485static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
486{
487	struct page *page = NULL;
488	enum dma_data_direction direction = iommu_tce_direction(tce);
489
490	if (pin_user_pages_fast(tce & PAGE_MASK, 1,
491			direction != DMA_TO_DEVICE ? FOLL_WRITE : 0,
492			&page) != 1)
493		return -EFAULT;
494
495	*hpa = __pa((unsigned long) page_address(page));
496
497	return 0;
498}
499
500static long tce_iommu_build(struct tce_container *container,
501		struct iommu_table *tbl,
502		unsigned long entry, unsigned long tce, unsigned long pages,
503		enum dma_data_direction direction)
504{
505	long i, ret = 0;
506	unsigned long hpa;
507	enum dma_data_direction dirtmp;
508
509	for (i = 0; i < pages; ++i) {
510		unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
511
512		ret = tce_iommu_use_page(tce, &hpa);
513		if (ret)
514			break;
515
516		if (!tce_page_is_contained(container->mm, hpa,
517				tbl->it_page_shift)) {
518			ret = -EPERM;
519			break;
520		}
521
522		hpa |= offset;
523		dirtmp = direction;
524		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
525				&hpa, &dirtmp);
526		if (ret) {
527			tce_iommu_unuse_page(hpa);
528			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
529					__func__, entry << tbl->it_page_shift,
530					tce, ret);
531			break;
532		}
533
534		if (dirtmp != DMA_NONE)
535			tce_iommu_unuse_page(hpa);
536
537		tce += IOMMU_PAGE_SIZE(tbl);
538	}
539
540	if (ret)
541		tce_iommu_clear(container, tbl, entry, i);
542	else
543		iommu_tce_kill(tbl, entry, pages);
544
545	return ret;
546}
547
548static long tce_iommu_build_v2(struct tce_container *container,
549		struct iommu_table *tbl,
550		unsigned long entry, unsigned long tce, unsigned long pages,
551		enum dma_data_direction direction)
552{
553	long i, ret = 0;
554	unsigned long hpa;
555	enum dma_data_direction dirtmp;
556
557	for (i = 0; i < pages; ++i) {
558		struct mm_iommu_table_group_mem_t *mem = NULL;
559		__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry + i);
560
561		ret = tce_iommu_prereg_ua_to_hpa(container,
562				tce, tbl->it_page_shift, &hpa, &mem);
563		if (ret)
564			break;
565
566		if (!tce_page_is_contained(container->mm, hpa,
567				tbl->it_page_shift)) {
568			ret = -EPERM;
569			break;
570		}
571
572		/* Preserve offset within IOMMU page */
573		hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
574		dirtmp = direction;
575
576		/* The registered region is being unregistered */
577		if (mm_iommu_mapped_inc(mem))
578			break;
579
580		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
581				&hpa, &dirtmp);
582		if (ret) {
583			/* dirtmp cannot be DMA_NONE here */
584			tce_iommu_unuse_page_v2(container, tbl, entry + i);
585			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
586					__func__, entry << tbl->it_page_shift,
587					tce, ret);
588			break;
589		}
590
591		if (dirtmp != DMA_NONE)
592			tce_iommu_unuse_page_v2(container, tbl, entry + i);
593
594		*pua = cpu_to_be64(tce);
595
596		tce += IOMMU_PAGE_SIZE(tbl);
597	}
598
599	if (ret)
600		tce_iommu_clear(container, tbl, entry, i);
601	else
602		iommu_tce_kill(tbl, entry, pages);
603
604	return ret;
605}
606
607static long tce_iommu_create_table(struct tce_container *container,
608			struct iommu_table_group *table_group,
609			int num,
610			__u32 page_shift,
611			__u64 window_size,
612			__u32 levels,
613			struct iommu_table **ptbl)
614{
615	long ret, table_size;
616
617	table_size = table_group->ops->get_table_size(page_shift, window_size,
618			levels);
619	if (!table_size)
620		return -EINVAL;
621
622	ret = account_locked_vm(container->mm, table_size >> PAGE_SHIFT, true);
623	if (ret)
624		return ret;
625
626	ret = table_group->ops->create_table(table_group, num,
627			page_shift, window_size, levels, ptbl);
628
629	WARN_ON(!ret && !(*ptbl)->it_ops->free);
630	WARN_ON(!ret && ((*ptbl)->it_allocated_size > table_size));
631
632	return ret;
633}
634
635static void tce_iommu_free_table(struct tce_container *container,
636		struct iommu_table *tbl)
637{
638	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
639
640	iommu_tce_table_put(tbl);
641	account_locked_vm(container->mm, pages, false);
642}
643
644static long tce_iommu_create_window(struct tce_container *container,
645		__u32 page_shift, __u64 window_size, __u32 levels,
646		__u64 *start_addr)
647{
648	struct tce_iommu_group *tcegrp;
649	struct iommu_table_group *table_group;
650	struct iommu_table *tbl = NULL;
651	long ret, num;
652
653	num = tce_iommu_find_free_table(container);
654	if (num < 0)
655		return num;
656
657	/* Get the first group for ops::create_table */
658	tcegrp = list_first_entry(&container->group_list,
659			struct tce_iommu_group, next);
660	table_group = iommu_group_get_iommudata(tcegrp->grp);
661	if (!table_group)
662		return -EFAULT;
663
664	if (!(table_group->pgsizes & (1ULL << page_shift)))
665		return -EINVAL;
666
667	if (!table_group->ops->set_window || !table_group->ops->unset_window ||
668			!table_group->ops->get_table_size ||
669			!table_group->ops->create_table)
670		return -EPERM;
671
672	/* Create TCE table */
673	ret = tce_iommu_create_table(container, table_group, num,
674			page_shift, window_size, levels, &tbl);
675	if (ret)
676		return ret;
677
678	BUG_ON(!tbl->it_ops->free);
679
680	/*
681	 * Program the table to every group.
682	 * Groups have been tested for compatibility at the attach time.
683	 */
684	list_for_each_entry(tcegrp, &container->group_list, next) {
685		table_group = iommu_group_get_iommudata(tcegrp->grp);
686
687		ret = table_group->ops->set_window(table_group, num, tbl);
688		if (ret)
689			goto unset_exit;
690	}
691
692	container->tables[num] = tbl;
693
694	/* Return start address assigned by platform in create_table() */
695	*start_addr = tbl->it_offset << tbl->it_page_shift;
696
697	return 0;
698
699unset_exit:
700	list_for_each_entry(tcegrp, &container->group_list, next) {
701		table_group = iommu_group_get_iommudata(tcegrp->grp);
702		table_group->ops->unset_window(table_group, num);
703	}
704	tce_iommu_free_table(container, tbl);
705
706	return ret;
707}
708
709static long tce_iommu_remove_window(struct tce_container *container,
710		__u64 start_addr)
711{
712	struct iommu_table_group *table_group = NULL;
713	struct iommu_table *tbl;
714	struct tce_iommu_group *tcegrp;
715	int num;
716
717	num = tce_iommu_find_table(container, start_addr, &tbl);
718	if (num < 0)
719		return -EINVAL;
720
721	BUG_ON(!tbl->it_size);
722
723	/* Detach groups from IOMMUs */
724	list_for_each_entry(tcegrp, &container->group_list, next) {
725		table_group = iommu_group_get_iommudata(tcegrp->grp);
726
727		/*
728		 * SPAPR TCE IOMMU exposes the default DMA window to
729		 * the guest via dma32_window_start/size of
730		 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow
731		 * the userspace to remove this window, some do not so
732		 * here we check for the platform capability.
733		 */
734		if (!table_group->ops || !table_group->ops->unset_window)
735			return -EPERM;
736
737		table_group->ops->unset_window(table_group, num);
738	}
739
740	/* Free table */
741	tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
742	tce_iommu_free_table(container, tbl);
743	container->tables[num] = NULL;
744
745	return 0;
746}
747
748static long tce_iommu_create_default_window(struct tce_container *container)
749{
750	long ret;
751	__u64 start_addr = 0;
752	struct tce_iommu_group *tcegrp;
753	struct iommu_table_group *table_group;
754
755	if (!container->def_window_pending)
756		return 0;
757
758	if (!tce_groups_attached(container))
759		return -ENODEV;
760
761	tcegrp = list_first_entry(&container->group_list,
762			struct tce_iommu_group, next);
763	table_group = iommu_group_get_iommudata(tcegrp->grp);
764	if (!table_group)
765		return -ENODEV;
766
767	ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K,
768			table_group->tce32_size, 1, &start_addr);
769	WARN_ON_ONCE(!ret && start_addr);
770
771	if (!ret)
772		container->def_window_pending = false;
773
774	return ret;
775}
776
777static long vfio_spapr_ioctl_eeh_pe_op(struct iommu_group *group,
778				       unsigned long arg)
779{
780	struct eeh_pe *pe;
781	struct vfio_eeh_pe_op op;
782	unsigned long minsz;
783
784	pe = eeh_iommu_group_to_pe(group);
785	if (!pe)
786		return -ENODEV;
787
788	minsz = offsetofend(struct vfio_eeh_pe_op, op);
789	if (copy_from_user(&op, (void __user *)arg, minsz))
790		return -EFAULT;
791	if (op.argsz < minsz || op.flags)
792		return -EINVAL;
793
794	switch (op.op) {
795	case VFIO_EEH_PE_DISABLE:
796		return eeh_pe_set_option(pe, EEH_OPT_DISABLE);
797	case VFIO_EEH_PE_ENABLE:
798		return eeh_pe_set_option(pe, EEH_OPT_ENABLE);
799	case VFIO_EEH_PE_UNFREEZE_IO:
800		return eeh_pe_set_option(pe, EEH_OPT_THAW_MMIO);
801	case VFIO_EEH_PE_UNFREEZE_DMA:
802		return eeh_pe_set_option(pe, EEH_OPT_THAW_DMA);
803	case VFIO_EEH_PE_GET_STATE:
804		return eeh_pe_get_state(pe);
805		break;
806	case VFIO_EEH_PE_RESET_DEACTIVATE:
807		return eeh_pe_reset(pe, EEH_RESET_DEACTIVATE, true);
808	case VFIO_EEH_PE_RESET_HOT:
809		return eeh_pe_reset(pe, EEH_RESET_HOT, true);
810	case VFIO_EEH_PE_RESET_FUNDAMENTAL:
811		return eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL, true);
812	case VFIO_EEH_PE_CONFIGURE:
813		return eeh_pe_configure(pe);
814	case VFIO_EEH_PE_INJECT_ERR:
815		minsz = offsetofend(struct vfio_eeh_pe_op, err.mask);
816		if (op.argsz < minsz)
817			return -EINVAL;
818		if (copy_from_user(&op, (void __user *)arg, minsz))
819			return -EFAULT;
820
821		return eeh_pe_inject_err(pe, op.err.type, op.err.func,
822					 op.err.addr, op.err.mask);
823	default:
824		return -EINVAL;
825	}
826}
827
828static long tce_iommu_ioctl(void *iommu_data,
829				 unsigned int cmd, unsigned long arg)
830{
831	struct tce_container *container = iommu_data;
832	unsigned long minsz, ddwsz;
833	long ret;
834
835	switch (cmd) {
836	case VFIO_CHECK_EXTENSION:
837		switch (arg) {
838		case VFIO_SPAPR_TCE_IOMMU:
839		case VFIO_SPAPR_TCE_v2_IOMMU:
840			return 1;
841		case VFIO_EEH:
842			return eeh_enabled();
843		default:
844			return 0;
845		}
846	}
847
848	/*
849	 * Sanity check to prevent one userspace from manipulating
850	 * another userspace mm.
851	 */
852	BUG_ON(!container);
853	if (container->mm && container->mm != current->mm)
854		return -EPERM;
855
856	switch (cmd) {
857	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
858		struct vfio_iommu_spapr_tce_info info;
859		struct tce_iommu_group *tcegrp;
860		struct iommu_table_group *table_group;
861
862		if (!tce_groups_attached(container))
863			return -ENXIO;
864
865		tcegrp = list_first_entry(&container->group_list,
866				struct tce_iommu_group, next);
867		table_group = iommu_group_get_iommudata(tcegrp->grp);
868
869		if (!table_group)
870			return -ENXIO;
871
872		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
873				dma32_window_size);
874
875		if (copy_from_user(&info, (void __user *)arg, minsz))
876			return -EFAULT;
877
878		if (info.argsz < minsz)
879			return -EINVAL;
880
881		info.dma32_window_start = table_group->tce32_start;
882		info.dma32_window_size = table_group->tce32_size;
883		info.flags = 0;
884		memset(&info.ddw, 0, sizeof(info.ddw));
885
886		if (table_group->max_dynamic_windows_supported &&
887				container->v2) {
888			info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW;
889			info.ddw.pgsizes = table_group->pgsizes;
890			info.ddw.max_dynamic_windows_supported =
891				table_group->max_dynamic_windows_supported;
892			info.ddw.levels = table_group->max_levels;
893		}
894
895		ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw);
896
897		if (info.argsz >= ddwsz)
898			minsz = ddwsz;
899
900		if (copy_to_user((void __user *)arg, &info, minsz))
901			return -EFAULT;
902
903		return 0;
904	}
905	case VFIO_IOMMU_MAP_DMA: {
906		struct vfio_iommu_type1_dma_map param;
907		struct iommu_table *tbl = NULL;
908		long num;
909		enum dma_data_direction direction;
910
911		if (!container->enabled)
912			return -EPERM;
913
914		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
915
916		if (copy_from_user(&param, (void __user *)arg, minsz))
917			return -EFAULT;
918
919		if (param.argsz < minsz)
920			return -EINVAL;
921
922		if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
923				VFIO_DMA_MAP_FLAG_WRITE))
924			return -EINVAL;
925
926		ret = tce_iommu_create_default_window(container);
927		if (ret)
928			return ret;
929
930		num = tce_iommu_find_table(container, param.iova, &tbl);
931		if (num < 0)
932			return -ENXIO;
933
934		if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
935				(param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
936			return -EINVAL;
937
938		/* iova is checked by the IOMMU API */
939		if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
940			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
941				direction = DMA_BIDIRECTIONAL;
942			else
943				direction = DMA_TO_DEVICE;
944		} else {
945			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
946				direction = DMA_FROM_DEVICE;
947			else
948				return -EINVAL;
949		}
950
951		ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr);
952		if (ret)
953			return ret;
954
955		if (container->v2)
956			ret = tce_iommu_build_v2(container, tbl,
957					param.iova >> tbl->it_page_shift,
958					param.vaddr,
959					param.size >> tbl->it_page_shift,
960					direction);
961		else
962			ret = tce_iommu_build(container, tbl,
963					param.iova >> tbl->it_page_shift,
964					param.vaddr,
965					param.size >> tbl->it_page_shift,
966					direction);
967
968		iommu_flush_tce(tbl);
969
970		return ret;
971	}
972	case VFIO_IOMMU_UNMAP_DMA: {
973		struct vfio_iommu_type1_dma_unmap param;
974		struct iommu_table *tbl = NULL;
975		long num;
976
977		if (!container->enabled)
978			return -EPERM;
979
980		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
981				size);
982
983		if (copy_from_user(&param, (void __user *)arg, minsz))
984			return -EFAULT;
985
986		if (param.argsz < minsz)
987			return -EINVAL;
988
989		/* No flag is supported now */
990		if (param.flags)
991			return -EINVAL;
992
993		ret = tce_iommu_create_default_window(container);
994		if (ret)
995			return ret;
996
997		num = tce_iommu_find_table(container, param.iova, &tbl);
998		if (num < 0)
999			return -ENXIO;
1000
1001		if (param.size & ~IOMMU_PAGE_MASK(tbl))
1002			return -EINVAL;
1003
1004		ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
1005				param.size >> tbl->it_page_shift);
1006		if (ret)
1007			return ret;
1008
1009		ret = tce_iommu_clear(container, tbl,
1010				param.iova >> tbl->it_page_shift,
1011				param.size >> tbl->it_page_shift);
1012		iommu_flush_tce(tbl);
1013
1014		return ret;
1015	}
1016	case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: {
1017		struct vfio_iommu_spapr_register_memory param;
1018
1019		if (!container->v2)
1020			break;
1021
1022		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
1023				size);
1024
1025		ret = tce_iommu_mm_set(container);
1026		if (ret)
1027			return ret;
1028
1029		if (copy_from_user(&param, (void __user *)arg, minsz))
1030			return -EFAULT;
1031
1032		if (param.argsz < minsz)
1033			return -EINVAL;
1034
1035		/* No flag is supported now */
1036		if (param.flags)
1037			return -EINVAL;
1038
1039		mutex_lock(&container->lock);
1040		ret = tce_iommu_register_pages(container, param.vaddr,
1041				param.size);
1042		mutex_unlock(&container->lock);
1043
1044		return ret;
1045	}
1046	case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: {
1047		struct vfio_iommu_spapr_register_memory param;
1048
1049		if (!container->v2)
1050			break;
1051
1052		if (!container->mm)
1053			return -EPERM;
1054
1055		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
1056				size);
1057
1058		if (copy_from_user(&param, (void __user *)arg, minsz))
1059			return -EFAULT;
1060
1061		if (param.argsz < minsz)
1062			return -EINVAL;
1063
1064		/* No flag is supported now */
1065		if (param.flags)
1066			return -EINVAL;
1067
1068		mutex_lock(&container->lock);
1069		ret = tce_iommu_unregister_pages(container, param.vaddr,
1070				param.size);
1071		mutex_unlock(&container->lock);
1072
1073		return ret;
1074	}
1075	case VFIO_IOMMU_ENABLE:
1076		if (container->v2)
1077			break;
1078
1079		mutex_lock(&container->lock);
1080		ret = tce_iommu_enable(container);
1081		mutex_unlock(&container->lock);
1082		return ret;
1083
1084
1085	case VFIO_IOMMU_DISABLE:
1086		if (container->v2)
1087			break;
1088
1089		mutex_lock(&container->lock);
1090		tce_iommu_disable(container);
1091		mutex_unlock(&container->lock);
1092		return 0;
1093
1094	case VFIO_EEH_PE_OP: {
1095		struct tce_iommu_group *tcegrp;
1096
1097		ret = 0;
1098		list_for_each_entry(tcegrp, &container->group_list, next) {
1099			ret = vfio_spapr_ioctl_eeh_pe_op(tcegrp->grp, arg);
1100			if (ret)
1101				return ret;
1102		}
1103		return ret;
1104	}
1105
1106	case VFIO_IOMMU_SPAPR_TCE_CREATE: {
1107		struct vfio_iommu_spapr_tce_create create;
1108
1109		if (!container->v2)
1110			break;
1111
1112		ret = tce_iommu_mm_set(container);
1113		if (ret)
1114			return ret;
1115
1116		if (!tce_groups_attached(container))
1117			return -ENXIO;
1118
1119		minsz = offsetofend(struct vfio_iommu_spapr_tce_create,
1120				start_addr);
1121
1122		if (copy_from_user(&create, (void __user *)arg, minsz))
1123			return -EFAULT;
1124
1125		if (create.argsz < minsz)
1126			return -EINVAL;
1127
1128		if (create.flags)
1129			return -EINVAL;
1130
1131		mutex_lock(&container->lock);
1132
1133		ret = tce_iommu_create_default_window(container);
1134		if (!ret)
1135			ret = tce_iommu_create_window(container,
1136					create.page_shift,
1137					create.window_size, create.levels,
1138					&create.start_addr);
1139
1140		mutex_unlock(&container->lock);
1141
1142		if (!ret && copy_to_user((void __user *)arg, &create, minsz))
1143			ret = -EFAULT;
1144
1145		return ret;
1146	}
1147	case VFIO_IOMMU_SPAPR_TCE_REMOVE: {
1148		struct vfio_iommu_spapr_tce_remove remove;
1149
1150		if (!container->v2)
1151			break;
1152
1153		ret = tce_iommu_mm_set(container);
1154		if (ret)
1155			return ret;
1156
1157		if (!tce_groups_attached(container))
1158			return -ENXIO;
1159
1160		minsz = offsetofend(struct vfio_iommu_spapr_tce_remove,
1161				start_addr);
1162
1163		if (copy_from_user(&remove, (void __user *)arg, minsz))
1164			return -EFAULT;
1165
1166		if (remove.argsz < minsz)
1167			return -EINVAL;
1168
1169		if (remove.flags)
1170			return -EINVAL;
1171
1172		if (container->def_window_pending && !remove.start_addr) {
1173			container->def_window_pending = false;
1174			return 0;
1175		}
1176
1177		mutex_lock(&container->lock);
1178
1179		ret = tce_iommu_remove_window(container, remove.start_addr);
1180
1181		mutex_unlock(&container->lock);
1182
1183		return ret;
1184	}
1185	}
1186
1187	return -ENOTTY;
1188}
1189
1190static void tce_iommu_release_ownership(struct tce_container *container,
1191		struct iommu_table_group *table_group)
1192{
1193	long i;
1194
1195	if (!table_group->ops->unset_window) {
1196		WARN_ON_ONCE(1);
1197		return;
1198	}
1199
1200	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1201		if (container->tables[i])
1202			table_group->ops->unset_window(table_group, i);
1203}
1204
1205static long tce_iommu_take_ownership(struct tce_container *container,
1206		struct iommu_table_group *table_group)
1207{
1208	long i, ret = 0;
1209
1210	/* Set all windows to the new group */
1211	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1212		struct iommu_table *tbl = container->tables[i];
1213
1214		if (!tbl)
1215			continue;
1216
1217		ret = table_group->ops->set_window(table_group, i, tbl);
1218		if (ret)
1219			goto release_exit;
1220	}
1221
1222	return 0;
1223
1224release_exit:
1225	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1226		table_group->ops->unset_window(table_group, i);
1227
1228	return ret;
1229}
1230
1231static int tce_iommu_attach_group(void *iommu_data,
1232		struct iommu_group *iommu_group, enum vfio_group_type type)
1233{
1234	int ret = 0;
1235	struct tce_container *container = iommu_data;
1236	struct iommu_table_group *table_group;
1237	struct tce_iommu_group *tcegrp = NULL;
1238
1239	if (type == VFIO_EMULATED_IOMMU)
1240		return -EINVAL;
1241
1242	mutex_lock(&container->lock);
1243
1244	/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
1245			iommu_group_id(iommu_group), iommu_group); */
1246	table_group = iommu_group_get_iommudata(iommu_group);
1247	if (!table_group) {
1248		ret = -ENODEV;
1249		goto unlock_exit;
1250	}
1251
1252	/* v2 requires full support of dynamic DMA windows */
1253	if (container->v2 && table_group->max_dynamic_windows_supported == 0) {
1254		ret = -EINVAL;
1255		goto unlock_exit;
1256	}
1257
1258	/* v1 reuses TCE tables and does not share them among PEs */
1259	if (!container->v2 && tce_groups_attached(container)) {
1260		ret = -EBUSY;
1261		goto unlock_exit;
1262	}
1263
1264	/*
1265	 * Check if new group has the same iommu_table_group_ops
1266	 * (i.e. compatible)
1267	 */
1268	list_for_each_entry(tcegrp, &container->group_list, next) {
1269		struct iommu_table_group *table_group_tmp;
1270
1271		if (tcegrp->grp == iommu_group) {
1272			pr_warn("tce_vfio: Group %d is already attached\n",
1273					iommu_group_id(iommu_group));
1274			ret = -EBUSY;
1275			goto unlock_exit;
1276		}
1277		table_group_tmp = iommu_group_get_iommudata(tcegrp->grp);
1278		if (table_group_tmp->ops->create_table !=
1279				table_group->ops->create_table) {
1280			pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
1281					iommu_group_id(iommu_group),
1282					iommu_group_id(tcegrp->grp));
1283			ret = -EPERM;
1284			goto unlock_exit;
1285		}
1286	}
1287
1288	tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL);
1289	if (!tcegrp) {
1290		ret = -ENOMEM;
1291		goto unlock_exit;
1292	}
1293
1294	ret = tce_iommu_take_ownership(container, table_group);
1295	if (!tce_groups_attached(container) && !container->tables[0])
1296		container->def_window_pending = true;
1297
1298	if (!ret) {
1299		tcegrp->grp = iommu_group;
1300		list_add(&tcegrp->next, &container->group_list);
1301	}
1302
1303	if (ret && tcegrp)
1304		kfree(tcegrp);
1305
1306unlock_exit:
1307	mutex_unlock(&container->lock);
1308
1309	return ret;
1310}
1311
1312static void tce_iommu_detach_group(void *iommu_data,
1313		struct iommu_group *iommu_group)
1314{
1315	struct tce_container *container = iommu_data;
1316	struct iommu_table_group *table_group;
1317	bool found = false;
1318	struct tce_iommu_group *tcegrp;
1319
1320	mutex_lock(&container->lock);
1321
1322	list_for_each_entry(tcegrp, &container->group_list, next) {
1323		if (tcegrp->grp == iommu_group) {
1324			found = true;
1325			break;
1326		}
1327	}
1328
1329	if (!found) {
1330		pr_warn("tce_vfio: detaching unattached group #%u\n",
1331				iommu_group_id(iommu_group));
1332		goto unlock_exit;
1333	}
1334
1335	list_del(&tcegrp->next);
1336	kfree(tcegrp);
1337
1338	table_group = iommu_group_get_iommudata(iommu_group);
1339	BUG_ON(!table_group);
1340
1341	tce_iommu_release_ownership(container, table_group);
1342
1343unlock_exit:
1344	mutex_unlock(&container->lock);
1345}
1346
1347static const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
1348	.name		= "iommu-vfio-powerpc",
1349	.owner		= THIS_MODULE,
1350	.open		= tce_iommu_open,
1351	.release	= tce_iommu_release,
1352	.ioctl		= tce_iommu_ioctl,
1353	.attach_group	= tce_iommu_attach_group,
1354	.detach_group	= tce_iommu_detach_group,
1355};
1356
1357static int __init tce_iommu_init(void)
1358{
1359	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
1360}
1361
1362static void __exit tce_iommu_cleanup(void)
1363{
1364	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
1365}
1366
1367module_init(tce_iommu_init);
1368module_exit(tce_iommu_cleanup);
1369
1370MODULE_VERSION(DRIVER_VERSION);
1371MODULE_LICENSE("GPL v2");
1372MODULE_AUTHOR(DRIVER_AUTHOR);
1373MODULE_DESCRIPTION(DRIVER_DESC);
1374
1375