1// SPDX-License-Identifier: GPL-2.0
2/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
3 *
4 * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
5 * PFNs can be placed into an iommu_domain, or returned to the caller as a page
6 * list for access by an in-kernel user.
7 *
8 * The datastructure uses the iopt_pages to optimize the storage of the PFNs
9 * between the domains and xarray.
10 */
11#include <linux/iommufd.h>
12#include <linux/lockdep.h>
13#include <linux/iommu.h>
14#include <linux/sched/mm.h>
15#include <linux/err.h>
16#include <linux/slab.h>
17#include <linux/errno.h>
18
19#include "io_pagetable.h"
20#include "double_span.h"
21
22struct iopt_pages_list {
23	struct iopt_pages *pages;
24	struct iopt_area *area;
25	struct list_head next;
26	unsigned long start_byte;
27	unsigned long length;
28};
29
30struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
31					struct io_pagetable *iopt,
32					unsigned long iova,
33					unsigned long last_iova)
34{
35	lockdep_assert_held(&iopt->iova_rwsem);
36
37	iter->cur_iova = iova;
38	iter->last_iova = last_iova;
39	iter->area = iopt_area_iter_first(iopt, iova, iova);
40	if (!iter->area)
41		return NULL;
42	if (!iter->area->pages) {
43		iter->area = NULL;
44		return NULL;
45	}
46	return iter->area;
47}
48
49struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
50{
51	unsigned long last_iova;
52
53	if (!iter->area)
54		return NULL;
55	last_iova = iopt_area_last_iova(iter->area);
56	if (iter->last_iova <= last_iova)
57		return NULL;
58
59	iter->cur_iova = last_iova + 1;
60	iter->area = iopt_area_iter_next(iter->area, iter->cur_iova,
61					 iter->last_iova);
62	if (!iter->area)
63		return NULL;
64	if (iter->cur_iova != iopt_area_iova(iter->area) ||
65	    !iter->area->pages) {
66		iter->area = NULL;
67		return NULL;
68	}
69	return iter->area;
70}
71
72static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
73				    unsigned long length,
74				    unsigned long iova_alignment,
75				    unsigned long page_offset)
76{
77	if (span->is_used || span->last_hole - span->start_hole < length - 1)
78		return false;
79
80	span->start_hole = ALIGN(span->start_hole, iova_alignment) |
81			   page_offset;
82	if (span->start_hole > span->last_hole ||
83	    span->last_hole - span->start_hole < length - 1)
84		return false;
85	return true;
86}
87
88static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
89				    unsigned long length,
90				    unsigned long iova_alignment,
91				    unsigned long page_offset)
92{
93	if (span->is_hole || span->last_used - span->start_used < length - 1)
94		return false;
95
96	span->start_used = ALIGN(span->start_used, iova_alignment) |
97			   page_offset;
98	if (span->start_used > span->last_used ||
99	    span->last_used - span->start_used < length - 1)
100		return false;
101	return true;
102}
103
104/*
105 * Automatically find a block of IOVA that is not being used and not reserved.
106 * Does not return a 0 IOVA even if it is valid.
107 */
108static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
109			   unsigned long uptr, unsigned long length)
110{
111	unsigned long page_offset = uptr % PAGE_SIZE;
112	struct interval_tree_double_span_iter used_span;
113	struct interval_tree_span_iter allowed_span;
114	unsigned long iova_alignment;
115
116	lockdep_assert_held(&iopt->iova_rwsem);
117
118	/* Protect roundup_pow-of_two() from overflow */
119	if (length == 0 || length >= ULONG_MAX / 2)
120		return -EOVERFLOW;
121
122	/*
123	 * Keep alignment present in the uptr when building the IOVA, this
124	 * increases the chance we can map a THP.
125	 */
126	if (!uptr)
127		iova_alignment = roundup_pow_of_two(length);
128	else
129		iova_alignment = min_t(unsigned long,
130				       roundup_pow_of_two(length),
131				       1UL << __ffs64(uptr));
132
133	if (iova_alignment < iopt->iova_alignment)
134		return -EINVAL;
135
136	interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree,
137				    PAGE_SIZE, ULONG_MAX - PAGE_SIZE) {
138		if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) {
139			allowed_span.start_used = PAGE_SIZE;
140			allowed_span.last_used = ULONG_MAX - PAGE_SIZE;
141			allowed_span.is_hole = false;
142		}
143
144		if (!__alloc_iova_check_used(&allowed_span, length,
145					     iova_alignment, page_offset))
146			continue;
147
148		interval_tree_for_each_double_span(
149			&used_span, &iopt->reserved_itree, &iopt->area_itree,
150			allowed_span.start_used, allowed_span.last_used) {
151			if (!__alloc_iova_check_hole(&used_span, length,
152						     iova_alignment,
153						     page_offset))
154				continue;
155
156			*iova = used_span.start_hole;
157			return 0;
158		}
159	}
160	return -ENOSPC;
161}
162
163static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova,
164			   unsigned long length)
165{
166	unsigned long last;
167
168	lockdep_assert_held(&iopt->iova_rwsem);
169
170	if ((iova & (iopt->iova_alignment - 1)))
171		return -EINVAL;
172
173	if (check_add_overflow(iova, length - 1, &last))
174		return -EOVERFLOW;
175
176	/* No reserved IOVA intersects the range */
177	if (iopt_reserved_iter_first(iopt, iova, last))
178		return -EINVAL;
179
180	/* Check that there is not already a mapping in the range */
181	if (iopt_area_iter_first(iopt, iova, last))
182		return -EEXIST;
183	return 0;
184}
185
186/*
187 * The area takes a slice of the pages from start_bytes to start_byte + length
188 */
189static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
190			    struct iopt_pages *pages, unsigned long iova,
191			    unsigned long start_byte, unsigned long length,
192			    int iommu_prot)
193{
194	lockdep_assert_held_write(&iopt->iova_rwsem);
195
196	if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
197		return -EPERM;
198
199	area->iommu_prot = iommu_prot;
200	area->page_offset = start_byte % PAGE_SIZE;
201	if (area->page_offset & (iopt->iova_alignment - 1))
202		return -EINVAL;
203
204	area->node.start = iova;
205	if (check_add_overflow(iova, length - 1, &area->node.last))
206		return -EOVERFLOW;
207
208	area->pages_node.start = start_byte / PAGE_SIZE;
209	if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
210		return -EOVERFLOW;
211	area->pages_node.last = area->pages_node.last / PAGE_SIZE;
212	if (WARN_ON(area->pages_node.last >= pages->npages))
213		return -EOVERFLOW;
214
215	/*
216	 * The area is inserted with a NULL pages indicating it is not fully
217	 * initialized yet.
218	 */
219	area->iopt = iopt;
220	interval_tree_insert(&area->node, &iopt->area_itree);
221	return 0;
222}
223
224static struct iopt_area *iopt_area_alloc(void)
225{
226	struct iopt_area *area;
227
228	area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
229	if (!area)
230		return NULL;
231	RB_CLEAR_NODE(&area->node.rb);
232	RB_CLEAR_NODE(&area->pages_node.rb);
233	return area;
234}
235
236static int iopt_alloc_area_pages(struct io_pagetable *iopt,
237				 struct list_head *pages_list,
238				 unsigned long length, unsigned long *dst_iova,
239				 int iommu_prot, unsigned int flags)
240{
241	struct iopt_pages_list *elm;
242	unsigned long iova;
243	int rc = 0;
244
245	list_for_each_entry(elm, pages_list, next) {
246		elm->area = iopt_area_alloc();
247		if (!elm->area)
248			return -ENOMEM;
249	}
250
251	down_write(&iopt->iova_rwsem);
252	if ((length & (iopt->iova_alignment - 1)) || !length) {
253		rc = -EINVAL;
254		goto out_unlock;
255	}
256
257	if (flags & IOPT_ALLOC_IOVA) {
258		/* Use the first entry to guess the ideal IOVA alignment */
259		elm = list_first_entry(pages_list, struct iopt_pages_list,
260				       next);
261		rc = iopt_alloc_iova(
262			iopt, dst_iova,
263			(uintptr_t)elm->pages->uptr + elm->start_byte, length);
264		if (rc)
265			goto out_unlock;
266		if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
267		    WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) {
268			rc = -EINVAL;
269			goto out_unlock;
270		}
271	} else {
272		rc = iopt_check_iova(iopt, *dst_iova, length);
273		if (rc)
274			goto out_unlock;
275	}
276
277	/*
278	 * Areas are created with a NULL pages so that the IOVA space is
279	 * reserved and we can unlock the iova_rwsem.
280	 */
281	iova = *dst_iova;
282	list_for_each_entry(elm, pages_list, next) {
283		rc = iopt_insert_area(iopt, elm->area, elm->pages, iova,
284				      elm->start_byte, elm->length, iommu_prot);
285		if (rc)
286			goto out_unlock;
287		iova += elm->length;
288	}
289
290out_unlock:
291	up_write(&iopt->iova_rwsem);
292	return rc;
293}
294
295static void iopt_abort_area(struct iopt_area *area)
296{
297	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
298		WARN_ON(area->pages);
299	if (area->iopt) {
300		down_write(&area->iopt->iova_rwsem);
301		interval_tree_remove(&area->node, &area->iopt->area_itree);
302		up_write(&area->iopt->iova_rwsem);
303	}
304	kfree(area);
305}
306
307void iopt_free_pages_list(struct list_head *pages_list)
308{
309	struct iopt_pages_list *elm;
310
311	while ((elm = list_first_entry_or_null(pages_list,
312					       struct iopt_pages_list, next))) {
313		if (elm->area)
314			iopt_abort_area(elm->area);
315		if (elm->pages)
316			iopt_put_pages(elm->pages);
317		list_del(&elm->next);
318		kfree(elm);
319	}
320}
321
322static int iopt_fill_domains_pages(struct list_head *pages_list)
323{
324	struct iopt_pages_list *undo_elm;
325	struct iopt_pages_list *elm;
326	int rc;
327
328	list_for_each_entry(elm, pages_list, next) {
329		rc = iopt_area_fill_domains(elm->area, elm->pages);
330		if (rc)
331			goto err_undo;
332	}
333	return 0;
334
335err_undo:
336	list_for_each_entry(undo_elm, pages_list, next) {
337		if (undo_elm == elm)
338			break;
339		iopt_area_unfill_domains(undo_elm->area, undo_elm->pages);
340	}
341	return rc;
342}
343
344int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
345		   unsigned long length, unsigned long *dst_iova,
346		   int iommu_prot, unsigned int flags)
347{
348	struct iopt_pages_list *elm;
349	int rc;
350
351	rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova,
352				   iommu_prot, flags);
353	if (rc)
354		return rc;
355
356	down_read(&iopt->domains_rwsem);
357	rc = iopt_fill_domains_pages(pages_list);
358	if (rc)
359		goto out_unlock_domains;
360
361	down_write(&iopt->iova_rwsem);
362	list_for_each_entry(elm, pages_list, next) {
363		/*
364		 * area->pages must be set inside the domains_rwsem to ensure
365		 * any newly added domains will get filled. Moves the reference
366		 * in from the list.
367		 */
368		elm->area->pages = elm->pages;
369		elm->pages = NULL;
370		elm->area = NULL;
371	}
372	up_write(&iopt->iova_rwsem);
373out_unlock_domains:
374	up_read(&iopt->domains_rwsem);
375	return rc;
376}
377
378/**
379 * iopt_map_user_pages() - Map a user VA to an iova in the io page table
380 * @ictx: iommufd_ctx the iopt is part of
381 * @iopt: io_pagetable to act on
382 * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
383 *        the chosen iova on output. Otherwise is the iova to map to on input
384 * @uptr: User VA to map
385 * @length: Number of bytes to map
386 * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
387 * @flags: IOPT_ALLOC_IOVA or zero
388 *
389 * iova, uptr, and length must be aligned to iova_alignment. For domain backed
390 * page tables this will pin the pages and load them into the domain at iova.
391 * For non-domain page tables this will only setup a lazy reference and the
392 * caller must use iopt_access_pages() to touch them.
393 *
394 * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
395 * destroyed.
396 */
397int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
398			unsigned long *iova, void __user *uptr,
399			unsigned long length, int iommu_prot,
400			unsigned int flags)
401{
402	struct iopt_pages_list elm = {};
403	LIST_HEAD(pages_list);
404	int rc;
405
406	elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE);
407	if (IS_ERR(elm.pages))
408		return PTR_ERR(elm.pages);
409	if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
410	    elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
411		elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
412	elm.start_byte = uptr - elm.pages->uptr;
413	elm.length = length;
414	list_add(&elm.next, &pages_list);
415
416	rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
417	if (rc) {
418		if (elm.area)
419			iopt_abort_area(elm.area);
420		if (elm.pages)
421			iopt_put_pages(elm.pages);
422		return rc;
423	}
424	return 0;
425}
426
427int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
428		   unsigned long length, struct list_head *pages_list)
429{
430	struct iopt_area_contig_iter iter;
431	unsigned long last_iova;
432	struct iopt_area *area;
433	int rc;
434
435	if (!length)
436		return -EINVAL;
437	if (check_add_overflow(iova, length - 1, &last_iova))
438		return -EOVERFLOW;
439
440	down_read(&iopt->iova_rwsem);
441	iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
442		struct iopt_pages_list *elm;
443		unsigned long last = min(last_iova, iopt_area_last_iova(area));
444
445		elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT);
446		if (!elm) {
447			rc = -ENOMEM;
448			goto err_free;
449		}
450		elm->start_byte = iopt_area_start_byte(area, iter.cur_iova);
451		elm->pages = area->pages;
452		elm->length = (last - iter.cur_iova) + 1;
453		kref_get(&elm->pages->kref);
454		list_add_tail(&elm->next, pages_list);
455	}
456	if (!iopt_area_contig_done(&iter)) {
457		rc = -ENOENT;
458		goto err_free;
459	}
460	up_read(&iopt->iova_rwsem);
461	return 0;
462err_free:
463	up_read(&iopt->iova_rwsem);
464	iopt_free_pages_list(pages_list);
465	return rc;
466}
467
468static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
469				 unsigned long last, unsigned long *unmapped)
470{
471	struct iopt_area *area;
472	unsigned long unmapped_bytes = 0;
473	unsigned int tries = 0;
474	int rc = -ENOENT;
475
476	/*
477	 * The domains_rwsem must be held in read mode any time any area->pages
478	 * is NULL. This prevents domain attach/detatch from running
479	 * concurrently with cleaning up the area.
480	 */
481again:
482	down_read(&iopt->domains_rwsem);
483	down_write(&iopt->iova_rwsem);
484	while ((area = iopt_area_iter_first(iopt, start, last))) {
485		unsigned long area_last = iopt_area_last_iova(area);
486		unsigned long area_first = iopt_area_iova(area);
487		struct iopt_pages *pages;
488
489		/* Userspace should not race map/unmap's of the same area */
490		if (!area->pages) {
491			rc = -EBUSY;
492			goto out_unlock_iova;
493		}
494
495		if (area_first < start || area_last > last) {
496			rc = -ENOENT;
497			goto out_unlock_iova;
498		}
499
500		if (area_first != start)
501			tries = 0;
502
503		/*
504		 * num_accesses writers must hold the iova_rwsem too, so we can
505		 * safely read it under the write side of the iovam_rwsem
506		 * without the pages->mutex.
507		 */
508		if (area->num_accesses) {
509			size_t length = iopt_area_length(area);
510
511			start = area_first;
512			area->prevent_access = true;
513			up_write(&iopt->iova_rwsem);
514			up_read(&iopt->domains_rwsem);
515
516			iommufd_access_notify_unmap(iopt, area_first, length);
517			/* Something is not responding to unmap requests. */
518			tries++;
519			if (WARN_ON(tries > 100))
520				return -EDEADLOCK;
521			goto again;
522		}
523
524		pages = area->pages;
525		area->pages = NULL;
526		up_write(&iopt->iova_rwsem);
527
528		iopt_area_unfill_domains(area, pages);
529		iopt_abort_area(area);
530		iopt_put_pages(pages);
531
532		unmapped_bytes += area_last - area_first + 1;
533
534		down_write(&iopt->iova_rwsem);
535	}
536	if (unmapped_bytes)
537		rc = 0;
538
539out_unlock_iova:
540	up_write(&iopt->iova_rwsem);
541	up_read(&iopt->domains_rwsem);
542	if (unmapped)
543		*unmapped = unmapped_bytes;
544	return rc;
545}
546
547/**
548 * iopt_unmap_iova() - Remove a range of iova
549 * @iopt: io_pagetable to act on
550 * @iova: Starting iova to unmap
551 * @length: Number of bytes to unmap
552 * @unmapped: Return number of bytes unmapped
553 *
554 * The requested range must be a superset of existing ranges.
555 * Splitting/truncating IOVA mappings is not allowed.
556 */
557int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
558		    unsigned long length, unsigned long *unmapped)
559{
560	unsigned long iova_last;
561
562	if (!length)
563		return -EINVAL;
564
565	if (check_add_overflow(iova, length - 1, &iova_last))
566		return -EOVERFLOW;
567
568	return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped);
569}
570
571int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
572{
573	int rc;
574
575	rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
576	/* If the IOVAs are empty then unmap all succeeds */
577	if (rc == -ENOENT)
578		return 0;
579	return rc;
580}
581
582/* The caller must always free all the nodes in the allowed_iova rb_root. */
583int iopt_set_allow_iova(struct io_pagetable *iopt,
584			struct rb_root_cached *allowed_iova)
585{
586	struct iopt_allowed *allowed;
587
588	down_write(&iopt->iova_rwsem);
589	swap(*allowed_iova, iopt->allowed_itree);
590
591	for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed;
592	     allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) {
593		if (iopt_reserved_iter_first(iopt, allowed->node.start,
594					     allowed->node.last)) {
595			swap(*allowed_iova, iopt->allowed_itree);
596			up_write(&iopt->iova_rwsem);
597			return -EADDRINUSE;
598		}
599	}
600	up_write(&iopt->iova_rwsem);
601	return 0;
602}
603
604int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
605		      unsigned long last, void *owner)
606{
607	struct iopt_reserved *reserved;
608
609	lockdep_assert_held_write(&iopt->iova_rwsem);
610
611	if (iopt_area_iter_first(iopt, start, last) ||
612	    iopt_allowed_iter_first(iopt, start, last))
613		return -EADDRINUSE;
614
615	reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT);
616	if (!reserved)
617		return -ENOMEM;
618	reserved->node.start = start;
619	reserved->node.last = last;
620	reserved->owner = owner;
621	interval_tree_insert(&reserved->node, &iopt->reserved_itree);
622	return 0;
623}
624
625static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
626{
627	struct iopt_reserved *reserved, *next;
628
629	lockdep_assert_held_write(&iopt->iova_rwsem);
630
631	for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved;
632	     reserved = next) {
633		next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX);
634
635		if (reserved->owner == owner) {
636			interval_tree_remove(&reserved->node,
637					     &iopt->reserved_itree);
638			kfree(reserved);
639		}
640	}
641}
642
643void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
644{
645	down_write(&iopt->iova_rwsem);
646	__iopt_remove_reserved_iova(iopt, owner);
647	up_write(&iopt->iova_rwsem);
648}
649
650void iopt_init_table(struct io_pagetable *iopt)
651{
652	init_rwsem(&iopt->iova_rwsem);
653	init_rwsem(&iopt->domains_rwsem);
654	iopt->area_itree = RB_ROOT_CACHED;
655	iopt->allowed_itree = RB_ROOT_CACHED;
656	iopt->reserved_itree = RB_ROOT_CACHED;
657	xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT);
658	xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC);
659
660	/*
661	 * iopt's start as SW tables that can use the entire size_t IOVA space
662	 * due to the use of size_t in the APIs. They have no alignment
663	 * restriction.
664	 */
665	iopt->iova_alignment = 1;
666}
667
668void iopt_destroy_table(struct io_pagetable *iopt)
669{
670	struct interval_tree_node *node;
671
672	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
673		iopt_remove_reserved_iova(iopt, NULL);
674
675	while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0,
676						ULONG_MAX))) {
677		interval_tree_remove(node, &iopt->allowed_itree);
678		kfree(container_of(node, struct iopt_allowed, node));
679	}
680
681	WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root));
682	WARN_ON(!xa_empty(&iopt->domains));
683	WARN_ON(!xa_empty(&iopt->access_list));
684	WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
685}
686
687/**
688 * iopt_unfill_domain() - Unfill a domain with PFNs
689 * @iopt: io_pagetable to act on
690 * @domain: domain to unfill
691 *
692 * This is used when removing a domain from the iopt. Every area in the iopt
693 * will be unmapped from the domain. The domain must already be removed from the
694 * domains xarray.
695 */
696static void iopt_unfill_domain(struct io_pagetable *iopt,
697			       struct iommu_domain *domain)
698{
699	struct iopt_area *area;
700
701	lockdep_assert_held(&iopt->iova_rwsem);
702	lockdep_assert_held_write(&iopt->domains_rwsem);
703
704	/*
705	 * Some other domain is holding all the pfns still, rapidly unmap this
706	 * domain.
707	 */
708	if (iopt->next_domain_id != 0) {
709		/* Pick an arbitrary remaining domain to act as storage */
710		struct iommu_domain *storage_domain =
711			xa_load(&iopt->domains, 0);
712
713		for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
714		     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
715			struct iopt_pages *pages = area->pages;
716
717			if (!pages)
718				continue;
719
720			mutex_lock(&pages->mutex);
721			if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
722				WARN_ON(!area->storage_domain);
723			if (area->storage_domain == domain)
724				area->storage_domain = storage_domain;
725			mutex_unlock(&pages->mutex);
726
727			iopt_area_unmap_domain(area, domain);
728		}
729		return;
730	}
731
732	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
733	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
734		struct iopt_pages *pages = area->pages;
735
736		if (!pages)
737			continue;
738
739		mutex_lock(&pages->mutex);
740		interval_tree_remove(&area->pages_node, &pages->domains_itree);
741		WARN_ON(area->storage_domain != domain);
742		area->storage_domain = NULL;
743		iopt_area_unfill_domain(area, pages, domain);
744		mutex_unlock(&pages->mutex);
745	}
746}
747
748/**
749 * iopt_fill_domain() - Fill a domain with PFNs
750 * @iopt: io_pagetable to act on
751 * @domain: domain to fill
752 *
753 * Fill the domain with PFNs from every area in the iopt. On failure the domain
754 * is left unchanged.
755 */
756static int iopt_fill_domain(struct io_pagetable *iopt,
757			    struct iommu_domain *domain)
758{
759	struct iopt_area *end_area;
760	struct iopt_area *area;
761	int rc;
762
763	lockdep_assert_held(&iopt->iova_rwsem);
764	lockdep_assert_held_write(&iopt->domains_rwsem);
765
766	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
767	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
768		struct iopt_pages *pages = area->pages;
769
770		if (!pages)
771			continue;
772
773		mutex_lock(&pages->mutex);
774		rc = iopt_area_fill_domain(area, domain);
775		if (rc) {
776			mutex_unlock(&pages->mutex);
777			goto out_unfill;
778		}
779		if (!area->storage_domain) {
780			WARN_ON(iopt->next_domain_id != 0);
781			area->storage_domain = domain;
782			interval_tree_insert(&area->pages_node,
783					     &pages->domains_itree);
784		}
785		mutex_unlock(&pages->mutex);
786	}
787	return 0;
788
789out_unfill:
790	end_area = area;
791	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
792	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
793		struct iopt_pages *pages = area->pages;
794
795		if (area == end_area)
796			break;
797		if (!pages)
798			continue;
799		mutex_lock(&pages->mutex);
800		if (iopt->next_domain_id == 0) {
801			interval_tree_remove(&area->pages_node,
802					     &pages->domains_itree);
803			area->storage_domain = NULL;
804		}
805		iopt_area_unfill_domain(area, pages, domain);
806		mutex_unlock(&pages->mutex);
807	}
808	return rc;
809}
810
811/* All existing area's conform to an increased page size */
812static int iopt_check_iova_alignment(struct io_pagetable *iopt,
813				     unsigned long new_iova_alignment)
814{
815	unsigned long align_mask = new_iova_alignment - 1;
816	struct iopt_area *area;
817
818	lockdep_assert_held(&iopt->iova_rwsem);
819	lockdep_assert_held(&iopt->domains_rwsem);
820
821	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
822	     area = iopt_area_iter_next(area, 0, ULONG_MAX))
823		if ((iopt_area_iova(area) & align_mask) ||
824		    (iopt_area_length(area) & align_mask) ||
825		    (area->page_offset & align_mask))
826			return -EADDRINUSE;
827
828	if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) {
829		struct iommufd_access *access;
830		unsigned long index;
831
832		xa_for_each(&iopt->access_list, index, access)
833			if (WARN_ON(access->iova_alignment >
834				    new_iova_alignment))
835				return -EADDRINUSE;
836	}
837	return 0;
838}
839
840int iopt_table_add_domain(struct io_pagetable *iopt,
841			  struct iommu_domain *domain)
842{
843	const struct iommu_domain_geometry *geometry = &domain->geometry;
844	struct iommu_domain *iter_domain;
845	unsigned int new_iova_alignment;
846	unsigned long index;
847	int rc;
848
849	down_write(&iopt->domains_rwsem);
850	down_write(&iopt->iova_rwsem);
851
852	xa_for_each(&iopt->domains, index, iter_domain) {
853		if (WARN_ON(iter_domain == domain)) {
854			rc = -EEXIST;
855			goto out_unlock;
856		}
857	}
858
859	/*
860	 * The io page size drives the iova_alignment. Internally the iopt_pages
861	 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
862	 * objects into the iommu_domain.
863	 *
864	 * A iommu_domain must always be able to accept PAGE_SIZE to be
865	 * compatible as we can't guarantee higher contiguity.
866	 */
867	new_iova_alignment = max_t(unsigned long,
868				   1UL << __ffs(domain->pgsize_bitmap),
869				   iopt->iova_alignment);
870	if (new_iova_alignment > PAGE_SIZE) {
871		rc = -EINVAL;
872		goto out_unlock;
873	}
874	if (new_iova_alignment != iopt->iova_alignment) {
875		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
876		if (rc)
877			goto out_unlock;
878	}
879
880	/* No area exists that is outside the allowed domain aperture */
881	if (geometry->aperture_start != 0) {
882		rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
883				       domain);
884		if (rc)
885			goto out_reserved;
886	}
887	if (geometry->aperture_end != ULONG_MAX) {
888		rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
889				       ULONG_MAX, domain);
890		if (rc)
891			goto out_reserved;
892	}
893
894	rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
895	if (rc)
896		goto out_reserved;
897
898	rc = iopt_fill_domain(iopt, domain);
899	if (rc)
900		goto out_release;
901
902	iopt->iova_alignment = new_iova_alignment;
903	xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
904	iopt->next_domain_id++;
905	up_write(&iopt->iova_rwsem);
906	up_write(&iopt->domains_rwsem);
907	return 0;
908out_release:
909	xa_release(&iopt->domains, iopt->next_domain_id);
910out_reserved:
911	__iopt_remove_reserved_iova(iopt, domain);
912out_unlock:
913	up_write(&iopt->iova_rwsem);
914	up_write(&iopt->domains_rwsem);
915	return rc;
916}
917
918static int iopt_calculate_iova_alignment(struct io_pagetable *iopt)
919{
920	unsigned long new_iova_alignment;
921	struct iommufd_access *access;
922	struct iommu_domain *domain;
923	unsigned long index;
924
925	lockdep_assert_held_write(&iopt->iova_rwsem);
926	lockdep_assert_held(&iopt->domains_rwsem);
927
928	/* See batch_iommu_map_small() */
929	if (iopt->disable_large_pages)
930		new_iova_alignment = PAGE_SIZE;
931	else
932		new_iova_alignment = 1;
933
934	xa_for_each(&iopt->domains, index, domain)
935		new_iova_alignment = max_t(unsigned long,
936					   1UL << __ffs(domain->pgsize_bitmap),
937					   new_iova_alignment);
938	xa_for_each(&iopt->access_list, index, access)
939		new_iova_alignment = max_t(unsigned long,
940					   access->iova_alignment,
941					   new_iova_alignment);
942
943	if (new_iova_alignment > iopt->iova_alignment) {
944		int rc;
945
946		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
947		if (rc)
948			return rc;
949	}
950	iopt->iova_alignment = new_iova_alignment;
951	return 0;
952}
953
954void iopt_table_remove_domain(struct io_pagetable *iopt,
955			      struct iommu_domain *domain)
956{
957	struct iommu_domain *iter_domain = NULL;
958	unsigned long index;
959
960	down_write(&iopt->domains_rwsem);
961	down_write(&iopt->iova_rwsem);
962
963	xa_for_each(&iopt->domains, index, iter_domain)
964		if (iter_domain == domain)
965			break;
966	if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
967		goto out_unlock;
968
969	/*
970	 * Compress the xarray to keep it linear by swapping the entry to erase
971	 * with the tail entry and shrinking the tail.
972	 */
973	iopt->next_domain_id--;
974	iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
975	if (index != iopt->next_domain_id)
976		xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
977
978	iopt_unfill_domain(iopt, domain);
979	__iopt_remove_reserved_iova(iopt, domain);
980
981	WARN_ON(iopt_calculate_iova_alignment(iopt));
982out_unlock:
983	up_write(&iopt->iova_rwsem);
984	up_write(&iopt->domains_rwsem);
985}
986
987/**
988 * iopt_area_split - Split an area into two parts at iova
989 * @area: The area to split
990 * @iova: Becomes the last of a new area
991 *
992 * This splits an area into two. It is part of the VFIO compatibility to allow
993 * poking a hole in the mapping. The two areas continue to point at the same
994 * iopt_pages, just with different starting bytes.
995 */
996static int iopt_area_split(struct iopt_area *area, unsigned long iova)
997{
998	unsigned long alignment = area->iopt->iova_alignment;
999	unsigned long last_iova = iopt_area_last_iova(area);
1000	unsigned long start_iova = iopt_area_iova(area);
1001	unsigned long new_start = iova + 1;
1002	struct io_pagetable *iopt = area->iopt;
1003	struct iopt_pages *pages = area->pages;
1004	struct iopt_area *lhs;
1005	struct iopt_area *rhs;
1006	int rc;
1007
1008	lockdep_assert_held_write(&iopt->iova_rwsem);
1009
1010	if (iova == start_iova || iova == last_iova)
1011		return 0;
1012
1013	if (!pages || area->prevent_access)
1014		return -EBUSY;
1015
1016	if (new_start & (alignment - 1) ||
1017	    iopt_area_start_byte(area, new_start) & (alignment - 1))
1018		return -EINVAL;
1019
1020	lhs = iopt_area_alloc();
1021	if (!lhs)
1022		return -ENOMEM;
1023
1024	rhs = iopt_area_alloc();
1025	if (!rhs) {
1026		rc = -ENOMEM;
1027		goto err_free_lhs;
1028	}
1029
1030	mutex_lock(&pages->mutex);
1031	/*
1032	 * Splitting is not permitted if an access exists, we don't track enough
1033	 * information to split existing accesses.
1034	 */
1035	if (area->num_accesses) {
1036		rc = -EINVAL;
1037		goto err_unlock;
1038	}
1039
1040	/*
1041	 * Splitting is not permitted if a domain could have been mapped with
1042	 * huge pages.
1043	 */
1044	if (area->storage_domain && !iopt->disable_large_pages) {
1045		rc = -EINVAL;
1046		goto err_unlock;
1047	}
1048
1049	interval_tree_remove(&area->node, &iopt->area_itree);
1050	rc = iopt_insert_area(iopt, lhs, area->pages, start_iova,
1051			      iopt_area_start_byte(area, start_iova),
1052			      (new_start - 1) - start_iova + 1,
1053			      area->iommu_prot);
1054	if (WARN_ON(rc))
1055		goto err_insert;
1056
1057	rc = iopt_insert_area(iopt, rhs, area->pages, new_start,
1058			      iopt_area_start_byte(area, new_start),
1059			      last_iova - new_start + 1, area->iommu_prot);
1060	if (WARN_ON(rc))
1061		goto err_remove_lhs;
1062
1063	/*
1064	 * If the original area has filled a domain, domains_itree has to be
1065	 * updated.
1066	 */
1067	if (area->storage_domain) {
1068		interval_tree_remove(&area->pages_node, &pages->domains_itree);
1069		interval_tree_insert(&lhs->pages_node, &pages->domains_itree);
1070		interval_tree_insert(&rhs->pages_node, &pages->domains_itree);
1071	}
1072
1073	lhs->storage_domain = area->storage_domain;
1074	lhs->pages = area->pages;
1075	rhs->storage_domain = area->storage_domain;
1076	rhs->pages = area->pages;
1077	kref_get(&rhs->pages->kref);
1078	kfree(area);
1079	mutex_unlock(&pages->mutex);
1080
1081	/*
1082	 * No change to domains or accesses because the pages hasn't been
1083	 * changed
1084	 */
1085	return 0;
1086
1087err_remove_lhs:
1088	interval_tree_remove(&lhs->node, &iopt->area_itree);
1089err_insert:
1090	interval_tree_insert(&area->node, &iopt->area_itree);
1091err_unlock:
1092	mutex_unlock(&pages->mutex);
1093	kfree(rhs);
1094err_free_lhs:
1095	kfree(lhs);
1096	return rc;
1097}
1098
1099int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
1100		  size_t num_iovas)
1101{
1102	int rc = 0;
1103	int i;
1104
1105	down_write(&iopt->iova_rwsem);
1106	for (i = 0; i < num_iovas; i++) {
1107		struct iopt_area *area;
1108
1109		area = iopt_area_iter_first(iopt, iovas[i], iovas[i]);
1110		if (!area)
1111			continue;
1112		rc = iopt_area_split(area, iovas[i]);
1113		if (rc)
1114			break;
1115	}
1116	up_write(&iopt->iova_rwsem);
1117	return rc;
1118}
1119
1120void iopt_enable_large_pages(struct io_pagetable *iopt)
1121{
1122	int rc;
1123
1124	down_write(&iopt->domains_rwsem);
1125	down_write(&iopt->iova_rwsem);
1126	WRITE_ONCE(iopt->disable_large_pages, false);
1127	rc = iopt_calculate_iova_alignment(iopt);
1128	WARN_ON(rc);
1129	up_write(&iopt->iova_rwsem);
1130	up_write(&iopt->domains_rwsem);
1131}
1132
1133int iopt_disable_large_pages(struct io_pagetable *iopt)
1134{
1135	int rc = 0;
1136
1137	down_write(&iopt->domains_rwsem);
1138	down_write(&iopt->iova_rwsem);
1139	if (iopt->disable_large_pages)
1140		goto out_unlock;
1141
1142	/* Won't do it if domains already have pages mapped in them */
1143	if (!xa_empty(&iopt->domains) &&
1144	    !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) {
1145		rc = -EINVAL;
1146		goto out_unlock;
1147	}
1148
1149	WRITE_ONCE(iopt->disable_large_pages, true);
1150	rc = iopt_calculate_iova_alignment(iopt);
1151	if (rc)
1152		WRITE_ONCE(iopt->disable_large_pages, false);
1153out_unlock:
1154	up_write(&iopt->iova_rwsem);
1155	up_write(&iopt->domains_rwsem);
1156	return rc;
1157}
1158
1159int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access)
1160{
1161	u32 new_id;
1162	int rc;
1163
1164	down_write(&iopt->domains_rwsem);
1165	down_write(&iopt->iova_rwsem);
1166	rc = xa_alloc(&iopt->access_list, &new_id, access, xa_limit_16b,
1167		      GFP_KERNEL_ACCOUNT);
1168
1169	if (rc)
1170		goto out_unlock;
1171
1172	rc = iopt_calculate_iova_alignment(iopt);
1173	if (rc) {
1174		xa_erase(&iopt->access_list, new_id);
1175		goto out_unlock;
1176	}
1177	access->iopt_access_list_id = new_id;
1178
1179out_unlock:
1180	up_write(&iopt->iova_rwsem);
1181	up_write(&iopt->domains_rwsem);
1182	return rc;
1183}
1184
1185void iopt_remove_access(struct io_pagetable *iopt,
1186			struct iommufd_access *access,
1187			u32 iopt_access_list_id)
1188{
1189	down_write(&iopt->domains_rwsem);
1190	down_write(&iopt->iova_rwsem);
1191	WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access);
1192	WARN_ON(iopt_calculate_iova_alignment(iopt));
1193	up_write(&iopt->iova_rwsem);
1194	up_write(&iopt->domains_rwsem);
1195}
1196
1197/* Narrow the valid_iova_itree to include reserved ranges from a device. */
1198int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
1199					struct device *dev,
1200					phys_addr_t *sw_msi_start)
1201{
1202	struct iommu_resv_region *resv;
1203	LIST_HEAD(resv_regions);
1204	unsigned int num_hw_msi = 0;
1205	unsigned int num_sw_msi = 0;
1206	int rc;
1207
1208	if (iommufd_should_fail())
1209		return -EINVAL;
1210
1211	down_write(&iopt->iova_rwsem);
1212	/* FIXME: drivers allocate memory but there is no failure propogated */
1213	iommu_get_resv_regions(dev, &resv_regions);
1214
1215	list_for_each_entry(resv, &resv_regions, list) {
1216		if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1217			continue;
1218
1219		if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
1220			num_hw_msi++;
1221		if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
1222			*sw_msi_start = resv->start;
1223			num_sw_msi++;
1224		}
1225
1226		rc = iopt_reserve_iova(iopt, resv->start,
1227				       resv->length - 1 + resv->start, dev);
1228		if (rc)
1229			goto out_reserved;
1230	}
1231
1232	/* Drivers must offer sane combinations of regions */
1233	if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) {
1234		rc = -EINVAL;
1235		goto out_reserved;
1236	}
1237
1238	rc = 0;
1239	goto out_free_resv;
1240
1241out_reserved:
1242	__iopt_remove_reserved_iova(iopt, dev);
1243out_free_resv:
1244	iommu_put_resv_regions(dev, &resv_regions);
1245	up_write(&iopt->iova_rwsem);
1246	return rc;
1247}
1248