162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 262306a36Sopenharmony_ci/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. 362306a36Sopenharmony_ci * 462306a36Sopenharmony_ci */ 562306a36Sopenharmony_ci#ifndef __IO_PAGETABLE_H 662306a36Sopenharmony_ci#define __IO_PAGETABLE_H 762306a36Sopenharmony_ci 862306a36Sopenharmony_ci#include <linux/interval_tree.h> 962306a36Sopenharmony_ci#include <linux/mutex.h> 1062306a36Sopenharmony_ci#include <linux/kref.h> 1162306a36Sopenharmony_ci#include <linux/xarray.h> 1262306a36Sopenharmony_ci 1362306a36Sopenharmony_ci#include "iommufd_private.h" 1462306a36Sopenharmony_ci 1562306a36Sopenharmony_cistruct iommu_domain; 1662306a36Sopenharmony_ci 1762306a36Sopenharmony_ci/* 1862306a36Sopenharmony_ci * Each io_pagetable is composed of intervals of areas which cover regions of 1962306a36Sopenharmony_ci * the iova that are backed by something. iova not covered by areas is not 2062306a36Sopenharmony_ci * populated in the page table. Each area is fully populated with pages. 2162306a36Sopenharmony_ci * 2262306a36Sopenharmony_ci * iovas are in byte units, but must be iopt->iova_alignment aligned. 2362306a36Sopenharmony_ci * 2462306a36Sopenharmony_ci * pages can be NULL, this means some other thread is still working on setting 2562306a36Sopenharmony_ci * up or tearing down the area. When observed under the write side of the 2662306a36Sopenharmony_ci * domain_rwsem a NULL pages must mean the area is still being setup and no 2762306a36Sopenharmony_ci * domains are filled. 2862306a36Sopenharmony_ci * 2962306a36Sopenharmony_ci * storage_domain points at an arbitrary iommu_domain that is holding the PFNs 3062306a36Sopenharmony_ci * for this area. It is locked by the pages->mutex. This simplifies the locking 3162306a36Sopenharmony_ci * as the pages code can rely on the storage_domain without having to get the 3262306a36Sopenharmony_ci * iopt->domains_rwsem. 3362306a36Sopenharmony_ci * 3462306a36Sopenharmony_ci * The io_pagetable::iova_rwsem protects node 3562306a36Sopenharmony_ci * The iopt_pages::mutex protects pages_node 3662306a36Sopenharmony_ci * iopt and iommu_prot are immutable 3762306a36Sopenharmony_ci * The pages::mutex protects num_accesses 3862306a36Sopenharmony_ci */ 3962306a36Sopenharmony_cistruct iopt_area { 4062306a36Sopenharmony_ci struct interval_tree_node node; 4162306a36Sopenharmony_ci struct interval_tree_node pages_node; 4262306a36Sopenharmony_ci struct io_pagetable *iopt; 4362306a36Sopenharmony_ci struct iopt_pages *pages; 4462306a36Sopenharmony_ci struct iommu_domain *storage_domain; 4562306a36Sopenharmony_ci /* How many bytes into the first page the area starts */ 4662306a36Sopenharmony_ci unsigned int page_offset; 4762306a36Sopenharmony_ci /* IOMMU_READ, IOMMU_WRITE, etc */ 4862306a36Sopenharmony_ci int iommu_prot; 4962306a36Sopenharmony_ci bool prevent_access : 1; 5062306a36Sopenharmony_ci unsigned int num_accesses; 5162306a36Sopenharmony_ci}; 5262306a36Sopenharmony_ci 5362306a36Sopenharmony_cistruct iopt_allowed { 5462306a36Sopenharmony_ci struct interval_tree_node node; 5562306a36Sopenharmony_ci}; 5662306a36Sopenharmony_ci 5762306a36Sopenharmony_cistruct iopt_reserved { 5862306a36Sopenharmony_ci struct interval_tree_node node; 5962306a36Sopenharmony_ci void *owner; 6062306a36Sopenharmony_ci}; 6162306a36Sopenharmony_ci 6262306a36Sopenharmony_ciint iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages); 6362306a36Sopenharmony_civoid iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages); 6462306a36Sopenharmony_ci 6562306a36Sopenharmony_ciint iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain); 6662306a36Sopenharmony_civoid iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages, 6762306a36Sopenharmony_ci struct iommu_domain *domain); 6862306a36Sopenharmony_civoid iopt_area_unmap_domain(struct iopt_area *area, 6962306a36Sopenharmony_ci struct iommu_domain *domain); 7062306a36Sopenharmony_ci 7162306a36Sopenharmony_cistatic inline unsigned long iopt_area_index(struct iopt_area *area) 7262306a36Sopenharmony_ci{ 7362306a36Sopenharmony_ci return area->pages_node.start; 7462306a36Sopenharmony_ci} 7562306a36Sopenharmony_ci 7662306a36Sopenharmony_cistatic inline unsigned long iopt_area_last_index(struct iopt_area *area) 7762306a36Sopenharmony_ci{ 7862306a36Sopenharmony_ci return area->pages_node.last; 7962306a36Sopenharmony_ci} 8062306a36Sopenharmony_ci 8162306a36Sopenharmony_cistatic inline unsigned long iopt_area_iova(struct iopt_area *area) 8262306a36Sopenharmony_ci{ 8362306a36Sopenharmony_ci return area->node.start; 8462306a36Sopenharmony_ci} 8562306a36Sopenharmony_ci 8662306a36Sopenharmony_cistatic inline unsigned long iopt_area_last_iova(struct iopt_area *area) 8762306a36Sopenharmony_ci{ 8862306a36Sopenharmony_ci return area->node.last; 8962306a36Sopenharmony_ci} 9062306a36Sopenharmony_ci 9162306a36Sopenharmony_cistatic inline size_t iopt_area_length(struct iopt_area *area) 9262306a36Sopenharmony_ci{ 9362306a36Sopenharmony_ci return (area->node.last - area->node.start) + 1; 9462306a36Sopenharmony_ci} 9562306a36Sopenharmony_ci 9662306a36Sopenharmony_ci/* 9762306a36Sopenharmony_ci * Number of bytes from the start of the iopt_pages that the iova begins. 9862306a36Sopenharmony_ci * iopt_area_start_byte() / PAGE_SIZE encodes the starting page index 9962306a36Sopenharmony_ci * iopt_area_start_byte() % PAGE_SIZE encodes the offset within that page 10062306a36Sopenharmony_ci */ 10162306a36Sopenharmony_cistatic inline unsigned long iopt_area_start_byte(struct iopt_area *area, 10262306a36Sopenharmony_ci unsigned long iova) 10362306a36Sopenharmony_ci{ 10462306a36Sopenharmony_ci if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) 10562306a36Sopenharmony_ci WARN_ON(iova < iopt_area_iova(area) || 10662306a36Sopenharmony_ci iova > iopt_area_last_iova(area)); 10762306a36Sopenharmony_ci return (iova - iopt_area_iova(area)) + area->page_offset + 10862306a36Sopenharmony_ci iopt_area_index(area) * PAGE_SIZE; 10962306a36Sopenharmony_ci} 11062306a36Sopenharmony_ci 11162306a36Sopenharmony_cistatic inline unsigned long iopt_area_iova_to_index(struct iopt_area *area, 11262306a36Sopenharmony_ci unsigned long iova) 11362306a36Sopenharmony_ci{ 11462306a36Sopenharmony_ci return iopt_area_start_byte(area, iova) / PAGE_SIZE; 11562306a36Sopenharmony_ci} 11662306a36Sopenharmony_ci 11762306a36Sopenharmony_ci#define __make_iopt_iter(name) \ 11862306a36Sopenharmony_ci static inline struct iopt_##name *iopt_##name##_iter_first( \ 11962306a36Sopenharmony_ci struct io_pagetable *iopt, unsigned long start, \ 12062306a36Sopenharmony_ci unsigned long last) \ 12162306a36Sopenharmony_ci { \ 12262306a36Sopenharmony_ci struct interval_tree_node *node; \ 12362306a36Sopenharmony_ci \ 12462306a36Sopenharmony_ci lockdep_assert_held(&iopt->iova_rwsem); \ 12562306a36Sopenharmony_ci node = interval_tree_iter_first(&iopt->name##_itree, start, \ 12662306a36Sopenharmony_ci last); \ 12762306a36Sopenharmony_ci if (!node) \ 12862306a36Sopenharmony_ci return NULL; \ 12962306a36Sopenharmony_ci return container_of(node, struct iopt_##name, node); \ 13062306a36Sopenharmony_ci } \ 13162306a36Sopenharmony_ci static inline struct iopt_##name *iopt_##name##_iter_next( \ 13262306a36Sopenharmony_ci struct iopt_##name *last_node, unsigned long start, \ 13362306a36Sopenharmony_ci unsigned long last) \ 13462306a36Sopenharmony_ci { \ 13562306a36Sopenharmony_ci struct interval_tree_node *node; \ 13662306a36Sopenharmony_ci \ 13762306a36Sopenharmony_ci node = interval_tree_iter_next(&last_node->node, start, last); \ 13862306a36Sopenharmony_ci if (!node) \ 13962306a36Sopenharmony_ci return NULL; \ 14062306a36Sopenharmony_ci return container_of(node, struct iopt_##name, node); \ 14162306a36Sopenharmony_ci } 14262306a36Sopenharmony_ci 14362306a36Sopenharmony_ci__make_iopt_iter(area) 14462306a36Sopenharmony_ci__make_iopt_iter(allowed) 14562306a36Sopenharmony_ci__make_iopt_iter(reserved) 14662306a36Sopenharmony_ci 14762306a36Sopenharmony_cistruct iopt_area_contig_iter { 14862306a36Sopenharmony_ci unsigned long cur_iova; 14962306a36Sopenharmony_ci unsigned long last_iova; 15062306a36Sopenharmony_ci struct iopt_area *area; 15162306a36Sopenharmony_ci}; 15262306a36Sopenharmony_cistruct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter, 15362306a36Sopenharmony_ci struct io_pagetable *iopt, 15462306a36Sopenharmony_ci unsigned long iova, 15562306a36Sopenharmony_ci unsigned long last_iova); 15662306a36Sopenharmony_cistruct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter); 15762306a36Sopenharmony_ci 15862306a36Sopenharmony_cistatic inline bool iopt_area_contig_done(struct iopt_area_contig_iter *iter) 15962306a36Sopenharmony_ci{ 16062306a36Sopenharmony_ci return iter->area && iter->last_iova <= iopt_area_last_iova(iter->area); 16162306a36Sopenharmony_ci} 16262306a36Sopenharmony_ci 16362306a36Sopenharmony_ci/* 16462306a36Sopenharmony_ci * Iterate over a contiguous list of areas that span the iova,last_iova range. 16562306a36Sopenharmony_ci * The caller must check iopt_area_contig_done() after the loop to see if 16662306a36Sopenharmony_ci * contiguous areas existed. 16762306a36Sopenharmony_ci */ 16862306a36Sopenharmony_ci#define iopt_for_each_contig_area(iter, area, iopt, iova, last_iova) \ 16962306a36Sopenharmony_ci for (area = iopt_area_contig_init(iter, iopt, iova, last_iova); area; \ 17062306a36Sopenharmony_ci area = iopt_area_contig_next(iter)) 17162306a36Sopenharmony_ci 17262306a36Sopenharmony_cienum { 17362306a36Sopenharmony_ci IOPT_PAGES_ACCOUNT_NONE = 0, 17462306a36Sopenharmony_ci IOPT_PAGES_ACCOUNT_USER = 1, 17562306a36Sopenharmony_ci IOPT_PAGES_ACCOUNT_MM = 2, 17662306a36Sopenharmony_ci}; 17762306a36Sopenharmony_ci 17862306a36Sopenharmony_ci/* 17962306a36Sopenharmony_ci * This holds a pinned page list for multiple areas of IO address space. The 18062306a36Sopenharmony_ci * pages always originate from a linear chunk of userspace VA. Multiple 18162306a36Sopenharmony_ci * io_pagetable's, through their iopt_area's, can share a single iopt_pages 18262306a36Sopenharmony_ci * which avoids multi-pinning and double accounting of page consumption. 18362306a36Sopenharmony_ci * 18462306a36Sopenharmony_ci * indexes in this structure are measured in PAGE_SIZE units, are 0 based from 18562306a36Sopenharmony_ci * the start of the uptr and extend to npages. pages are pinned dynamically 18662306a36Sopenharmony_ci * according to the intervals in the access_itree and domains_itree, npinned 18762306a36Sopenharmony_ci * records the current number of pages pinned. 18862306a36Sopenharmony_ci */ 18962306a36Sopenharmony_cistruct iopt_pages { 19062306a36Sopenharmony_ci struct kref kref; 19162306a36Sopenharmony_ci struct mutex mutex; 19262306a36Sopenharmony_ci size_t npages; 19362306a36Sopenharmony_ci size_t npinned; 19462306a36Sopenharmony_ci size_t last_npinned; 19562306a36Sopenharmony_ci struct task_struct *source_task; 19662306a36Sopenharmony_ci struct mm_struct *source_mm; 19762306a36Sopenharmony_ci struct user_struct *source_user; 19862306a36Sopenharmony_ci void __user *uptr; 19962306a36Sopenharmony_ci bool writable:1; 20062306a36Sopenharmony_ci u8 account_mode; 20162306a36Sopenharmony_ci 20262306a36Sopenharmony_ci struct xarray pinned_pfns; 20362306a36Sopenharmony_ci /* Of iopt_pages_access::node */ 20462306a36Sopenharmony_ci struct rb_root_cached access_itree; 20562306a36Sopenharmony_ci /* Of iopt_area::pages_node */ 20662306a36Sopenharmony_ci struct rb_root_cached domains_itree; 20762306a36Sopenharmony_ci}; 20862306a36Sopenharmony_ci 20962306a36Sopenharmony_cistruct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, 21062306a36Sopenharmony_ci bool writable); 21162306a36Sopenharmony_civoid iopt_release_pages(struct kref *kref); 21262306a36Sopenharmony_cistatic inline void iopt_put_pages(struct iopt_pages *pages) 21362306a36Sopenharmony_ci{ 21462306a36Sopenharmony_ci kref_put(&pages->kref, iopt_release_pages); 21562306a36Sopenharmony_ci} 21662306a36Sopenharmony_ci 21762306a36Sopenharmony_civoid iopt_pages_fill_from_xarray(struct iopt_pages *pages, unsigned long start, 21862306a36Sopenharmony_ci unsigned long last, struct page **out_pages); 21962306a36Sopenharmony_ciint iopt_pages_fill_xarray(struct iopt_pages *pages, unsigned long start, 22062306a36Sopenharmony_ci unsigned long last, struct page **out_pages); 22162306a36Sopenharmony_civoid iopt_pages_unfill_xarray(struct iopt_pages *pages, unsigned long start, 22262306a36Sopenharmony_ci unsigned long last); 22362306a36Sopenharmony_ci 22462306a36Sopenharmony_ciint iopt_area_add_access(struct iopt_area *area, unsigned long start, 22562306a36Sopenharmony_ci unsigned long last, struct page **out_pages, 22662306a36Sopenharmony_ci unsigned int flags); 22762306a36Sopenharmony_civoid iopt_area_remove_access(struct iopt_area *area, unsigned long start, 22862306a36Sopenharmony_ci unsigned long last); 22962306a36Sopenharmony_ciint iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte, 23062306a36Sopenharmony_ci void *data, unsigned long length, unsigned int flags); 23162306a36Sopenharmony_ci 23262306a36Sopenharmony_ci/* 23362306a36Sopenharmony_ci * Each interval represents an active iopt_access_pages(), it acts as an 23462306a36Sopenharmony_ci * interval lock that keeps the PFNs pinned and stored in the xarray. 23562306a36Sopenharmony_ci */ 23662306a36Sopenharmony_cistruct iopt_pages_access { 23762306a36Sopenharmony_ci struct interval_tree_node node; 23862306a36Sopenharmony_ci unsigned int users; 23962306a36Sopenharmony_ci}; 24062306a36Sopenharmony_ci 24162306a36Sopenharmony_ci#endif 242