18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Copyright 2013 Red Hat Inc. 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Authors: Jérôme Glisse <jglisse@redhat.com> 68c2ecf20Sopenharmony_ci */ 78c2ecf20Sopenharmony_ci/* 88c2ecf20Sopenharmony_ci * Refer to include/linux/hmm.h for information about heterogeneous memory 98c2ecf20Sopenharmony_ci * management or HMM for short. 108c2ecf20Sopenharmony_ci */ 118c2ecf20Sopenharmony_ci#include <linux/pagewalk.h> 128c2ecf20Sopenharmony_ci#include <linux/hmm.h> 138c2ecf20Sopenharmony_ci#include <linux/init.h> 148c2ecf20Sopenharmony_ci#include <linux/rmap.h> 158c2ecf20Sopenharmony_ci#include <linux/swap.h> 168c2ecf20Sopenharmony_ci#include <linux/slab.h> 178c2ecf20Sopenharmony_ci#include <linux/sched.h> 188c2ecf20Sopenharmony_ci#include <linux/mmzone.h> 198c2ecf20Sopenharmony_ci#include <linux/pagemap.h> 208c2ecf20Sopenharmony_ci#include <linux/swapops.h> 218c2ecf20Sopenharmony_ci#include <linux/hugetlb.h> 228c2ecf20Sopenharmony_ci#include <linux/memremap.h> 238c2ecf20Sopenharmony_ci#include <linux/sched/mm.h> 248c2ecf20Sopenharmony_ci#include <linux/jump_label.h> 258c2ecf20Sopenharmony_ci#include <linux/dma-mapping.h> 268c2ecf20Sopenharmony_ci#include <linux/mmu_notifier.h> 278c2ecf20Sopenharmony_ci#include <linux/memory_hotplug.h> 288c2ecf20Sopenharmony_ci 298c2ecf20Sopenharmony_cistruct hmm_vma_walk { 308c2ecf20Sopenharmony_ci struct hmm_range *range; 318c2ecf20Sopenharmony_ci unsigned long last; 328c2ecf20Sopenharmony_ci}; 338c2ecf20Sopenharmony_ci 348c2ecf20Sopenharmony_cienum { 358c2ecf20Sopenharmony_ci HMM_NEED_FAULT = 1 << 0, 368c2ecf20Sopenharmony_ci HMM_NEED_WRITE_FAULT = 1 << 1, 378c2ecf20Sopenharmony_ci HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT, 388c2ecf20Sopenharmony_ci}; 398c2ecf20Sopenharmony_ci 408c2ecf20Sopenharmony_cistatic int hmm_pfns_fill(unsigned long addr, unsigned long end, 418c2ecf20Sopenharmony_ci struct hmm_range *range, unsigned long cpu_flags) 428c2ecf20Sopenharmony_ci{ 438c2ecf20Sopenharmony_ci unsigned long i = (addr - range->start) >> PAGE_SHIFT; 448c2ecf20Sopenharmony_ci 458c2ecf20Sopenharmony_ci for (; addr < end; addr += PAGE_SIZE, i++) 468c2ecf20Sopenharmony_ci range->hmm_pfns[i] = cpu_flags; 478c2ecf20Sopenharmony_ci return 0; 488c2ecf20Sopenharmony_ci} 498c2ecf20Sopenharmony_ci 508c2ecf20Sopenharmony_ci/* 518c2ecf20Sopenharmony_ci * hmm_vma_fault() - fault in a range lacking valid pmd or pte(s) 528c2ecf20Sopenharmony_ci * @addr: range virtual start address (inclusive) 538c2ecf20Sopenharmony_ci * @end: range virtual end address (exclusive) 548c2ecf20Sopenharmony_ci * @required_fault: HMM_NEED_* flags 558c2ecf20Sopenharmony_ci * @walk: mm_walk structure 568c2ecf20Sopenharmony_ci * Return: -EBUSY after page fault, or page fault error 578c2ecf20Sopenharmony_ci * 588c2ecf20Sopenharmony_ci * This function will be called whenever pmd_none() or pte_none() returns true, 598c2ecf20Sopenharmony_ci * or whenever there is no page directory covering the virtual address range. 608c2ecf20Sopenharmony_ci */ 618c2ecf20Sopenharmony_cistatic int hmm_vma_fault(unsigned long addr, unsigned long end, 628c2ecf20Sopenharmony_ci unsigned int required_fault, struct mm_walk *walk) 638c2ecf20Sopenharmony_ci{ 648c2ecf20Sopenharmony_ci struct hmm_vma_walk *hmm_vma_walk = walk->private; 658c2ecf20Sopenharmony_ci struct vm_area_struct *vma = walk->vma; 668c2ecf20Sopenharmony_ci unsigned int fault_flags = FAULT_FLAG_REMOTE; 678c2ecf20Sopenharmony_ci 688c2ecf20Sopenharmony_ci WARN_ON_ONCE(!required_fault); 698c2ecf20Sopenharmony_ci hmm_vma_walk->last = addr; 708c2ecf20Sopenharmony_ci 718c2ecf20Sopenharmony_ci if (required_fault & HMM_NEED_WRITE_FAULT) { 728c2ecf20Sopenharmony_ci if (!(vma->vm_flags & VM_WRITE)) 738c2ecf20Sopenharmony_ci return -EPERM; 748c2ecf20Sopenharmony_ci fault_flags |= FAULT_FLAG_WRITE; 758c2ecf20Sopenharmony_ci } 768c2ecf20Sopenharmony_ci 778c2ecf20Sopenharmony_ci for (; addr < end; addr += PAGE_SIZE) 788c2ecf20Sopenharmony_ci if (handle_mm_fault(vma, addr, fault_flags, NULL) & 798c2ecf20Sopenharmony_ci VM_FAULT_ERROR) 808c2ecf20Sopenharmony_ci return -EFAULT; 818c2ecf20Sopenharmony_ci return -EBUSY; 828c2ecf20Sopenharmony_ci} 838c2ecf20Sopenharmony_ci 848c2ecf20Sopenharmony_cistatic unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 858c2ecf20Sopenharmony_ci unsigned long pfn_req_flags, 868c2ecf20Sopenharmony_ci unsigned long cpu_flags) 878c2ecf20Sopenharmony_ci{ 888c2ecf20Sopenharmony_ci struct hmm_range *range = hmm_vma_walk->range; 898c2ecf20Sopenharmony_ci 908c2ecf20Sopenharmony_ci /* 918c2ecf20Sopenharmony_ci * So we not only consider the individual per page request we also 928c2ecf20Sopenharmony_ci * consider the default flags requested for the range. The API can 938c2ecf20Sopenharmony_ci * be used 2 ways. The first one where the HMM user coalesces 948c2ecf20Sopenharmony_ci * multiple page faults into one request and sets flags per pfn for 958c2ecf20Sopenharmony_ci * those faults. The second one where the HMM user wants to pre- 968c2ecf20Sopenharmony_ci * fault a range with specific flags. For the latter one it is a 978c2ecf20Sopenharmony_ci * waste to have the user pre-fill the pfn arrays with a default 988c2ecf20Sopenharmony_ci * flags value. 998c2ecf20Sopenharmony_ci */ 1008c2ecf20Sopenharmony_ci pfn_req_flags &= range->pfn_flags_mask; 1018c2ecf20Sopenharmony_ci pfn_req_flags |= range->default_flags; 1028c2ecf20Sopenharmony_ci 1038c2ecf20Sopenharmony_ci /* We aren't ask to do anything ... */ 1048c2ecf20Sopenharmony_ci if (!(pfn_req_flags & HMM_PFN_REQ_FAULT)) 1058c2ecf20Sopenharmony_ci return 0; 1068c2ecf20Sopenharmony_ci 1078c2ecf20Sopenharmony_ci /* Need to write fault ? */ 1088c2ecf20Sopenharmony_ci if ((pfn_req_flags & HMM_PFN_REQ_WRITE) && 1098c2ecf20Sopenharmony_ci !(cpu_flags & HMM_PFN_WRITE)) 1108c2ecf20Sopenharmony_ci return HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT; 1118c2ecf20Sopenharmony_ci 1128c2ecf20Sopenharmony_ci /* If CPU page table is not valid then we need to fault */ 1138c2ecf20Sopenharmony_ci if (!(cpu_flags & HMM_PFN_VALID)) 1148c2ecf20Sopenharmony_ci return HMM_NEED_FAULT; 1158c2ecf20Sopenharmony_ci return 0; 1168c2ecf20Sopenharmony_ci} 1178c2ecf20Sopenharmony_ci 1188c2ecf20Sopenharmony_cistatic unsigned int 1198c2ecf20Sopenharmony_cihmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 1208c2ecf20Sopenharmony_ci const unsigned long hmm_pfns[], unsigned long npages, 1218c2ecf20Sopenharmony_ci unsigned long cpu_flags) 1228c2ecf20Sopenharmony_ci{ 1238c2ecf20Sopenharmony_ci struct hmm_range *range = hmm_vma_walk->range; 1248c2ecf20Sopenharmony_ci unsigned int required_fault = 0; 1258c2ecf20Sopenharmony_ci unsigned long i; 1268c2ecf20Sopenharmony_ci 1278c2ecf20Sopenharmony_ci /* 1288c2ecf20Sopenharmony_ci * If the default flags do not request to fault pages, and the mask does 1298c2ecf20Sopenharmony_ci * not allow for individual pages to be faulted, then 1308c2ecf20Sopenharmony_ci * hmm_pte_need_fault() will always return 0. 1318c2ecf20Sopenharmony_ci */ 1328c2ecf20Sopenharmony_ci if (!((range->default_flags | range->pfn_flags_mask) & 1338c2ecf20Sopenharmony_ci HMM_PFN_REQ_FAULT)) 1348c2ecf20Sopenharmony_ci return 0; 1358c2ecf20Sopenharmony_ci 1368c2ecf20Sopenharmony_ci for (i = 0; i < npages; ++i) { 1378c2ecf20Sopenharmony_ci required_fault |= hmm_pte_need_fault(hmm_vma_walk, hmm_pfns[i], 1388c2ecf20Sopenharmony_ci cpu_flags); 1398c2ecf20Sopenharmony_ci if (required_fault == HMM_NEED_ALL_BITS) 1408c2ecf20Sopenharmony_ci return required_fault; 1418c2ecf20Sopenharmony_ci } 1428c2ecf20Sopenharmony_ci return required_fault; 1438c2ecf20Sopenharmony_ci} 1448c2ecf20Sopenharmony_ci 1458c2ecf20Sopenharmony_cistatic int hmm_vma_walk_hole(unsigned long addr, unsigned long end, 1468c2ecf20Sopenharmony_ci __always_unused int depth, struct mm_walk *walk) 1478c2ecf20Sopenharmony_ci{ 1488c2ecf20Sopenharmony_ci struct hmm_vma_walk *hmm_vma_walk = walk->private; 1498c2ecf20Sopenharmony_ci struct hmm_range *range = hmm_vma_walk->range; 1508c2ecf20Sopenharmony_ci unsigned int required_fault; 1518c2ecf20Sopenharmony_ci unsigned long i, npages; 1528c2ecf20Sopenharmony_ci unsigned long *hmm_pfns; 1538c2ecf20Sopenharmony_ci 1548c2ecf20Sopenharmony_ci i = (addr - range->start) >> PAGE_SHIFT; 1558c2ecf20Sopenharmony_ci npages = (end - addr) >> PAGE_SHIFT; 1568c2ecf20Sopenharmony_ci hmm_pfns = &range->hmm_pfns[i]; 1578c2ecf20Sopenharmony_ci required_fault = 1588c2ecf20Sopenharmony_ci hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0); 1598c2ecf20Sopenharmony_ci if (!walk->vma) { 1608c2ecf20Sopenharmony_ci if (required_fault) 1618c2ecf20Sopenharmony_ci return -EFAULT; 1628c2ecf20Sopenharmony_ci return hmm_pfns_fill(addr, end, range, HMM_PFN_ERROR); 1638c2ecf20Sopenharmony_ci } 1648c2ecf20Sopenharmony_ci if (required_fault) 1658c2ecf20Sopenharmony_ci return hmm_vma_fault(addr, end, required_fault, walk); 1668c2ecf20Sopenharmony_ci return hmm_pfns_fill(addr, end, range, 0); 1678c2ecf20Sopenharmony_ci} 1688c2ecf20Sopenharmony_ci 1698c2ecf20Sopenharmony_cistatic inline unsigned long hmm_pfn_flags_order(unsigned long order) 1708c2ecf20Sopenharmony_ci{ 1718c2ecf20Sopenharmony_ci return order << HMM_PFN_ORDER_SHIFT; 1728c2ecf20Sopenharmony_ci} 1738c2ecf20Sopenharmony_ci 1748c2ecf20Sopenharmony_cistatic inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range, 1758c2ecf20Sopenharmony_ci pmd_t pmd) 1768c2ecf20Sopenharmony_ci{ 1778c2ecf20Sopenharmony_ci if (pmd_protnone(pmd)) 1788c2ecf20Sopenharmony_ci return 0; 1798c2ecf20Sopenharmony_ci return (pmd_write(pmd) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : 1808c2ecf20Sopenharmony_ci HMM_PFN_VALID) | 1818c2ecf20Sopenharmony_ci hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT); 1828c2ecf20Sopenharmony_ci} 1838c2ecf20Sopenharmony_ci 1848c2ecf20Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 1858c2ecf20Sopenharmony_cistatic int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, 1868c2ecf20Sopenharmony_ci unsigned long end, unsigned long hmm_pfns[], 1878c2ecf20Sopenharmony_ci pmd_t pmd) 1888c2ecf20Sopenharmony_ci{ 1898c2ecf20Sopenharmony_ci struct hmm_vma_walk *hmm_vma_walk = walk->private; 1908c2ecf20Sopenharmony_ci struct hmm_range *range = hmm_vma_walk->range; 1918c2ecf20Sopenharmony_ci unsigned long pfn, npages, i; 1928c2ecf20Sopenharmony_ci unsigned int required_fault; 1938c2ecf20Sopenharmony_ci unsigned long cpu_flags; 1948c2ecf20Sopenharmony_ci 1958c2ecf20Sopenharmony_ci npages = (end - addr) >> PAGE_SHIFT; 1968c2ecf20Sopenharmony_ci cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); 1978c2ecf20Sopenharmony_ci required_fault = 1988c2ecf20Sopenharmony_ci hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, cpu_flags); 1998c2ecf20Sopenharmony_ci if (required_fault) 2008c2ecf20Sopenharmony_ci return hmm_vma_fault(addr, end, required_fault, walk); 2018c2ecf20Sopenharmony_ci 2028c2ecf20Sopenharmony_ci pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 2038c2ecf20Sopenharmony_ci for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) 2048c2ecf20Sopenharmony_ci hmm_pfns[i] = pfn | cpu_flags; 2058c2ecf20Sopenharmony_ci return 0; 2068c2ecf20Sopenharmony_ci} 2078c2ecf20Sopenharmony_ci#else /* CONFIG_TRANSPARENT_HUGEPAGE */ 2088c2ecf20Sopenharmony_ci/* stub to allow the code below to compile */ 2098c2ecf20Sopenharmony_ciint hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, 2108c2ecf20Sopenharmony_ci unsigned long end, unsigned long hmm_pfns[], pmd_t pmd); 2118c2ecf20Sopenharmony_ci#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 2128c2ecf20Sopenharmony_ci 2138c2ecf20Sopenharmony_cistatic inline bool hmm_is_device_private_entry(struct hmm_range *range, 2148c2ecf20Sopenharmony_ci swp_entry_t entry) 2158c2ecf20Sopenharmony_ci{ 2168c2ecf20Sopenharmony_ci return is_device_private_entry(entry) && 2178c2ecf20Sopenharmony_ci device_private_entry_to_page(entry)->pgmap->owner == 2188c2ecf20Sopenharmony_ci range->dev_private_owner; 2198c2ecf20Sopenharmony_ci} 2208c2ecf20Sopenharmony_ci 2218c2ecf20Sopenharmony_cistatic inline unsigned long pte_to_hmm_pfn_flags(struct hmm_range *range, 2228c2ecf20Sopenharmony_ci pte_t pte) 2238c2ecf20Sopenharmony_ci{ 2248c2ecf20Sopenharmony_ci if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte)) 2258c2ecf20Sopenharmony_ci return 0; 2268c2ecf20Sopenharmony_ci return pte_write(pte) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID; 2278c2ecf20Sopenharmony_ci} 2288c2ecf20Sopenharmony_ci 2298c2ecf20Sopenharmony_cistatic int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, 2308c2ecf20Sopenharmony_ci unsigned long end, pmd_t *pmdp, pte_t *ptep, 2318c2ecf20Sopenharmony_ci unsigned long *hmm_pfn) 2328c2ecf20Sopenharmony_ci{ 2338c2ecf20Sopenharmony_ci struct hmm_vma_walk *hmm_vma_walk = walk->private; 2348c2ecf20Sopenharmony_ci struct hmm_range *range = hmm_vma_walk->range; 2358c2ecf20Sopenharmony_ci unsigned int required_fault; 2368c2ecf20Sopenharmony_ci unsigned long cpu_flags; 2378c2ecf20Sopenharmony_ci pte_t pte = *ptep; 2388c2ecf20Sopenharmony_ci uint64_t pfn_req_flags = *hmm_pfn; 2398c2ecf20Sopenharmony_ci 2408c2ecf20Sopenharmony_ci if (pte_none(pte)) { 2418c2ecf20Sopenharmony_ci required_fault = 2428c2ecf20Sopenharmony_ci hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); 2438c2ecf20Sopenharmony_ci if (required_fault) 2448c2ecf20Sopenharmony_ci goto fault; 2458c2ecf20Sopenharmony_ci *hmm_pfn = 0; 2468c2ecf20Sopenharmony_ci return 0; 2478c2ecf20Sopenharmony_ci } 2488c2ecf20Sopenharmony_ci 2498c2ecf20Sopenharmony_ci if (!pte_present(pte)) { 2508c2ecf20Sopenharmony_ci swp_entry_t entry = pte_to_swp_entry(pte); 2518c2ecf20Sopenharmony_ci 2528c2ecf20Sopenharmony_ci /* 2538c2ecf20Sopenharmony_ci * Never fault in device private pages, but just report 2548c2ecf20Sopenharmony_ci * the PFN even if not present. 2558c2ecf20Sopenharmony_ci */ 2568c2ecf20Sopenharmony_ci if (hmm_is_device_private_entry(range, entry)) { 2578c2ecf20Sopenharmony_ci cpu_flags = HMM_PFN_VALID; 2588c2ecf20Sopenharmony_ci if (is_write_device_private_entry(entry)) 2598c2ecf20Sopenharmony_ci cpu_flags |= HMM_PFN_WRITE; 2608c2ecf20Sopenharmony_ci *hmm_pfn = device_private_entry_to_pfn(entry) | 2618c2ecf20Sopenharmony_ci cpu_flags; 2628c2ecf20Sopenharmony_ci return 0; 2638c2ecf20Sopenharmony_ci } 2648c2ecf20Sopenharmony_ci 2658c2ecf20Sopenharmony_ci required_fault = 2668c2ecf20Sopenharmony_ci hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); 2678c2ecf20Sopenharmony_ci if (!required_fault) { 2688c2ecf20Sopenharmony_ci *hmm_pfn = 0; 2698c2ecf20Sopenharmony_ci return 0; 2708c2ecf20Sopenharmony_ci } 2718c2ecf20Sopenharmony_ci 2728c2ecf20Sopenharmony_ci if (!non_swap_entry(entry)) 2738c2ecf20Sopenharmony_ci goto fault; 2748c2ecf20Sopenharmony_ci 2758c2ecf20Sopenharmony_ci if (is_migration_entry(entry)) { 2768c2ecf20Sopenharmony_ci pte_unmap(ptep); 2778c2ecf20Sopenharmony_ci hmm_vma_walk->last = addr; 2788c2ecf20Sopenharmony_ci migration_entry_wait(walk->mm, pmdp, addr); 2798c2ecf20Sopenharmony_ci return -EBUSY; 2808c2ecf20Sopenharmony_ci } 2818c2ecf20Sopenharmony_ci 2828c2ecf20Sopenharmony_ci /* Report error for everything else */ 2838c2ecf20Sopenharmony_ci pte_unmap(ptep); 2848c2ecf20Sopenharmony_ci return -EFAULT; 2858c2ecf20Sopenharmony_ci } 2868c2ecf20Sopenharmony_ci 2878c2ecf20Sopenharmony_ci cpu_flags = pte_to_hmm_pfn_flags(range, pte); 2888c2ecf20Sopenharmony_ci required_fault = 2898c2ecf20Sopenharmony_ci hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags); 2908c2ecf20Sopenharmony_ci if (required_fault) 2918c2ecf20Sopenharmony_ci goto fault; 2928c2ecf20Sopenharmony_ci 2938c2ecf20Sopenharmony_ci /* 2948c2ecf20Sopenharmony_ci * Bypass devmap pte such as DAX page when all pfn requested 2958c2ecf20Sopenharmony_ci * flags(pfn_req_flags) are fulfilled. 2968c2ecf20Sopenharmony_ci * Since each architecture defines a struct page for the zero page, just 2978c2ecf20Sopenharmony_ci * fall through and treat it like a normal page. 2988c2ecf20Sopenharmony_ci */ 2998c2ecf20Sopenharmony_ci if (!vm_normal_page(walk->vma, addr, pte) && 3008c2ecf20Sopenharmony_ci !pte_devmap(pte) && 3018c2ecf20Sopenharmony_ci !is_zero_pfn(pte_pfn(pte))) { 3028c2ecf20Sopenharmony_ci if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) { 3038c2ecf20Sopenharmony_ci pte_unmap(ptep); 3048c2ecf20Sopenharmony_ci return -EFAULT; 3058c2ecf20Sopenharmony_ci } 3068c2ecf20Sopenharmony_ci *hmm_pfn = HMM_PFN_ERROR; 3078c2ecf20Sopenharmony_ci return 0; 3088c2ecf20Sopenharmony_ci } 3098c2ecf20Sopenharmony_ci 3108c2ecf20Sopenharmony_ci *hmm_pfn = pte_pfn(pte) | cpu_flags; 3118c2ecf20Sopenharmony_ci return 0; 3128c2ecf20Sopenharmony_ci 3138c2ecf20Sopenharmony_cifault: 3148c2ecf20Sopenharmony_ci pte_unmap(ptep); 3158c2ecf20Sopenharmony_ci /* Fault any virtual address we were asked to fault */ 3168c2ecf20Sopenharmony_ci return hmm_vma_fault(addr, end, required_fault, walk); 3178c2ecf20Sopenharmony_ci} 3188c2ecf20Sopenharmony_ci 3198c2ecf20Sopenharmony_cistatic int hmm_vma_walk_pmd(pmd_t *pmdp, 3208c2ecf20Sopenharmony_ci unsigned long start, 3218c2ecf20Sopenharmony_ci unsigned long end, 3228c2ecf20Sopenharmony_ci struct mm_walk *walk) 3238c2ecf20Sopenharmony_ci{ 3248c2ecf20Sopenharmony_ci struct hmm_vma_walk *hmm_vma_walk = walk->private; 3258c2ecf20Sopenharmony_ci struct hmm_range *range = hmm_vma_walk->range; 3268c2ecf20Sopenharmony_ci unsigned long *hmm_pfns = 3278c2ecf20Sopenharmony_ci &range->hmm_pfns[(start - range->start) >> PAGE_SHIFT]; 3288c2ecf20Sopenharmony_ci unsigned long npages = (end - start) >> PAGE_SHIFT; 3298c2ecf20Sopenharmony_ci unsigned long addr = start; 3308c2ecf20Sopenharmony_ci pte_t *ptep; 3318c2ecf20Sopenharmony_ci pmd_t pmd; 3328c2ecf20Sopenharmony_ci 3338c2ecf20Sopenharmony_ciagain: 3348c2ecf20Sopenharmony_ci pmd = READ_ONCE(*pmdp); 3358c2ecf20Sopenharmony_ci if (pmd_none(pmd)) 3368c2ecf20Sopenharmony_ci return hmm_vma_walk_hole(start, end, -1, walk); 3378c2ecf20Sopenharmony_ci 3388c2ecf20Sopenharmony_ci if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { 3398c2ecf20Sopenharmony_ci if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) { 3408c2ecf20Sopenharmony_ci hmm_vma_walk->last = addr; 3418c2ecf20Sopenharmony_ci pmd_migration_entry_wait(walk->mm, pmdp); 3428c2ecf20Sopenharmony_ci return -EBUSY; 3438c2ecf20Sopenharmony_ci } 3448c2ecf20Sopenharmony_ci return hmm_pfns_fill(start, end, range, 0); 3458c2ecf20Sopenharmony_ci } 3468c2ecf20Sopenharmony_ci 3478c2ecf20Sopenharmony_ci if (!pmd_present(pmd)) { 3488c2ecf20Sopenharmony_ci if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) 3498c2ecf20Sopenharmony_ci return -EFAULT; 3508c2ecf20Sopenharmony_ci return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); 3518c2ecf20Sopenharmony_ci } 3528c2ecf20Sopenharmony_ci 3538c2ecf20Sopenharmony_ci if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { 3548c2ecf20Sopenharmony_ci /* 3558c2ecf20Sopenharmony_ci * No need to take pmd_lock here, even if some other thread 3568c2ecf20Sopenharmony_ci * is splitting the huge pmd we will get that event through 3578c2ecf20Sopenharmony_ci * mmu_notifier callback. 3588c2ecf20Sopenharmony_ci * 3598c2ecf20Sopenharmony_ci * So just read pmd value and check again it's a transparent 3608c2ecf20Sopenharmony_ci * huge or device mapping one and compute corresponding pfn 3618c2ecf20Sopenharmony_ci * values. 3628c2ecf20Sopenharmony_ci */ 3638c2ecf20Sopenharmony_ci pmd = pmd_read_atomic(pmdp); 3648c2ecf20Sopenharmony_ci barrier(); 3658c2ecf20Sopenharmony_ci if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) 3668c2ecf20Sopenharmony_ci goto again; 3678c2ecf20Sopenharmony_ci 3688c2ecf20Sopenharmony_ci return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd); 3698c2ecf20Sopenharmony_ci } 3708c2ecf20Sopenharmony_ci 3718c2ecf20Sopenharmony_ci /* 3728c2ecf20Sopenharmony_ci * We have handled all the valid cases above ie either none, migration, 3738c2ecf20Sopenharmony_ci * huge or transparent huge. At this point either it is a valid pmd 3748c2ecf20Sopenharmony_ci * entry pointing to pte directory or it is a bad pmd that will not 3758c2ecf20Sopenharmony_ci * recover. 3768c2ecf20Sopenharmony_ci */ 3778c2ecf20Sopenharmony_ci if (pmd_bad(pmd)) { 3788c2ecf20Sopenharmony_ci if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) 3798c2ecf20Sopenharmony_ci return -EFAULT; 3808c2ecf20Sopenharmony_ci return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); 3818c2ecf20Sopenharmony_ci } 3828c2ecf20Sopenharmony_ci 3838c2ecf20Sopenharmony_ci ptep = pte_offset_map(pmdp, addr); 3848c2ecf20Sopenharmony_ci for (; addr < end; addr += PAGE_SIZE, ptep++, hmm_pfns++) { 3858c2ecf20Sopenharmony_ci int r; 3868c2ecf20Sopenharmony_ci 3878c2ecf20Sopenharmony_ci r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, hmm_pfns); 3888c2ecf20Sopenharmony_ci if (r) { 3898c2ecf20Sopenharmony_ci /* hmm_vma_handle_pte() did pte_unmap() */ 3908c2ecf20Sopenharmony_ci return r; 3918c2ecf20Sopenharmony_ci } 3928c2ecf20Sopenharmony_ci } 3938c2ecf20Sopenharmony_ci pte_unmap(ptep - 1); 3948c2ecf20Sopenharmony_ci return 0; 3958c2ecf20Sopenharmony_ci} 3968c2ecf20Sopenharmony_ci 3978c2ecf20Sopenharmony_ci#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \ 3988c2ecf20Sopenharmony_ci defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) 3998c2ecf20Sopenharmony_cistatic inline unsigned long pud_to_hmm_pfn_flags(struct hmm_range *range, 4008c2ecf20Sopenharmony_ci pud_t pud) 4018c2ecf20Sopenharmony_ci{ 4028c2ecf20Sopenharmony_ci if (!pud_present(pud)) 4038c2ecf20Sopenharmony_ci return 0; 4048c2ecf20Sopenharmony_ci return (pud_write(pud) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : 4058c2ecf20Sopenharmony_ci HMM_PFN_VALID) | 4068c2ecf20Sopenharmony_ci hmm_pfn_flags_order(PUD_SHIFT - PAGE_SHIFT); 4078c2ecf20Sopenharmony_ci} 4088c2ecf20Sopenharmony_ci 4098c2ecf20Sopenharmony_cistatic int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, 4108c2ecf20Sopenharmony_ci struct mm_walk *walk) 4118c2ecf20Sopenharmony_ci{ 4128c2ecf20Sopenharmony_ci struct hmm_vma_walk *hmm_vma_walk = walk->private; 4138c2ecf20Sopenharmony_ci struct hmm_range *range = hmm_vma_walk->range; 4148c2ecf20Sopenharmony_ci unsigned long addr = start; 4158c2ecf20Sopenharmony_ci pud_t pud; 4168c2ecf20Sopenharmony_ci int ret = 0; 4178c2ecf20Sopenharmony_ci spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma); 4188c2ecf20Sopenharmony_ci 4198c2ecf20Sopenharmony_ci if (!ptl) 4208c2ecf20Sopenharmony_ci return 0; 4218c2ecf20Sopenharmony_ci 4228c2ecf20Sopenharmony_ci /* Normally we don't want to split the huge page */ 4238c2ecf20Sopenharmony_ci walk->action = ACTION_CONTINUE; 4248c2ecf20Sopenharmony_ci 4258c2ecf20Sopenharmony_ci pud = READ_ONCE(*pudp); 4268c2ecf20Sopenharmony_ci if (pud_none(pud)) { 4278c2ecf20Sopenharmony_ci spin_unlock(ptl); 4288c2ecf20Sopenharmony_ci return hmm_vma_walk_hole(start, end, -1, walk); 4298c2ecf20Sopenharmony_ci } 4308c2ecf20Sopenharmony_ci 4318c2ecf20Sopenharmony_ci if (pud_huge(pud) && pud_devmap(pud)) { 4328c2ecf20Sopenharmony_ci unsigned long i, npages, pfn; 4338c2ecf20Sopenharmony_ci unsigned int required_fault; 4348c2ecf20Sopenharmony_ci unsigned long *hmm_pfns; 4358c2ecf20Sopenharmony_ci unsigned long cpu_flags; 4368c2ecf20Sopenharmony_ci 4378c2ecf20Sopenharmony_ci if (!pud_present(pud)) { 4388c2ecf20Sopenharmony_ci spin_unlock(ptl); 4398c2ecf20Sopenharmony_ci return hmm_vma_walk_hole(start, end, -1, walk); 4408c2ecf20Sopenharmony_ci } 4418c2ecf20Sopenharmony_ci 4428c2ecf20Sopenharmony_ci i = (addr - range->start) >> PAGE_SHIFT; 4438c2ecf20Sopenharmony_ci npages = (end - addr) >> PAGE_SHIFT; 4448c2ecf20Sopenharmony_ci hmm_pfns = &range->hmm_pfns[i]; 4458c2ecf20Sopenharmony_ci 4468c2ecf20Sopenharmony_ci cpu_flags = pud_to_hmm_pfn_flags(range, pud); 4478c2ecf20Sopenharmony_ci required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns, 4488c2ecf20Sopenharmony_ci npages, cpu_flags); 4498c2ecf20Sopenharmony_ci if (required_fault) { 4508c2ecf20Sopenharmony_ci spin_unlock(ptl); 4518c2ecf20Sopenharmony_ci return hmm_vma_fault(addr, end, required_fault, walk); 4528c2ecf20Sopenharmony_ci } 4538c2ecf20Sopenharmony_ci 4548c2ecf20Sopenharmony_ci pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 4558c2ecf20Sopenharmony_ci for (i = 0; i < npages; ++i, ++pfn) 4568c2ecf20Sopenharmony_ci hmm_pfns[i] = pfn | cpu_flags; 4578c2ecf20Sopenharmony_ci goto out_unlock; 4588c2ecf20Sopenharmony_ci } 4598c2ecf20Sopenharmony_ci 4608c2ecf20Sopenharmony_ci /* Ask for the PUD to be split */ 4618c2ecf20Sopenharmony_ci walk->action = ACTION_SUBTREE; 4628c2ecf20Sopenharmony_ci 4638c2ecf20Sopenharmony_ciout_unlock: 4648c2ecf20Sopenharmony_ci spin_unlock(ptl); 4658c2ecf20Sopenharmony_ci return ret; 4668c2ecf20Sopenharmony_ci} 4678c2ecf20Sopenharmony_ci#else 4688c2ecf20Sopenharmony_ci#define hmm_vma_walk_pud NULL 4698c2ecf20Sopenharmony_ci#endif 4708c2ecf20Sopenharmony_ci 4718c2ecf20Sopenharmony_ci#ifdef CONFIG_HUGETLB_PAGE 4728c2ecf20Sopenharmony_cistatic int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, 4738c2ecf20Sopenharmony_ci unsigned long start, unsigned long end, 4748c2ecf20Sopenharmony_ci struct mm_walk *walk) 4758c2ecf20Sopenharmony_ci{ 4768c2ecf20Sopenharmony_ci unsigned long addr = start, i, pfn; 4778c2ecf20Sopenharmony_ci struct hmm_vma_walk *hmm_vma_walk = walk->private; 4788c2ecf20Sopenharmony_ci struct hmm_range *range = hmm_vma_walk->range; 4798c2ecf20Sopenharmony_ci struct vm_area_struct *vma = walk->vma; 4808c2ecf20Sopenharmony_ci unsigned int required_fault; 4818c2ecf20Sopenharmony_ci unsigned long pfn_req_flags; 4828c2ecf20Sopenharmony_ci unsigned long cpu_flags; 4838c2ecf20Sopenharmony_ci spinlock_t *ptl; 4848c2ecf20Sopenharmony_ci pte_t entry; 4858c2ecf20Sopenharmony_ci 4868c2ecf20Sopenharmony_ci ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); 4878c2ecf20Sopenharmony_ci entry = huge_ptep_get(pte); 4888c2ecf20Sopenharmony_ci 4898c2ecf20Sopenharmony_ci i = (start - range->start) >> PAGE_SHIFT; 4908c2ecf20Sopenharmony_ci pfn_req_flags = range->hmm_pfns[i]; 4918c2ecf20Sopenharmony_ci cpu_flags = pte_to_hmm_pfn_flags(range, entry) | 4928c2ecf20Sopenharmony_ci hmm_pfn_flags_order(huge_page_order(hstate_vma(vma))); 4938c2ecf20Sopenharmony_ci required_fault = 4948c2ecf20Sopenharmony_ci hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags); 4958c2ecf20Sopenharmony_ci if (required_fault) { 4968c2ecf20Sopenharmony_ci spin_unlock(ptl); 4978c2ecf20Sopenharmony_ci return hmm_vma_fault(addr, end, required_fault, walk); 4988c2ecf20Sopenharmony_ci } 4998c2ecf20Sopenharmony_ci 5008c2ecf20Sopenharmony_ci pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); 5018c2ecf20Sopenharmony_ci for (; addr < end; addr += PAGE_SIZE, i++, pfn++) 5028c2ecf20Sopenharmony_ci range->hmm_pfns[i] = pfn | cpu_flags; 5038c2ecf20Sopenharmony_ci 5048c2ecf20Sopenharmony_ci spin_unlock(ptl); 5058c2ecf20Sopenharmony_ci return 0; 5068c2ecf20Sopenharmony_ci} 5078c2ecf20Sopenharmony_ci#else 5088c2ecf20Sopenharmony_ci#define hmm_vma_walk_hugetlb_entry NULL 5098c2ecf20Sopenharmony_ci#endif /* CONFIG_HUGETLB_PAGE */ 5108c2ecf20Sopenharmony_ci 5118c2ecf20Sopenharmony_cistatic int hmm_vma_walk_test(unsigned long start, unsigned long end, 5128c2ecf20Sopenharmony_ci struct mm_walk *walk) 5138c2ecf20Sopenharmony_ci{ 5148c2ecf20Sopenharmony_ci struct hmm_vma_walk *hmm_vma_walk = walk->private; 5158c2ecf20Sopenharmony_ci struct hmm_range *range = hmm_vma_walk->range; 5168c2ecf20Sopenharmony_ci struct vm_area_struct *vma = walk->vma; 5178c2ecf20Sopenharmony_ci 5188c2ecf20Sopenharmony_ci if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)) && 5198c2ecf20Sopenharmony_ci vma->vm_flags & VM_READ) 5208c2ecf20Sopenharmony_ci return 0; 5218c2ecf20Sopenharmony_ci 5228c2ecf20Sopenharmony_ci /* 5238c2ecf20Sopenharmony_ci * vma ranges that don't have struct page backing them or map I/O 5248c2ecf20Sopenharmony_ci * devices directly cannot be handled by hmm_range_fault(). 5258c2ecf20Sopenharmony_ci * 5268c2ecf20Sopenharmony_ci * If the vma does not allow read access, then assume that it does not 5278c2ecf20Sopenharmony_ci * allow write access either. HMM does not support architectures that 5288c2ecf20Sopenharmony_ci * allow write without read. 5298c2ecf20Sopenharmony_ci * 5308c2ecf20Sopenharmony_ci * If a fault is requested for an unsupported range then it is a hard 5318c2ecf20Sopenharmony_ci * failure. 5328c2ecf20Sopenharmony_ci */ 5338c2ecf20Sopenharmony_ci if (hmm_range_need_fault(hmm_vma_walk, 5348c2ecf20Sopenharmony_ci range->hmm_pfns + 5358c2ecf20Sopenharmony_ci ((start - range->start) >> PAGE_SHIFT), 5368c2ecf20Sopenharmony_ci (end - start) >> PAGE_SHIFT, 0)) 5378c2ecf20Sopenharmony_ci return -EFAULT; 5388c2ecf20Sopenharmony_ci 5398c2ecf20Sopenharmony_ci hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); 5408c2ecf20Sopenharmony_ci 5418c2ecf20Sopenharmony_ci /* Skip this vma and continue processing the next vma. */ 5428c2ecf20Sopenharmony_ci return 1; 5438c2ecf20Sopenharmony_ci} 5448c2ecf20Sopenharmony_ci 5458c2ecf20Sopenharmony_cistatic const struct mm_walk_ops hmm_walk_ops = { 5468c2ecf20Sopenharmony_ci .pud_entry = hmm_vma_walk_pud, 5478c2ecf20Sopenharmony_ci .pmd_entry = hmm_vma_walk_pmd, 5488c2ecf20Sopenharmony_ci .pte_hole = hmm_vma_walk_hole, 5498c2ecf20Sopenharmony_ci .hugetlb_entry = hmm_vma_walk_hugetlb_entry, 5508c2ecf20Sopenharmony_ci .test_walk = hmm_vma_walk_test, 5518c2ecf20Sopenharmony_ci}; 5528c2ecf20Sopenharmony_ci 5538c2ecf20Sopenharmony_ci/** 5548c2ecf20Sopenharmony_ci * hmm_range_fault - try to fault some address in a virtual address range 5558c2ecf20Sopenharmony_ci * @range: argument structure 5568c2ecf20Sopenharmony_ci * 5578c2ecf20Sopenharmony_ci * Returns 0 on success or one of the following error codes: 5588c2ecf20Sopenharmony_ci * 5598c2ecf20Sopenharmony_ci * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma 5608c2ecf20Sopenharmony_ci * (e.g., device file vma). 5618c2ecf20Sopenharmony_ci * -ENOMEM: Out of memory. 5628c2ecf20Sopenharmony_ci * -EPERM: Invalid permission (e.g., asking for write and range is read 5638c2ecf20Sopenharmony_ci * only). 5648c2ecf20Sopenharmony_ci * -EBUSY: The range has been invalidated and the caller needs to wait for 5658c2ecf20Sopenharmony_ci * the invalidation to finish. 5668c2ecf20Sopenharmony_ci * -EFAULT: A page was requested to be valid and could not be made valid 5678c2ecf20Sopenharmony_ci * ie it has no backing VMA or it is illegal to access 5688c2ecf20Sopenharmony_ci * 5698c2ecf20Sopenharmony_ci * This is similar to get_user_pages(), except that it can read the page tables 5708c2ecf20Sopenharmony_ci * without mutating them (ie causing faults). 5718c2ecf20Sopenharmony_ci */ 5728c2ecf20Sopenharmony_ciint hmm_range_fault(struct hmm_range *range) 5738c2ecf20Sopenharmony_ci{ 5748c2ecf20Sopenharmony_ci struct hmm_vma_walk hmm_vma_walk = { 5758c2ecf20Sopenharmony_ci .range = range, 5768c2ecf20Sopenharmony_ci .last = range->start, 5778c2ecf20Sopenharmony_ci }; 5788c2ecf20Sopenharmony_ci struct mm_struct *mm = range->notifier->mm; 5798c2ecf20Sopenharmony_ci int ret; 5808c2ecf20Sopenharmony_ci 5818c2ecf20Sopenharmony_ci mmap_assert_locked(mm); 5828c2ecf20Sopenharmony_ci 5838c2ecf20Sopenharmony_ci do { 5848c2ecf20Sopenharmony_ci /* If range is no longer valid force retry. */ 5858c2ecf20Sopenharmony_ci if (mmu_interval_check_retry(range->notifier, 5868c2ecf20Sopenharmony_ci range->notifier_seq)) 5878c2ecf20Sopenharmony_ci return -EBUSY; 5888c2ecf20Sopenharmony_ci ret = walk_page_range(mm, hmm_vma_walk.last, range->end, 5898c2ecf20Sopenharmony_ci &hmm_walk_ops, &hmm_vma_walk); 5908c2ecf20Sopenharmony_ci /* 5918c2ecf20Sopenharmony_ci * When -EBUSY is returned the loop restarts with 5928c2ecf20Sopenharmony_ci * hmm_vma_walk.last set to an address that has not been stored 5938c2ecf20Sopenharmony_ci * in pfns. All entries < last in the pfn array are set to their 5948c2ecf20Sopenharmony_ci * output, and all >= are still at their input values. 5958c2ecf20Sopenharmony_ci */ 5968c2ecf20Sopenharmony_ci } while (ret == -EBUSY); 5978c2ecf20Sopenharmony_ci return ret; 5988c2ecf20Sopenharmony_ci} 5998c2ecf20Sopenharmony_ciEXPORT_SYMBOL(hmm_range_fault); 600