162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Copyright 2013 Red Hat Inc. 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Authors: Jérôme Glisse <jglisse@redhat.com> 662306a36Sopenharmony_ci */ 762306a36Sopenharmony_ci/* 862306a36Sopenharmony_ci * Refer to include/linux/hmm.h for information about heterogeneous memory 962306a36Sopenharmony_ci * management or HMM for short. 1062306a36Sopenharmony_ci */ 1162306a36Sopenharmony_ci#include <linux/pagewalk.h> 1262306a36Sopenharmony_ci#include <linux/hmm.h> 1362306a36Sopenharmony_ci#include <linux/init.h> 1462306a36Sopenharmony_ci#include <linux/rmap.h> 1562306a36Sopenharmony_ci#include <linux/swap.h> 1662306a36Sopenharmony_ci#include <linux/slab.h> 1762306a36Sopenharmony_ci#include <linux/sched.h> 1862306a36Sopenharmony_ci#include <linux/mmzone.h> 1962306a36Sopenharmony_ci#include <linux/pagemap.h> 2062306a36Sopenharmony_ci#include <linux/swapops.h> 2162306a36Sopenharmony_ci#include <linux/hugetlb.h> 2262306a36Sopenharmony_ci#include <linux/memremap.h> 2362306a36Sopenharmony_ci#include <linux/sched/mm.h> 2462306a36Sopenharmony_ci#include <linux/jump_label.h> 2562306a36Sopenharmony_ci#include <linux/dma-mapping.h> 2662306a36Sopenharmony_ci#include <linux/mmu_notifier.h> 2762306a36Sopenharmony_ci#include <linux/memory_hotplug.h> 2862306a36Sopenharmony_ci 2962306a36Sopenharmony_ci#include "internal.h" 3062306a36Sopenharmony_ci 3162306a36Sopenharmony_cistruct hmm_vma_walk { 3262306a36Sopenharmony_ci struct hmm_range *range; 3362306a36Sopenharmony_ci unsigned long last; 3462306a36Sopenharmony_ci}; 3562306a36Sopenharmony_ci 3662306a36Sopenharmony_cienum { 3762306a36Sopenharmony_ci HMM_NEED_FAULT = 1 << 0, 3862306a36Sopenharmony_ci HMM_NEED_WRITE_FAULT = 1 << 1, 3962306a36Sopenharmony_ci HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT, 4062306a36Sopenharmony_ci}; 4162306a36Sopenharmony_ci 4262306a36Sopenharmony_cistatic int hmm_pfns_fill(unsigned long addr, unsigned long end, 4362306a36Sopenharmony_ci struct hmm_range *range, unsigned long cpu_flags) 4462306a36Sopenharmony_ci{ 4562306a36Sopenharmony_ci unsigned long i = (addr - range->start) >> PAGE_SHIFT; 4662306a36Sopenharmony_ci 4762306a36Sopenharmony_ci for (; addr < end; addr += PAGE_SIZE, i++) 4862306a36Sopenharmony_ci range->hmm_pfns[i] = cpu_flags; 4962306a36Sopenharmony_ci return 0; 5062306a36Sopenharmony_ci} 5162306a36Sopenharmony_ci 5262306a36Sopenharmony_ci/* 5362306a36Sopenharmony_ci * hmm_vma_fault() - fault in a range lacking valid pmd or pte(s) 5462306a36Sopenharmony_ci * @addr: range virtual start address (inclusive) 5562306a36Sopenharmony_ci * @end: range virtual end address (exclusive) 5662306a36Sopenharmony_ci * @required_fault: HMM_NEED_* flags 5762306a36Sopenharmony_ci * @walk: mm_walk structure 5862306a36Sopenharmony_ci * Return: -EBUSY after page fault, or page fault error 5962306a36Sopenharmony_ci * 6062306a36Sopenharmony_ci * This function will be called whenever pmd_none() or pte_none() returns true, 6162306a36Sopenharmony_ci * or whenever there is no page directory covering the virtual address range. 6262306a36Sopenharmony_ci */ 6362306a36Sopenharmony_cistatic int hmm_vma_fault(unsigned long addr, unsigned long end, 6462306a36Sopenharmony_ci unsigned int required_fault, struct mm_walk *walk) 6562306a36Sopenharmony_ci{ 6662306a36Sopenharmony_ci struct hmm_vma_walk *hmm_vma_walk = walk->private; 6762306a36Sopenharmony_ci struct vm_area_struct *vma = walk->vma; 6862306a36Sopenharmony_ci unsigned int fault_flags = FAULT_FLAG_REMOTE; 6962306a36Sopenharmony_ci 7062306a36Sopenharmony_ci WARN_ON_ONCE(!required_fault); 7162306a36Sopenharmony_ci hmm_vma_walk->last = addr; 7262306a36Sopenharmony_ci 7362306a36Sopenharmony_ci if (required_fault & HMM_NEED_WRITE_FAULT) { 7462306a36Sopenharmony_ci if (!(vma->vm_flags & VM_WRITE)) 7562306a36Sopenharmony_ci return -EPERM; 7662306a36Sopenharmony_ci fault_flags |= FAULT_FLAG_WRITE; 7762306a36Sopenharmony_ci } 7862306a36Sopenharmony_ci 7962306a36Sopenharmony_ci for (; addr < end; addr += PAGE_SIZE) 8062306a36Sopenharmony_ci if (handle_mm_fault(vma, addr, fault_flags, NULL) & 8162306a36Sopenharmony_ci VM_FAULT_ERROR) 8262306a36Sopenharmony_ci return -EFAULT; 8362306a36Sopenharmony_ci return -EBUSY; 8462306a36Sopenharmony_ci} 8562306a36Sopenharmony_ci 8662306a36Sopenharmony_cistatic unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 8762306a36Sopenharmony_ci unsigned long pfn_req_flags, 8862306a36Sopenharmony_ci unsigned long cpu_flags) 8962306a36Sopenharmony_ci{ 9062306a36Sopenharmony_ci struct hmm_range *range = hmm_vma_walk->range; 9162306a36Sopenharmony_ci 9262306a36Sopenharmony_ci /* 9362306a36Sopenharmony_ci * So we not only consider the individual per page request we also 9462306a36Sopenharmony_ci * consider the default flags requested for the range. The API can 9562306a36Sopenharmony_ci * be used 2 ways. The first one where the HMM user coalesces 9662306a36Sopenharmony_ci * multiple page faults into one request and sets flags per pfn for 9762306a36Sopenharmony_ci * those faults. The second one where the HMM user wants to pre- 9862306a36Sopenharmony_ci * fault a range with specific flags. For the latter one it is a 9962306a36Sopenharmony_ci * waste to have the user pre-fill the pfn arrays with a default 10062306a36Sopenharmony_ci * flags value. 10162306a36Sopenharmony_ci */ 10262306a36Sopenharmony_ci pfn_req_flags &= range->pfn_flags_mask; 10362306a36Sopenharmony_ci pfn_req_flags |= range->default_flags; 10462306a36Sopenharmony_ci 10562306a36Sopenharmony_ci /* We aren't ask to do anything ... */ 10662306a36Sopenharmony_ci if (!(pfn_req_flags & HMM_PFN_REQ_FAULT)) 10762306a36Sopenharmony_ci return 0; 10862306a36Sopenharmony_ci 10962306a36Sopenharmony_ci /* Need to write fault ? */ 11062306a36Sopenharmony_ci if ((pfn_req_flags & HMM_PFN_REQ_WRITE) && 11162306a36Sopenharmony_ci !(cpu_flags & HMM_PFN_WRITE)) 11262306a36Sopenharmony_ci return HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT; 11362306a36Sopenharmony_ci 11462306a36Sopenharmony_ci /* If CPU page table is not valid then we need to fault */ 11562306a36Sopenharmony_ci if (!(cpu_flags & HMM_PFN_VALID)) 11662306a36Sopenharmony_ci return HMM_NEED_FAULT; 11762306a36Sopenharmony_ci return 0; 11862306a36Sopenharmony_ci} 11962306a36Sopenharmony_ci 12062306a36Sopenharmony_cistatic unsigned int 12162306a36Sopenharmony_cihmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 12262306a36Sopenharmony_ci const unsigned long hmm_pfns[], unsigned long npages, 12362306a36Sopenharmony_ci unsigned long cpu_flags) 12462306a36Sopenharmony_ci{ 12562306a36Sopenharmony_ci struct hmm_range *range = hmm_vma_walk->range; 12662306a36Sopenharmony_ci unsigned int required_fault = 0; 12762306a36Sopenharmony_ci unsigned long i; 12862306a36Sopenharmony_ci 12962306a36Sopenharmony_ci /* 13062306a36Sopenharmony_ci * If the default flags do not request to fault pages, and the mask does 13162306a36Sopenharmony_ci * not allow for individual pages to be faulted, then 13262306a36Sopenharmony_ci * hmm_pte_need_fault() will always return 0. 13362306a36Sopenharmony_ci */ 13462306a36Sopenharmony_ci if (!((range->default_flags | range->pfn_flags_mask) & 13562306a36Sopenharmony_ci HMM_PFN_REQ_FAULT)) 13662306a36Sopenharmony_ci return 0; 13762306a36Sopenharmony_ci 13862306a36Sopenharmony_ci for (i = 0; i < npages; ++i) { 13962306a36Sopenharmony_ci required_fault |= hmm_pte_need_fault(hmm_vma_walk, hmm_pfns[i], 14062306a36Sopenharmony_ci cpu_flags); 14162306a36Sopenharmony_ci if (required_fault == HMM_NEED_ALL_BITS) 14262306a36Sopenharmony_ci return required_fault; 14362306a36Sopenharmony_ci } 14462306a36Sopenharmony_ci return required_fault; 14562306a36Sopenharmony_ci} 14662306a36Sopenharmony_ci 14762306a36Sopenharmony_cistatic int hmm_vma_walk_hole(unsigned long addr, unsigned long end, 14862306a36Sopenharmony_ci __always_unused int depth, struct mm_walk *walk) 14962306a36Sopenharmony_ci{ 15062306a36Sopenharmony_ci struct hmm_vma_walk *hmm_vma_walk = walk->private; 15162306a36Sopenharmony_ci struct hmm_range *range = hmm_vma_walk->range; 15262306a36Sopenharmony_ci unsigned int required_fault; 15362306a36Sopenharmony_ci unsigned long i, npages; 15462306a36Sopenharmony_ci unsigned long *hmm_pfns; 15562306a36Sopenharmony_ci 15662306a36Sopenharmony_ci i = (addr - range->start) >> PAGE_SHIFT; 15762306a36Sopenharmony_ci npages = (end - addr) >> PAGE_SHIFT; 15862306a36Sopenharmony_ci hmm_pfns = &range->hmm_pfns[i]; 15962306a36Sopenharmony_ci required_fault = 16062306a36Sopenharmony_ci hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0); 16162306a36Sopenharmony_ci if (!walk->vma) { 16262306a36Sopenharmony_ci if (required_fault) 16362306a36Sopenharmony_ci return -EFAULT; 16462306a36Sopenharmony_ci return hmm_pfns_fill(addr, end, range, HMM_PFN_ERROR); 16562306a36Sopenharmony_ci } 16662306a36Sopenharmony_ci if (required_fault) 16762306a36Sopenharmony_ci return hmm_vma_fault(addr, end, required_fault, walk); 16862306a36Sopenharmony_ci return hmm_pfns_fill(addr, end, range, 0); 16962306a36Sopenharmony_ci} 17062306a36Sopenharmony_ci 17162306a36Sopenharmony_cistatic inline unsigned long hmm_pfn_flags_order(unsigned long order) 17262306a36Sopenharmony_ci{ 17362306a36Sopenharmony_ci return order << HMM_PFN_ORDER_SHIFT; 17462306a36Sopenharmony_ci} 17562306a36Sopenharmony_ci 17662306a36Sopenharmony_cistatic inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range, 17762306a36Sopenharmony_ci pmd_t pmd) 17862306a36Sopenharmony_ci{ 17962306a36Sopenharmony_ci if (pmd_protnone(pmd)) 18062306a36Sopenharmony_ci return 0; 18162306a36Sopenharmony_ci return (pmd_write(pmd) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : 18262306a36Sopenharmony_ci HMM_PFN_VALID) | 18362306a36Sopenharmony_ci hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT); 18462306a36Sopenharmony_ci} 18562306a36Sopenharmony_ci 18662306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 18762306a36Sopenharmony_cistatic int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, 18862306a36Sopenharmony_ci unsigned long end, unsigned long hmm_pfns[], 18962306a36Sopenharmony_ci pmd_t pmd) 19062306a36Sopenharmony_ci{ 19162306a36Sopenharmony_ci struct hmm_vma_walk *hmm_vma_walk = walk->private; 19262306a36Sopenharmony_ci struct hmm_range *range = hmm_vma_walk->range; 19362306a36Sopenharmony_ci unsigned long pfn, npages, i; 19462306a36Sopenharmony_ci unsigned int required_fault; 19562306a36Sopenharmony_ci unsigned long cpu_flags; 19662306a36Sopenharmony_ci 19762306a36Sopenharmony_ci npages = (end - addr) >> PAGE_SHIFT; 19862306a36Sopenharmony_ci cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); 19962306a36Sopenharmony_ci required_fault = 20062306a36Sopenharmony_ci hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, cpu_flags); 20162306a36Sopenharmony_ci if (required_fault) 20262306a36Sopenharmony_ci return hmm_vma_fault(addr, end, required_fault, walk); 20362306a36Sopenharmony_ci 20462306a36Sopenharmony_ci pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 20562306a36Sopenharmony_ci for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) 20662306a36Sopenharmony_ci hmm_pfns[i] = pfn | cpu_flags; 20762306a36Sopenharmony_ci return 0; 20862306a36Sopenharmony_ci} 20962306a36Sopenharmony_ci#else /* CONFIG_TRANSPARENT_HUGEPAGE */ 21062306a36Sopenharmony_ci/* stub to allow the code below to compile */ 21162306a36Sopenharmony_ciint hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, 21262306a36Sopenharmony_ci unsigned long end, unsigned long hmm_pfns[], pmd_t pmd); 21362306a36Sopenharmony_ci#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 21462306a36Sopenharmony_ci 21562306a36Sopenharmony_cistatic inline unsigned long pte_to_hmm_pfn_flags(struct hmm_range *range, 21662306a36Sopenharmony_ci pte_t pte) 21762306a36Sopenharmony_ci{ 21862306a36Sopenharmony_ci if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte)) 21962306a36Sopenharmony_ci return 0; 22062306a36Sopenharmony_ci return pte_write(pte) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID; 22162306a36Sopenharmony_ci} 22262306a36Sopenharmony_ci 22362306a36Sopenharmony_cistatic int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, 22462306a36Sopenharmony_ci unsigned long end, pmd_t *pmdp, pte_t *ptep, 22562306a36Sopenharmony_ci unsigned long *hmm_pfn) 22662306a36Sopenharmony_ci{ 22762306a36Sopenharmony_ci struct hmm_vma_walk *hmm_vma_walk = walk->private; 22862306a36Sopenharmony_ci struct hmm_range *range = hmm_vma_walk->range; 22962306a36Sopenharmony_ci unsigned int required_fault; 23062306a36Sopenharmony_ci unsigned long cpu_flags; 23162306a36Sopenharmony_ci pte_t pte = ptep_get(ptep); 23262306a36Sopenharmony_ci uint64_t pfn_req_flags = *hmm_pfn; 23362306a36Sopenharmony_ci 23462306a36Sopenharmony_ci if (pte_none_mostly(pte)) { 23562306a36Sopenharmony_ci required_fault = 23662306a36Sopenharmony_ci hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); 23762306a36Sopenharmony_ci if (required_fault) 23862306a36Sopenharmony_ci goto fault; 23962306a36Sopenharmony_ci *hmm_pfn = 0; 24062306a36Sopenharmony_ci return 0; 24162306a36Sopenharmony_ci } 24262306a36Sopenharmony_ci 24362306a36Sopenharmony_ci if (!pte_present(pte)) { 24462306a36Sopenharmony_ci swp_entry_t entry = pte_to_swp_entry(pte); 24562306a36Sopenharmony_ci 24662306a36Sopenharmony_ci /* 24762306a36Sopenharmony_ci * Don't fault in device private pages owned by the caller, 24862306a36Sopenharmony_ci * just report the PFN. 24962306a36Sopenharmony_ci */ 25062306a36Sopenharmony_ci if (is_device_private_entry(entry) && 25162306a36Sopenharmony_ci pfn_swap_entry_to_page(entry)->pgmap->owner == 25262306a36Sopenharmony_ci range->dev_private_owner) { 25362306a36Sopenharmony_ci cpu_flags = HMM_PFN_VALID; 25462306a36Sopenharmony_ci if (is_writable_device_private_entry(entry)) 25562306a36Sopenharmony_ci cpu_flags |= HMM_PFN_WRITE; 25662306a36Sopenharmony_ci *hmm_pfn = swp_offset_pfn(entry) | cpu_flags; 25762306a36Sopenharmony_ci return 0; 25862306a36Sopenharmony_ci } 25962306a36Sopenharmony_ci 26062306a36Sopenharmony_ci required_fault = 26162306a36Sopenharmony_ci hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); 26262306a36Sopenharmony_ci if (!required_fault) { 26362306a36Sopenharmony_ci *hmm_pfn = 0; 26462306a36Sopenharmony_ci return 0; 26562306a36Sopenharmony_ci } 26662306a36Sopenharmony_ci 26762306a36Sopenharmony_ci if (!non_swap_entry(entry)) 26862306a36Sopenharmony_ci goto fault; 26962306a36Sopenharmony_ci 27062306a36Sopenharmony_ci if (is_device_private_entry(entry)) 27162306a36Sopenharmony_ci goto fault; 27262306a36Sopenharmony_ci 27362306a36Sopenharmony_ci if (is_device_exclusive_entry(entry)) 27462306a36Sopenharmony_ci goto fault; 27562306a36Sopenharmony_ci 27662306a36Sopenharmony_ci if (is_migration_entry(entry)) { 27762306a36Sopenharmony_ci pte_unmap(ptep); 27862306a36Sopenharmony_ci hmm_vma_walk->last = addr; 27962306a36Sopenharmony_ci migration_entry_wait(walk->mm, pmdp, addr); 28062306a36Sopenharmony_ci return -EBUSY; 28162306a36Sopenharmony_ci } 28262306a36Sopenharmony_ci 28362306a36Sopenharmony_ci /* Report error for everything else */ 28462306a36Sopenharmony_ci pte_unmap(ptep); 28562306a36Sopenharmony_ci return -EFAULT; 28662306a36Sopenharmony_ci } 28762306a36Sopenharmony_ci 28862306a36Sopenharmony_ci cpu_flags = pte_to_hmm_pfn_flags(range, pte); 28962306a36Sopenharmony_ci required_fault = 29062306a36Sopenharmony_ci hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags); 29162306a36Sopenharmony_ci if (required_fault) 29262306a36Sopenharmony_ci goto fault; 29362306a36Sopenharmony_ci 29462306a36Sopenharmony_ci /* 29562306a36Sopenharmony_ci * Bypass devmap pte such as DAX page when all pfn requested 29662306a36Sopenharmony_ci * flags(pfn_req_flags) are fulfilled. 29762306a36Sopenharmony_ci * Since each architecture defines a struct page for the zero page, just 29862306a36Sopenharmony_ci * fall through and treat it like a normal page. 29962306a36Sopenharmony_ci */ 30062306a36Sopenharmony_ci if (!vm_normal_page(walk->vma, addr, pte) && 30162306a36Sopenharmony_ci !pte_devmap(pte) && 30262306a36Sopenharmony_ci !is_zero_pfn(pte_pfn(pte))) { 30362306a36Sopenharmony_ci if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) { 30462306a36Sopenharmony_ci pte_unmap(ptep); 30562306a36Sopenharmony_ci return -EFAULT; 30662306a36Sopenharmony_ci } 30762306a36Sopenharmony_ci *hmm_pfn = HMM_PFN_ERROR; 30862306a36Sopenharmony_ci return 0; 30962306a36Sopenharmony_ci } 31062306a36Sopenharmony_ci 31162306a36Sopenharmony_ci *hmm_pfn = pte_pfn(pte) | cpu_flags; 31262306a36Sopenharmony_ci return 0; 31362306a36Sopenharmony_ci 31462306a36Sopenharmony_cifault: 31562306a36Sopenharmony_ci pte_unmap(ptep); 31662306a36Sopenharmony_ci /* Fault any virtual address we were asked to fault */ 31762306a36Sopenharmony_ci return hmm_vma_fault(addr, end, required_fault, walk); 31862306a36Sopenharmony_ci} 31962306a36Sopenharmony_ci 32062306a36Sopenharmony_cistatic int hmm_vma_walk_pmd(pmd_t *pmdp, 32162306a36Sopenharmony_ci unsigned long start, 32262306a36Sopenharmony_ci unsigned long end, 32362306a36Sopenharmony_ci struct mm_walk *walk) 32462306a36Sopenharmony_ci{ 32562306a36Sopenharmony_ci struct hmm_vma_walk *hmm_vma_walk = walk->private; 32662306a36Sopenharmony_ci struct hmm_range *range = hmm_vma_walk->range; 32762306a36Sopenharmony_ci unsigned long *hmm_pfns = 32862306a36Sopenharmony_ci &range->hmm_pfns[(start - range->start) >> PAGE_SHIFT]; 32962306a36Sopenharmony_ci unsigned long npages = (end - start) >> PAGE_SHIFT; 33062306a36Sopenharmony_ci unsigned long addr = start; 33162306a36Sopenharmony_ci pte_t *ptep; 33262306a36Sopenharmony_ci pmd_t pmd; 33362306a36Sopenharmony_ci 33462306a36Sopenharmony_ciagain: 33562306a36Sopenharmony_ci pmd = pmdp_get_lockless(pmdp); 33662306a36Sopenharmony_ci if (pmd_none(pmd)) 33762306a36Sopenharmony_ci return hmm_vma_walk_hole(start, end, -1, walk); 33862306a36Sopenharmony_ci 33962306a36Sopenharmony_ci if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { 34062306a36Sopenharmony_ci if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) { 34162306a36Sopenharmony_ci hmm_vma_walk->last = addr; 34262306a36Sopenharmony_ci pmd_migration_entry_wait(walk->mm, pmdp); 34362306a36Sopenharmony_ci return -EBUSY; 34462306a36Sopenharmony_ci } 34562306a36Sopenharmony_ci return hmm_pfns_fill(start, end, range, 0); 34662306a36Sopenharmony_ci } 34762306a36Sopenharmony_ci 34862306a36Sopenharmony_ci if (!pmd_present(pmd)) { 34962306a36Sopenharmony_ci if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) 35062306a36Sopenharmony_ci return -EFAULT; 35162306a36Sopenharmony_ci return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); 35262306a36Sopenharmony_ci } 35362306a36Sopenharmony_ci 35462306a36Sopenharmony_ci if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { 35562306a36Sopenharmony_ci /* 35662306a36Sopenharmony_ci * No need to take pmd_lock here, even if some other thread 35762306a36Sopenharmony_ci * is splitting the huge pmd we will get that event through 35862306a36Sopenharmony_ci * mmu_notifier callback. 35962306a36Sopenharmony_ci * 36062306a36Sopenharmony_ci * So just read pmd value and check again it's a transparent 36162306a36Sopenharmony_ci * huge or device mapping one and compute corresponding pfn 36262306a36Sopenharmony_ci * values. 36362306a36Sopenharmony_ci */ 36462306a36Sopenharmony_ci pmd = pmdp_get_lockless(pmdp); 36562306a36Sopenharmony_ci if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) 36662306a36Sopenharmony_ci goto again; 36762306a36Sopenharmony_ci 36862306a36Sopenharmony_ci return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd); 36962306a36Sopenharmony_ci } 37062306a36Sopenharmony_ci 37162306a36Sopenharmony_ci /* 37262306a36Sopenharmony_ci * We have handled all the valid cases above ie either none, migration, 37362306a36Sopenharmony_ci * huge or transparent huge. At this point either it is a valid pmd 37462306a36Sopenharmony_ci * entry pointing to pte directory or it is a bad pmd that will not 37562306a36Sopenharmony_ci * recover. 37662306a36Sopenharmony_ci */ 37762306a36Sopenharmony_ci if (pmd_bad(pmd)) { 37862306a36Sopenharmony_ci if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) 37962306a36Sopenharmony_ci return -EFAULT; 38062306a36Sopenharmony_ci return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); 38162306a36Sopenharmony_ci } 38262306a36Sopenharmony_ci 38362306a36Sopenharmony_ci ptep = pte_offset_map(pmdp, addr); 38462306a36Sopenharmony_ci if (!ptep) 38562306a36Sopenharmony_ci goto again; 38662306a36Sopenharmony_ci for (; addr < end; addr += PAGE_SIZE, ptep++, hmm_pfns++) { 38762306a36Sopenharmony_ci int r; 38862306a36Sopenharmony_ci 38962306a36Sopenharmony_ci r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, hmm_pfns); 39062306a36Sopenharmony_ci if (r) { 39162306a36Sopenharmony_ci /* hmm_vma_handle_pte() did pte_unmap() */ 39262306a36Sopenharmony_ci return r; 39362306a36Sopenharmony_ci } 39462306a36Sopenharmony_ci } 39562306a36Sopenharmony_ci pte_unmap(ptep - 1); 39662306a36Sopenharmony_ci return 0; 39762306a36Sopenharmony_ci} 39862306a36Sopenharmony_ci 39962306a36Sopenharmony_ci#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \ 40062306a36Sopenharmony_ci defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) 40162306a36Sopenharmony_cistatic inline unsigned long pud_to_hmm_pfn_flags(struct hmm_range *range, 40262306a36Sopenharmony_ci pud_t pud) 40362306a36Sopenharmony_ci{ 40462306a36Sopenharmony_ci if (!pud_present(pud)) 40562306a36Sopenharmony_ci return 0; 40662306a36Sopenharmony_ci return (pud_write(pud) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : 40762306a36Sopenharmony_ci HMM_PFN_VALID) | 40862306a36Sopenharmony_ci hmm_pfn_flags_order(PUD_SHIFT - PAGE_SHIFT); 40962306a36Sopenharmony_ci} 41062306a36Sopenharmony_ci 41162306a36Sopenharmony_cistatic int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, 41262306a36Sopenharmony_ci struct mm_walk *walk) 41362306a36Sopenharmony_ci{ 41462306a36Sopenharmony_ci struct hmm_vma_walk *hmm_vma_walk = walk->private; 41562306a36Sopenharmony_ci struct hmm_range *range = hmm_vma_walk->range; 41662306a36Sopenharmony_ci unsigned long addr = start; 41762306a36Sopenharmony_ci pud_t pud; 41862306a36Sopenharmony_ci spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma); 41962306a36Sopenharmony_ci 42062306a36Sopenharmony_ci if (!ptl) 42162306a36Sopenharmony_ci return 0; 42262306a36Sopenharmony_ci 42362306a36Sopenharmony_ci /* Normally we don't want to split the huge page */ 42462306a36Sopenharmony_ci walk->action = ACTION_CONTINUE; 42562306a36Sopenharmony_ci 42662306a36Sopenharmony_ci pud = READ_ONCE(*pudp); 42762306a36Sopenharmony_ci if (pud_none(pud)) { 42862306a36Sopenharmony_ci spin_unlock(ptl); 42962306a36Sopenharmony_ci return hmm_vma_walk_hole(start, end, -1, walk); 43062306a36Sopenharmony_ci } 43162306a36Sopenharmony_ci 43262306a36Sopenharmony_ci if (pud_huge(pud) && pud_devmap(pud)) { 43362306a36Sopenharmony_ci unsigned long i, npages, pfn; 43462306a36Sopenharmony_ci unsigned int required_fault; 43562306a36Sopenharmony_ci unsigned long *hmm_pfns; 43662306a36Sopenharmony_ci unsigned long cpu_flags; 43762306a36Sopenharmony_ci 43862306a36Sopenharmony_ci if (!pud_present(pud)) { 43962306a36Sopenharmony_ci spin_unlock(ptl); 44062306a36Sopenharmony_ci return hmm_vma_walk_hole(start, end, -1, walk); 44162306a36Sopenharmony_ci } 44262306a36Sopenharmony_ci 44362306a36Sopenharmony_ci i = (addr - range->start) >> PAGE_SHIFT; 44462306a36Sopenharmony_ci npages = (end - addr) >> PAGE_SHIFT; 44562306a36Sopenharmony_ci hmm_pfns = &range->hmm_pfns[i]; 44662306a36Sopenharmony_ci 44762306a36Sopenharmony_ci cpu_flags = pud_to_hmm_pfn_flags(range, pud); 44862306a36Sopenharmony_ci required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns, 44962306a36Sopenharmony_ci npages, cpu_flags); 45062306a36Sopenharmony_ci if (required_fault) { 45162306a36Sopenharmony_ci spin_unlock(ptl); 45262306a36Sopenharmony_ci return hmm_vma_fault(addr, end, required_fault, walk); 45362306a36Sopenharmony_ci } 45462306a36Sopenharmony_ci 45562306a36Sopenharmony_ci pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 45662306a36Sopenharmony_ci for (i = 0; i < npages; ++i, ++pfn) 45762306a36Sopenharmony_ci hmm_pfns[i] = pfn | cpu_flags; 45862306a36Sopenharmony_ci goto out_unlock; 45962306a36Sopenharmony_ci } 46062306a36Sopenharmony_ci 46162306a36Sopenharmony_ci /* Ask for the PUD to be split */ 46262306a36Sopenharmony_ci walk->action = ACTION_SUBTREE; 46362306a36Sopenharmony_ci 46462306a36Sopenharmony_ciout_unlock: 46562306a36Sopenharmony_ci spin_unlock(ptl); 46662306a36Sopenharmony_ci return 0; 46762306a36Sopenharmony_ci} 46862306a36Sopenharmony_ci#else 46962306a36Sopenharmony_ci#define hmm_vma_walk_pud NULL 47062306a36Sopenharmony_ci#endif 47162306a36Sopenharmony_ci 47262306a36Sopenharmony_ci#ifdef CONFIG_HUGETLB_PAGE 47362306a36Sopenharmony_cistatic int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, 47462306a36Sopenharmony_ci unsigned long start, unsigned long end, 47562306a36Sopenharmony_ci struct mm_walk *walk) 47662306a36Sopenharmony_ci{ 47762306a36Sopenharmony_ci unsigned long addr = start, i, pfn; 47862306a36Sopenharmony_ci struct hmm_vma_walk *hmm_vma_walk = walk->private; 47962306a36Sopenharmony_ci struct hmm_range *range = hmm_vma_walk->range; 48062306a36Sopenharmony_ci struct vm_area_struct *vma = walk->vma; 48162306a36Sopenharmony_ci unsigned int required_fault; 48262306a36Sopenharmony_ci unsigned long pfn_req_flags; 48362306a36Sopenharmony_ci unsigned long cpu_flags; 48462306a36Sopenharmony_ci spinlock_t *ptl; 48562306a36Sopenharmony_ci pte_t entry; 48662306a36Sopenharmony_ci 48762306a36Sopenharmony_ci ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); 48862306a36Sopenharmony_ci entry = huge_ptep_get(pte); 48962306a36Sopenharmony_ci 49062306a36Sopenharmony_ci i = (start - range->start) >> PAGE_SHIFT; 49162306a36Sopenharmony_ci pfn_req_flags = range->hmm_pfns[i]; 49262306a36Sopenharmony_ci cpu_flags = pte_to_hmm_pfn_flags(range, entry) | 49362306a36Sopenharmony_ci hmm_pfn_flags_order(huge_page_order(hstate_vma(vma))); 49462306a36Sopenharmony_ci required_fault = 49562306a36Sopenharmony_ci hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags); 49662306a36Sopenharmony_ci if (required_fault) { 49762306a36Sopenharmony_ci int ret; 49862306a36Sopenharmony_ci 49962306a36Sopenharmony_ci spin_unlock(ptl); 50062306a36Sopenharmony_ci hugetlb_vma_unlock_read(vma); 50162306a36Sopenharmony_ci /* 50262306a36Sopenharmony_ci * Avoid deadlock: drop the vma lock before calling 50362306a36Sopenharmony_ci * hmm_vma_fault(), which will itself potentially take and 50462306a36Sopenharmony_ci * drop the vma lock. This is also correct from a 50562306a36Sopenharmony_ci * protection point of view, because there is no further 50662306a36Sopenharmony_ci * use here of either pte or ptl after dropping the vma 50762306a36Sopenharmony_ci * lock. 50862306a36Sopenharmony_ci */ 50962306a36Sopenharmony_ci ret = hmm_vma_fault(addr, end, required_fault, walk); 51062306a36Sopenharmony_ci hugetlb_vma_lock_read(vma); 51162306a36Sopenharmony_ci return ret; 51262306a36Sopenharmony_ci } 51362306a36Sopenharmony_ci 51462306a36Sopenharmony_ci pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); 51562306a36Sopenharmony_ci for (; addr < end; addr += PAGE_SIZE, i++, pfn++) 51662306a36Sopenharmony_ci range->hmm_pfns[i] = pfn | cpu_flags; 51762306a36Sopenharmony_ci 51862306a36Sopenharmony_ci spin_unlock(ptl); 51962306a36Sopenharmony_ci return 0; 52062306a36Sopenharmony_ci} 52162306a36Sopenharmony_ci#else 52262306a36Sopenharmony_ci#define hmm_vma_walk_hugetlb_entry NULL 52362306a36Sopenharmony_ci#endif /* CONFIG_HUGETLB_PAGE */ 52462306a36Sopenharmony_ci 52562306a36Sopenharmony_cistatic int hmm_vma_walk_test(unsigned long start, unsigned long end, 52662306a36Sopenharmony_ci struct mm_walk *walk) 52762306a36Sopenharmony_ci{ 52862306a36Sopenharmony_ci struct hmm_vma_walk *hmm_vma_walk = walk->private; 52962306a36Sopenharmony_ci struct hmm_range *range = hmm_vma_walk->range; 53062306a36Sopenharmony_ci struct vm_area_struct *vma = walk->vma; 53162306a36Sopenharmony_ci 53262306a36Sopenharmony_ci if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)) && 53362306a36Sopenharmony_ci vma->vm_flags & VM_READ) 53462306a36Sopenharmony_ci return 0; 53562306a36Sopenharmony_ci 53662306a36Sopenharmony_ci /* 53762306a36Sopenharmony_ci * vma ranges that don't have struct page backing them or map I/O 53862306a36Sopenharmony_ci * devices directly cannot be handled by hmm_range_fault(). 53962306a36Sopenharmony_ci * 54062306a36Sopenharmony_ci * If the vma does not allow read access, then assume that it does not 54162306a36Sopenharmony_ci * allow write access either. HMM does not support architectures that 54262306a36Sopenharmony_ci * allow write without read. 54362306a36Sopenharmony_ci * 54462306a36Sopenharmony_ci * If a fault is requested for an unsupported range then it is a hard 54562306a36Sopenharmony_ci * failure. 54662306a36Sopenharmony_ci */ 54762306a36Sopenharmony_ci if (hmm_range_need_fault(hmm_vma_walk, 54862306a36Sopenharmony_ci range->hmm_pfns + 54962306a36Sopenharmony_ci ((start - range->start) >> PAGE_SHIFT), 55062306a36Sopenharmony_ci (end - start) >> PAGE_SHIFT, 0)) 55162306a36Sopenharmony_ci return -EFAULT; 55262306a36Sopenharmony_ci 55362306a36Sopenharmony_ci hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); 55462306a36Sopenharmony_ci 55562306a36Sopenharmony_ci /* Skip this vma and continue processing the next vma. */ 55662306a36Sopenharmony_ci return 1; 55762306a36Sopenharmony_ci} 55862306a36Sopenharmony_ci 55962306a36Sopenharmony_cistatic const struct mm_walk_ops hmm_walk_ops = { 56062306a36Sopenharmony_ci .pud_entry = hmm_vma_walk_pud, 56162306a36Sopenharmony_ci .pmd_entry = hmm_vma_walk_pmd, 56262306a36Sopenharmony_ci .pte_hole = hmm_vma_walk_hole, 56362306a36Sopenharmony_ci .hugetlb_entry = hmm_vma_walk_hugetlb_entry, 56462306a36Sopenharmony_ci .test_walk = hmm_vma_walk_test, 56562306a36Sopenharmony_ci .walk_lock = PGWALK_RDLOCK, 56662306a36Sopenharmony_ci}; 56762306a36Sopenharmony_ci 56862306a36Sopenharmony_ci/** 56962306a36Sopenharmony_ci * hmm_range_fault - try to fault some address in a virtual address range 57062306a36Sopenharmony_ci * @range: argument structure 57162306a36Sopenharmony_ci * 57262306a36Sopenharmony_ci * Returns 0 on success or one of the following error codes: 57362306a36Sopenharmony_ci * 57462306a36Sopenharmony_ci * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma 57562306a36Sopenharmony_ci * (e.g., device file vma). 57662306a36Sopenharmony_ci * -ENOMEM: Out of memory. 57762306a36Sopenharmony_ci * -EPERM: Invalid permission (e.g., asking for write and range is read 57862306a36Sopenharmony_ci * only). 57962306a36Sopenharmony_ci * -EBUSY: The range has been invalidated and the caller needs to wait for 58062306a36Sopenharmony_ci * the invalidation to finish. 58162306a36Sopenharmony_ci * -EFAULT: A page was requested to be valid and could not be made valid 58262306a36Sopenharmony_ci * ie it has no backing VMA or it is illegal to access 58362306a36Sopenharmony_ci * 58462306a36Sopenharmony_ci * This is similar to get_user_pages(), except that it can read the page tables 58562306a36Sopenharmony_ci * without mutating them (ie causing faults). 58662306a36Sopenharmony_ci */ 58762306a36Sopenharmony_ciint hmm_range_fault(struct hmm_range *range) 58862306a36Sopenharmony_ci{ 58962306a36Sopenharmony_ci struct hmm_vma_walk hmm_vma_walk = { 59062306a36Sopenharmony_ci .range = range, 59162306a36Sopenharmony_ci .last = range->start, 59262306a36Sopenharmony_ci }; 59362306a36Sopenharmony_ci struct mm_struct *mm = range->notifier->mm; 59462306a36Sopenharmony_ci int ret; 59562306a36Sopenharmony_ci 59662306a36Sopenharmony_ci mmap_assert_locked(mm); 59762306a36Sopenharmony_ci 59862306a36Sopenharmony_ci do { 59962306a36Sopenharmony_ci /* If range is no longer valid force retry. */ 60062306a36Sopenharmony_ci if (mmu_interval_check_retry(range->notifier, 60162306a36Sopenharmony_ci range->notifier_seq)) 60262306a36Sopenharmony_ci return -EBUSY; 60362306a36Sopenharmony_ci ret = walk_page_range(mm, hmm_vma_walk.last, range->end, 60462306a36Sopenharmony_ci &hmm_walk_ops, &hmm_vma_walk); 60562306a36Sopenharmony_ci /* 60662306a36Sopenharmony_ci * When -EBUSY is returned the loop restarts with 60762306a36Sopenharmony_ci * hmm_vma_walk.last set to an address that has not been stored 60862306a36Sopenharmony_ci * in pfns. All entries < last in the pfn array are set to their 60962306a36Sopenharmony_ci * output, and all >= are still at their input values. 61062306a36Sopenharmony_ci */ 61162306a36Sopenharmony_ci } while (ret == -EBUSY); 61262306a36Sopenharmony_ci return ret; 61362306a36Sopenharmony_ci} 61462306a36Sopenharmony_ciEXPORT_SYMBOL(hmm_range_fault); 615