162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Copyright 2013 Red Hat Inc.
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Authors: Jérôme Glisse <jglisse@redhat.com>
662306a36Sopenharmony_ci */
762306a36Sopenharmony_ci/*
862306a36Sopenharmony_ci * Refer to include/linux/hmm.h for information about heterogeneous memory
962306a36Sopenharmony_ci * management or HMM for short.
1062306a36Sopenharmony_ci */
1162306a36Sopenharmony_ci#include <linux/pagewalk.h>
1262306a36Sopenharmony_ci#include <linux/hmm.h>
1362306a36Sopenharmony_ci#include <linux/init.h>
1462306a36Sopenharmony_ci#include <linux/rmap.h>
1562306a36Sopenharmony_ci#include <linux/swap.h>
1662306a36Sopenharmony_ci#include <linux/slab.h>
1762306a36Sopenharmony_ci#include <linux/sched.h>
1862306a36Sopenharmony_ci#include <linux/mmzone.h>
1962306a36Sopenharmony_ci#include <linux/pagemap.h>
2062306a36Sopenharmony_ci#include <linux/swapops.h>
2162306a36Sopenharmony_ci#include <linux/hugetlb.h>
2262306a36Sopenharmony_ci#include <linux/memremap.h>
2362306a36Sopenharmony_ci#include <linux/sched/mm.h>
2462306a36Sopenharmony_ci#include <linux/jump_label.h>
2562306a36Sopenharmony_ci#include <linux/dma-mapping.h>
2662306a36Sopenharmony_ci#include <linux/mmu_notifier.h>
2762306a36Sopenharmony_ci#include <linux/memory_hotplug.h>
2862306a36Sopenharmony_ci
2962306a36Sopenharmony_ci#include "internal.h"
3062306a36Sopenharmony_ci
3162306a36Sopenharmony_cistruct hmm_vma_walk {
3262306a36Sopenharmony_ci	struct hmm_range	*range;
3362306a36Sopenharmony_ci	unsigned long		last;
3462306a36Sopenharmony_ci};
3562306a36Sopenharmony_ci
3662306a36Sopenharmony_cienum {
3762306a36Sopenharmony_ci	HMM_NEED_FAULT = 1 << 0,
3862306a36Sopenharmony_ci	HMM_NEED_WRITE_FAULT = 1 << 1,
3962306a36Sopenharmony_ci	HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT,
4062306a36Sopenharmony_ci};
4162306a36Sopenharmony_ci
4262306a36Sopenharmony_cistatic int hmm_pfns_fill(unsigned long addr, unsigned long end,
4362306a36Sopenharmony_ci			 struct hmm_range *range, unsigned long cpu_flags)
4462306a36Sopenharmony_ci{
4562306a36Sopenharmony_ci	unsigned long i = (addr - range->start) >> PAGE_SHIFT;
4662306a36Sopenharmony_ci
4762306a36Sopenharmony_ci	for (; addr < end; addr += PAGE_SIZE, i++)
4862306a36Sopenharmony_ci		range->hmm_pfns[i] = cpu_flags;
4962306a36Sopenharmony_ci	return 0;
5062306a36Sopenharmony_ci}
5162306a36Sopenharmony_ci
5262306a36Sopenharmony_ci/*
5362306a36Sopenharmony_ci * hmm_vma_fault() - fault in a range lacking valid pmd or pte(s)
5462306a36Sopenharmony_ci * @addr: range virtual start address (inclusive)
5562306a36Sopenharmony_ci * @end: range virtual end address (exclusive)
5662306a36Sopenharmony_ci * @required_fault: HMM_NEED_* flags
5762306a36Sopenharmony_ci * @walk: mm_walk structure
5862306a36Sopenharmony_ci * Return: -EBUSY after page fault, or page fault error
5962306a36Sopenharmony_ci *
6062306a36Sopenharmony_ci * This function will be called whenever pmd_none() or pte_none() returns true,
6162306a36Sopenharmony_ci * or whenever there is no page directory covering the virtual address range.
6262306a36Sopenharmony_ci */
6362306a36Sopenharmony_cistatic int hmm_vma_fault(unsigned long addr, unsigned long end,
6462306a36Sopenharmony_ci			 unsigned int required_fault, struct mm_walk *walk)
6562306a36Sopenharmony_ci{
6662306a36Sopenharmony_ci	struct hmm_vma_walk *hmm_vma_walk = walk->private;
6762306a36Sopenharmony_ci	struct vm_area_struct *vma = walk->vma;
6862306a36Sopenharmony_ci	unsigned int fault_flags = FAULT_FLAG_REMOTE;
6962306a36Sopenharmony_ci
7062306a36Sopenharmony_ci	WARN_ON_ONCE(!required_fault);
7162306a36Sopenharmony_ci	hmm_vma_walk->last = addr;
7262306a36Sopenharmony_ci
7362306a36Sopenharmony_ci	if (required_fault & HMM_NEED_WRITE_FAULT) {
7462306a36Sopenharmony_ci		if (!(vma->vm_flags & VM_WRITE))
7562306a36Sopenharmony_ci			return -EPERM;
7662306a36Sopenharmony_ci		fault_flags |= FAULT_FLAG_WRITE;
7762306a36Sopenharmony_ci	}
7862306a36Sopenharmony_ci
7962306a36Sopenharmony_ci	for (; addr < end; addr += PAGE_SIZE)
8062306a36Sopenharmony_ci		if (handle_mm_fault(vma, addr, fault_flags, NULL) &
8162306a36Sopenharmony_ci		    VM_FAULT_ERROR)
8262306a36Sopenharmony_ci			return -EFAULT;
8362306a36Sopenharmony_ci	return -EBUSY;
8462306a36Sopenharmony_ci}
8562306a36Sopenharmony_ci
8662306a36Sopenharmony_cistatic unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
8762306a36Sopenharmony_ci				       unsigned long pfn_req_flags,
8862306a36Sopenharmony_ci				       unsigned long cpu_flags)
8962306a36Sopenharmony_ci{
9062306a36Sopenharmony_ci	struct hmm_range *range = hmm_vma_walk->range;
9162306a36Sopenharmony_ci
9262306a36Sopenharmony_ci	/*
9362306a36Sopenharmony_ci	 * So we not only consider the individual per page request we also
9462306a36Sopenharmony_ci	 * consider the default flags requested for the range. The API can
9562306a36Sopenharmony_ci	 * be used 2 ways. The first one where the HMM user coalesces
9662306a36Sopenharmony_ci	 * multiple page faults into one request and sets flags per pfn for
9762306a36Sopenharmony_ci	 * those faults. The second one where the HMM user wants to pre-
9862306a36Sopenharmony_ci	 * fault a range with specific flags. For the latter one it is a
9962306a36Sopenharmony_ci	 * waste to have the user pre-fill the pfn arrays with a default
10062306a36Sopenharmony_ci	 * flags value.
10162306a36Sopenharmony_ci	 */
10262306a36Sopenharmony_ci	pfn_req_flags &= range->pfn_flags_mask;
10362306a36Sopenharmony_ci	pfn_req_flags |= range->default_flags;
10462306a36Sopenharmony_ci
10562306a36Sopenharmony_ci	/* We aren't ask to do anything ... */
10662306a36Sopenharmony_ci	if (!(pfn_req_flags & HMM_PFN_REQ_FAULT))
10762306a36Sopenharmony_ci		return 0;
10862306a36Sopenharmony_ci
10962306a36Sopenharmony_ci	/* Need to write fault ? */
11062306a36Sopenharmony_ci	if ((pfn_req_flags & HMM_PFN_REQ_WRITE) &&
11162306a36Sopenharmony_ci	    !(cpu_flags & HMM_PFN_WRITE))
11262306a36Sopenharmony_ci		return HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT;
11362306a36Sopenharmony_ci
11462306a36Sopenharmony_ci	/* If CPU page table is not valid then we need to fault */
11562306a36Sopenharmony_ci	if (!(cpu_flags & HMM_PFN_VALID))
11662306a36Sopenharmony_ci		return HMM_NEED_FAULT;
11762306a36Sopenharmony_ci	return 0;
11862306a36Sopenharmony_ci}
11962306a36Sopenharmony_ci
12062306a36Sopenharmony_cistatic unsigned int
12162306a36Sopenharmony_cihmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
12262306a36Sopenharmony_ci		     const unsigned long hmm_pfns[], unsigned long npages,
12362306a36Sopenharmony_ci		     unsigned long cpu_flags)
12462306a36Sopenharmony_ci{
12562306a36Sopenharmony_ci	struct hmm_range *range = hmm_vma_walk->range;
12662306a36Sopenharmony_ci	unsigned int required_fault = 0;
12762306a36Sopenharmony_ci	unsigned long i;
12862306a36Sopenharmony_ci
12962306a36Sopenharmony_ci	/*
13062306a36Sopenharmony_ci	 * If the default flags do not request to fault pages, and the mask does
13162306a36Sopenharmony_ci	 * not allow for individual pages to be faulted, then
13262306a36Sopenharmony_ci	 * hmm_pte_need_fault() will always return 0.
13362306a36Sopenharmony_ci	 */
13462306a36Sopenharmony_ci	if (!((range->default_flags | range->pfn_flags_mask) &
13562306a36Sopenharmony_ci	      HMM_PFN_REQ_FAULT))
13662306a36Sopenharmony_ci		return 0;
13762306a36Sopenharmony_ci
13862306a36Sopenharmony_ci	for (i = 0; i < npages; ++i) {
13962306a36Sopenharmony_ci		required_fault |= hmm_pte_need_fault(hmm_vma_walk, hmm_pfns[i],
14062306a36Sopenharmony_ci						     cpu_flags);
14162306a36Sopenharmony_ci		if (required_fault == HMM_NEED_ALL_BITS)
14262306a36Sopenharmony_ci			return required_fault;
14362306a36Sopenharmony_ci	}
14462306a36Sopenharmony_ci	return required_fault;
14562306a36Sopenharmony_ci}
14662306a36Sopenharmony_ci
14762306a36Sopenharmony_cistatic int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
14862306a36Sopenharmony_ci			     __always_unused int depth, struct mm_walk *walk)
14962306a36Sopenharmony_ci{
15062306a36Sopenharmony_ci	struct hmm_vma_walk *hmm_vma_walk = walk->private;
15162306a36Sopenharmony_ci	struct hmm_range *range = hmm_vma_walk->range;
15262306a36Sopenharmony_ci	unsigned int required_fault;
15362306a36Sopenharmony_ci	unsigned long i, npages;
15462306a36Sopenharmony_ci	unsigned long *hmm_pfns;
15562306a36Sopenharmony_ci
15662306a36Sopenharmony_ci	i = (addr - range->start) >> PAGE_SHIFT;
15762306a36Sopenharmony_ci	npages = (end - addr) >> PAGE_SHIFT;
15862306a36Sopenharmony_ci	hmm_pfns = &range->hmm_pfns[i];
15962306a36Sopenharmony_ci	required_fault =
16062306a36Sopenharmony_ci		hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0);
16162306a36Sopenharmony_ci	if (!walk->vma) {
16262306a36Sopenharmony_ci		if (required_fault)
16362306a36Sopenharmony_ci			return -EFAULT;
16462306a36Sopenharmony_ci		return hmm_pfns_fill(addr, end, range, HMM_PFN_ERROR);
16562306a36Sopenharmony_ci	}
16662306a36Sopenharmony_ci	if (required_fault)
16762306a36Sopenharmony_ci		return hmm_vma_fault(addr, end, required_fault, walk);
16862306a36Sopenharmony_ci	return hmm_pfns_fill(addr, end, range, 0);
16962306a36Sopenharmony_ci}
17062306a36Sopenharmony_ci
17162306a36Sopenharmony_cistatic inline unsigned long hmm_pfn_flags_order(unsigned long order)
17262306a36Sopenharmony_ci{
17362306a36Sopenharmony_ci	return order << HMM_PFN_ORDER_SHIFT;
17462306a36Sopenharmony_ci}
17562306a36Sopenharmony_ci
17662306a36Sopenharmony_cistatic inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range,
17762306a36Sopenharmony_ci						 pmd_t pmd)
17862306a36Sopenharmony_ci{
17962306a36Sopenharmony_ci	if (pmd_protnone(pmd))
18062306a36Sopenharmony_ci		return 0;
18162306a36Sopenharmony_ci	return (pmd_write(pmd) ? (HMM_PFN_VALID | HMM_PFN_WRITE) :
18262306a36Sopenharmony_ci				 HMM_PFN_VALID) |
18362306a36Sopenharmony_ci	       hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT);
18462306a36Sopenharmony_ci}
18562306a36Sopenharmony_ci
18662306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE
18762306a36Sopenharmony_cistatic int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
18862306a36Sopenharmony_ci			      unsigned long end, unsigned long hmm_pfns[],
18962306a36Sopenharmony_ci			      pmd_t pmd)
19062306a36Sopenharmony_ci{
19162306a36Sopenharmony_ci	struct hmm_vma_walk *hmm_vma_walk = walk->private;
19262306a36Sopenharmony_ci	struct hmm_range *range = hmm_vma_walk->range;
19362306a36Sopenharmony_ci	unsigned long pfn, npages, i;
19462306a36Sopenharmony_ci	unsigned int required_fault;
19562306a36Sopenharmony_ci	unsigned long cpu_flags;
19662306a36Sopenharmony_ci
19762306a36Sopenharmony_ci	npages = (end - addr) >> PAGE_SHIFT;
19862306a36Sopenharmony_ci	cpu_flags = pmd_to_hmm_pfn_flags(range, pmd);
19962306a36Sopenharmony_ci	required_fault =
20062306a36Sopenharmony_ci		hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, cpu_flags);
20162306a36Sopenharmony_ci	if (required_fault)
20262306a36Sopenharmony_ci		return hmm_vma_fault(addr, end, required_fault, walk);
20362306a36Sopenharmony_ci
20462306a36Sopenharmony_ci	pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
20562306a36Sopenharmony_ci	for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
20662306a36Sopenharmony_ci		hmm_pfns[i] = pfn | cpu_flags;
20762306a36Sopenharmony_ci	return 0;
20862306a36Sopenharmony_ci}
20962306a36Sopenharmony_ci#else /* CONFIG_TRANSPARENT_HUGEPAGE */
21062306a36Sopenharmony_ci/* stub to allow the code below to compile */
21162306a36Sopenharmony_ciint hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
21262306a36Sopenharmony_ci		unsigned long end, unsigned long hmm_pfns[], pmd_t pmd);
21362306a36Sopenharmony_ci#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
21462306a36Sopenharmony_ci
21562306a36Sopenharmony_cistatic inline unsigned long pte_to_hmm_pfn_flags(struct hmm_range *range,
21662306a36Sopenharmony_ci						 pte_t pte)
21762306a36Sopenharmony_ci{
21862306a36Sopenharmony_ci	if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte))
21962306a36Sopenharmony_ci		return 0;
22062306a36Sopenharmony_ci	return pte_write(pte) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID;
22162306a36Sopenharmony_ci}
22262306a36Sopenharmony_ci
22362306a36Sopenharmony_cistatic int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
22462306a36Sopenharmony_ci			      unsigned long end, pmd_t *pmdp, pte_t *ptep,
22562306a36Sopenharmony_ci			      unsigned long *hmm_pfn)
22662306a36Sopenharmony_ci{
22762306a36Sopenharmony_ci	struct hmm_vma_walk *hmm_vma_walk = walk->private;
22862306a36Sopenharmony_ci	struct hmm_range *range = hmm_vma_walk->range;
22962306a36Sopenharmony_ci	unsigned int required_fault;
23062306a36Sopenharmony_ci	unsigned long cpu_flags;
23162306a36Sopenharmony_ci	pte_t pte = ptep_get(ptep);
23262306a36Sopenharmony_ci	uint64_t pfn_req_flags = *hmm_pfn;
23362306a36Sopenharmony_ci
23462306a36Sopenharmony_ci	if (pte_none_mostly(pte)) {
23562306a36Sopenharmony_ci		required_fault =
23662306a36Sopenharmony_ci			hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
23762306a36Sopenharmony_ci		if (required_fault)
23862306a36Sopenharmony_ci			goto fault;
23962306a36Sopenharmony_ci		*hmm_pfn = 0;
24062306a36Sopenharmony_ci		return 0;
24162306a36Sopenharmony_ci	}
24262306a36Sopenharmony_ci
24362306a36Sopenharmony_ci	if (!pte_present(pte)) {
24462306a36Sopenharmony_ci		swp_entry_t entry = pte_to_swp_entry(pte);
24562306a36Sopenharmony_ci
24662306a36Sopenharmony_ci		/*
24762306a36Sopenharmony_ci		 * Don't fault in device private pages owned by the caller,
24862306a36Sopenharmony_ci		 * just report the PFN.
24962306a36Sopenharmony_ci		 */
25062306a36Sopenharmony_ci		if (is_device_private_entry(entry) &&
25162306a36Sopenharmony_ci		    pfn_swap_entry_to_page(entry)->pgmap->owner ==
25262306a36Sopenharmony_ci		    range->dev_private_owner) {
25362306a36Sopenharmony_ci			cpu_flags = HMM_PFN_VALID;
25462306a36Sopenharmony_ci			if (is_writable_device_private_entry(entry))
25562306a36Sopenharmony_ci				cpu_flags |= HMM_PFN_WRITE;
25662306a36Sopenharmony_ci			*hmm_pfn = swp_offset_pfn(entry) | cpu_flags;
25762306a36Sopenharmony_ci			return 0;
25862306a36Sopenharmony_ci		}
25962306a36Sopenharmony_ci
26062306a36Sopenharmony_ci		required_fault =
26162306a36Sopenharmony_ci			hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
26262306a36Sopenharmony_ci		if (!required_fault) {
26362306a36Sopenharmony_ci			*hmm_pfn = 0;
26462306a36Sopenharmony_ci			return 0;
26562306a36Sopenharmony_ci		}
26662306a36Sopenharmony_ci
26762306a36Sopenharmony_ci		if (!non_swap_entry(entry))
26862306a36Sopenharmony_ci			goto fault;
26962306a36Sopenharmony_ci
27062306a36Sopenharmony_ci		if (is_device_private_entry(entry))
27162306a36Sopenharmony_ci			goto fault;
27262306a36Sopenharmony_ci
27362306a36Sopenharmony_ci		if (is_device_exclusive_entry(entry))
27462306a36Sopenharmony_ci			goto fault;
27562306a36Sopenharmony_ci
27662306a36Sopenharmony_ci		if (is_migration_entry(entry)) {
27762306a36Sopenharmony_ci			pte_unmap(ptep);
27862306a36Sopenharmony_ci			hmm_vma_walk->last = addr;
27962306a36Sopenharmony_ci			migration_entry_wait(walk->mm, pmdp, addr);
28062306a36Sopenharmony_ci			return -EBUSY;
28162306a36Sopenharmony_ci		}
28262306a36Sopenharmony_ci
28362306a36Sopenharmony_ci		/* Report error for everything else */
28462306a36Sopenharmony_ci		pte_unmap(ptep);
28562306a36Sopenharmony_ci		return -EFAULT;
28662306a36Sopenharmony_ci	}
28762306a36Sopenharmony_ci
28862306a36Sopenharmony_ci	cpu_flags = pte_to_hmm_pfn_flags(range, pte);
28962306a36Sopenharmony_ci	required_fault =
29062306a36Sopenharmony_ci		hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags);
29162306a36Sopenharmony_ci	if (required_fault)
29262306a36Sopenharmony_ci		goto fault;
29362306a36Sopenharmony_ci
29462306a36Sopenharmony_ci	/*
29562306a36Sopenharmony_ci	 * Bypass devmap pte such as DAX page when all pfn requested
29662306a36Sopenharmony_ci	 * flags(pfn_req_flags) are fulfilled.
29762306a36Sopenharmony_ci	 * Since each architecture defines a struct page for the zero page, just
29862306a36Sopenharmony_ci	 * fall through and treat it like a normal page.
29962306a36Sopenharmony_ci	 */
30062306a36Sopenharmony_ci	if (!vm_normal_page(walk->vma, addr, pte) &&
30162306a36Sopenharmony_ci	    !pte_devmap(pte) &&
30262306a36Sopenharmony_ci	    !is_zero_pfn(pte_pfn(pte))) {
30362306a36Sopenharmony_ci		if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) {
30462306a36Sopenharmony_ci			pte_unmap(ptep);
30562306a36Sopenharmony_ci			return -EFAULT;
30662306a36Sopenharmony_ci		}
30762306a36Sopenharmony_ci		*hmm_pfn = HMM_PFN_ERROR;
30862306a36Sopenharmony_ci		return 0;
30962306a36Sopenharmony_ci	}
31062306a36Sopenharmony_ci
31162306a36Sopenharmony_ci	*hmm_pfn = pte_pfn(pte) | cpu_flags;
31262306a36Sopenharmony_ci	return 0;
31362306a36Sopenharmony_ci
31462306a36Sopenharmony_cifault:
31562306a36Sopenharmony_ci	pte_unmap(ptep);
31662306a36Sopenharmony_ci	/* Fault any virtual address we were asked to fault */
31762306a36Sopenharmony_ci	return hmm_vma_fault(addr, end, required_fault, walk);
31862306a36Sopenharmony_ci}
31962306a36Sopenharmony_ci
32062306a36Sopenharmony_cistatic int hmm_vma_walk_pmd(pmd_t *pmdp,
32162306a36Sopenharmony_ci			    unsigned long start,
32262306a36Sopenharmony_ci			    unsigned long end,
32362306a36Sopenharmony_ci			    struct mm_walk *walk)
32462306a36Sopenharmony_ci{
32562306a36Sopenharmony_ci	struct hmm_vma_walk *hmm_vma_walk = walk->private;
32662306a36Sopenharmony_ci	struct hmm_range *range = hmm_vma_walk->range;
32762306a36Sopenharmony_ci	unsigned long *hmm_pfns =
32862306a36Sopenharmony_ci		&range->hmm_pfns[(start - range->start) >> PAGE_SHIFT];
32962306a36Sopenharmony_ci	unsigned long npages = (end - start) >> PAGE_SHIFT;
33062306a36Sopenharmony_ci	unsigned long addr = start;
33162306a36Sopenharmony_ci	pte_t *ptep;
33262306a36Sopenharmony_ci	pmd_t pmd;
33362306a36Sopenharmony_ci
33462306a36Sopenharmony_ciagain:
33562306a36Sopenharmony_ci	pmd = pmdp_get_lockless(pmdp);
33662306a36Sopenharmony_ci	if (pmd_none(pmd))
33762306a36Sopenharmony_ci		return hmm_vma_walk_hole(start, end, -1, walk);
33862306a36Sopenharmony_ci
33962306a36Sopenharmony_ci	if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
34062306a36Sopenharmony_ci		if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) {
34162306a36Sopenharmony_ci			hmm_vma_walk->last = addr;
34262306a36Sopenharmony_ci			pmd_migration_entry_wait(walk->mm, pmdp);
34362306a36Sopenharmony_ci			return -EBUSY;
34462306a36Sopenharmony_ci		}
34562306a36Sopenharmony_ci		return hmm_pfns_fill(start, end, range, 0);
34662306a36Sopenharmony_ci	}
34762306a36Sopenharmony_ci
34862306a36Sopenharmony_ci	if (!pmd_present(pmd)) {
34962306a36Sopenharmony_ci		if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
35062306a36Sopenharmony_ci			return -EFAULT;
35162306a36Sopenharmony_ci		return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
35262306a36Sopenharmony_ci	}
35362306a36Sopenharmony_ci
35462306a36Sopenharmony_ci	if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) {
35562306a36Sopenharmony_ci		/*
35662306a36Sopenharmony_ci		 * No need to take pmd_lock here, even if some other thread
35762306a36Sopenharmony_ci		 * is splitting the huge pmd we will get that event through
35862306a36Sopenharmony_ci		 * mmu_notifier callback.
35962306a36Sopenharmony_ci		 *
36062306a36Sopenharmony_ci		 * So just read pmd value and check again it's a transparent
36162306a36Sopenharmony_ci		 * huge or device mapping one and compute corresponding pfn
36262306a36Sopenharmony_ci		 * values.
36362306a36Sopenharmony_ci		 */
36462306a36Sopenharmony_ci		pmd = pmdp_get_lockless(pmdp);
36562306a36Sopenharmony_ci		if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
36662306a36Sopenharmony_ci			goto again;
36762306a36Sopenharmony_ci
36862306a36Sopenharmony_ci		return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd);
36962306a36Sopenharmony_ci	}
37062306a36Sopenharmony_ci
37162306a36Sopenharmony_ci	/*
37262306a36Sopenharmony_ci	 * We have handled all the valid cases above ie either none, migration,
37362306a36Sopenharmony_ci	 * huge or transparent huge. At this point either it is a valid pmd
37462306a36Sopenharmony_ci	 * entry pointing to pte directory or it is a bad pmd that will not
37562306a36Sopenharmony_ci	 * recover.
37662306a36Sopenharmony_ci	 */
37762306a36Sopenharmony_ci	if (pmd_bad(pmd)) {
37862306a36Sopenharmony_ci		if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
37962306a36Sopenharmony_ci			return -EFAULT;
38062306a36Sopenharmony_ci		return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
38162306a36Sopenharmony_ci	}
38262306a36Sopenharmony_ci
38362306a36Sopenharmony_ci	ptep = pte_offset_map(pmdp, addr);
38462306a36Sopenharmony_ci	if (!ptep)
38562306a36Sopenharmony_ci		goto again;
38662306a36Sopenharmony_ci	for (; addr < end; addr += PAGE_SIZE, ptep++, hmm_pfns++) {
38762306a36Sopenharmony_ci		int r;
38862306a36Sopenharmony_ci
38962306a36Sopenharmony_ci		r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, hmm_pfns);
39062306a36Sopenharmony_ci		if (r) {
39162306a36Sopenharmony_ci			/* hmm_vma_handle_pte() did pte_unmap() */
39262306a36Sopenharmony_ci			return r;
39362306a36Sopenharmony_ci		}
39462306a36Sopenharmony_ci	}
39562306a36Sopenharmony_ci	pte_unmap(ptep - 1);
39662306a36Sopenharmony_ci	return 0;
39762306a36Sopenharmony_ci}
39862306a36Sopenharmony_ci
39962306a36Sopenharmony_ci#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \
40062306a36Sopenharmony_ci    defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
40162306a36Sopenharmony_cistatic inline unsigned long pud_to_hmm_pfn_flags(struct hmm_range *range,
40262306a36Sopenharmony_ci						 pud_t pud)
40362306a36Sopenharmony_ci{
40462306a36Sopenharmony_ci	if (!pud_present(pud))
40562306a36Sopenharmony_ci		return 0;
40662306a36Sopenharmony_ci	return (pud_write(pud) ? (HMM_PFN_VALID | HMM_PFN_WRITE) :
40762306a36Sopenharmony_ci				 HMM_PFN_VALID) |
40862306a36Sopenharmony_ci	       hmm_pfn_flags_order(PUD_SHIFT - PAGE_SHIFT);
40962306a36Sopenharmony_ci}
41062306a36Sopenharmony_ci
41162306a36Sopenharmony_cistatic int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
41262306a36Sopenharmony_ci		struct mm_walk *walk)
41362306a36Sopenharmony_ci{
41462306a36Sopenharmony_ci	struct hmm_vma_walk *hmm_vma_walk = walk->private;
41562306a36Sopenharmony_ci	struct hmm_range *range = hmm_vma_walk->range;
41662306a36Sopenharmony_ci	unsigned long addr = start;
41762306a36Sopenharmony_ci	pud_t pud;
41862306a36Sopenharmony_ci	spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma);
41962306a36Sopenharmony_ci
42062306a36Sopenharmony_ci	if (!ptl)
42162306a36Sopenharmony_ci		return 0;
42262306a36Sopenharmony_ci
42362306a36Sopenharmony_ci	/* Normally we don't want to split the huge page */
42462306a36Sopenharmony_ci	walk->action = ACTION_CONTINUE;
42562306a36Sopenharmony_ci
42662306a36Sopenharmony_ci	pud = READ_ONCE(*pudp);
42762306a36Sopenharmony_ci	if (pud_none(pud)) {
42862306a36Sopenharmony_ci		spin_unlock(ptl);
42962306a36Sopenharmony_ci		return hmm_vma_walk_hole(start, end, -1, walk);
43062306a36Sopenharmony_ci	}
43162306a36Sopenharmony_ci
43262306a36Sopenharmony_ci	if (pud_huge(pud) && pud_devmap(pud)) {
43362306a36Sopenharmony_ci		unsigned long i, npages, pfn;
43462306a36Sopenharmony_ci		unsigned int required_fault;
43562306a36Sopenharmony_ci		unsigned long *hmm_pfns;
43662306a36Sopenharmony_ci		unsigned long cpu_flags;
43762306a36Sopenharmony_ci
43862306a36Sopenharmony_ci		if (!pud_present(pud)) {
43962306a36Sopenharmony_ci			spin_unlock(ptl);
44062306a36Sopenharmony_ci			return hmm_vma_walk_hole(start, end, -1, walk);
44162306a36Sopenharmony_ci		}
44262306a36Sopenharmony_ci
44362306a36Sopenharmony_ci		i = (addr - range->start) >> PAGE_SHIFT;
44462306a36Sopenharmony_ci		npages = (end - addr) >> PAGE_SHIFT;
44562306a36Sopenharmony_ci		hmm_pfns = &range->hmm_pfns[i];
44662306a36Sopenharmony_ci
44762306a36Sopenharmony_ci		cpu_flags = pud_to_hmm_pfn_flags(range, pud);
44862306a36Sopenharmony_ci		required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns,
44962306a36Sopenharmony_ci						      npages, cpu_flags);
45062306a36Sopenharmony_ci		if (required_fault) {
45162306a36Sopenharmony_ci			spin_unlock(ptl);
45262306a36Sopenharmony_ci			return hmm_vma_fault(addr, end, required_fault, walk);
45362306a36Sopenharmony_ci		}
45462306a36Sopenharmony_ci
45562306a36Sopenharmony_ci		pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
45662306a36Sopenharmony_ci		for (i = 0; i < npages; ++i, ++pfn)
45762306a36Sopenharmony_ci			hmm_pfns[i] = pfn | cpu_flags;
45862306a36Sopenharmony_ci		goto out_unlock;
45962306a36Sopenharmony_ci	}
46062306a36Sopenharmony_ci
46162306a36Sopenharmony_ci	/* Ask for the PUD to be split */
46262306a36Sopenharmony_ci	walk->action = ACTION_SUBTREE;
46362306a36Sopenharmony_ci
46462306a36Sopenharmony_ciout_unlock:
46562306a36Sopenharmony_ci	spin_unlock(ptl);
46662306a36Sopenharmony_ci	return 0;
46762306a36Sopenharmony_ci}
46862306a36Sopenharmony_ci#else
46962306a36Sopenharmony_ci#define hmm_vma_walk_pud	NULL
47062306a36Sopenharmony_ci#endif
47162306a36Sopenharmony_ci
47262306a36Sopenharmony_ci#ifdef CONFIG_HUGETLB_PAGE
47362306a36Sopenharmony_cistatic int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
47462306a36Sopenharmony_ci				      unsigned long start, unsigned long end,
47562306a36Sopenharmony_ci				      struct mm_walk *walk)
47662306a36Sopenharmony_ci{
47762306a36Sopenharmony_ci	unsigned long addr = start, i, pfn;
47862306a36Sopenharmony_ci	struct hmm_vma_walk *hmm_vma_walk = walk->private;
47962306a36Sopenharmony_ci	struct hmm_range *range = hmm_vma_walk->range;
48062306a36Sopenharmony_ci	struct vm_area_struct *vma = walk->vma;
48162306a36Sopenharmony_ci	unsigned int required_fault;
48262306a36Sopenharmony_ci	unsigned long pfn_req_flags;
48362306a36Sopenharmony_ci	unsigned long cpu_flags;
48462306a36Sopenharmony_ci	spinlock_t *ptl;
48562306a36Sopenharmony_ci	pte_t entry;
48662306a36Sopenharmony_ci
48762306a36Sopenharmony_ci	ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
48862306a36Sopenharmony_ci	entry = huge_ptep_get(pte);
48962306a36Sopenharmony_ci
49062306a36Sopenharmony_ci	i = (start - range->start) >> PAGE_SHIFT;
49162306a36Sopenharmony_ci	pfn_req_flags = range->hmm_pfns[i];
49262306a36Sopenharmony_ci	cpu_flags = pte_to_hmm_pfn_flags(range, entry) |
49362306a36Sopenharmony_ci		    hmm_pfn_flags_order(huge_page_order(hstate_vma(vma)));
49462306a36Sopenharmony_ci	required_fault =
49562306a36Sopenharmony_ci		hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags);
49662306a36Sopenharmony_ci	if (required_fault) {
49762306a36Sopenharmony_ci		int ret;
49862306a36Sopenharmony_ci
49962306a36Sopenharmony_ci		spin_unlock(ptl);
50062306a36Sopenharmony_ci		hugetlb_vma_unlock_read(vma);
50162306a36Sopenharmony_ci		/*
50262306a36Sopenharmony_ci		 * Avoid deadlock: drop the vma lock before calling
50362306a36Sopenharmony_ci		 * hmm_vma_fault(), which will itself potentially take and
50462306a36Sopenharmony_ci		 * drop the vma lock. This is also correct from a
50562306a36Sopenharmony_ci		 * protection point of view, because there is no further
50662306a36Sopenharmony_ci		 * use here of either pte or ptl after dropping the vma
50762306a36Sopenharmony_ci		 * lock.
50862306a36Sopenharmony_ci		 */
50962306a36Sopenharmony_ci		ret = hmm_vma_fault(addr, end, required_fault, walk);
51062306a36Sopenharmony_ci		hugetlb_vma_lock_read(vma);
51162306a36Sopenharmony_ci		return ret;
51262306a36Sopenharmony_ci	}
51362306a36Sopenharmony_ci
51462306a36Sopenharmony_ci	pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
51562306a36Sopenharmony_ci	for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
51662306a36Sopenharmony_ci		range->hmm_pfns[i] = pfn | cpu_flags;
51762306a36Sopenharmony_ci
51862306a36Sopenharmony_ci	spin_unlock(ptl);
51962306a36Sopenharmony_ci	return 0;
52062306a36Sopenharmony_ci}
52162306a36Sopenharmony_ci#else
52262306a36Sopenharmony_ci#define hmm_vma_walk_hugetlb_entry NULL
52362306a36Sopenharmony_ci#endif /* CONFIG_HUGETLB_PAGE */
52462306a36Sopenharmony_ci
52562306a36Sopenharmony_cistatic int hmm_vma_walk_test(unsigned long start, unsigned long end,
52662306a36Sopenharmony_ci			     struct mm_walk *walk)
52762306a36Sopenharmony_ci{
52862306a36Sopenharmony_ci	struct hmm_vma_walk *hmm_vma_walk = walk->private;
52962306a36Sopenharmony_ci	struct hmm_range *range = hmm_vma_walk->range;
53062306a36Sopenharmony_ci	struct vm_area_struct *vma = walk->vma;
53162306a36Sopenharmony_ci
53262306a36Sopenharmony_ci	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)) &&
53362306a36Sopenharmony_ci	    vma->vm_flags & VM_READ)
53462306a36Sopenharmony_ci		return 0;
53562306a36Sopenharmony_ci
53662306a36Sopenharmony_ci	/*
53762306a36Sopenharmony_ci	 * vma ranges that don't have struct page backing them or map I/O
53862306a36Sopenharmony_ci	 * devices directly cannot be handled by hmm_range_fault().
53962306a36Sopenharmony_ci	 *
54062306a36Sopenharmony_ci	 * If the vma does not allow read access, then assume that it does not
54162306a36Sopenharmony_ci	 * allow write access either. HMM does not support architectures that
54262306a36Sopenharmony_ci	 * allow write without read.
54362306a36Sopenharmony_ci	 *
54462306a36Sopenharmony_ci	 * If a fault is requested for an unsupported range then it is a hard
54562306a36Sopenharmony_ci	 * failure.
54662306a36Sopenharmony_ci	 */
54762306a36Sopenharmony_ci	if (hmm_range_need_fault(hmm_vma_walk,
54862306a36Sopenharmony_ci				 range->hmm_pfns +
54962306a36Sopenharmony_ci					 ((start - range->start) >> PAGE_SHIFT),
55062306a36Sopenharmony_ci				 (end - start) >> PAGE_SHIFT, 0))
55162306a36Sopenharmony_ci		return -EFAULT;
55262306a36Sopenharmony_ci
55362306a36Sopenharmony_ci	hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
55462306a36Sopenharmony_ci
55562306a36Sopenharmony_ci	/* Skip this vma and continue processing the next vma. */
55662306a36Sopenharmony_ci	return 1;
55762306a36Sopenharmony_ci}
55862306a36Sopenharmony_ci
55962306a36Sopenharmony_cistatic const struct mm_walk_ops hmm_walk_ops = {
56062306a36Sopenharmony_ci	.pud_entry	= hmm_vma_walk_pud,
56162306a36Sopenharmony_ci	.pmd_entry	= hmm_vma_walk_pmd,
56262306a36Sopenharmony_ci	.pte_hole	= hmm_vma_walk_hole,
56362306a36Sopenharmony_ci	.hugetlb_entry	= hmm_vma_walk_hugetlb_entry,
56462306a36Sopenharmony_ci	.test_walk	= hmm_vma_walk_test,
56562306a36Sopenharmony_ci	.walk_lock	= PGWALK_RDLOCK,
56662306a36Sopenharmony_ci};
56762306a36Sopenharmony_ci
56862306a36Sopenharmony_ci/**
56962306a36Sopenharmony_ci * hmm_range_fault - try to fault some address in a virtual address range
57062306a36Sopenharmony_ci * @range:	argument structure
57162306a36Sopenharmony_ci *
57262306a36Sopenharmony_ci * Returns 0 on success or one of the following error codes:
57362306a36Sopenharmony_ci *
57462306a36Sopenharmony_ci * -EINVAL:	Invalid arguments or mm or virtual address is in an invalid vma
57562306a36Sopenharmony_ci *		(e.g., device file vma).
57662306a36Sopenharmony_ci * -ENOMEM:	Out of memory.
57762306a36Sopenharmony_ci * -EPERM:	Invalid permission (e.g., asking for write and range is read
57862306a36Sopenharmony_ci *		only).
57962306a36Sopenharmony_ci * -EBUSY:	The range has been invalidated and the caller needs to wait for
58062306a36Sopenharmony_ci *		the invalidation to finish.
58162306a36Sopenharmony_ci * -EFAULT:     A page was requested to be valid and could not be made valid
58262306a36Sopenharmony_ci *              ie it has no backing VMA or it is illegal to access
58362306a36Sopenharmony_ci *
58462306a36Sopenharmony_ci * This is similar to get_user_pages(), except that it can read the page tables
58562306a36Sopenharmony_ci * without mutating them (ie causing faults).
58662306a36Sopenharmony_ci */
58762306a36Sopenharmony_ciint hmm_range_fault(struct hmm_range *range)
58862306a36Sopenharmony_ci{
58962306a36Sopenharmony_ci	struct hmm_vma_walk hmm_vma_walk = {
59062306a36Sopenharmony_ci		.range = range,
59162306a36Sopenharmony_ci		.last = range->start,
59262306a36Sopenharmony_ci	};
59362306a36Sopenharmony_ci	struct mm_struct *mm = range->notifier->mm;
59462306a36Sopenharmony_ci	int ret;
59562306a36Sopenharmony_ci
59662306a36Sopenharmony_ci	mmap_assert_locked(mm);
59762306a36Sopenharmony_ci
59862306a36Sopenharmony_ci	do {
59962306a36Sopenharmony_ci		/* If range is no longer valid force retry. */
60062306a36Sopenharmony_ci		if (mmu_interval_check_retry(range->notifier,
60162306a36Sopenharmony_ci					     range->notifier_seq))
60262306a36Sopenharmony_ci			return -EBUSY;
60362306a36Sopenharmony_ci		ret = walk_page_range(mm, hmm_vma_walk.last, range->end,
60462306a36Sopenharmony_ci				      &hmm_walk_ops, &hmm_vma_walk);
60562306a36Sopenharmony_ci		/*
60662306a36Sopenharmony_ci		 * When -EBUSY is returned the loop restarts with
60762306a36Sopenharmony_ci		 * hmm_vma_walk.last set to an address that has not been stored
60862306a36Sopenharmony_ci		 * in pfns. All entries < last in the pfn array are set to their
60962306a36Sopenharmony_ci		 * output, and all >= are still at their input values.
61062306a36Sopenharmony_ci		 */
61162306a36Sopenharmony_ci	} while (ret == -EBUSY);
61262306a36Sopenharmony_ci	return ret;
61362306a36Sopenharmony_ci}
61462306a36Sopenharmony_ciEXPORT_SYMBOL(hmm_range_fault);
615