18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
28c2ecf20Sopenharmony_ci#include <linux/pagewalk.h>
38c2ecf20Sopenharmony_ci#include <linux/highmem.h>
48c2ecf20Sopenharmony_ci#include <linux/sched.h>
58c2ecf20Sopenharmony_ci#include <linux/hugetlb.h>
68c2ecf20Sopenharmony_ci
78c2ecf20Sopenharmony_ci/*
88c2ecf20Sopenharmony_ci * We want to know the real level where a entry is located ignoring any
98c2ecf20Sopenharmony_ci * folding of levels which may be happening. For example if p4d is folded then
108c2ecf20Sopenharmony_ci * a missing entry found at level 1 (p4d) is actually at level 0 (pgd).
118c2ecf20Sopenharmony_ci */
128c2ecf20Sopenharmony_cistatic int real_depth(int depth)
138c2ecf20Sopenharmony_ci{
148c2ecf20Sopenharmony_ci	if (depth == 3 && PTRS_PER_PMD == 1)
158c2ecf20Sopenharmony_ci		depth = 2;
168c2ecf20Sopenharmony_ci	if (depth == 2 && PTRS_PER_PUD == 1)
178c2ecf20Sopenharmony_ci		depth = 1;
188c2ecf20Sopenharmony_ci	if (depth == 1 && PTRS_PER_P4D == 1)
198c2ecf20Sopenharmony_ci		depth = 0;
208c2ecf20Sopenharmony_ci	return depth;
218c2ecf20Sopenharmony_ci}
228c2ecf20Sopenharmony_ci
238c2ecf20Sopenharmony_cistatic int walk_pte_range_inner(pte_t *pte, unsigned long addr,
248c2ecf20Sopenharmony_ci				unsigned long end, struct mm_walk *walk)
258c2ecf20Sopenharmony_ci{
268c2ecf20Sopenharmony_ci	const struct mm_walk_ops *ops = walk->ops;
278c2ecf20Sopenharmony_ci	int err = 0;
288c2ecf20Sopenharmony_ci
298c2ecf20Sopenharmony_ci	for (;;) {
308c2ecf20Sopenharmony_ci		err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
318c2ecf20Sopenharmony_ci		if (err)
328c2ecf20Sopenharmony_ci		       break;
338c2ecf20Sopenharmony_ci		if (addr >= end - PAGE_SIZE)
348c2ecf20Sopenharmony_ci			break;
358c2ecf20Sopenharmony_ci		addr += PAGE_SIZE;
368c2ecf20Sopenharmony_ci		pte++;
378c2ecf20Sopenharmony_ci	}
388c2ecf20Sopenharmony_ci	return err;
398c2ecf20Sopenharmony_ci}
408c2ecf20Sopenharmony_ci
418c2ecf20Sopenharmony_cistatic int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
428c2ecf20Sopenharmony_ci			  struct mm_walk *walk)
438c2ecf20Sopenharmony_ci{
448c2ecf20Sopenharmony_ci	pte_t *pte;
458c2ecf20Sopenharmony_ci	int err = 0;
468c2ecf20Sopenharmony_ci	spinlock_t *ptl;
478c2ecf20Sopenharmony_ci
488c2ecf20Sopenharmony_ci	if (walk->no_vma) {
498c2ecf20Sopenharmony_ci		pte = pte_offset_map(pmd, addr);
508c2ecf20Sopenharmony_ci		err = walk_pte_range_inner(pte, addr, end, walk);
518c2ecf20Sopenharmony_ci		pte_unmap(pte);
528c2ecf20Sopenharmony_ci	} else {
538c2ecf20Sopenharmony_ci		pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
548c2ecf20Sopenharmony_ci		err = walk_pte_range_inner(pte, addr, end, walk);
558c2ecf20Sopenharmony_ci		pte_unmap_unlock(pte, ptl);
568c2ecf20Sopenharmony_ci	}
578c2ecf20Sopenharmony_ci
588c2ecf20Sopenharmony_ci	return err;
598c2ecf20Sopenharmony_ci}
608c2ecf20Sopenharmony_ci
618c2ecf20Sopenharmony_cistatic int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
628c2ecf20Sopenharmony_ci			  struct mm_walk *walk)
638c2ecf20Sopenharmony_ci{
648c2ecf20Sopenharmony_ci	pmd_t *pmd;
658c2ecf20Sopenharmony_ci	unsigned long next;
668c2ecf20Sopenharmony_ci	const struct mm_walk_ops *ops = walk->ops;
678c2ecf20Sopenharmony_ci	int err = 0;
688c2ecf20Sopenharmony_ci	int depth = real_depth(3);
698c2ecf20Sopenharmony_ci
708c2ecf20Sopenharmony_ci	pmd = pmd_offset(pud, addr);
718c2ecf20Sopenharmony_ci	do {
728c2ecf20Sopenharmony_ciagain:
738c2ecf20Sopenharmony_ci		next = pmd_addr_end(addr, end);
748c2ecf20Sopenharmony_ci		if (pmd_none(*pmd)) {
758c2ecf20Sopenharmony_ci			if (ops->pte_hole)
768c2ecf20Sopenharmony_ci				err = ops->pte_hole(addr, next, depth, walk);
778c2ecf20Sopenharmony_ci			if (err)
788c2ecf20Sopenharmony_ci				break;
798c2ecf20Sopenharmony_ci			continue;
808c2ecf20Sopenharmony_ci		}
818c2ecf20Sopenharmony_ci
828c2ecf20Sopenharmony_ci		walk->action = ACTION_SUBTREE;
838c2ecf20Sopenharmony_ci
848c2ecf20Sopenharmony_ci		/*
858c2ecf20Sopenharmony_ci		 * This implies that each ->pmd_entry() handler
868c2ecf20Sopenharmony_ci		 * needs to know about pmd_trans_huge() pmds
878c2ecf20Sopenharmony_ci		 */
888c2ecf20Sopenharmony_ci		if (ops->pmd_entry)
898c2ecf20Sopenharmony_ci			err = ops->pmd_entry(pmd, addr, next, walk);
908c2ecf20Sopenharmony_ci		if (err)
918c2ecf20Sopenharmony_ci			break;
928c2ecf20Sopenharmony_ci
938c2ecf20Sopenharmony_ci		if (walk->action == ACTION_AGAIN)
948c2ecf20Sopenharmony_ci			goto again;
958c2ecf20Sopenharmony_ci
968c2ecf20Sopenharmony_ci		/*
978c2ecf20Sopenharmony_ci		 * Check this here so we only break down trans_huge
988c2ecf20Sopenharmony_ci		 * pages when we _need_ to
998c2ecf20Sopenharmony_ci		 */
1008c2ecf20Sopenharmony_ci		if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) ||
1018c2ecf20Sopenharmony_ci		    walk->action == ACTION_CONTINUE ||
1028c2ecf20Sopenharmony_ci		    !(ops->pte_entry))
1038c2ecf20Sopenharmony_ci			continue;
1048c2ecf20Sopenharmony_ci
1058c2ecf20Sopenharmony_ci		if (walk->vma) {
1068c2ecf20Sopenharmony_ci			split_huge_pmd(walk->vma, pmd, addr);
1078c2ecf20Sopenharmony_ci			if (pmd_trans_unstable(pmd))
1088c2ecf20Sopenharmony_ci				goto again;
1098c2ecf20Sopenharmony_ci		}
1108c2ecf20Sopenharmony_ci
1118c2ecf20Sopenharmony_ci		err = walk_pte_range(pmd, addr, next, walk);
1128c2ecf20Sopenharmony_ci		if (err)
1138c2ecf20Sopenharmony_ci			break;
1148c2ecf20Sopenharmony_ci	} while (pmd++, addr = next, addr != end);
1158c2ecf20Sopenharmony_ci
1168c2ecf20Sopenharmony_ci	return err;
1178c2ecf20Sopenharmony_ci}
1188c2ecf20Sopenharmony_ci
1198c2ecf20Sopenharmony_cistatic int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
1208c2ecf20Sopenharmony_ci			  struct mm_walk *walk)
1218c2ecf20Sopenharmony_ci{
1228c2ecf20Sopenharmony_ci	pud_t *pud;
1238c2ecf20Sopenharmony_ci	unsigned long next;
1248c2ecf20Sopenharmony_ci	const struct mm_walk_ops *ops = walk->ops;
1258c2ecf20Sopenharmony_ci	int err = 0;
1268c2ecf20Sopenharmony_ci	int depth = real_depth(2);
1278c2ecf20Sopenharmony_ci
1288c2ecf20Sopenharmony_ci	pud = pud_offset(p4d, addr);
1298c2ecf20Sopenharmony_ci	do {
1308c2ecf20Sopenharmony_ci again:
1318c2ecf20Sopenharmony_ci		next = pud_addr_end(addr, end);
1328c2ecf20Sopenharmony_ci		if (pud_none(*pud)) {
1338c2ecf20Sopenharmony_ci			if (ops->pte_hole)
1348c2ecf20Sopenharmony_ci				err = ops->pte_hole(addr, next, depth, walk);
1358c2ecf20Sopenharmony_ci			if (err)
1368c2ecf20Sopenharmony_ci				break;
1378c2ecf20Sopenharmony_ci			continue;
1388c2ecf20Sopenharmony_ci		}
1398c2ecf20Sopenharmony_ci
1408c2ecf20Sopenharmony_ci		walk->action = ACTION_SUBTREE;
1418c2ecf20Sopenharmony_ci
1428c2ecf20Sopenharmony_ci		if (ops->pud_entry)
1438c2ecf20Sopenharmony_ci			err = ops->pud_entry(pud, addr, next, walk);
1448c2ecf20Sopenharmony_ci		if (err)
1458c2ecf20Sopenharmony_ci			break;
1468c2ecf20Sopenharmony_ci
1478c2ecf20Sopenharmony_ci		if (walk->action == ACTION_AGAIN)
1488c2ecf20Sopenharmony_ci			goto again;
1498c2ecf20Sopenharmony_ci
1508c2ecf20Sopenharmony_ci		if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) ||
1518c2ecf20Sopenharmony_ci		    walk->action == ACTION_CONTINUE ||
1528c2ecf20Sopenharmony_ci		    !(ops->pmd_entry || ops->pte_entry))
1538c2ecf20Sopenharmony_ci			continue;
1548c2ecf20Sopenharmony_ci
1558c2ecf20Sopenharmony_ci		if (walk->vma)
1568c2ecf20Sopenharmony_ci			split_huge_pud(walk->vma, pud, addr);
1578c2ecf20Sopenharmony_ci		if (pud_none(*pud))
1588c2ecf20Sopenharmony_ci			goto again;
1598c2ecf20Sopenharmony_ci
1608c2ecf20Sopenharmony_ci		err = walk_pmd_range(pud, addr, next, walk);
1618c2ecf20Sopenharmony_ci		if (err)
1628c2ecf20Sopenharmony_ci			break;
1638c2ecf20Sopenharmony_ci	} while (pud++, addr = next, addr != end);
1648c2ecf20Sopenharmony_ci
1658c2ecf20Sopenharmony_ci	return err;
1668c2ecf20Sopenharmony_ci}
1678c2ecf20Sopenharmony_ci
1688c2ecf20Sopenharmony_cistatic int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
1698c2ecf20Sopenharmony_ci			  struct mm_walk *walk)
1708c2ecf20Sopenharmony_ci{
1718c2ecf20Sopenharmony_ci	p4d_t *p4d;
1728c2ecf20Sopenharmony_ci	unsigned long next;
1738c2ecf20Sopenharmony_ci	const struct mm_walk_ops *ops = walk->ops;
1748c2ecf20Sopenharmony_ci	int err = 0;
1758c2ecf20Sopenharmony_ci	int depth = real_depth(1);
1768c2ecf20Sopenharmony_ci
1778c2ecf20Sopenharmony_ci	p4d = p4d_offset(pgd, addr);
1788c2ecf20Sopenharmony_ci	do {
1798c2ecf20Sopenharmony_ci		next = p4d_addr_end(addr, end);
1808c2ecf20Sopenharmony_ci		if (p4d_none_or_clear_bad(p4d)) {
1818c2ecf20Sopenharmony_ci			if (ops->pte_hole)
1828c2ecf20Sopenharmony_ci				err = ops->pte_hole(addr, next, depth, walk);
1838c2ecf20Sopenharmony_ci			if (err)
1848c2ecf20Sopenharmony_ci				break;
1858c2ecf20Sopenharmony_ci			continue;
1868c2ecf20Sopenharmony_ci		}
1878c2ecf20Sopenharmony_ci		if (ops->p4d_entry) {
1888c2ecf20Sopenharmony_ci			err = ops->p4d_entry(p4d, addr, next, walk);
1898c2ecf20Sopenharmony_ci			if (err)
1908c2ecf20Sopenharmony_ci				break;
1918c2ecf20Sopenharmony_ci		}
1928c2ecf20Sopenharmony_ci		if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
1938c2ecf20Sopenharmony_ci			err = walk_pud_range(p4d, addr, next, walk);
1948c2ecf20Sopenharmony_ci		if (err)
1958c2ecf20Sopenharmony_ci			break;
1968c2ecf20Sopenharmony_ci	} while (p4d++, addr = next, addr != end);
1978c2ecf20Sopenharmony_ci
1988c2ecf20Sopenharmony_ci	return err;
1998c2ecf20Sopenharmony_ci}
2008c2ecf20Sopenharmony_ci
2018c2ecf20Sopenharmony_cistatic int walk_pgd_range(unsigned long addr, unsigned long end,
2028c2ecf20Sopenharmony_ci			  struct mm_walk *walk)
2038c2ecf20Sopenharmony_ci{
2048c2ecf20Sopenharmony_ci	pgd_t *pgd;
2058c2ecf20Sopenharmony_ci	unsigned long next;
2068c2ecf20Sopenharmony_ci	const struct mm_walk_ops *ops = walk->ops;
2078c2ecf20Sopenharmony_ci	int err = 0;
2088c2ecf20Sopenharmony_ci
2098c2ecf20Sopenharmony_ci	if (walk->pgd)
2108c2ecf20Sopenharmony_ci		pgd = walk->pgd + pgd_index(addr);
2118c2ecf20Sopenharmony_ci	else
2128c2ecf20Sopenharmony_ci		pgd = pgd_offset(walk->mm, addr);
2138c2ecf20Sopenharmony_ci	do {
2148c2ecf20Sopenharmony_ci		next = pgd_addr_end(addr, end);
2158c2ecf20Sopenharmony_ci		if (pgd_none_or_clear_bad(pgd)) {
2168c2ecf20Sopenharmony_ci			if (ops->pte_hole)
2178c2ecf20Sopenharmony_ci				err = ops->pte_hole(addr, next, 0, walk);
2188c2ecf20Sopenharmony_ci			if (err)
2198c2ecf20Sopenharmony_ci				break;
2208c2ecf20Sopenharmony_ci			continue;
2218c2ecf20Sopenharmony_ci		}
2228c2ecf20Sopenharmony_ci		if (ops->pgd_entry) {
2238c2ecf20Sopenharmony_ci			err = ops->pgd_entry(pgd, addr, next, walk);
2248c2ecf20Sopenharmony_ci			if (err)
2258c2ecf20Sopenharmony_ci				break;
2268c2ecf20Sopenharmony_ci		}
2278c2ecf20Sopenharmony_ci		if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry ||
2288c2ecf20Sopenharmony_ci		    ops->pte_entry)
2298c2ecf20Sopenharmony_ci			err = walk_p4d_range(pgd, addr, next, walk);
2308c2ecf20Sopenharmony_ci		if (err)
2318c2ecf20Sopenharmony_ci			break;
2328c2ecf20Sopenharmony_ci	} while (pgd++, addr = next, addr != end);
2338c2ecf20Sopenharmony_ci
2348c2ecf20Sopenharmony_ci	return err;
2358c2ecf20Sopenharmony_ci}
2368c2ecf20Sopenharmony_ci
2378c2ecf20Sopenharmony_ci#ifdef CONFIG_HUGETLB_PAGE
2388c2ecf20Sopenharmony_cistatic unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
2398c2ecf20Sopenharmony_ci				       unsigned long end)
2408c2ecf20Sopenharmony_ci{
2418c2ecf20Sopenharmony_ci	unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
2428c2ecf20Sopenharmony_ci	return boundary < end ? boundary : end;
2438c2ecf20Sopenharmony_ci}
2448c2ecf20Sopenharmony_ci
2458c2ecf20Sopenharmony_cistatic int walk_hugetlb_range(unsigned long addr, unsigned long end,
2468c2ecf20Sopenharmony_ci			      struct mm_walk *walk)
2478c2ecf20Sopenharmony_ci{
2488c2ecf20Sopenharmony_ci	struct vm_area_struct *vma = walk->vma;
2498c2ecf20Sopenharmony_ci	struct hstate *h = hstate_vma(vma);
2508c2ecf20Sopenharmony_ci	unsigned long next;
2518c2ecf20Sopenharmony_ci	unsigned long hmask = huge_page_mask(h);
2528c2ecf20Sopenharmony_ci	unsigned long sz = huge_page_size(h);
2538c2ecf20Sopenharmony_ci	pte_t *pte;
2548c2ecf20Sopenharmony_ci	const struct mm_walk_ops *ops = walk->ops;
2558c2ecf20Sopenharmony_ci	int err = 0;
2568c2ecf20Sopenharmony_ci
2578c2ecf20Sopenharmony_ci	do {
2588c2ecf20Sopenharmony_ci		next = hugetlb_entry_end(h, addr, end);
2598c2ecf20Sopenharmony_ci		pte = huge_pte_offset(walk->mm, addr & hmask, sz);
2608c2ecf20Sopenharmony_ci
2618c2ecf20Sopenharmony_ci		if (pte)
2628c2ecf20Sopenharmony_ci			err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
2638c2ecf20Sopenharmony_ci		else if (ops->pte_hole)
2648c2ecf20Sopenharmony_ci			err = ops->pte_hole(addr, next, -1, walk);
2658c2ecf20Sopenharmony_ci
2668c2ecf20Sopenharmony_ci		if (err)
2678c2ecf20Sopenharmony_ci			break;
2688c2ecf20Sopenharmony_ci	} while (addr = next, addr != end);
2698c2ecf20Sopenharmony_ci
2708c2ecf20Sopenharmony_ci	return err;
2718c2ecf20Sopenharmony_ci}
2728c2ecf20Sopenharmony_ci
2738c2ecf20Sopenharmony_ci#else /* CONFIG_HUGETLB_PAGE */
2748c2ecf20Sopenharmony_cistatic int walk_hugetlb_range(unsigned long addr, unsigned long end,
2758c2ecf20Sopenharmony_ci			      struct mm_walk *walk)
2768c2ecf20Sopenharmony_ci{
2778c2ecf20Sopenharmony_ci	return 0;
2788c2ecf20Sopenharmony_ci}
2798c2ecf20Sopenharmony_ci
2808c2ecf20Sopenharmony_ci#endif /* CONFIG_HUGETLB_PAGE */
2818c2ecf20Sopenharmony_ci
2828c2ecf20Sopenharmony_ci/*
2838c2ecf20Sopenharmony_ci * Decide whether we really walk over the current vma on [@start, @end)
2848c2ecf20Sopenharmony_ci * or skip it via the returned value. Return 0 if we do walk over the
2858c2ecf20Sopenharmony_ci * current vma, and return 1 if we skip the vma. Negative values means
2868c2ecf20Sopenharmony_ci * error, where we abort the current walk.
2878c2ecf20Sopenharmony_ci */
2888c2ecf20Sopenharmony_cistatic int walk_page_test(unsigned long start, unsigned long end,
2898c2ecf20Sopenharmony_ci			struct mm_walk *walk)
2908c2ecf20Sopenharmony_ci{
2918c2ecf20Sopenharmony_ci	struct vm_area_struct *vma = walk->vma;
2928c2ecf20Sopenharmony_ci	const struct mm_walk_ops *ops = walk->ops;
2938c2ecf20Sopenharmony_ci
2948c2ecf20Sopenharmony_ci	if (ops->test_walk)
2958c2ecf20Sopenharmony_ci		return ops->test_walk(start, end, walk);
2968c2ecf20Sopenharmony_ci
2978c2ecf20Sopenharmony_ci	/*
2988c2ecf20Sopenharmony_ci	 * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
2998c2ecf20Sopenharmony_ci	 * range, so we don't walk over it as we do for normal vmas. However,
3008c2ecf20Sopenharmony_ci	 * Some callers are interested in handling hole range and they don't
3018c2ecf20Sopenharmony_ci	 * want to just ignore any single address range. Such users certainly
3028c2ecf20Sopenharmony_ci	 * define their ->pte_hole() callbacks, so let's delegate them to handle
3038c2ecf20Sopenharmony_ci	 * vma(VM_PFNMAP).
3048c2ecf20Sopenharmony_ci	 */
3058c2ecf20Sopenharmony_ci	if (vma->vm_flags & VM_PFNMAP) {
3068c2ecf20Sopenharmony_ci		int err = 1;
3078c2ecf20Sopenharmony_ci		if (ops->pte_hole)
3088c2ecf20Sopenharmony_ci			err = ops->pte_hole(start, end, -1, walk);
3098c2ecf20Sopenharmony_ci		return err ? err : 1;
3108c2ecf20Sopenharmony_ci	}
3118c2ecf20Sopenharmony_ci	return 0;
3128c2ecf20Sopenharmony_ci}
3138c2ecf20Sopenharmony_ci
3148c2ecf20Sopenharmony_cistatic int __walk_page_range(unsigned long start, unsigned long end,
3158c2ecf20Sopenharmony_ci			struct mm_walk *walk)
3168c2ecf20Sopenharmony_ci{
3178c2ecf20Sopenharmony_ci	int err = 0;
3188c2ecf20Sopenharmony_ci	struct vm_area_struct *vma = walk->vma;
3198c2ecf20Sopenharmony_ci	const struct mm_walk_ops *ops = walk->ops;
3208c2ecf20Sopenharmony_ci
3218c2ecf20Sopenharmony_ci	if (ops->pre_vma) {
3228c2ecf20Sopenharmony_ci		err = ops->pre_vma(start, end, walk);
3238c2ecf20Sopenharmony_ci		if (err)
3248c2ecf20Sopenharmony_ci			return err;
3258c2ecf20Sopenharmony_ci	}
3268c2ecf20Sopenharmony_ci
3278c2ecf20Sopenharmony_ci	if (is_vm_hugetlb_page(vma)) {
3288c2ecf20Sopenharmony_ci		if (ops->hugetlb_entry)
3298c2ecf20Sopenharmony_ci			err = walk_hugetlb_range(start, end, walk);
3308c2ecf20Sopenharmony_ci	} else
3318c2ecf20Sopenharmony_ci		err = walk_pgd_range(start, end, walk);
3328c2ecf20Sopenharmony_ci
3338c2ecf20Sopenharmony_ci	if (ops->post_vma)
3348c2ecf20Sopenharmony_ci		ops->post_vma(walk);
3358c2ecf20Sopenharmony_ci
3368c2ecf20Sopenharmony_ci	return err;
3378c2ecf20Sopenharmony_ci}
3388c2ecf20Sopenharmony_ci
3398c2ecf20Sopenharmony_ci/**
3408c2ecf20Sopenharmony_ci * walk_page_range - walk page table with caller specific callbacks
3418c2ecf20Sopenharmony_ci * @mm:		mm_struct representing the target process of page table walk
3428c2ecf20Sopenharmony_ci * @start:	start address of the virtual address range
3438c2ecf20Sopenharmony_ci * @end:	end address of the virtual address range
3448c2ecf20Sopenharmony_ci * @ops:	operation to call during the walk
3458c2ecf20Sopenharmony_ci * @private:	private data for callbacks' usage
3468c2ecf20Sopenharmony_ci *
3478c2ecf20Sopenharmony_ci * Recursively walk the page table tree of the process represented by @mm
3488c2ecf20Sopenharmony_ci * within the virtual address range [@start, @end). During walking, we can do
3498c2ecf20Sopenharmony_ci * some caller-specific works for each entry, by setting up pmd_entry(),
3508c2ecf20Sopenharmony_ci * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
3518c2ecf20Sopenharmony_ci * callbacks, the associated entries/pages are just ignored.
3528c2ecf20Sopenharmony_ci * The return values of these callbacks are commonly defined like below:
3538c2ecf20Sopenharmony_ci *
3548c2ecf20Sopenharmony_ci *  - 0  : succeeded to handle the current entry, and if you don't reach the
3558c2ecf20Sopenharmony_ci *         end address yet, continue to walk.
3568c2ecf20Sopenharmony_ci *  - >0 : succeeded to handle the current entry, and return to the caller
3578c2ecf20Sopenharmony_ci *         with caller specific value.
3588c2ecf20Sopenharmony_ci *  - <0 : failed to handle the current entry, and return to the caller
3598c2ecf20Sopenharmony_ci *         with error code.
3608c2ecf20Sopenharmony_ci *
3618c2ecf20Sopenharmony_ci * Before starting to walk page table, some callers want to check whether
3628c2ecf20Sopenharmony_ci * they really want to walk over the current vma, typically by checking
3638c2ecf20Sopenharmony_ci * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
3648c2ecf20Sopenharmony_ci * purpose.
3658c2ecf20Sopenharmony_ci *
3668c2ecf20Sopenharmony_ci * If operations need to be staged before and committed after a vma is walked,
3678c2ecf20Sopenharmony_ci * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
3688c2ecf20Sopenharmony_ci * since it is intended to handle commit-type operations, can't return any
3698c2ecf20Sopenharmony_ci * errors.
3708c2ecf20Sopenharmony_ci *
3718c2ecf20Sopenharmony_ci * struct mm_walk keeps current values of some common data like vma and pmd,
3728c2ecf20Sopenharmony_ci * which are useful for the access from callbacks. If you want to pass some
3738c2ecf20Sopenharmony_ci * caller-specific data to callbacks, @private should be helpful.
3748c2ecf20Sopenharmony_ci *
3758c2ecf20Sopenharmony_ci * Locking:
3768c2ecf20Sopenharmony_ci *   Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock,
3778c2ecf20Sopenharmony_ci *   because these function traverse vma list and/or access to vma's data.
3788c2ecf20Sopenharmony_ci */
3798c2ecf20Sopenharmony_ciint walk_page_range(struct mm_struct *mm, unsigned long start,
3808c2ecf20Sopenharmony_ci		unsigned long end, const struct mm_walk_ops *ops,
3818c2ecf20Sopenharmony_ci		void *private)
3828c2ecf20Sopenharmony_ci{
3838c2ecf20Sopenharmony_ci	int err = 0;
3848c2ecf20Sopenharmony_ci	unsigned long next;
3858c2ecf20Sopenharmony_ci	struct vm_area_struct *vma;
3868c2ecf20Sopenharmony_ci	struct mm_walk walk = {
3878c2ecf20Sopenharmony_ci		.ops		= ops,
3888c2ecf20Sopenharmony_ci		.mm		= mm,
3898c2ecf20Sopenharmony_ci		.private	= private,
3908c2ecf20Sopenharmony_ci	};
3918c2ecf20Sopenharmony_ci
3928c2ecf20Sopenharmony_ci	if (start >= end)
3938c2ecf20Sopenharmony_ci		return -EINVAL;
3948c2ecf20Sopenharmony_ci
3958c2ecf20Sopenharmony_ci	if (!walk.mm)
3968c2ecf20Sopenharmony_ci		return -EINVAL;
3978c2ecf20Sopenharmony_ci
3988c2ecf20Sopenharmony_ci	mmap_assert_locked(walk.mm);
3998c2ecf20Sopenharmony_ci
4008c2ecf20Sopenharmony_ci	vma = find_vma(walk.mm, start);
4018c2ecf20Sopenharmony_ci	do {
4028c2ecf20Sopenharmony_ci		if (!vma) { /* after the last vma */
4038c2ecf20Sopenharmony_ci			walk.vma = NULL;
4048c2ecf20Sopenharmony_ci			next = end;
4058c2ecf20Sopenharmony_ci			if (ops->pte_hole)
4068c2ecf20Sopenharmony_ci				err = ops->pte_hole(start, next, -1, &walk);
4078c2ecf20Sopenharmony_ci		} else if (start < vma->vm_start) { /* outside vma */
4088c2ecf20Sopenharmony_ci			walk.vma = NULL;
4098c2ecf20Sopenharmony_ci			next = min(end, vma->vm_start);
4108c2ecf20Sopenharmony_ci			if (ops->pte_hole)
4118c2ecf20Sopenharmony_ci				err = ops->pte_hole(start, next, -1, &walk);
4128c2ecf20Sopenharmony_ci		} else { /* inside vma */
4138c2ecf20Sopenharmony_ci			walk.vma = vma;
4148c2ecf20Sopenharmony_ci			next = min(end, vma->vm_end);
4158c2ecf20Sopenharmony_ci			vma = vma->vm_next;
4168c2ecf20Sopenharmony_ci
4178c2ecf20Sopenharmony_ci			err = walk_page_test(start, next, &walk);
4188c2ecf20Sopenharmony_ci			if (err > 0) {
4198c2ecf20Sopenharmony_ci				/*
4208c2ecf20Sopenharmony_ci				 * positive return values are purely for
4218c2ecf20Sopenharmony_ci				 * controlling the pagewalk, so should never
4228c2ecf20Sopenharmony_ci				 * be passed to the callers.
4238c2ecf20Sopenharmony_ci				 */
4248c2ecf20Sopenharmony_ci				err = 0;
4258c2ecf20Sopenharmony_ci				continue;
4268c2ecf20Sopenharmony_ci			}
4278c2ecf20Sopenharmony_ci			if (err < 0)
4288c2ecf20Sopenharmony_ci				break;
4298c2ecf20Sopenharmony_ci			err = __walk_page_range(start, next, &walk);
4308c2ecf20Sopenharmony_ci		}
4318c2ecf20Sopenharmony_ci		if (err)
4328c2ecf20Sopenharmony_ci			break;
4338c2ecf20Sopenharmony_ci	} while (start = next, start < end);
4348c2ecf20Sopenharmony_ci	return err;
4358c2ecf20Sopenharmony_ci}
4368c2ecf20Sopenharmony_ci
4378c2ecf20Sopenharmony_ci/*
4388c2ecf20Sopenharmony_ci * Similar to walk_page_range() but can walk any page tables even if they are
4398c2ecf20Sopenharmony_ci * not backed by VMAs. Because 'unusual' entries may be walked this function
4408c2ecf20Sopenharmony_ci * will also not lock the PTEs for the pte_entry() callback. This is useful for
4418c2ecf20Sopenharmony_ci * walking the kernel pages tables or page tables for firmware.
4428c2ecf20Sopenharmony_ci */
4438c2ecf20Sopenharmony_ciint walk_page_range_novma(struct mm_struct *mm, unsigned long start,
4448c2ecf20Sopenharmony_ci			  unsigned long end, const struct mm_walk_ops *ops,
4458c2ecf20Sopenharmony_ci			  pgd_t *pgd,
4468c2ecf20Sopenharmony_ci			  void *private)
4478c2ecf20Sopenharmony_ci{
4488c2ecf20Sopenharmony_ci	struct mm_walk walk = {
4498c2ecf20Sopenharmony_ci		.ops		= ops,
4508c2ecf20Sopenharmony_ci		.mm		= mm,
4518c2ecf20Sopenharmony_ci		.pgd		= pgd,
4528c2ecf20Sopenharmony_ci		.private	= private,
4538c2ecf20Sopenharmony_ci		.no_vma		= true
4548c2ecf20Sopenharmony_ci	};
4558c2ecf20Sopenharmony_ci
4568c2ecf20Sopenharmony_ci	if (start >= end || !walk.mm)
4578c2ecf20Sopenharmony_ci		return -EINVAL;
4588c2ecf20Sopenharmony_ci
4598c2ecf20Sopenharmony_ci	mmap_assert_write_locked(walk.mm);
4608c2ecf20Sopenharmony_ci
4618c2ecf20Sopenharmony_ci	return walk_pgd_range(start, end, &walk);
4628c2ecf20Sopenharmony_ci}
4638c2ecf20Sopenharmony_ci
4648c2ecf20Sopenharmony_ciint walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
4658c2ecf20Sopenharmony_ci		void *private)
4668c2ecf20Sopenharmony_ci{
4678c2ecf20Sopenharmony_ci	struct mm_walk walk = {
4688c2ecf20Sopenharmony_ci		.ops		= ops,
4698c2ecf20Sopenharmony_ci		.mm		= vma->vm_mm,
4708c2ecf20Sopenharmony_ci		.vma		= vma,
4718c2ecf20Sopenharmony_ci		.private	= private,
4728c2ecf20Sopenharmony_ci	};
4738c2ecf20Sopenharmony_ci	int err;
4748c2ecf20Sopenharmony_ci
4758c2ecf20Sopenharmony_ci	if (!walk.mm)
4768c2ecf20Sopenharmony_ci		return -EINVAL;
4778c2ecf20Sopenharmony_ci
4788c2ecf20Sopenharmony_ci	mmap_assert_locked(walk.mm);
4798c2ecf20Sopenharmony_ci
4808c2ecf20Sopenharmony_ci	err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
4818c2ecf20Sopenharmony_ci	if (err > 0)
4828c2ecf20Sopenharmony_ci		return 0;
4838c2ecf20Sopenharmony_ci	if (err < 0)
4848c2ecf20Sopenharmony_ci		return err;
4858c2ecf20Sopenharmony_ci	return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
4868c2ecf20Sopenharmony_ci}
4878c2ecf20Sopenharmony_ci
4888c2ecf20Sopenharmony_ci/**
4898c2ecf20Sopenharmony_ci * walk_page_mapping - walk all memory areas mapped into a struct address_space.
4908c2ecf20Sopenharmony_ci * @mapping: Pointer to the struct address_space
4918c2ecf20Sopenharmony_ci * @first_index: First page offset in the address_space
4928c2ecf20Sopenharmony_ci * @nr: Number of incremental page offsets to cover
4938c2ecf20Sopenharmony_ci * @ops:	operation to call during the walk
4948c2ecf20Sopenharmony_ci * @private:	private data for callbacks' usage
4958c2ecf20Sopenharmony_ci *
4968c2ecf20Sopenharmony_ci * This function walks all memory areas mapped into a struct address_space.
4978c2ecf20Sopenharmony_ci * The walk is limited to only the given page-size index range, but if
4988c2ecf20Sopenharmony_ci * the index boundaries cross a huge page-table entry, that entry will be
4998c2ecf20Sopenharmony_ci * included.
5008c2ecf20Sopenharmony_ci *
5018c2ecf20Sopenharmony_ci * Also see walk_page_range() for additional information.
5028c2ecf20Sopenharmony_ci *
5038c2ecf20Sopenharmony_ci * Locking:
5048c2ecf20Sopenharmony_ci *   This function can't require that the struct mm_struct::mmap_lock is held,
5058c2ecf20Sopenharmony_ci *   since @mapping may be mapped by multiple processes. Instead
5068c2ecf20Sopenharmony_ci *   @mapping->i_mmap_rwsem must be held. This might have implications in the
5078c2ecf20Sopenharmony_ci *   callbacks, and it's up tho the caller to ensure that the
5088c2ecf20Sopenharmony_ci *   struct mm_struct::mmap_lock is not needed.
5098c2ecf20Sopenharmony_ci *
5108c2ecf20Sopenharmony_ci *   Also this means that a caller can't rely on the struct
5118c2ecf20Sopenharmony_ci *   vm_area_struct::vm_flags to be constant across a call,
5128c2ecf20Sopenharmony_ci *   except for immutable flags. Callers requiring this shouldn't use
5138c2ecf20Sopenharmony_ci *   this function.
5148c2ecf20Sopenharmony_ci *
5158c2ecf20Sopenharmony_ci * Return: 0 on success, negative error code on failure, positive number on
5168c2ecf20Sopenharmony_ci * caller defined premature termination.
5178c2ecf20Sopenharmony_ci */
5188c2ecf20Sopenharmony_ciint walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
5198c2ecf20Sopenharmony_ci		      pgoff_t nr, const struct mm_walk_ops *ops,
5208c2ecf20Sopenharmony_ci		      void *private)
5218c2ecf20Sopenharmony_ci{
5228c2ecf20Sopenharmony_ci	struct mm_walk walk = {
5238c2ecf20Sopenharmony_ci		.ops		= ops,
5248c2ecf20Sopenharmony_ci		.private	= private,
5258c2ecf20Sopenharmony_ci	};
5268c2ecf20Sopenharmony_ci	struct vm_area_struct *vma;
5278c2ecf20Sopenharmony_ci	pgoff_t vba, vea, cba, cea;
5288c2ecf20Sopenharmony_ci	unsigned long start_addr, end_addr;
5298c2ecf20Sopenharmony_ci	int err = 0;
5308c2ecf20Sopenharmony_ci
5318c2ecf20Sopenharmony_ci	lockdep_assert_held(&mapping->i_mmap_rwsem);
5328c2ecf20Sopenharmony_ci	vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
5338c2ecf20Sopenharmony_ci				  first_index + nr - 1) {
5348c2ecf20Sopenharmony_ci		/* Clip to the vma */
5358c2ecf20Sopenharmony_ci		vba = vma->vm_pgoff;
5368c2ecf20Sopenharmony_ci		vea = vba + vma_pages(vma);
5378c2ecf20Sopenharmony_ci		cba = first_index;
5388c2ecf20Sopenharmony_ci		cba = max(cba, vba);
5398c2ecf20Sopenharmony_ci		cea = first_index + nr;
5408c2ecf20Sopenharmony_ci		cea = min(cea, vea);
5418c2ecf20Sopenharmony_ci
5428c2ecf20Sopenharmony_ci		start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
5438c2ecf20Sopenharmony_ci		end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
5448c2ecf20Sopenharmony_ci		if (start_addr >= end_addr)
5458c2ecf20Sopenharmony_ci			continue;
5468c2ecf20Sopenharmony_ci
5478c2ecf20Sopenharmony_ci		walk.vma = vma;
5488c2ecf20Sopenharmony_ci		walk.mm = vma->vm_mm;
5498c2ecf20Sopenharmony_ci
5508c2ecf20Sopenharmony_ci		err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
5518c2ecf20Sopenharmony_ci		if (err > 0) {
5528c2ecf20Sopenharmony_ci			err = 0;
5538c2ecf20Sopenharmony_ci			break;
5548c2ecf20Sopenharmony_ci		} else if (err < 0)
5558c2ecf20Sopenharmony_ci			break;
5568c2ecf20Sopenharmony_ci
5578c2ecf20Sopenharmony_ci		err = __walk_page_range(start_addr, end_addr, &walk);
5588c2ecf20Sopenharmony_ci		if (err)
5598c2ecf20Sopenharmony_ci			break;
5608c2ecf20Sopenharmony_ci	}
5618c2ecf20Sopenharmony_ci
5628c2ecf20Sopenharmony_ci	return err;
5638c2ecf20Sopenharmony_ci}
564