18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 28c2ecf20Sopenharmony_ci#include <linux/pagewalk.h> 38c2ecf20Sopenharmony_ci#include <linux/highmem.h> 48c2ecf20Sopenharmony_ci#include <linux/sched.h> 58c2ecf20Sopenharmony_ci#include <linux/hugetlb.h> 68c2ecf20Sopenharmony_ci 78c2ecf20Sopenharmony_ci/* 88c2ecf20Sopenharmony_ci * We want to know the real level where a entry is located ignoring any 98c2ecf20Sopenharmony_ci * folding of levels which may be happening. For example if p4d is folded then 108c2ecf20Sopenharmony_ci * a missing entry found at level 1 (p4d) is actually at level 0 (pgd). 118c2ecf20Sopenharmony_ci */ 128c2ecf20Sopenharmony_cistatic int real_depth(int depth) 138c2ecf20Sopenharmony_ci{ 148c2ecf20Sopenharmony_ci if (depth == 3 && PTRS_PER_PMD == 1) 158c2ecf20Sopenharmony_ci depth = 2; 168c2ecf20Sopenharmony_ci if (depth == 2 && PTRS_PER_PUD == 1) 178c2ecf20Sopenharmony_ci depth = 1; 188c2ecf20Sopenharmony_ci if (depth == 1 && PTRS_PER_P4D == 1) 198c2ecf20Sopenharmony_ci depth = 0; 208c2ecf20Sopenharmony_ci return depth; 218c2ecf20Sopenharmony_ci} 228c2ecf20Sopenharmony_ci 238c2ecf20Sopenharmony_cistatic int walk_pte_range_inner(pte_t *pte, unsigned long addr, 248c2ecf20Sopenharmony_ci unsigned long end, struct mm_walk *walk) 258c2ecf20Sopenharmony_ci{ 268c2ecf20Sopenharmony_ci const struct mm_walk_ops *ops = walk->ops; 278c2ecf20Sopenharmony_ci int err = 0; 288c2ecf20Sopenharmony_ci 298c2ecf20Sopenharmony_ci for (;;) { 308c2ecf20Sopenharmony_ci err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk); 318c2ecf20Sopenharmony_ci if (err) 328c2ecf20Sopenharmony_ci break; 338c2ecf20Sopenharmony_ci if (addr >= end - PAGE_SIZE) 348c2ecf20Sopenharmony_ci break; 358c2ecf20Sopenharmony_ci addr += PAGE_SIZE; 368c2ecf20Sopenharmony_ci pte++; 378c2ecf20Sopenharmony_ci } 388c2ecf20Sopenharmony_ci return err; 398c2ecf20Sopenharmony_ci} 408c2ecf20Sopenharmony_ci 418c2ecf20Sopenharmony_cistatic int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 428c2ecf20Sopenharmony_ci struct mm_walk *walk) 438c2ecf20Sopenharmony_ci{ 448c2ecf20Sopenharmony_ci pte_t *pte; 458c2ecf20Sopenharmony_ci int err = 0; 468c2ecf20Sopenharmony_ci spinlock_t *ptl; 478c2ecf20Sopenharmony_ci 488c2ecf20Sopenharmony_ci if (walk->no_vma) { 498c2ecf20Sopenharmony_ci pte = pte_offset_map(pmd, addr); 508c2ecf20Sopenharmony_ci err = walk_pte_range_inner(pte, addr, end, walk); 518c2ecf20Sopenharmony_ci pte_unmap(pte); 528c2ecf20Sopenharmony_ci } else { 538c2ecf20Sopenharmony_ci pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 548c2ecf20Sopenharmony_ci err = walk_pte_range_inner(pte, addr, end, walk); 558c2ecf20Sopenharmony_ci pte_unmap_unlock(pte, ptl); 568c2ecf20Sopenharmony_ci } 578c2ecf20Sopenharmony_ci 588c2ecf20Sopenharmony_ci return err; 598c2ecf20Sopenharmony_ci} 608c2ecf20Sopenharmony_ci 618c2ecf20Sopenharmony_cistatic int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, 628c2ecf20Sopenharmony_ci struct mm_walk *walk) 638c2ecf20Sopenharmony_ci{ 648c2ecf20Sopenharmony_ci pmd_t *pmd; 658c2ecf20Sopenharmony_ci unsigned long next; 668c2ecf20Sopenharmony_ci const struct mm_walk_ops *ops = walk->ops; 678c2ecf20Sopenharmony_ci int err = 0; 688c2ecf20Sopenharmony_ci int depth = real_depth(3); 698c2ecf20Sopenharmony_ci 708c2ecf20Sopenharmony_ci pmd = pmd_offset(pud, addr); 718c2ecf20Sopenharmony_ci do { 728c2ecf20Sopenharmony_ciagain: 738c2ecf20Sopenharmony_ci next = pmd_addr_end(addr, end); 748c2ecf20Sopenharmony_ci if (pmd_none(*pmd)) { 758c2ecf20Sopenharmony_ci if (ops->pte_hole) 768c2ecf20Sopenharmony_ci err = ops->pte_hole(addr, next, depth, walk); 778c2ecf20Sopenharmony_ci if (err) 788c2ecf20Sopenharmony_ci break; 798c2ecf20Sopenharmony_ci continue; 808c2ecf20Sopenharmony_ci } 818c2ecf20Sopenharmony_ci 828c2ecf20Sopenharmony_ci walk->action = ACTION_SUBTREE; 838c2ecf20Sopenharmony_ci 848c2ecf20Sopenharmony_ci /* 858c2ecf20Sopenharmony_ci * This implies that each ->pmd_entry() handler 868c2ecf20Sopenharmony_ci * needs to know about pmd_trans_huge() pmds 878c2ecf20Sopenharmony_ci */ 888c2ecf20Sopenharmony_ci if (ops->pmd_entry) 898c2ecf20Sopenharmony_ci err = ops->pmd_entry(pmd, addr, next, walk); 908c2ecf20Sopenharmony_ci if (err) 918c2ecf20Sopenharmony_ci break; 928c2ecf20Sopenharmony_ci 938c2ecf20Sopenharmony_ci if (walk->action == ACTION_AGAIN) 948c2ecf20Sopenharmony_ci goto again; 958c2ecf20Sopenharmony_ci 968c2ecf20Sopenharmony_ci /* 978c2ecf20Sopenharmony_ci * Check this here so we only break down trans_huge 988c2ecf20Sopenharmony_ci * pages when we _need_ to 998c2ecf20Sopenharmony_ci */ 1008c2ecf20Sopenharmony_ci if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) || 1018c2ecf20Sopenharmony_ci walk->action == ACTION_CONTINUE || 1028c2ecf20Sopenharmony_ci !(ops->pte_entry)) 1038c2ecf20Sopenharmony_ci continue; 1048c2ecf20Sopenharmony_ci 1058c2ecf20Sopenharmony_ci if (walk->vma) { 1068c2ecf20Sopenharmony_ci split_huge_pmd(walk->vma, pmd, addr); 1078c2ecf20Sopenharmony_ci if (pmd_trans_unstable(pmd)) 1088c2ecf20Sopenharmony_ci goto again; 1098c2ecf20Sopenharmony_ci } 1108c2ecf20Sopenharmony_ci 1118c2ecf20Sopenharmony_ci err = walk_pte_range(pmd, addr, next, walk); 1128c2ecf20Sopenharmony_ci if (err) 1138c2ecf20Sopenharmony_ci break; 1148c2ecf20Sopenharmony_ci } while (pmd++, addr = next, addr != end); 1158c2ecf20Sopenharmony_ci 1168c2ecf20Sopenharmony_ci return err; 1178c2ecf20Sopenharmony_ci} 1188c2ecf20Sopenharmony_ci 1198c2ecf20Sopenharmony_cistatic int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, 1208c2ecf20Sopenharmony_ci struct mm_walk *walk) 1218c2ecf20Sopenharmony_ci{ 1228c2ecf20Sopenharmony_ci pud_t *pud; 1238c2ecf20Sopenharmony_ci unsigned long next; 1248c2ecf20Sopenharmony_ci const struct mm_walk_ops *ops = walk->ops; 1258c2ecf20Sopenharmony_ci int err = 0; 1268c2ecf20Sopenharmony_ci int depth = real_depth(2); 1278c2ecf20Sopenharmony_ci 1288c2ecf20Sopenharmony_ci pud = pud_offset(p4d, addr); 1298c2ecf20Sopenharmony_ci do { 1308c2ecf20Sopenharmony_ci again: 1318c2ecf20Sopenharmony_ci next = pud_addr_end(addr, end); 1328c2ecf20Sopenharmony_ci if (pud_none(*pud)) { 1338c2ecf20Sopenharmony_ci if (ops->pte_hole) 1348c2ecf20Sopenharmony_ci err = ops->pte_hole(addr, next, depth, walk); 1358c2ecf20Sopenharmony_ci if (err) 1368c2ecf20Sopenharmony_ci break; 1378c2ecf20Sopenharmony_ci continue; 1388c2ecf20Sopenharmony_ci } 1398c2ecf20Sopenharmony_ci 1408c2ecf20Sopenharmony_ci walk->action = ACTION_SUBTREE; 1418c2ecf20Sopenharmony_ci 1428c2ecf20Sopenharmony_ci if (ops->pud_entry) 1438c2ecf20Sopenharmony_ci err = ops->pud_entry(pud, addr, next, walk); 1448c2ecf20Sopenharmony_ci if (err) 1458c2ecf20Sopenharmony_ci break; 1468c2ecf20Sopenharmony_ci 1478c2ecf20Sopenharmony_ci if (walk->action == ACTION_AGAIN) 1488c2ecf20Sopenharmony_ci goto again; 1498c2ecf20Sopenharmony_ci 1508c2ecf20Sopenharmony_ci if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) || 1518c2ecf20Sopenharmony_ci walk->action == ACTION_CONTINUE || 1528c2ecf20Sopenharmony_ci !(ops->pmd_entry || ops->pte_entry)) 1538c2ecf20Sopenharmony_ci continue; 1548c2ecf20Sopenharmony_ci 1558c2ecf20Sopenharmony_ci if (walk->vma) 1568c2ecf20Sopenharmony_ci split_huge_pud(walk->vma, pud, addr); 1578c2ecf20Sopenharmony_ci if (pud_none(*pud)) 1588c2ecf20Sopenharmony_ci goto again; 1598c2ecf20Sopenharmony_ci 1608c2ecf20Sopenharmony_ci err = walk_pmd_range(pud, addr, next, walk); 1618c2ecf20Sopenharmony_ci if (err) 1628c2ecf20Sopenharmony_ci break; 1638c2ecf20Sopenharmony_ci } while (pud++, addr = next, addr != end); 1648c2ecf20Sopenharmony_ci 1658c2ecf20Sopenharmony_ci return err; 1668c2ecf20Sopenharmony_ci} 1678c2ecf20Sopenharmony_ci 1688c2ecf20Sopenharmony_cistatic int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, 1698c2ecf20Sopenharmony_ci struct mm_walk *walk) 1708c2ecf20Sopenharmony_ci{ 1718c2ecf20Sopenharmony_ci p4d_t *p4d; 1728c2ecf20Sopenharmony_ci unsigned long next; 1738c2ecf20Sopenharmony_ci const struct mm_walk_ops *ops = walk->ops; 1748c2ecf20Sopenharmony_ci int err = 0; 1758c2ecf20Sopenharmony_ci int depth = real_depth(1); 1768c2ecf20Sopenharmony_ci 1778c2ecf20Sopenharmony_ci p4d = p4d_offset(pgd, addr); 1788c2ecf20Sopenharmony_ci do { 1798c2ecf20Sopenharmony_ci next = p4d_addr_end(addr, end); 1808c2ecf20Sopenharmony_ci if (p4d_none_or_clear_bad(p4d)) { 1818c2ecf20Sopenharmony_ci if (ops->pte_hole) 1828c2ecf20Sopenharmony_ci err = ops->pte_hole(addr, next, depth, walk); 1838c2ecf20Sopenharmony_ci if (err) 1848c2ecf20Sopenharmony_ci break; 1858c2ecf20Sopenharmony_ci continue; 1868c2ecf20Sopenharmony_ci } 1878c2ecf20Sopenharmony_ci if (ops->p4d_entry) { 1888c2ecf20Sopenharmony_ci err = ops->p4d_entry(p4d, addr, next, walk); 1898c2ecf20Sopenharmony_ci if (err) 1908c2ecf20Sopenharmony_ci break; 1918c2ecf20Sopenharmony_ci } 1928c2ecf20Sopenharmony_ci if (ops->pud_entry || ops->pmd_entry || ops->pte_entry) 1938c2ecf20Sopenharmony_ci err = walk_pud_range(p4d, addr, next, walk); 1948c2ecf20Sopenharmony_ci if (err) 1958c2ecf20Sopenharmony_ci break; 1968c2ecf20Sopenharmony_ci } while (p4d++, addr = next, addr != end); 1978c2ecf20Sopenharmony_ci 1988c2ecf20Sopenharmony_ci return err; 1998c2ecf20Sopenharmony_ci} 2008c2ecf20Sopenharmony_ci 2018c2ecf20Sopenharmony_cistatic int walk_pgd_range(unsigned long addr, unsigned long end, 2028c2ecf20Sopenharmony_ci struct mm_walk *walk) 2038c2ecf20Sopenharmony_ci{ 2048c2ecf20Sopenharmony_ci pgd_t *pgd; 2058c2ecf20Sopenharmony_ci unsigned long next; 2068c2ecf20Sopenharmony_ci const struct mm_walk_ops *ops = walk->ops; 2078c2ecf20Sopenharmony_ci int err = 0; 2088c2ecf20Sopenharmony_ci 2098c2ecf20Sopenharmony_ci if (walk->pgd) 2108c2ecf20Sopenharmony_ci pgd = walk->pgd + pgd_index(addr); 2118c2ecf20Sopenharmony_ci else 2128c2ecf20Sopenharmony_ci pgd = pgd_offset(walk->mm, addr); 2138c2ecf20Sopenharmony_ci do { 2148c2ecf20Sopenharmony_ci next = pgd_addr_end(addr, end); 2158c2ecf20Sopenharmony_ci if (pgd_none_or_clear_bad(pgd)) { 2168c2ecf20Sopenharmony_ci if (ops->pte_hole) 2178c2ecf20Sopenharmony_ci err = ops->pte_hole(addr, next, 0, walk); 2188c2ecf20Sopenharmony_ci if (err) 2198c2ecf20Sopenharmony_ci break; 2208c2ecf20Sopenharmony_ci continue; 2218c2ecf20Sopenharmony_ci } 2228c2ecf20Sopenharmony_ci if (ops->pgd_entry) { 2238c2ecf20Sopenharmony_ci err = ops->pgd_entry(pgd, addr, next, walk); 2248c2ecf20Sopenharmony_ci if (err) 2258c2ecf20Sopenharmony_ci break; 2268c2ecf20Sopenharmony_ci } 2278c2ecf20Sopenharmony_ci if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || 2288c2ecf20Sopenharmony_ci ops->pte_entry) 2298c2ecf20Sopenharmony_ci err = walk_p4d_range(pgd, addr, next, walk); 2308c2ecf20Sopenharmony_ci if (err) 2318c2ecf20Sopenharmony_ci break; 2328c2ecf20Sopenharmony_ci } while (pgd++, addr = next, addr != end); 2338c2ecf20Sopenharmony_ci 2348c2ecf20Sopenharmony_ci return err; 2358c2ecf20Sopenharmony_ci} 2368c2ecf20Sopenharmony_ci 2378c2ecf20Sopenharmony_ci#ifdef CONFIG_HUGETLB_PAGE 2388c2ecf20Sopenharmony_cistatic unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, 2398c2ecf20Sopenharmony_ci unsigned long end) 2408c2ecf20Sopenharmony_ci{ 2418c2ecf20Sopenharmony_ci unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h); 2428c2ecf20Sopenharmony_ci return boundary < end ? boundary : end; 2438c2ecf20Sopenharmony_ci} 2448c2ecf20Sopenharmony_ci 2458c2ecf20Sopenharmony_cistatic int walk_hugetlb_range(unsigned long addr, unsigned long end, 2468c2ecf20Sopenharmony_ci struct mm_walk *walk) 2478c2ecf20Sopenharmony_ci{ 2488c2ecf20Sopenharmony_ci struct vm_area_struct *vma = walk->vma; 2498c2ecf20Sopenharmony_ci struct hstate *h = hstate_vma(vma); 2508c2ecf20Sopenharmony_ci unsigned long next; 2518c2ecf20Sopenharmony_ci unsigned long hmask = huge_page_mask(h); 2528c2ecf20Sopenharmony_ci unsigned long sz = huge_page_size(h); 2538c2ecf20Sopenharmony_ci pte_t *pte; 2548c2ecf20Sopenharmony_ci const struct mm_walk_ops *ops = walk->ops; 2558c2ecf20Sopenharmony_ci int err = 0; 2568c2ecf20Sopenharmony_ci 2578c2ecf20Sopenharmony_ci do { 2588c2ecf20Sopenharmony_ci next = hugetlb_entry_end(h, addr, end); 2598c2ecf20Sopenharmony_ci pte = huge_pte_offset(walk->mm, addr & hmask, sz); 2608c2ecf20Sopenharmony_ci 2618c2ecf20Sopenharmony_ci if (pte) 2628c2ecf20Sopenharmony_ci err = ops->hugetlb_entry(pte, hmask, addr, next, walk); 2638c2ecf20Sopenharmony_ci else if (ops->pte_hole) 2648c2ecf20Sopenharmony_ci err = ops->pte_hole(addr, next, -1, walk); 2658c2ecf20Sopenharmony_ci 2668c2ecf20Sopenharmony_ci if (err) 2678c2ecf20Sopenharmony_ci break; 2688c2ecf20Sopenharmony_ci } while (addr = next, addr != end); 2698c2ecf20Sopenharmony_ci 2708c2ecf20Sopenharmony_ci return err; 2718c2ecf20Sopenharmony_ci} 2728c2ecf20Sopenharmony_ci 2738c2ecf20Sopenharmony_ci#else /* CONFIG_HUGETLB_PAGE */ 2748c2ecf20Sopenharmony_cistatic int walk_hugetlb_range(unsigned long addr, unsigned long end, 2758c2ecf20Sopenharmony_ci struct mm_walk *walk) 2768c2ecf20Sopenharmony_ci{ 2778c2ecf20Sopenharmony_ci return 0; 2788c2ecf20Sopenharmony_ci} 2798c2ecf20Sopenharmony_ci 2808c2ecf20Sopenharmony_ci#endif /* CONFIG_HUGETLB_PAGE */ 2818c2ecf20Sopenharmony_ci 2828c2ecf20Sopenharmony_ci/* 2838c2ecf20Sopenharmony_ci * Decide whether we really walk over the current vma on [@start, @end) 2848c2ecf20Sopenharmony_ci * or skip it via the returned value. Return 0 if we do walk over the 2858c2ecf20Sopenharmony_ci * current vma, and return 1 if we skip the vma. Negative values means 2868c2ecf20Sopenharmony_ci * error, where we abort the current walk. 2878c2ecf20Sopenharmony_ci */ 2888c2ecf20Sopenharmony_cistatic int walk_page_test(unsigned long start, unsigned long end, 2898c2ecf20Sopenharmony_ci struct mm_walk *walk) 2908c2ecf20Sopenharmony_ci{ 2918c2ecf20Sopenharmony_ci struct vm_area_struct *vma = walk->vma; 2928c2ecf20Sopenharmony_ci const struct mm_walk_ops *ops = walk->ops; 2938c2ecf20Sopenharmony_ci 2948c2ecf20Sopenharmony_ci if (ops->test_walk) 2958c2ecf20Sopenharmony_ci return ops->test_walk(start, end, walk); 2968c2ecf20Sopenharmony_ci 2978c2ecf20Sopenharmony_ci /* 2988c2ecf20Sopenharmony_ci * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP 2998c2ecf20Sopenharmony_ci * range, so we don't walk over it as we do for normal vmas. However, 3008c2ecf20Sopenharmony_ci * Some callers are interested in handling hole range and they don't 3018c2ecf20Sopenharmony_ci * want to just ignore any single address range. Such users certainly 3028c2ecf20Sopenharmony_ci * define their ->pte_hole() callbacks, so let's delegate them to handle 3038c2ecf20Sopenharmony_ci * vma(VM_PFNMAP). 3048c2ecf20Sopenharmony_ci */ 3058c2ecf20Sopenharmony_ci if (vma->vm_flags & VM_PFNMAP) { 3068c2ecf20Sopenharmony_ci int err = 1; 3078c2ecf20Sopenharmony_ci if (ops->pte_hole) 3088c2ecf20Sopenharmony_ci err = ops->pte_hole(start, end, -1, walk); 3098c2ecf20Sopenharmony_ci return err ? err : 1; 3108c2ecf20Sopenharmony_ci } 3118c2ecf20Sopenharmony_ci return 0; 3128c2ecf20Sopenharmony_ci} 3138c2ecf20Sopenharmony_ci 3148c2ecf20Sopenharmony_cistatic int __walk_page_range(unsigned long start, unsigned long end, 3158c2ecf20Sopenharmony_ci struct mm_walk *walk) 3168c2ecf20Sopenharmony_ci{ 3178c2ecf20Sopenharmony_ci int err = 0; 3188c2ecf20Sopenharmony_ci struct vm_area_struct *vma = walk->vma; 3198c2ecf20Sopenharmony_ci const struct mm_walk_ops *ops = walk->ops; 3208c2ecf20Sopenharmony_ci 3218c2ecf20Sopenharmony_ci if (ops->pre_vma) { 3228c2ecf20Sopenharmony_ci err = ops->pre_vma(start, end, walk); 3238c2ecf20Sopenharmony_ci if (err) 3248c2ecf20Sopenharmony_ci return err; 3258c2ecf20Sopenharmony_ci } 3268c2ecf20Sopenharmony_ci 3278c2ecf20Sopenharmony_ci if (is_vm_hugetlb_page(vma)) { 3288c2ecf20Sopenharmony_ci if (ops->hugetlb_entry) 3298c2ecf20Sopenharmony_ci err = walk_hugetlb_range(start, end, walk); 3308c2ecf20Sopenharmony_ci } else 3318c2ecf20Sopenharmony_ci err = walk_pgd_range(start, end, walk); 3328c2ecf20Sopenharmony_ci 3338c2ecf20Sopenharmony_ci if (ops->post_vma) 3348c2ecf20Sopenharmony_ci ops->post_vma(walk); 3358c2ecf20Sopenharmony_ci 3368c2ecf20Sopenharmony_ci return err; 3378c2ecf20Sopenharmony_ci} 3388c2ecf20Sopenharmony_ci 3398c2ecf20Sopenharmony_ci/** 3408c2ecf20Sopenharmony_ci * walk_page_range - walk page table with caller specific callbacks 3418c2ecf20Sopenharmony_ci * @mm: mm_struct representing the target process of page table walk 3428c2ecf20Sopenharmony_ci * @start: start address of the virtual address range 3438c2ecf20Sopenharmony_ci * @end: end address of the virtual address range 3448c2ecf20Sopenharmony_ci * @ops: operation to call during the walk 3458c2ecf20Sopenharmony_ci * @private: private data for callbacks' usage 3468c2ecf20Sopenharmony_ci * 3478c2ecf20Sopenharmony_ci * Recursively walk the page table tree of the process represented by @mm 3488c2ecf20Sopenharmony_ci * within the virtual address range [@start, @end). During walking, we can do 3498c2ecf20Sopenharmony_ci * some caller-specific works for each entry, by setting up pmd_entry(), 3508c2ecf20Sopenharmony_ci * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these 3518c2ecf20Sopenharmony_ci * callbacks, the associated entries/pages are just ignored. 3528c2ecf20Sopenharmony_ci * The return values of these callbacks are commonly defined like below: 3538c2ecf20Sopenharmony_ci * 3548c2ecf20Sopenharmony_ci * - 0 : succeeded to handle the current entry, and if you don't reach the 3558c2ecf20Sopenharmony_ci * end address yet, continue to walk. 3568c2ecf20Sopenharmony_ci * - >0 : succeeded to handle the current entry, and return to the caller 3578c2ecf20Sopenharmony_ci * with caller specific value. 3588c2ecf20Sopenharmony_ci * - <0 : failed to handle the current entry, and return to the caller 3598c2ecf20Sopenharmony_ci * with error code. 3608c2ecf20Sopenharmony_ci * 3618c2ecf20Sopenharmony_ci * Before starting to walk page table, some callers want to check whether 3628c2ecf20Sopenharmony_ci * they really want to walk over the current vma, typically by checking 3638c2ecf20Sopenharmony_ci * its vm_flags. walk_page_test() and @ops->test_walk() are used for this 3648c2ecf20Sopenharmony_ci * purpose. 3658c2ecf20Sopenharmony_ci * 3668c2ecf20Sopenharmony_ci * If operations need to be staged before and committed after a vma is walked, 3678c2ecf20Sopenharmony_ci * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(), 3688c2ecf20Sopenharmony_ci * since it is intended to handle commit-type operations, can't return any 3698c2ecf20Sopenharmony_ci * errors. 3708c2ecf20Sopenharmony_ci * 3718c2ecf20Sopenharmony_ci * struct mm_walk keeps current values of some common data like vma and pmd, 3728c2ecf20Sopenharmony_ci * which are useful for the access from callbacks. If you want to pass some 3738c2ecf20Sopenharmony_ci * caller-specific data to callbacks, @private should be helpful. 3748c2ecf20Sopenharmony_ci * 3758c2ecf20Sopenharmony_ci * Locking: 3768c2ecf20Sopenharmony_ci * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock, 3778c2ecf20Sopenharmony_ci * because these function traverse vma list and/or access to vma's data. 3788c2ecf20Sopenharmony_ci */ 3798c2ecf20Sopenharmony_ciint walk_page_range(struct mm_struct *mm, unsigned long start, 3808c2ecf20Sopenharmony_ci unsigned long end, const struct mm_walk_ops *ops, 3818c2ecf20Sopenharmony_ci void *private) 3828c2ecf20Sopenharmony_ci{ 3838c2ecf20Sopenharmony_ci int err = 0; 3848c2ecf20Sopenharmony_ci unsigned long next; 3858c2ecf20Sopenharmony_ci struct vm_area_struct *vma; 3868c2ecf20Sopenharmony_ci struct mm_walk walk = { 3878c2ecf20Sopenharmony_ci .ops = ops, 3888c2ecf20Sopenharmony_ci .mm = mm, 3898c2ecf20Sopenharmony_ci .private = private, 3908c2ecf20Sopenharmony_ci }; 3918c2ecf20Sopenharmony_ci 3928c2ecf20Sopenharmony_ci if (start >= end) 3938c2ecf20Sopenharmony_ci return -EINVAL; 3948c2ecf20Sopenharmony_ci 3958c2ecf20Sopenharmony_ci if (!walk.mm) 3968c2ecf20Sopenharmony_ci return -EINVAL; 3978c2ecf20Sopenharmony_ci 3988c2ecf20Sopenharmony_ci mmap_assert_locked(walk.mm); 3998c2ecf20Sopenharmony_ci 4008c2ecf20Sopenharmony_ci vma = find_vma(walk.mm, start); 4018c2ecf20Sopenharmony_ci do { 4028c2ecf20Sopenharmony_ci if (!vma) { /* after the last vma */ 4038c2ecf20Sopenharmony_ci walk.vma = NULL; 4048c2ecf20Sopenharmony_ci next = end; 4058c2ecf20Sopenharmony_ci if (ops->pte_hole) 4068c2ecf20Sopenharmony_ci err = ops->pte_hole(start, next, -1, &walk); 4078c2ecf20Sopenharmony_ci } else if (start < vma->vm_start) { /* outside vma */ 4088c2ecf20Sopenharmony_ci walk.vma = NULL; 4098c2ecf20Sopenharmony_ci next = min(end, vma->vm_start); 4108c2ecf20Sopenharmony_ci if (ops->pte_hole) 4118c2ecf20Sopenharmony_ci err = ops->pte_hole(start, next, -1, &walk); 4128c2ecf20Sopenharmony_ci } else { /* inside vma */ 4138c2ecf20Sopenharmony_ci walk.vma = vma; 4148c2ecf20Sopenharmony_ci next = min(end, vma->vm_end); 4158c2ecf20Sopenharmony_ci vma = vma->vm_next; 4168c2ecf20Sopenharmony_ci 4178c2ecf20Sopenharmony_ci err = walk_page_test(start, next, &walk); 4188c2ecf20Sopenharmony_ci if (err > 0) { 4198c2ecf20Sopenharmony_ci /* 4208c2ecf20Sopenharmony_ci * positive return values are purely for 4218c2ecf20Sopenharmony_ci * controlling the pagewalk, so should never 4228c2ecf20Sopenharmony_ci * be passed to the callers. 4238c2ecf20Sopenharmony_ci */ 4248c2ecf20Sopenharmony_ci err = 0; 4258c2ecf20Sopenharmony_ci continue; 4268c2ecf20Sopenharmony_ci } 4278c2ecf20Sopenharmony_ci if (err < 0) 4288c2ecf20Sopenharmony_ci break; 4298c2ecf20Sopenharmony_ci err = __walk_page_range(start, next, &walk); 4308c2ecf20Sopenharmony_ci } 4318c2ecf20Sopenharmony_ci if (err) 4328c2ecf20Sopenharmony_ci break; 4338c2ecf20Sopenharmony_ci } while (start = next, start < end); 4348c2ecf20Sopenharmony_ci return err; 4358c2ecf20Sopenharmony_ci} 4368c2ecf20Sopenharmony_ci 4378c2ecf20Sopenharmony_ci/* 4388c2ecf20Sopenharmony_ci * Similar to walk_page_range() but can walk any page tables even if they are 4398c2ecf20Sopenharmony_ci * not backed by VMAs. Because 'unusual' entries may be walked this function 4408c2ecf20Sopenharmony_ci * will also not lock the PTEs for the pte_entry() callback. This is useful for 4418c2ecf20Sopenharmony_ci * walking the kernel pages tables or page tables for firmware. 4428c2ecf20Sopenharmony_ci */ 4438c2ecf20Sopenharmony_ciint walk_page_range_novma(struct mm_struct *mm, unsigned long start, 4448c2ecf20Sopenharmony_ci unsigned long end, const struct mm_walk_ops *ops, 4458c2ecf20Sopenharmony_ci pgd_t *pgd, 4468c2ecf20Sopenharmony_ci void *private) 4478c2ecf20Sopenharmony_ci{ 4488c2ecf20Sopenharmony_ci struct mm_walk walk = { 4498c2ecf20Sopenharmony_ci .ops = ops, 4508c2ecf20Sopenharmony_ci .mm = mm, 4518c2ecf20Sopenharmony_ci .pgd = pgd, 4528c2ecf20Sopenharmony_ci .private = private, 4538c2ecf20Sopenharmony_ci .no_vma = true 4548c2ecf20Sopenharmony_ci }; 4558c2ecf20Sopenharmony_ci 4568c2ecf20Sopenharmony_ci if (start >= end || !walk.mm) 4578c2ecf20Sopenharmony_ci return -EINVAL; 4588c2ecf20Sopenharmony_ci 4598c2ecf20Sopenharmony_ci mmap_assert_write_locked(walk.mm); 4608c2ecf20Sopenharmony_ci 4618c2ecf20Sopenharmony_ci return walk_pgd_range(start, end, &walk); 4628c2ecf20Sopenharmony_ci} 4638c2ecf20Sopenharmony_ci 4648c2ecf20Sopenharmony_ciint walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, 4658c2ecf20Sopenharmony_ci void *private) 4668c2ecf20Sopenharmony_ci{ 4678c2ecf20Sopenharmony_ci struct mm_walk walk = { 4688c2ecf20Sopenharmony_ci .ops = ops, 4698c2ecf20Sopenharmony_ci .mm = vma->vm_mm, 4708c2ecf20Sopenharmony_ci .vma = vma, 4718c2ecf20Sopenharmony_ci .private = private, 4728c2ecf20Sopenharmony_ci }; 4738c2ecf20Sopenharmony_ci int err; 4748c2ecf20Sopenharmony_ci 4758c2ecf20Sopenharmony_ci if (!walk.mm) 4768c2ecf20Sopenharmony_ci return -EINVAL; 4778c2ecf20Sopenharmony_ci 4788c2ecf20Sopenharmony_ci mmap_assert_locked(walk.mm); 4798c2ecf20Sopenharmony_ci 4808c2ecf20Sopenharmony_ci err = walk_page_test(vma->vm_start, vma->vm_end, &walk); 4818c2ecf20Sopenharmony_ci if (err > 0) 4828c2ecf20Sopenharmony_ci return 0; 4838c2ecf20Sopenharmony_ci if (err < 0) 4848c2ecf20Sopenharmony_ci return err; 4858c2ecf20Sopenharmony_ci return __walk_page_range(vma->vm_start, vma->vm_end, &walk); 4868c2ecf20Sopenharmony_ci} 4878c2ecf20Sopenharmony_ci 4888c2ecf20Sopenharmony_ci/** 4898c2ecf20Sopenharmony_ci * walk_page_mapping - walk all memory areas mapped into a struct address_space. 4908c2ecf20Sopenharmony_ci * @mapping: Pointer to the struct address_space 4918c2ecf20Sopenharmony_ci * @first_index: First page offset in the address_space 4928c2ecf20Sopenharmony_ci * @nr: Number of incremental page offsets to cover 4938c2ecf20Sopenharmony_ci * @ops: operation to call during the walk 4948c2ecf20Sopenharmony_ci * @private: private data for callbacks' usage 4958c2ecf20Sopenharmony_ci * 4968c2ecf20Sopenharmony_ci * This function walks all memory areas mapped into a struct address_space. 4978c2ecf20Sopenharmony_ci * The walk is limited to only the given page-size index range, but if 4988c2ecf20Sopenharmony_ci * the index boundaries cross a huge page-table entry, that entry will be 4998c2ecf20Sopenharmony_ci * included. 5008c2ecf20Sopenharmony_ci * 5018c2ecf20Sopenharmony_ci * Also see walk_page_range() for additional information. 5028c2ecf20Sopenharmony_ci * 5038c2ecf20Sopenharmony_ci * Locking: 5048c2ecf20Sopenharmony_ci * This function can't require that the struct mm_struct::mmap_lock is held, 5058c2ecf20Sopenharmony_ci * since @mapping may be mapped by multiple processes. Instead 5068c2ecf20Sopenharmony_ci * @mapping->i_mmap_rwsem must be held. This might have implications in the 5078c2ecf20Sopenharmony_ci * callbacks, and it's up tho the caller to ensure that the 5088c2ecf20Sopenharmony_ci * struct mm_struct::mmap_lock is not needed. 5098c2ecf20Sopenharmony_ci * 5108c2ecf20Sopenharmony_ci * Also this means that a caller can't rely on the struct 5118c2ecf20Sopenharmony_ci * vm_area_struct::vm_flags to be constant across a call, 5128c2ecf20Sopenharmony_ci * except for immutable flags. Callers requiring this shouldn't use 5138c2ecf20Sopenharmony_ci * this function. 5148c2ecf20Sopenharmony_ci * 5158c2ecf20Sopenharmony_ci * Return: 0 on success, negative error code on failure, positive number on 5168c2ecf20Sopenharmony_ci * caller defined premature termination. 5178c2ecf20Sopenharmony_ci */ 5188c2ecf20Sopenharmony_ciint walk_page_mapping(struct address_space *mapping, pgoff_t first_index, 5198c2ecf20Sopenharmony_ci pgoff_t nr, const struct mm_walk_ops *ops, 5208c2ecf20Sopenharmony_ci void *private) 5218c2ecf20Sopenharmony_ci{ 5228c2ecf20Sopenharmony_ci struct mm_walk walk = { 5238c2ecf20Sopenharmony_ci .ops = ops, 5248c2ecf20Sopenharmony_ci .private = private, 5258c2ecf20Sopenharmony_ci }; 5268c2ecf20Sopenharmony_ci struct vm_area_struct *vma; 5278c2ecf20Sopenharmony_ci pgoff_t vba, vea, cba, cea; 5288c2ecf20Sopenharmony_ci unsigned long start_addr, end_addr; 5298c2ecf20Sopenharmony_ci int err = 0; 5308c2ecf20Sopenharmony_ci 5318c2ecf20Sopenharmony_ci lockdep_assert_held(&mapping->i_mmap_rwsem); 5328c2ecf20Sopenharmony_ci vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index, 5338c2ecf20Sopenharmony_ci first_index + nr - 1) { 5348c2ecf20Sopenharmony_ci /* Clip to the vma */ 5358c2ecf20Sopenharmony_ci vba = vma->vm_pgoff; 5368c2ecf20Sopenharmony_ci vea = vba + vma_pages(vma); 5378c2ecf20Sopenharmony_ci cba = first_index; 5388c2ecf20Sopenharmony_ci cba = max(cba, vba); 5398c2ecf20Sopenharmony_ci cea = first_index + nr; 5408c2ecf20Sopenharmony_ci cea = min(cea, vea); 5418c2ecf20Sopenharmony_ci 5428c2ecf20Sopenharmony_ci start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start; 5438c2ecf20Sopenharmony_ci end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start; 5448c2ecf20Sopenharmony_ci if (start_addr >= end_addr) 5458c2ecf20Sopenharmony_ci continue; 5468c2ecf20Sopenharmony_ci 5478c2ecf20Sopenharmony_ci walk.vma = vma; 5488c2ecf20Sopenharmony_ci walk.mm = vma->vm_mm; 5498c2ecf20Sopenharmony_ci 5508c2ecf20Sopenharmony_ci err = walk_page_test(vma->vm_start, vma->vm_end, &walk); 5518c2ecf20Sopenharmony_ci if (err > 0) { 5528c2ecf20Sopenharmony_ci err = 0; 5538c2ecf20Sopenharmony_ci break; 5548c2ecf20Sopenharmony_ci } else if (err < 0) 5558c2ecf20Sopenharmony_ci break; 5568c2ecf20Sopenharmony_ci 5578c2ecf20Sopenharmony_ci err = __walk_page_range(start_addr, end_addr, &walk); 5588c2ecf20Sopenharmony_ci if (err) 5598c2ecf20Sopenharmony_ci break; 5608c2ecf20Sopenharmony_ci } 5618c2ecf20Sopenharmony_ci 5628c2ecf20Sopenharmony_ci return err; 5638c2ecf20Sopenharmony_ci} 564