18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * linux/mm/mincore.c 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Copyright (C) 1994-2006 Linus Torvalds 68c2ecf20Sopenharmony_ci */ 78c2ecf20Sopenharmony_ci 88c2ecf20Sopenharmony_ci/* 98c2ecf20Sopenharmony_ci * The mincore() system call. 108c2ecf20Sopenharmony_ci */ 118c2ecf20Sopenharmony_ci#include <linux/pagemap.h> 128c2ecf20Sopenharmony_ci#include <linux/gfp.h> 138c2ecf20Sopenharmony_ci#include <linux/pagewalk.h> 148c2ecf20Sopenharmony_ci#include <linux/mman.h> 158c2ecf20Sopenharmony_ci#include <linux/syscalls.h> 168c2ecf20Sopenharmony_ci#include <linux/swap.h> 178c2ecf20Sopenharmony_ci#include <linux/swapops.h> 188c2ecf20Sopenharmony_ci#include <linux/shmem_fs.h> 198c2ecf20Sopenharmony_ci#include <linux/hugetlb.h> 208c2ecf20Sopenharmony_ci#include <linux/pgtable.h> 218c2ecf20Sopenharmony_ci 228c2ecf20Sopenharmony_ci#include <linux/uaccess.h> 238c2ecf20Sopenharmony_ci 248c2ecf20Sopenharmony_cistatic int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, 258c2ecf20Sopenharmony_ci unsigned long end, struct mm_walk *walk) 268c2ecf20Sopenharmony_ci{ 278c2ecf20Sopenharmony_ci#ifdef CONFIG_HUGETLB_PAGE 288c2ecf20Sopenharmony_ci unsigned char present; 298c2ecf20Sopenharmony_ci unsigned char *vec = walk->private; 308c2ecf20Sopenharmony_ci 318c2ecf20Sopenharmony_ci /* 328c2ecf20Sopenharmony_ci * Hugepages under user process are always in RAM and never 338c2ecf20Sopenharmony_ci * swapped out, but theoretically it needs to be checked. 348c2ecf20Sopenharmony_ci */ 358c2ecf20Sopenharmony_ci present = pte && !huge_pte_none(huge_ptep_get(pte)); 368c2ecf20Sopenharmony_ci for (; addr != end; vec++, addr += PAGE_SIZE) 378c2ecf20Sopenharmony_ci *vec = present; 388c2ecf20Sopenharmony_ci walk->private = vec; 398c2ecf20Sopenharmony_ci#else 408c2ecf20Sopenharmony_ci BUG(); 418c2ecf20Sopenharmony_ci#endif 428c2ecf20Sopenharmony_ci return 0; 438c2ecf20Sopenharmony_ci} 448c2ecf20Sopenharmony_ci 458c2ecf20Sopenharmony_ci/* 468c2ecf20Sopenharmony_ci * Later we can get more picky about what "in core" means precisely. 478c2ecf20Sopenharmony_ci * For now, simply check to see if the page is in the page cache, 488c2ecf20Sopenharmony_ci * and is up to date; i.e. that no page-in operation would be required 498c2ecf20Sopenharmony_ci * at this time if an application were to map and access this page. 508c2ecf20Sopenharmony_ci */ 518c2ecf20Sopenharmony_cistatic unsigned char mincore_page(struct address_space *mapping, pgoff_t index) 528c2ecf20Sopenharmony_ci{ 538c2ecf20Sopenharmony_ci unsigned char present = 0; 548c2ecf20Sopenharmony_ci struct page *page; 558c2ecf20Sopenharmony_ci 568c2ecf20Sopenharmony_ci /* 578c2ecf20Sopenharmony_ci * When tmpfs swaps out a page from a file, any process mapping that 588c2ecf20Sopenharmony_ci * file will not get a swp_entry_t in its pte, but rather it is like 598c2ecf20Sopenharmony_ci * any other file mapping (ie. marked !present and faulted in with 608c2ecf20Sopenharmony_ci * tmpfs's .fault). So swapped out tmpfs mappings are tested here. 618c2ecf20Sopenharmony_ci */ 628c2ecf20Sopenharmony_ci page = find_get_incore_page(mapping, index); 638c2ecf20Sopenharmony_ci if (page) { 648c2ecf20Sopenharmony_ci present = PageUptodate(page); 658c2ecf20Sopenharmony_ci put_page(page); 668c2ecf20Sopenharmony_ci } 678c2ecf20Sopenharmony_ci 688c2ecf20Sopenharmony_ci return present; 698c2ecf20Sopenharmony_ci} 708c2ecf20Sopenharmony_ci 718c2ecf20Sopenharmony_cistatic int __mincore_unmapped_range(unsigned long addr, unsigned long end, 728c2ecf20Sopenharmony_ci struct vm_area_struct *vma, unsigned char *vec) 738c2ecf20Sopenharmony_ci{ 748c2ecf20Sopenharmony_ci unsigned long nr = (end - addr) >> PAGE_SHIFT; 758c2ecf20Sopenharmony_ci int i; 768c2ecf20Sopenharmony_ci 778c2ecf20Sopenharmony_ci if (vma->vm_file) { 788c2ecf20Sopenharmony_ci pgoff_t pgoff; 798c2ecf20Sopenharmony_ci 808c2ecf20Sopenharmony_ci pgoff = linear_page_index(vma, addr); 818c2ecf20Sopenharmony_ci for (i = 0; i < nr; i++, pgoff++) 828c2ecf20Sopenharmony_ci vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff); 838c2ecf20Sopenharmony_ci } else { 848c2ecf20Sopenharmony_ci for (i = 0; i < nr; i++) 858c2ecf20Sopenharmony_ci vec[i] = 0; 868c2ecf20Sopenharmony_ci } 878c2ecf20Sopenharmony_ci return nr; 888c2ecf20Sopenharmony_ci} 898c2ecf20Sopenharmony_ci 908c2ecf20Sopenharmony_cistatic int mincore_unmapped_range(unsigned long addr, unsigned long end, 918c2ecf20Sopenharmony_ci __always_unused int depth, 928c2ecf20Sopenharmony_ci struct mm_walk *walk) 938c2ecf20Sopenharmony_ci{ 948c2ecf20Sopenharmony_ci walk->private += __mincore_unmapped_range(addr, end, 958c2ecf20Sopenharmony_ci walk->vma, walk->private); 968c2ecf20Sopenharmony_ci return 0; 978c2ecf20Sopenharmony_ci} 988c2ecf20Sopenharmony_ci 998c2ecf20Sopenharmony_cistatic int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 1008c2ecf20Sopenharmony_ci struct mm_walk *walk) 1018c2ecf20Sopenharmony_ci{ 1028c2ecf20Sopenharmony_ci spinlock_t *ptl; 1038c2ecf20Sopenharmony_ci struct vm_area_struct *vma = walk->vma; 1048c2ecf20Sopenharmony_ci pte_t *ptep; 1058c2ecf20Sopenharmony_ci unsigned char *vec = walk->private; 1068c2ecf20Sopenharmony_ci int nr = (end - addr) >> PAGE_SHIFT; 1078c2ecf20Sopenharmony_ci 1088c2ecf20Sopenharmony_ci ptl = pmd_trans_huge_lock(pmd, vma); 1098c2ecf20Sopenharmony_ci if (ptl) { 1108c2ecf20Sopenharmony_ci memset(vec, 1, nr); 1118c2ecf20Sopenharmony_ci spin_unlock(ptl); 1128c2ecf20Sopenharmony_ci goto out; 1138c2ecf20Sopenharmony_ci } 1148c2ecf20Sopenharmony_ci 1158c2ecf20Sopenharmony_ci if (pmd_trans_unstable(pmd)) { 1168c2ecf20Sopenharmony_ci __mincore_unmapped_range(addr, end, vma, vec); 1178c2ecf20Sopenharmony_ci goto out; 1188c2ecf20Sopenharmony_ci } 1198c2ecf20Sopenharmony_ci 1208c2ecf20Sopenharmony_ci ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 1218c2ecf20Sopenharmony_ci for (; addr != end; ptep++, addr += PAGE_SIZE) { 1228c2ecf20Sopenharmony_ci pte_t pte = *ptep; 1238c2ecf20Sopenharmony_ci 1248c2ecf20Sopenharmony_ci if (pte_none(pte)) 1258c2ecf20Sopenharmony_ci __mincore_unmapped_range(addr, addr + PAGE_SIZE, 1268c2ecf20Sopenharmony_ci vma, vec); 1278c2ecf20Sopenharmony_ci else if (pte_present(pte)) 1288c2ecf20Sopenharmony_ci *vec = 1; 1298c2ecf20Sopenharmony_ci else { /* pte is a swap entry */ 1308c2ecf20Sopenharmony_ci swp_entry_t entry = pte_to_swp_entry(pte); 1318c2ecf20Sopenharmony_ci 1328c2ecf20Sopenharmony_ci if (non_swap_entry(entry)) { 1338c2ecf20Sopenharmony_ci /* 1348c2ecf20Sopenharmony_ci * migration or hwpoison entries are always 1358c2ecf20Sopenharmony_ci * uptodate 1368c2ecf20Sopenharmony_ci */ 1378c2ecf20Sopenharmony_ci *vec = 1; 1388c2ecf20Sopenharmony_ci } else { 1398c2ecf20Sopenharmony_ci#ifdef CONFIG_SWAP 1408c2ecf20Sopenharmony_ci *vec = mincore_page(swap_address_space(entry), 1418c2ecf20Sopenharmony_ci swp_offset(entry)); 1428c2ecf20Sopenharmony_ci#else 1438c2ecf20Sopenharmony_ci WARN_ON(1); 1448c2ecf20Sopenharmony_ci *vec = 1; 1458c2ecf20Sopenharmony_ci#endif 1468c2ecf20Sopenharmony_ci } 1478c2ecf20Sopenharmony_ci } 1488c2ecf20Sopenharmony_ci vec++; 1498c2ecf20Sopenharmony_ci } 1508c2ecf20Sopenharmony_ci pte_unmap_unlock(ptep - 1, ptl); 1518c2ecf20Sopenharmony_ciout: 1528c2ecf20Sopenharmony_ci walk->private += nr; 1538c2ecf20Sopenharmony_ci cond_resched(); 1548c2ecf20Sopenharmony_ci return 0; 1558c2ecf20Sopenharmony_ci} 1568c2ecf20Sopenharmony_ci 1578c2ecf20Sopenharmony_cistatic inline bool can_do_mincore(struct vm_area_struct *vma) 1588c2ecf20Sopenharmony_ci{ 1598c2ecf20Sopenharmony_ci if (vma_is_anonymous(vma)) 1608c2ecf20Sopenharmony_ci return true; 1618c2ecf20Sopenharmony_ci if (!vma->vm_file) 1628c2ecf20Sopenharmony_ci return false; 1638c2ecf20Sopenharmony_ci /* 1648c2ecf20Sopenharmony_ci * Reveal pagecache information only for non-anonymous mappings that 1658c2ecf20Sopenharmony_ci * correspond to the files the calling process could (if tried) open 1668c2ecf20Sopenharmony_ci * for writing; otherwise we'd be including shared non-exclusive 1678c2ecf20Sopenharmony_ci * mappings, which opens a side channel. 1688c2ecf20Sopenharmony_ci */ 1698c2ecf20Sopenharmony_ci return inode_owner_or_capable(file_inode(vma->vm_file)) || 1708c2ecf20Sopenharmony_ci inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0; 1718c2ecf20Sopenharmony_ci} 1728c2ecf20Sopenharmony_ci 1738c2ecf20Sopenharmony_cistatic const struct mm_walk_ops mincore_walk_ops = { 1748c2ecf20Sopenharmony_ci .pmd_entry = mincore_pte_range, 1758c2ecf20Sopenharmony_ci .pte_hole = mincore_unmapped_range, 1768c2ecf20Sopenharmony_ci .hugetlb_entry = mincore_hugetlb, 1778c2ecf20Sopenharmony_ci}; 1788c2ecf20Sopenharmony_ci 1798c2ecf20Sopenharmony_ci/* 1808c2ecf20Sopenharmony_ci * Do a chunk of "sys_mincore()". We've already checked 1818c2ecf20Sopenharmony_ci * all the arguments, we hold the mmap semaphore: we should 1828c2ecf20Sopenharmony_ci * just return the amount of info we're asked for. 1838c2ecf20Sopenharmony_ci */ 1848c2ecf20Sopenharmony_cistatic long do_mincore(unsigned long addr, unsigned long pages, unsigned char *vec) 1858c2ecf20Sopenharmony_ci{ 1868c2ecf20Sopenharmony_ci struct vm_area_struct *vma; 1878c2ecf20Sopenharmony_ci unsigned long end; 1888c2ecf20Sopenharmony_ci int err; 1898c2ecf20Sopenharmony_ci 1908c2ecf20Sopenharmony_ci vma = find_vma(current->mm, addr); 1918c2ecf20Sopenharmony_ci if (!vma || addr < vma->vm_start) 1928c2ecf20Sopenharmony_ci return -ENOMEM; 1938c2ecf20Sopenharmony_ci end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); 1948c2ecf20Sopenharmony_ci if (!can_do_mincore(vma)) { 1958c2ecf20Sopenharmony_ci unsigned long pages = DIV_ROUND_UP(end - addr, PAGE_SIZE); 1968c2ecf20Sopenharmony_ci memset(vec, 1, pages); 1978c2ecf20Sopenharmony_ci return pages; 1988c2ecf20Sopenharmony_ci } 1998c2ecf20Sopenharmony_ci err = walk_page_range(vma->vm_mm, addr, end, &mincore_walk_ops, vec); 2008c2ecf20Sopenharmony_ci if (err < 0) 2018c2ecf20Sopenharmony_ci return err; 2028c2ecf20Sopenharmony_ci return (end - addr) >> PAGE_SHIFT; 2038c2ecf20Sopenharmony_ci} 2048c2ecf20Sopenharmony_ci 2058c2ecf20Sopenharmony_ci/* 2068c2ecf20Sopenharmony_ci * The mincore(2) system call. 2078c2ecf20Sopenharmony_ci * 2088c2ecf20Sopenharmony_ci * mincore() returns the memory residency status of the pages in the 2098c2ecf20Sopenharmony_ci * current process's address space specified by [addr, addr + len). 2108c2ecf20Sopenharmony_ci * The status is returned in a vector of bytes. The least significant 2118c2ecf20Sopenharmony_ci * bit of each byte is 1 if the referenced page is in memory, otherwise 2128c2ecf20Sopenharmony_ci * it is zero. 2138c2ecf20Sopenharmony_ci * 2148c2ecf20Sopenharmony_ci * Because the status of a page can change after mincore() checks it 2158c2ecf20Sopenharmony_ci * but before it returns to the application, the returned vector may 2168c2ecf20Sopenharmony_ci * contain stale information. Only locked pages are guaranteed to 2178c2ecf20Sopenharmony_ci * remain in memory. 2188c2ecf20Sopenharmony_ci * 2198c2ecf20Sopenharmony_ci * return values: 2208c2ecf20Sopenharmony_ci * zero - success 2218c2ecf20Sopenharmony_ci * -EFAULT - vec points to an illegal address 2228c2ecf20Sopenharmony_ci * -EINVAL - addr is not a multiple of PAGE_SIZE 2238c2ecf20Sopenharmony_ci * -ENOMEM - Addresses in the range [addr, addr + len] are 2248c2ecf20Sopenharmony_ci * invalid for the address space of this process, or 2258c2ecf20Sopenharmony_ci * specify one or more pages which are not currently 2268c2ecf20Sopenharmony_ci * mapped 2278c2ecf20Sopenharmony_ci * -EAGAIN - A kernel resource was temporarily unavailable. 2288c2ecf20Sopenharmony_ci */ 2298c2ecf20Sopenharmony_ciSYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, 2308c2ecf20Sopenharmony_ci unsigned char __user *, vec) 2318c2ecf20Sopenharmony_ci{ 2328c2ecf20Sopenharmony_ci long retval; 2338c2ecf20Sopenharmony_ci unsigned long pages; 2348c2ecf20Sopenharmony_ci unsigned char *tmp; 2358c2ecf20Sopenharmony_ci 2368c2ecf20Sopenharmony_ci start = untagged_addr(start); 2378c2ecf20Sopenharmony_ci 2388c2ecf20Sopenharmony_ci /* Check the start address: needs to be page-aligned.. */ 2398c2ecf20Sopenharmony_ci if (start & ~PAGE_MASK) 2408c2ecf20Sopenharmony_ci return -EINVAL; 2418c2ecf20Sopenharmony_ci 2428c2ecf20Sopenharmony_ci /* ..and we need to be passed a valid user-space range */ 2438c2ecf20Sopenharmony_ci if (!access_ok((void __user *) start, len)) 2448c2ecf20Sopenharmony_ci return -ENOMEM; 2458c2ecf20Sopenharmony_ci 2468c2ecf20Sopenharmony_ci /* This also avoids any overflows on PAGE_ALIGN */ 2478c2ecf20Sopenharmony_ci pages = len >> PAGE_SHIFT; 2488c2ecf20Sopenharmony_ci pages += (offset_in_page(len)) != 0; 2498c2ecf20Sopenharmony_ci 2508c2ecf20Sopenharmony_ci if (!access_ok(vec, pages)) 2518c2ecf20Sopenharmony_ci return -EFAULT; 2528c2ecf20Sopenharmony_ci 2538c2ecf20Sopenharmony_ci tmp = (void *) __get_free_page(GFP_USER); 2548c2ecf20Sopenharmony_ci if (!tmp) 2558c2ecf20Sopenharmony_ci return -EAGAIN; 2568c2ecf20Sopenharmony_ci 2578c2ecf20Sopenharmony_ci retval = 0; 2588c2ecf20Sopenharmony_ci while (pages) { 2598c2ecf20Sopenharmony_ci /* 2608c2ecf20Sopenharmony_ci * Do at most PAGE_SIZE entries per iteration, due to 2618c2ecf20Sopenharmony_ci * the temporary buffer size. 2628c2ecf20Sopenharmony_ci */ 2638c2ecf20Sopenharmony_ci mmap_read_lock(current->mm); 2648c2ecf20Sopenharmony_ci retval = do_mincore(start, min(pages, PAGE_SIZE), tmp); 2658c2ecf20Sopenharmony_ci mmap_read_unlock(current->mm); 2668c2ecf20Sopenharmony_ci 2678c2ecf20Sopenharmony_ci if (retval <= 0) 2688c2ecf20Sopenharmony_ci break; 2698c2ecf20Sopenharmony_ci if (copy_to_user(vec, tmp, retval)) { 2708c2ecf20Sopenharmony_ci retval = -EFAULT; 2718c2ecf20Sopenharmony_ci break; 2728c2ecf20Sopenharmony_ci } 2738c2ecf20Sopenharmony_ci pages -= retval; 2748c2ecf20Sopenharmony_ci vec += retval; 2758c2ecf20Sopenharmony_ci start += retval << PAGE_SHIFT; 2768c2ecf20Sopenharmony_ci retval = 0; 2778c2ecf20Sopenharmony_ci } 2788c2ecf20Sopenharmony_ci free_page((unsigned long) tmp); 2798c2ecf20Sopenharmony_ci return retval; 2808c2ecf20Sopenharmony_ci} 281