18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * linux/mm/memory.c 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 68c2ecf20Sopenharmony_ci */ 78c2ecf20Sopenharmony_ci 88c2ecf20Sopenharmony_ci/* 98c2ecf20Sopenharmony_ci * demand-loading started 01.12.91 - seems it is high on the list of 108c2ecf20Sopenharmony_ci * things wanted, and it should be easy to implement. - Linus 118c2ecf20Sopenharmony_ci */ 128c2ecf20Sopenharmony_ci 138c2ecf20Sopenharmony_ci/* 148c2ecf20Sopenharmony_ci * Ok, demand-loading was easy, shared pages a little bit tricker. Shared 158c2ecf20Sopenharmony_ci * pages started 02.12.91, seems to work. - Linus. 168c2ecf20Sopenharmony_ci * 178c2ecf20Sopenharmony_ci * Tested sharing by executing about 30 /bin/sh: under the old kernel it 188c2ecf20Sopenharmony_ci * would have taken more than the 6M I have free, but it worked well as 198c2ecf20Sopenharmony_ci * far as I could see. 208c2ecf20Sopenharmony_ci * 218c2ecf20Sopenharmony_ci * Also corrected some "invalidate()"s - I wasn't doing enough of them. 228c2ecf20Sopenharmony_ci */ 238c2ecf20Sopenharmony_ci 248c2ecf20Sopenharmony_ci/* 258c2ecf20Sopenharmony_ci * Real VM (paging to/from disk) started 18.12.91. Much more work and 268c2ecf20Sopenharmony_ci * thought has to go into this. Oh, well.. 278c2ecf20Sopenharmony_ci * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. 288c2ecf20Sopenharmony_ci * Found it. Everything seems to work now. 298c2ecf20Sopenharmony_ci * 20.12.91 - Ok, making the swap-device changeable like the root. 308c2ecf20Sopenharmony_ci */ 318c2ecf20Sopenharmony_ci 328c2ecf20Sopenharmony_ci/* 338c2ecf20Sopenharmony_ci * 05.04.94 - Multi-page memory management added for v1.1. 348c2ecf20Sopenharmony_ci * Idea by Alex Bligh (alex@cconcepts.co.uk) 358c2ecf20Sopenharmony_ci * 368c2ecf20Sopenharmony_ci * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG 378c2ecf20Sopenharmony_ci * (Gerhard.Wichert@pdb.siemens.de) 388c2ecf20Sopenharmony_ci * 398c2ecf20Sopenharmony_ci * Aug/Sep 2004 Changed to four level page tables (Andi Kleen) 408c2ecf20Sopenharmony_ci */ 418c2ecf20Sopenharmony_ci 428c2ecf20Sopenharmony_ci#include <linux/kernel_stat.h> 438c2ecf20Sopenharmony_ci#include <linux/mm.h> 448c2ecf20Sopenharmony_ci#include <linux/sched/mm.h> 458c2ecf20Sopenharmony_ci#include <linux/sched/coredump.h> 468c2ecf20Sopenharmony_ci#include <linux/sched/numa_balancing.h> 478c2ecf20Sopenharmony_ci#include <linux/sched/task.h> 488c2ecf20Sopenharmony_ci#include <linux/hugetlb.h> 498c2ecf20Sopenharmony_ci#include <linux/mman.h> 508c2ecf20Sopenharmony_ci#include <linux/swap.h> 518c2ecf20Sopenharmony_ci#include <linux/highmem.h> 528c2ecf20Sopenharmony_ci#include <linux/pagemap.h> 538c2ecf20Sopenharmony_ci#include <linux/memremap.h> 548c2ecf20Sopenharmony_ci#include <linux/ksm.h> 558c2ecf20Sopenharmony_ci#include <linux/rmap.h> 568c2ecf20Sopenharmony_ci#include <linux/export.h> 578c2ecf20Sopenharmony_ci#include <linux/delayacct.h> 588c2ecf20Sopenharmony_ci#include <linux/init.h> 598c2ecf20Sopenharmony_ci#include <linux/pfn_t.h> 608c2ecf20Sopenharmony_ci#include <linux/writeback.h> 618c2ecf20Sopenharmony_ci#include <linux/memcontrol.h> 628c2ecf20Sopenharmony_ci#include <linux/mmu_notifier.h> 638c2ecf20Sopenharmony_ci#include <linux/swapops.h> 648c2ecf20Sopenharmony_ci#include <linux/elf.h> 658c2ecf20Sopenharmony_ci#include <linux/gfp.h> 668c2ecf20Sopenharmony_ci#include <linux/migrate.h> 678c2ecf20Sopenharmony_ci#include <linux/string.h> 688c2ecf20Sopenharmony_ci#include <linux/debugfs.h> 698c2ecf20Sopenharmony_ci#include <linux/userfaultfd_k.h> 708c2ecf20Sopenharmony_ci#include <linux/dax.h> 718c2ecf20Sopenharmony_ci#include <linux/oom.h> 728c2ecf20Sopenharmony_ci#include <linux/numa.h> 738c2ecf20Sopenharmony_ci#include <linux/perf_event.h> 748c2ecf20Sopenharmony_ci#include <linux/ptrace.h> 758c2ecf20Sopenharmony_ci#include <linux/vmalloc.h> 768c2ecf20Sopenharmony_ci#include <linux/mm_purgeable.h> 778c2ecf20Sopenharmony_ci 788c2ecf20Sopenharmony_ci#include <trace/events/kmem.h> 798c2ecf20Sopenharmony_ci 808c2ecf20Sopenharmony_ci#include <asm/io.h> 818c2ecf20Sopenharmony_ci#include <asm/mmu_context.h> 828c2ecf20Sopenharmony_ci#include <asm/pgalloc.h> 838c2ecf20Sopenharmony_ci#include <linux/uaccess.h> 848c2ecf20Sopenharmony_ci#include <asm/tlb.h> 858c2ecf20Sopenharmony_ci#include <asm/tlbflush.h> 868c2ecf20Sopenharmony_ci 878c2ecf20Sopenharmony_ci#include "pgalloc-track.h" 888c2ecf20Sopenharmony_ci#include "internal.h" 898c2ecf20Sopenharmony_ci#include <linux/xpm.h> 908c2ecf20Sopenharmony_ci 918c2ecf20Sopenharmony_ci#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST) 928c2ecf20Sopenharmony_ci#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. 938c2ecf20Sopenharmony_ci#endif 948c2ecf20Sopenharmony_ci 958c2ecf20Sopenharmony_ci#ifndef CONFIG_NEED_MULTIPLE_NODES 968c2ecf20Sopenharmony_ci/* use the per-pgdat data instead for discontigmem - mbligh */ 978c2ecf20Sopenharmony_ciunsigned long max_mapnr; 988c2ecf20Sopenharmony_ciEXPORT_SYMBOL(max_mapnr); 998c2ecf20Sopenharmony_ci 1008c2ecf20Sopenharmony_cistruct page *mem_map; 1018c2ecf20Sopenharmony_ciEXPORT_SYMBOL(mem_map); 1028c2ecf20Sopenharmony_ci#endif 1038c2ecf20Sopenharmony_ci 1048c2ecf20Sopenharmony_ci/* 1058c2ecf20Sopenharmony_ci * A number of key systems in x86 including ioremap() rely on the assumption 1068c2ecf20Sopenharmony_ci * that high_memory defines the upper bound on direct map memory, then end 1078c2ecf20Sopenharmony_ci * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and 1088c2ecf20Sopenharmony_ci * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL 1098c2ecf20Sopenharmony_ci * and ZONE_HIGHMEM. 1108c2ecf20Sopenharmony_ci */ 1118c2ecf20Sopenharmony_civoid *high_memory; 1128c2ecf20Sopenharmony_ciEXPORT_SYMBOL(high_memory); 1138c2ecf20Sopenharmony_ci 1148c2ecf20Sopenharmony_ci/* 1158c2ecf20Sopenharmony_ci * Randomize the address space (stacks, mmaps, brk, etc.). 1168c2ecf20Sopenharmony_ci * 1178c2ecf20Sopenharmony_ci * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization, 1188c2ecf20Sopenharmony_ci * as ancient (libc5 based) binaries can segfault. ) 1198c2ecf20Sopenharmony_ci */ 1208c2ecf20Sopenharmony_ciint randomize_va_space __read_mostly = 1218c2ecf20Sopenharmony_ci#ifdef CONFIG_COMPAT_BRK 1228c2ecf20Sopenharmony_ci 1; 1238c2ecf20Sopenharmony_ci#else 1248c2ecf20Sopenharmony_ci 2; 1258c2ecf20Sopenharmony_ci#endif 1268c2ecf20Sopenharmony_ci 1278c2ecf20Sopenharmony_ci#ifndef arch_faults_on_old_pte 1288c2ecf20Sopenharmony_cistatic inline bool arch_faults_on_old_pte(void) 1298c2ecf20Sopenharmony_ci{ 1308c2ecf20Sopenharmony_ci /* 1318c2ecf20Sopenharmony_ci * Those arches which don't have hw access flag feature need to 1328c2ecf20Sopenharmony_ci * implement their own helper. By default, "true" means pagefault 1338c2ecf20Sopenharmony_ci * will be hit on old pte. 1348c2ecf20Sopenharmony_ci */ 1358c2ecf20Sopenharmony_ci return true; 1368c2ecf20Sopenharmony_ci} 1378c2ecf20Sopenharmony_ci#endif 1388c2ecf20Sopenharmony_ci 1398c2ecf20Sopenharmony_cistatic int __init disable_randmaps(char *s) 1408c2ecf20Sopenharmony_ci{ 1418c2ecf20Sopenharmony_ci randomize_va_space = 0; 1428c2ecf20Sopenharmony_ci return 1; 1438c2ecf20Sopenharmony_ci} 1448c2ecf20Sopenharmony_ci__setup("norandmaps", disable_randmaps); 1458c2ecf20Sopenharmony_ci 1468c2ecf20Sopenharmony_ciunsigned long zero_pfn __read_mostly; 1478c2ecf20Sopenharmony_ciEXPORT_SYMBOL(zero_pfn); 1488c2ecf20Sopenharmony_ci 1498c2ecf20Sopenharmony_ciunsigned long highest_memmap_pfn __read_mostly; 1508c2ecf20Sopenharmony_ci 1518c2ecf20Sopenharmony_ci/* 1528c2ecf20Sopenharmony_ci * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() 1538c2ecf20Sopenharmony_ci */ 1548c2ecf20Sopenharmony_cistatic int __init init_zero_pfn(void) 1558c2ecf20Sopenharmony_ci{ 1568c2ecf20Sopenharmony_ci zero_pfn = page_to_pfn(ZERO_PAGE(0)); 1578c2ecf20Sopenharmony_ci return 0; 1588c2ecf20Sopenharmony_ci} 1598c2ecf20Sopenharmony_ciearly_initcall(init_zero_pfn); 1608c2ecf20Sopenharmony_ci 1618c2ecf20Sopenharmony_civoid mm_trace_rss_stat(struct mm_struct *mm, int member, long count) 1628c2ecf20Sopenharmony_ci{ 1638c2ecf20Sopenharmony_ci trace_rss_stat(mm, member, count); 1648c2ecf20Sopenharmony_ci} 1658c2ecf20Sopenharmony_ci 1668c2ecf20Sopenharmony_ci#if defined(SPLIT_RSS_COUNTING) 1678c2ecf20Sopenharmony_ci 1688c2ecf20Sopenharmony_civoid sync_mm_rss(struct mm_struct *mm) 1698c2ecf20Sopenharmony_ci{ 1708c2ecf20Sopenharmony_ci int i; 1718c2ecf20Sopenharmony_ci 1728c2ecf20Sopenharmony_ci for (i = 0; i < NR_MM_COUNTERS; i++) { 1738c2ecf20Sopenharmony_ci if (current->rss_stat.count[i]) { 1748c2ecf20Sopenharmony_ci add_mm_counter(mm, i, current->rss_stat.count[i]); 1758c2ecf20Sopenharmony_ci current->rss_stat.count[i] = 0; 1768c2ecf20Sopenharmony_ci } 1778c2ecf20Sopenharmony_ci } 1788c2ecf20Sopenharmony_ci current->rss_stat.events = 0; 1798c2ecf20Sopenharmony_ci} 1808c2ecf20Sopenharmony_ci 1818c2ecf20Sopenharmony_cistatic void add_mm_counter_fast(struct mm_struct *mm, int member, int val) 1828c2ecf20Sopenharmony_ci{ 1838c2ecf20Sopenharmony_ci struct task_struct *task = current; 1848c2ecf20Sopenharmony_ci 1858c2ecf20Sopenharmony_ci if (likely(task->mm == mm)) 1868c2ecf20Sopenharmony_ci task->rss_stat.count[member] += val; 1878c2ecf20Sopenharmony_ci else 1888c2ecf20Sopenharmony_ci add_mm_counter(mm, member, val); 1898c2ecf20Sopenharmony_ci} 1908c2ecf20Sopenharmony_ci#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) 1918c2ecf20Sopenharmony_ci#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) 1928c2ecf20Sopenharmony_ci 1938c2ecf20Sopenharmony_ci/* sync counter once per 64 page faults */ 1948c2ecf20Sopenharmony_ci#define TASK_RSS_EVENTS_THRESH (64) 1958c2ecf20Sopenharmony_cistatic void check_sync_rss_stat(struct task_struct *task) 1968c2ecf20Sopenharmony_ci{ 1978c2ecf20Sopenharmony_ci if (unlikely(task != current)) 1988c2ecf20Sopenharmony_ci return; 1998c2ecf20Sopenharmony_ci if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) 2008c2ecf20Sopenharmony_ci sync_mm_rss(task->mm); 2018c2ecf20Sopenharmony_ci} 2028c2ecf20Sopenharmony_ci#else /* SPLIT_RSS_COUNTING */ 2038c2ecf20Sopenharmony_ci 2048c2ecf20Sopenharmony_ci#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) 2058c2ecf20Sopenharmony_ci#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) 2068c2ecf20Sopenharmony_ci 2078c2ecf20Sopenharmony_cistatic void check_sync_rss_stat(struct task_struct *task) 2088c2ecf20Sopenharmony_ci{ 2098c2ecf20Sopenharmony_ci} 2108c2ecf20Sopenharmony_ci 2118c2ecf20Sopenharmony_ci#endif /* SPLIT_RSS_COUNTING */ 2128c2ecf20Sopenharmony_ci 2138c2ecf20Sopenharmony_ci/* 2148c2ecf20Sopenharmony_ci * Note: this doesn't free the actual pages themselves. That 2158c2ecf20Sopenharmony_ci * has been handled earlier when unmapping all the memory regions. 2168c2ecf20Sopenharmony_ci */ 2178c2ecf20Sopenharmony_cistatic void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, 2188c2ecf20Sopenharmony_ci unsigned long addr) 2198c2ecf20Sopenharmony_ci{ 2208c2ecf20Sopenharmony_ci pgtable_t token = pmd_pgtable(*pmd); 2218c2ecf20Sopenharmony_ci pmd_clear(pmd); 2228c2ecf20Sopenharmony_ci pte_free_tlb(tlb, token, addr); 2238c2ecf20Sopenharmony_ci mm_dec_nr_ptes(tlb->mm); 2248c2ecf20Sopenharmony_ci} 2258c2ecf20Sopenharmony_ci 2268c2ecf20Sopenharmony_cistatic inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 2278c2ecf20Sopenharmony_ci unsigned long addr, unsigned long end, 2288c2ecf20Sopenharmony_ci unsigned long floor, unsigned long ceiling) 2298c2ecf20Sopenharmony_ci{ 2308c2ecf20Sopenharmony_ci pmd_t *pmd; 2318c2ecf20Sopenharmony_ci unsigned long next; 2328c2ecf20Sopenharmony_ci unsigned long start; 2338c2ecf20Sopenharmony_ci 2348c2ecf20Sopenharmony_ci start = addr; 2358c2ecf20Sopenharmony_ci pmd = pmd_offset(pud, addr); 2368c2ecf20Sopenharmony_ci do { 2378c2ecf20Sopenharmony_ci next = pmd_addr_end(addr, end); 2388c2ecf20Sopenharmony_ci if (pmd_none_or_clear_bad(pmd)) 2398c2ecf20Sopenharmony_ci continue; 2408c2ecf20Sopenharmony_ci free_pte_range(tlb, pmd, addr); 2418c2ecf20Sopenharmony_ci } while (pmd++, addr = next, addr != end); 2428c2ecf20Sopenharmony_ci 2438c2ecf20Sopenharmony_ci start &= PUD_MASK; 2448c2ecf20Sopenharmony_ci if (start < floor) 2458c2ecf20Sopenharmony_ci return; 2468c2ecf20Sopenharmony_ci if (ceiling) { 2478c2ecf20Sopenharmony_ci ceiling &= PUD_MASK; 2488c2ecf20Sopenharmony_ci if (!ceiling) 2498c2ecf20Sopenharmony_ci return; 2508c2ecf20Sopenharmony_ci } 2518c2ecf20Sopenharmony_ci if (end - 1 > ceiling - 1) 2528c2ecf20Sopenharmony_ci return; 2538c2ecf20Sopenharmony_ci 2548c2ecf20Sopenharmony_ci pmd = pmd_offset(pud, start); 2558c2ecf20Sopenharmony_ci pud_clear(pud); 2568c2ecf20Sopenharmony_ci pmd_free_tlb(tlb, pmd, start); 2578c2ecf20Sopenharmony_ci mm_dec_nr_pmds(tlb->mm); 2588c2ecf20Sopenharmony_ci} 2598c2ecf20Sopenharmony_ci 2608c2ecf20Sopenharmony_cistatic inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, 2618c2ecf20Sopenharmony_ci unsigned long addr, unsigned long end, 2628c2ecf20Sopenharmony_ci unsigned long floor, unsigned long ceiling) 2638c2ecf20Sopenharmony_ci{ 2648c2ecf20Sopenharmony_ci pud_t *pud; 2658c2ecf20Sopenharmony_ci unsigned long next; 2668c2ecf20Sopenharmony_ci unsigned long start; 2678c2ecf20Sopenharmony_ci 2688c2ecf20Sopenharmony_ci start = addr; 2698c2ecf20Sopenharmony_ci pud = pud_offset(p4d, addr); 2708c2ecf20Sopenharmony_ci do { 2718c2ecf20Sopenharmony_ci next = pud_addr_end(addr, end); 2728c2ecf20Sopenharmony_ci if (pud_none_or_clear_bad(pud)) 2738c2ecf20Sopenharmony_ci continue; 2748c2ecf20Sopenharmony_ci free_pmd_range(tlb, pud, addr, next, floor, ceiling); 2758c2ecf20Sopenharmony_ci } while (pud++, addr = next, addr != end); 2768c2ecf20Sopenharmony_ci 2778c2ecf20Sopenharmony_ci start &= P4D_MASK; 2788c2ecf20Sopenharmony_ci if (start < floor) 2798c2ecf20Sopenharmony_ci return; 2808c2ecf20Sopenharmony_ci if (ceiling) { 2818c2ecf20Sopenharmony_ci ceiling &= P4D_MASK; 2828c2ecf20Sopenharmony_ci if (!ceiling) 2838c2ecf20Sopenharmony_ci return; 2848c2ecf20Sopenharmony_ci } 2858c2ecf20Sopenharmony_ci if (end - 1 > ceiling - 1) 2868c2ecf20Sopenharmony_ci return; 2878c2ecf20Sopenharmony_ci 2888c2ecf20Sopenharmony_ci pud = pud_offset(p4d, start); 2898c2ecf20Sopenharmony_ci p4d_clear(p4d); 2908c2ecf20Sopenharmony_ci pud_free_tlb(tlb, pud, start); 2918c2ecf20Sopenharmony_ci mm_dec_nr_puds(tlb->mm); 2928c2ecf20Sopenharmony_ci} 2938c2ecf20Sopenharmony_ci 2948c2ecf20Sopenharmony_cistatic inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, 2958c2ecf20Sopenharmony_ci unsigned long addr, unsigned long end, 2968c2ecf20Sopenharmony_ci unsigned long floor, unsigned long ceiling) 2978c2ecf20Sopenharmony_ci{ 2988c2ecf20Sopenharmony_ci p4d_t *p4d; 2998c2ecf20Sopenharmony_ci unsigned long next; 3008c2ecf20Sopenharmony_ci unsigned long start; 3018c2ecf20Sopenharmony_ci 3028c2ecf20Sopenharmony_ci start = addr; 3038c2ecf20Sopenharmony_ci p4d = p4d_offset(pgd, addr); 3048c2ecf20Sopenharmony_ci do { 3058c2ecf20Sopenharmony_ci next = p4d_addr_end(addr, end); 3068c2ecf20Sopenharmony_ci if (p4d_none_or_clear_bad(p4d)) 3078c2ecf20Sopenharmony_ci continue; 3088c2ecf20Sopenharmony_ci free_pud_range(tlb, p4d, addr, next, floor, ceiling); 3098c2ecf20Sopenharmony_ci } while (p4d++, addr = next, addr != end); 3108c2ecf20Sopenharmony_ci 3118c2ecf20Sopenharmony_ci start &= PGDIR_MASK; 3128c2ecf20Sopenharmony_ci if (start < floor) 3138c2ecf20Sopenharmony_ci return; 3148c2ecf20Sopenharmony_ci if (ceiling) { 3158c2ecf20Sopenharmony_ci ceiling &= PGDIR_MASK; 3168c2ecf20Sopenharmony_ci if (!ceiling) 3178c2ecf20Sopenharmony_ci return; 3188c2ecf20Sopenharmony_ci } 3198c2ecf20Sopenharmony_ci if (end - 1 > ceiling - 1) 3208c2ecf20Sopenharmony_ci return; 3218c2ecf20Sopenharmony_ci 3228c2ecf20Sopenharmony_ci p4d = p4d_offset(pgd, start); 3238c2ecf20Sopenharmony_ci pgd_clear(pgd); 3248c2ecf20Sopenharmony_ci p4d_free_tlb(tlb, p4d, start); 3258c2ecf20Sopenharmony_ci} 3268c2ecf20Sopenharmony_ci 3278c2ecf20Sopenharmony_ci/* 3288c2ecf20Sopenharmony_ci * This function frees user-level page tables of a process. 3298c2ecf20Sopenharmony_ci */ 3308c2ecf20Sopenharmony_civoid free_pgd_range(struct mmu_gather *tlb, 3318c2ecf20Sopenharmony_ci unsigned long addr, unsigned long end, 3328c2ecf20Sopenharmony_ci unsigned long floor, unsigned long ceiling) 3338c2ecf20Sopenharmony_ci{ 3348c2ecf20Sopenharmony_ci pgd_t *pgd; 3358c2ecf20Sopenharmony_ci unsigned long next; 3368c2ecf20Sopenharmony_ci 3378c2ecf20Sopenharmony_ci /* 3388c2ecf20Sopenharmony_ci * The next few lines have given us lots of grief... 3398c2ecf20Sopenharmony_ci * 3408c2ecf20Sopenharmony_ci * Why are we testing PMD* at this top level? Because often 3418c2ecf20Sopenharmony_ci * there will be no work to do at all, and we'd prefer not to 3428c2ecf20Sopenharmony_ci * go all the way down to the bottom just to discover that. 3438c2ecf20Sopenharmony_ci * 3448c2ecf20Sopenharmony_ci * Why all these "- 1"s? Because 0 represents both the bottom 3458c2ecf20Sopenharmony_ci * of the address space and the top of it (using -1 for the 3468c2ecf20Sopenharmony_ci * top wouldn't help much: the masks would do the wrong thing). 3478c2ecf20Sopenharmony_ci * The rule is that addr 0 and floor 0 refer to the bottom of 3488c2ecf20Sopenharmony_ci * the address space, but end 0 and ceiling 0 refer to the top 3498c2ecf20Sopenharmony_ci * Comparisons need to use "end - 1" and "ceiling - 1" (though 3508c2ecf20Sopenharmony_ci * that end 0 case should be mythical). 3518c2ecf20Sopenharmony_ci * 3528c2ecf20Sopenharmony_ci * Wherever addr is brought up or ceiling brought down, we must 3538c2ecf20Sopenharmony_ci * be careful to reject "the opposite 0" before it confuses the 3548c2ecf20Sopenharmony_ci * subsequent tests. But what about where end is brought down 3558c2ecf20Sopenharmony_ci * by PMD_SIZE below? no, end can't go down to 0 there. 3568c2ecf20Sopenharmony_ci * 3578c2ecf20Sopenharmony_ci * Whereas we round start (addr) and ceiling down, by different 3588c2ecf20Sopenharmony_ci * masks at different levels, in order to test whether a table 3598c2ecf20Sopenharmony_ci * now has no other vmas using it, so can be freed, we don't 3608c2ecf20Sopenharmony_ci * bother to round floor or end up - the tests don't need that. 3618c2ecf20Sopenharmony_ci */ 3628c2ecf20Sopenharmony_ci 3638c2ecf20Sopenharmony_ci addr &= PMD_MASK; 3648c2ecf20Sopenharmony_ci if (addr < floor) { 3658c2ecf20Sopenharmony_ci addr += PMD_SIZE; 3668c2ecf20Sopenharmony_ci if (!addr) 3678c2ecf20Sopenharmony_ci return; 3688c2ecf20Sopenharmony_ci } 3698c2ecf20Sopenharmony_ci if (ceiling) { 3708c2ecf20Sopenharmony_ci ceiling &= PMD_MASK; 3718c2ecf20Sopenharmony_ci if (!ceiling) 3728c2ecf20Sopenharmony_ci return; 3738c2ecf20Sopenharmony_ci } 3748c2ecf20Sopenharmony_ci if (end - 1 > ceiling - 1) 3758c2ecf20Sopenharmony_ci end -= PMD_SIZE; 3768c2ecf20Sopenharmony_ci if (addr > end - 1) 3778c2ecf20Sopenharmony_ci return; 3788c2ecf20Sopenharmony_ci /* 3798c2ecf20Sopenharmony_ci * We add page table cache pages with PAGE_SIZE, 3808c2ecf20Sopenharmony_ci * (see pte_free_tlb()), flush the tlb if we need 3818c2ecf20Sopenharmony_ci */ 3828c2ecf20Sopenharmony_ci tlb_change_page_size(tlb, PAGE_SIZE); 3838c2ecf20Sopenharmony_ci pgd = pgd_offset(tlb->mm, addr); 3848c2ecf20Sopenharmony_ci do { 3858c2ecf20Sopenharmony_ci next = pgd_addr_end(addr, end); 3868c2ecf20Sopenharmony_ci if (pgd_none_or_clear_bad(pgd)) 3878c2ecf20Sopenharmony_ci continue; 3888c2ecf20Sopenharmony_ci free_p4d_range(tlb, pgd, addr, next, floor, ceiling); 3898c2ecf20Sopenharmony_ci } while (pgd++, addr = next, addr != end); 3908c2ecf20Sopenharmony_ci} 3918c2ecf20Sopenharmony_ci 3928c2ecf20Sopenharmony_civoid free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, 3938c2ecf20Sopenharmony_ci unsigned long floor, unsigned long ceiling) 3948c2ecf20Sopenharmony_ci{ 3958c2ecf20Sopenharmony_ci while (vma) { 3968c2ecf20Sopenharmony_ci struct vm_area_struct *next = vma->vm_next; 3978c2ecf20Sopenharmony_ci unsigned long addr = vma->vm_start; 3988c2ecf20Sopenharmony_ci 3998c2ecf20Sopenharmony_ci /* 4008c2ecf20Sopenharmony_ci * Hide vma from rmap and truncate_pagecache before freeing 4018c2ecf20Sopenharmony_ci * pgtables 4028c2ecf20Sopenharmony_ci */ 4038c2ecf20Sopenharmony_ci unlink_anon_vmas(vma); 4048c2ecf20Sopenharmony_ci unlink_file_vma(vma); 4058c2ecf20Sopenharmony_ci 4068c2ecf20Sopenharmony_ci if (is_vm_hugetlb_page(vma)) { 4078c2ecf20Sopenharmony_ci hugetlb_free_pgd_range(tlb, addr, vma->vm_end, 4088c2ecf20Sopenharmony_ci floor, next ? next->vm_start : ceiling); 4098c2ecf20Sopenharmony_ci } else { 4108c2ecf20Sopenharmony_ci /* 4118c2ecf20Sopenharmony_ci * Optimization: gather nearby vmas into one call down 4128c2ecf20Sopenharmony_ci */ 4138c2ecf20Sopenharmony_ci while (next && next->vm_start <= vma->vm_end + PMD_SIZE 4148c2ecf20Sopenharmony_ci && !is_vm_hugetlb_page(next)) { 4158c2ecf20Sopenharmony_ci vma = next; 4168c2ecf20Sopenharmony_ci next = vma->vm_next; 4178c2ecf20Sopenharmony_ci unlink_anon_vmas(vma); 4188c2ecf20Sopenharmony_ci unlink_file_vma(vma); 4198c2ecf20Sopenharmony_ci } 4208c2ecf20Sopenharmony_ci free_pgd_range(tlb, addr, vma->vm_end, 4218c2ecf20Sopenharmony_ci floor, next ? next->vm_start : ceiling); 4228c2ecf20Sopenharmony_ci } 4238c2ecf20Sopenharmony_ci vma = next; 4248c2ecf20Sopenharmony_ci } 4258c2ecf20Sopenharmony_ci} 4268c2ecf20Sopenharmony_ci 4278c2ecf20Sopenharmony_ciint __pte_alloc(struct mm_struct *mm, pmd_t *pmd) 4288c2ecf20Sopenharmony_ci{ 4298c2ecf20Sopenharmony_ci spinlock_t *ptl; 4308c2ecf20Sopenharmony_ci pgtable_t new = pte_alloc_one(mm); 4318c2ecf20Sopenharmony_ci if (!new) 4328c2ecf20Sopenharmony_ci return -ENOMEM; 4338c2ecf20Sopenharmony_ci 4348c2ecf20Sopenharmony_ci /* 4358c2ecf20Sopenharmony_ci * Ensure all pte setup (eg. pte page lock and page clearing) are 4368c2ecf20Sopenharmony_ci * visible before the pte is made visible to other CPUs by being 4378c2ecf20Sopenharmony_ci * put into page tables. 4388c2ecf20Sopenharmony_ci * 4398c2ecf20Sopenharmony_ci * The other side of the story is the pointer chasing in the page 4408c2ecf20Sopenharmony_ci * table walking code (when walking the page table without locking; 4418c2ecf20Sopenharmony_ci * ie. most of the time). Fortunately, these data accesses consist 4428c2ecf20Sopenharmony_ci * of a chain of data-dependent loads, meaning most CPUs (alpha 4438c2ecf20Sopenharmony_ci * being the notable exception) will already guarantee loads are 4448c2ecf20Sopenharmony_ci * seen in-order. See the alpha page table accessors for the 4458c2ecf20Sopenharmony_ci * smp_rmb() barriers in page table walking code. 4468c2ecf20Sopenharmony_ci */ 4478c2ecf20Sopenharmony_ci smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ 4488c2ecf20Sopenharmony_ci 4498c2ecf20Sopenharmony_ci ptl = pmd_lock(mm, pmd); 4508c2ecf20Sopenharmony_ci if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ 4518c2ecf20Sopenharmony_ci mm_inc_nr_ptes(mm); 4528c2ecf20Sopenharmony_ci pmd_populate(mm, pmd, new); 4538c2ecf20Sopenharmony_ci new = NULL; 4548c2ecf20Sopenharmony_ci } 4558c2ecf20Sopenharmony_ci spin_unlock(ptl); 4568c2ecf20Sopenharmony_ci if (new) 4578c2ecf20Sopenharmony_ci pte_free(mm, new); 4588c2ecf20Sopenharmony_ci return 0; 4598c2ecf20Sopenharmony_ci} 4608c2ecf20Sopenharmony_ci 4618c2ecf20Sopenharmony_ciint __pte_alloc_kernel(pmd_t *pmd) 4628c2ecf20Sopenharmony_ci{ 4638c2ecf20Sopenharmony_ci pte_t *new = pte_alloc_one_kernel(&init_mm); 4648c2ecf20Sopenharmony_ci if (!new) 4658c2ecf20Sopenharmony_ci return -ENOMEM; 4668c2ecf20Sopenharmony_ci 4678c2ecf20Sopenharmony_ci smp_wmb(); /* See comment in __pte_alloc */ 4688c2ecf20Sopenharmony_ci 4698c2ecf20Sopenharmony_ci spin_lock(&init_mm.page_table_lock); 4708c2ecf20Sopenharmony_ci if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ 4718c2ecf20Sopenharmony_ci pmd_populate_kernel(&init_mm, pmd, new); 4728c2ecf20Sopenharmony_ci new = NULL; 4738c2ecf20Sopenharmony_ci } 4748c2ecf20Sopenharmony_ci spin_unlock(&init_mm.page_table_lock); 4758c2ecf20Sopenharmony_ci if (new) 4768c2ecf20Sopenharmony_ci pte_free_kernel(&init_mm, new); 4778c2ecf20Sopenharmony_ci return 0; 4788c2ecf20Sopenharmony_ci} 4798c2ecf20Sopenharmony_ci 4808c2ecf20Sopenharmony_cistatic inline void init_rss_vec(int *rss) 4818c2ecf20Sopenharmony_ci{ 4828c2ecf20Sopenharmony_ci memset(rss, 0, sizeof(int) * NR_MM_COUNTERS); 4838c2ecf20Sopenharmony_ci} 4848c2ecf20Sopenharmony_ci 4858c2ecf20Sopenharmony_cistatic inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) 4868c2ecf20Sopenharmony_ci{ 4878c2ecf20Sopenharmony_ci int i; 4888c2ecf20Sopenharmony_ci 4898c2ecf20Sopenharmony_ci if (current->mm == mm) 4908c2ecf20Sopenharmony_ci sync_mm_rss(mm); 4918c2ecf20Sopenharmony_ci for (i = 0; i < NR_MM_COUNTERS; i++) 4928c2ecf20Sopenharmony_ci if (rss[i]) 4938c2ecf20Sopenharmony_ci add_mm_counter(mm, i, rss[i]); 4948c2ecf20Sopenharmony_ci} 4958c2ecf20Sopenharmony_ci 4968c2ecf20Sopenharmony_ci/* 4978c2ecf20Sopenharmony_ci * This function is called to print an error when a bad pte 4988c2ecf20Sopenharmony_ci * is found. For example, we might have a PFN-mapped pte in 4998c2ecf20Sopenharmony_ci * a region that doesn't allow it. 5008c2ecf20Sopenharmony_ci * 5018c2ecf20Sopenharmony_ci * The calling function must still handle the error. 5028c2ecf20Sopenharmony_ci */ 5038c2ecf20Sopenharmony_cistatic void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, 5048c2ecf20Sopenharmony_ci pte_t pte, struct page *page) 5058c2ecf20Sopenharmony_ci{ 5068c2ecf20Sopenharmony_ci pgd_t *pgd = pgd_offset(vma->vm_mm, addr); 5078c2ecf20Sopenharmony_ci p4d_t *p4d = p4d_offset(pgd, addr); 5088c2ecf20Sopenharmony_ci pud_t *pud = pud_offset(p4d, addr); 5098c2ecf20Sopenharmony_ci pmd_t *pmd = pmd_offset(pud, addr); 5108c2ecf20Sopenharmony_ci struct address_space *mapping; 5118c2ecf20Sopenharmony_ci pgoff_t index; 5128c2ecf20Sopenharmony_ci static unsigned long resume; 5138c2ecf20Sopenharmony_ci static unsigned long nr_shown; 5148c2ecf20Sopenharmony_ci static unsigned long nr_unshown; 5158c2ecf20Sopenharmony_ci 5168c2ecf20Sopenharmony_ci /* 5178c2ecf20Sopenharmony_ci * Allow a burst of 60 reports, then keep quiet for that minute; 5188c2ecf20Sopenharmony_ci * or allow a steady drip of one report per second. 5198c2ecf20Sopenharmony_ci */ 5208c2ecf20Sopenharmony_ci if (nr_shown == 60) { 5218c2ecf20Sopenharmony_ci if (time_before(jiffies, resume)) { 5228c2ecf20Sopenharmony_ci nr_unshown++; 5238c2ecf20Sopenharmony_ci return; 5248c2ecf20Sopenharmony_ci } 5258c2ecf20Sopenharmony_ci if (nr_unshown) { 5268c2ecf20Sopenharmony_ci pr_alert("BUG: Bad page map: %lu messages suppressed\n", 5278c2ecf20Sopenharmony_ci nr_unshown); 5288c2ecf20Sopenharmony_ci nr_unshown = 0; 5298c2ecf20Sopenharmony_ci } 5308c2ecf20Sopenharmony_ci nr_shown = 0; 5318c2ecf20Sopenharmony_ci } 5328c2ecf20Sopenharmony_ci if (nr_shown++ == 0) 5338c2ecf20Sopenharmony_ci resume = jiffies + 60 * HZ; 5348c2ecf20Sopenharmony_ci 5358c2ecf20Sopenharmony_ci mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; 5368c2ecf20Sopenharmony_ci index = linear_page_index(vma, addr); 5378c2ecf20Sopenharmony_ci 5388c2ecf20Sopenharmony_ci pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", 5398c2ecf20Sopenharmony_ci current->comm, 5408c2ecf20Sopenharmony_ci (long long)pte_val(pte), (long long)pmd_val(*pmd)); 5418c2ecf20Sopenharmony_ci if (page) 5428c2ecf20Sopenharmony_ci dump_page(page, "bad pte"); 5438c2ecf20Sopenharmony_ci pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n", 5448c2ecf20Sopenharmony_ci (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); 5458c2ecf20Sopenharmony_ci pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n", 5468c2ecf20Sopenharmony_ci vma->vm_file, 5478c2ecf20Sopenharmony_ci vma->vm_ops ? vma->vm_ops->fault : NULL, 5488c2ecf20Sopenharmony_ci vma->vm_file ? vma->vm_file->f_op->mmap : NULL, 5498c2ecf20Sopenharmony_ci mapping ? mapping->a_ops->readpage : NULL); 5508c2ecf20Sopenharmony_ci dump_stack(); 5518c2ecf20Sopenharmony_ci add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 5528c2ecf20Sopenharmony_ci} 5538c2ecf20Sopenharmony_ci 5548c2ecf20Sopenharmony_ci/* 5558c2ecf20Sopenharmony_ci * vm_normal_page -- This function gets the "struct page" associated with a pte. 5568c2ecf20Sopenharmony_ci * 5578c2ecf20Sopenharmony_ci * "Special" mappings do not wish to be associated with a "struct page" (either 5588c2ecf20Sopenharmony_ci * it doesn't exist, or it exists but they don't want to touch it). In this 5598c2ecf20Sopenharmony_ci * case, NULL is returned here. "Normal" mappings do have a struct page. 5608c2ecf20Sopenharmony_ci * 5618c2ecf20Sopenharmony_ci * There are 2 broad cases. Firstly, an architecture may define a pte_special() 5628c2ecf20Sopenharmony_ci * pte bit, in which case this function is trivial. Secondly, an architecture 5638c2ecf20Sopenharmony_ci * may not have a spare pte bit, which requires a more complicated scheme, 5648c2ecf20Sopenharmony_ci * described below. 5658c2ecf20Sopenharmony_ci * 5668c2ecf20Sopenharmony_ci * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a 5678c2ecf20Sopenharmony_ci * special mapping (even if there are underlying and valid "struct pages"). 5688c2ecf20Sopenharmony_ci * COWed pages of a VM_PFNMAP are always normal. 5698c2ecf20Sopenharmony_ci * 5708c2ecf20Sopenharmony_ci * The way we recognize COWed pages within VM_PFNMAP mappings is through the 5718c2ecf20Sopenharmony_ci * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit 5728c2ecf20Sopenharmony_ci * set, and the vm_pgoff will point to the first PFN mapped: thus every special 5738c2ecf20Sopenharmony_ci * mapping will always honor the rule 5748c2ecf20Sopenharmony_ci * 5758c2ecf20Sopenharmony_ci * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) 5768c2ecf20Sopenharmony_ci * 5778c2ecf20Sopenharmony_ci * And for normal mappings this is false. 5788c2ecf20Sopenharmony_ci * 5798c2ecf20Sopenharmony_ci * This restricts such mappings to be a linear translation from virtual address 5808c2ecf20Sopenharmony_ci * to pfn. To get around this restriction, we allow arbitrary mappings so long 5818c2ecf20Sopenharmony_ci * as the vma is not a COW mapping; in that case, we know that all ptes are 5828c2ecf20Sopenharmony_ci * special (because none can have been COWed). 5838c2ecf20Sopenharmony_ci * 5848c2ecf20Sopenharmony_ci * 5858c2ecf20Sopenharmony_ci * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP. 5868c2ecf20Sopenharmony_ci * 5878c2ecf20Sopenharmony_ci * VM_MIXEDMAP mappings can likewise contain memory with or without "struct 5888c2ecf20Sopenharmony_ci * page" backing, however the difference is that _all_ pages with a struct 5898c2ecf20Sopenharmony_ci * page (that is, those where pfn_valid is true) are refcounted and considered 5908c2ecf20Sopenharmony_ci * normal pages by the VM. The disadvantage is that pages are refcounted 5918c2ecf20Sopenharmony_ci * (which can be slower and simply not an option for some PFNMAP users). The 5928c2ecf20Sopenharmony_ci * advantage is that we don't have to follow the strict linearity rule of 5938c2ecf20Sopenharmony_ci * PFNMAP mappings in order to support COWable mappings. 5948c2ecf20Sopenharmony_ci * 5958c2ecf20Sopenharmony_ci */ 5968c2ecf20Sopenharmony_cistruct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, 5978c2ecf20Sopenharmony_ci pte_t pte) 5988c2ecf20Sopenharmony_ci{ 5998c2ecf20Sopenharmony_ci unsigned long pfn = pte_pfn(pte); 6008c2ecf20Sopenharmony_ci 6018c2ecf20Sopenharmony_ci if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) { 6028c2ecf20Sopenharmony_ci if (likely(!pte_special(pte))) 6038c2ecf20Sopenharmony_ci goto check_pfn; 6048c2ecf20Sopenharmony_ci if (vma->vm_ops && vma->vm_ops->find_special_page) 6058c2ecf20Sopenharmony_ci return vma->vm_ops->find_special_page(vma, addr); 6068c2ecf20Sopenharmony_ci if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) 6078c2ecf20Sopenharmony_ci return NULL; 6088c2ecf20Sopenharmony_ci if (is_zero_pfn(pfn)) 6098c2ecf20Sopenharmony_ci return NULL; 6108c2ecf20Sopenharmony_ci if (pte_devmap(pte)) 6118c2ecf20Sopenharmony_ci return NULL; 6128c2ecf20Sopenharmony_ci 6138c2ecf20Sopenharmony_ci print_bad_pte(vma, addr, pte, NULL); 6148c2ecf20Sopenharmony_ci return NULL; 6158c2ecf20Sopenharmony_ci } 6168c2ecf20Sopenharmony_ci 6178c2ecf20Sopenharmony_ci /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */ 6188c2ecf20Sopenharmony_ci 6198c2ecf20Sopenharmony_ci if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { 6208c2ecf20Sopenharmony_ci if (vma->vm_flags & VM_MIXEDMAP) { 6218c2ecf20Sopenharmony_ci if (!pfn_valid(pfn)) 6228c2ecf20Sopenharmony_ci return NULL; 6238c2ecf20Sopenharmony_ci goto out; 6248c2ecf20Sopenharmony_ci } else { 6258c2ecf20Sopenharmony_ci unsigned long off; 6268c2ecf20Sopenharmony_ci off = (addr - vma->vm_start) >> PAGE_SHIFT; 6278c2ecf20Sopenharmony_ci if (pfn == vma->vm_pgoff + off) 6288c2ecf20Sopenharmony_ci return NULL; 6298c2ecf20Sopenharmony_ci if (!is_cow_mapping(vma->vm_flags)) 6308c2ecf20Sopenharmony_ci return NULL; 6318c2ecf20Sopenharmony_ci } 6328c2ecf20Sopenharmony_ci } 6338c2ecf20Sopenharmony_ci 6348c2ecf20Sopenharmony_ci if (is_zero_pfn(pfn)) 6358c2ecf20Sopenharmony_ci return NULL; 6368c2ecf20Sopenharmony_ci 6378c2ecf20Sopenharmony_cicheck_pfn: 6388c2ecf20Sopenharmony_ci if (unlikely(pfn > highest_memmap_pfn)) { 6398c2ecf20Sopenharmony_ci print_bad_pte(vma, addr, pte, NULL); 6408c2ecf20Sopenharmony_ci return NULL; 6418c2ecf20Sopenharmony_ci } 6428c2ecf20Sopenharmony_ci 6438c2ecf20Sopenharmony_ci /* 6448c2ecf20Sopenharmony_ci * NOTE! We still have PageReserved() pages in the page tables. 6458c2ecf20Sopenharmony_ci * eg. VDSO mappings can cause them to exist. 6468c2ecf20Sopenharmony_ci */ 6478c2ecf20Sopenharmony_ciout: 6488c2ecf20Sopenharmony_ci return pfn_to_page(pfn); 6498c2ecf20Sopenharmony_ci} 6508c2ecf20Sopenharmony_ci 6518c2ecf20Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 6528c2ecf20Sopenharmony_cistruct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, 6538c2ecf20Sopenharmony_ci pmd_t pmd) 6548c2ecf20Sopenharmony_ci{ 6558c2ecf20Sopenharmony_ci unsigned long pfn = pmd_pfn(pmd); 6568c2ecf20Sopenharmony_ci 6578c2ecf20Sopenharmony_ci /* 6588c2ecf20Sopenharmony_ci * There is no pmd_special() but there may be special pmds, e.g. 6598c2ecf20Sopenharmony_ci * in a direct-access (dax) mapping, so let's just replicate the 6608c2ecf20Sopenharmony_ci * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here. 6618c2ecf20Sopenharmony_ci */ 6628c2ecf20Sopenharmony_ci if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { 6638c2ecf20Sopenharmony_ci if (vma->vm_flags & VM_MIXEDMAP) { 6648c2ecf20Sopenharmony_ci if (!pfn_valid(pfn)) 6658c2ecf20Sopenharmony_ci return NULL; 6668c2ecf20Sopenharmony_ci goto out; 6678c2ecf20Sopenharmony_ci } else { 6688c2ecf20Sopenharmony_ci unsigned long off; 6698c2ecf20Sopenharmony_ci off = (addr - vma->vm_start) >> PAGE_SHIFT; 6708c2ecf20Sopenharmony_ci if (pfn == vma->vm_pgoff + off) 6718c2ecf20Sopenharmony_ci return NULL; 6728c2ecf20Sopenharmony_ci if (!is_cow_mapping(vma->vm_flags)) 6738c2ecf20Sopenharmony_ci return NULL; 6748c2ecf20Sopenharmony_ci } 6758c2ecf20Sopenharmony_ci } 6768c2ecf20Sopenharmony_ci 6778c2ecf20Sopenharmony_ci if (pmd_devmap(pmd)) 6788c2ecf20Sopenharmony_ci return NULL; 6798c2ecf20Sopenharmony_ci if (is_huge_zero_pmd(pmd)) 6808c2ecf20Sopenharmony_ci return NULL; 6818c2ecf20Sopenharmony_ci if (unlikely(pfn > highest_memmap_pfn)) 6828c2ecf20Sopenharmony_ci return NULL; 6838c2ecf20Sopenharmony_ci 6848c2ecf20Sopenharmony_ci /* 6858c2ecf20Sopenharmony_ci * NOTE! We still have PageReserved() pages in the page tables. 6868c2ecf20Sopenharmony_ci * eg. VDSO mappings can cause them to exist. 6878c2ecf20Sopenharmony_ci */ 6888c2ecf20Sopenharmony_ciout: 6898c2ecf20Sopenharmony_ci return pfn_to_page(pfn); 6908c2ecf20Sopenharmony_ci} 6918c2ecf20Sopenharmony_ci#endif 6928c2ecf20Sopenharmony_ci 6938c2ecf20Sopenharmony_ci/* 6948c2ecf20Sopenharmony_ci * copy one vm_area from one task to the other. Assumes the page tables 6958c2ecf20Sopenharmony_ci * already present in the new task to be cleared in the whole range 6968c2ecf20Sopenharmony_ci * covered by this vma. 6978c2ecf20Sopenharmony_ci */ 6988c2ecf20Sopenharmony_ci 6998c2ecf20Sopenharmony_cistatic unsigned long 7008c2ecf20Sopenharmony_cicopy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, 7018c2ecf20Sopenharmony_ci pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma, 7028c2ecf20Sopenharmony_ci struct vm_area_struct *src_vma, unsigned long addr, int *rss) 7038c2ecf20Sopenharmony_ci{ 7048c2ecf20Sopenharmony_ci unsigned long vm_flags = dst_vma->vm_flags; 7058c2ecf20Sopenharmony_ci pte_t pte = *src_pte; 7068c2ecf20Sopenharmony_ci struct page *page; 7078c2ecf20Sopenharmony_ci swp_entry_t entry = pte_to_swp_entry(pte); 7088c2ecf20Sopenharmony_ci 7098c2ecf20Sopenharmony_ci if (likely(!non_swap_entry(entry))) { 7108c2ecf20Sopenharmony_ci if (swap_duplicate(entry) < 0) 7118c2ecf20Sopenharmony_ci return entry.val; 7128c2ecf20Sopenharmony_ci 7138c2ecf20Sopenharmony_ci /* make sure dst_mm is on swapoff's mmlist. */ 7148c2ecf20Sopenharmony_ci if (unlikely(list_empty(&dst_mm->mmlist))) { 7158c2ecf20Sopenharmony_ci spin_lock(&mmlist_lock); 7168c2ecf20Sopenharmony_ci if (list_empty(&dst_mm->mmlist)) 7178c2ecf20Sopenharmony_ci list_add(&dst_mm->mmlist, 7188c2ecf20Sopenharmony_ci &src_mm->mmlist); 7198c2ecf20Sopenharmony_ci spin_unlock(&mmlist_lock); 7208c2ecf20Sopenharmony_ci } 7218c2ecf20Sopenharmony_ci rss[MM_SWAPENTS]++; 7228c2ecf20Sopenharmony_ci } else if (is_migration_entry(entry)) { 7238c2ecf20Sopenharmony_ci page = migration_entry_to_page(entry); 7248c2ecf20Sopenharmony_ci 7258c2ecf20Sopenharmony_ci rss[mm_counter(page)]++; 7268c2ecf20Sopenharmony_ci 7278c2ecf20Sopenharmony_ci if (is_write_migration_entry(entry) && 7288c2ecf20Sopenharmony_ci is_cow_mapping(vm_flags)) { 7298c2ecf20Sopenharmony_ci /* 7308c2ecf20Sopenharmony_ci * COW mappings require pages in both 7318c2ecf20Sopenharmony_ci * parent and child to be set to read. 7328c2ecf20Sopenharmony_ci */ 7338c2ecf20Sopenharmony_ci make_migration_entry_read(&entry); 7348c2ecf20Sopenharmony_ci pte = swp_entry_to_pte(entry); 7358c2ecf20Sopenharmony_ci if (pte_swp_soft_dirty(*src_pte)) 7368c2ecf20Sopenharmony_ci pte = pte_swp_mksoft_dirty(pte); 7378c2ecf20Sopenharmony_ci if (pte_swp_uffd_wp(*src_pte)) 7388c2ecf20Sopenharmony_ci pte = pte_swp_mkuffd_wp(pte); 7398c2ecf20Sopenharmony_ci set_pte_at(src_mm, addr, src_pte, pte); 7408c2ecf20Sopenharmony_ci } 7418c2ecf20Sopenharmony_ci } else if (is_device_private_entry(entry)) { 7428c2ecf20Sopenharmony_ci page = device_private_entry_to_page(entry); 7438c2ecf20Sopenharmony_ci 7448c2ecf20Sopenharmony_ci /* 7458c2ecf20Sopenharmony_ci * Update rss count even for unaddressable pages, as 7468c2ecf20Sopenharmony_ci * they should treated just like normal pages in this 7478c2ecf20Sopenharmony_ci * respect. 7488c2ecf20Sopenharmony_ci * 7498c2ecf20Sopenharmony_ci * We will likely want to have some new rss counters 7508c2ecf20Sopenharmony_ci * for unaddressable pages, at some point. But for now 7518c2ecf20Sopenharmony_ci * keep things as they are. 7528c2ecf20Sopenharmony_ci */ 7538c2ecf20Sopenharmony_ci get_page(page); 7548c2ecf20Sopenharmony_ci rss[mm_counter(page)]++; 7558c2ecf20Sopenharmony_ci page_dup_rmap(page, false); 7568c2ecf20Sopenharmony_ci 7578c2ecf20Sopenharmony_ci /* 7588c2ecf20Sopenharmony_ci * We do not preserve soft-dirty information, because so 7598c2ecf20Sopenharmony_ci * far, checkpoint/restore is the only feature that 7608c2ecf20Sopenharmony_ci * requires that. And checkpoint/restore does not work 7618c2ecf20Sopenharmony_ci * when a device driver is involved (you cannot easily 7628c2ecf20Sopenharmony_ci * save and restore device driver state). 7638c2ecf20Sopenharmony_ci */ 7648c2ecf20Sopenharmony_ci if (is_write_device_private_entry(entry) && 7658c2ecf20Sopenharmony_ci is_cow_mapping(vm_flags)) { 7668c2ecf20Sopenharmony_ci make_device_private_entry_read(&entry); 7678c2ecf20Sopenharmony_ci pte = swp_entry_to_pte(entry); 7688c2ecf20Sopenharmony_ci if (pte_swp_uffd_wp(*src_pte)) 7698c2ecf20Sopenharmony_ci pte = pte_swp_mkuffd_wp(pte); 7708c2ecf20Sopenharmony_ci set_pte_at(src_mm, addr, src_pte, pte); 7718c2ecf20Sopenharmony_ci } 7728c2ecf20Sopenharmony_ci } 7738c2ecf20Sopenharmony_ci if (!userfaultfd_wp(dst_vma)) 7748c2ecf20Sopenharmony_ci pte = pte_swp_clear_uffd_wp(pte); 7758c2ecf20Sopenharmony_ci set_pte_at(dst_mm, addr, dst_pte, pte); 7768c2ecf20Sopenharmony_ci return 0; 7778c2ecf20Sopenharmony_ci} 7788c2ecf20Sopenharmony_ci 7798c2ecf20Sopenharmony_ci/* 7808c2ecf20Sopenharmony_ci * Copy a present and normal page if necessary. 7818c2ecf20Sopenharmony_ci * 7828c2ecf20Sopenharmony_ci * NOTE! The usual case is that this doesn't need to do 7838c2ecf20Sopenharmony_ci * anything, and can just return a positive value. That 7848c2ecf20Sopenharmony_ci * will let the caller know that it can just increase 7858c2ecf20Sopenharmony_ci * the page refcount and re-use the pte the traditional 7868c2ecf20Sopenharmony_ci * way. 7878c2ecf20Sopenharmony_ci * 7888c2ecf20Sopenharmony_ci * But _if_ we need to copy it because it needs to be 7898c2ecf20Sopenharmony_ci * pinned in the parent (and the child should get its own 7908c2ecf20Sopenharmony_ci * copy rather than just a reference to the same page), 7918c2ecf20Sopenharmony_ci * we'll do that here and return zero to let the caller 7928c2ecf20Sopenharmony_ci * know we're done. 7938c2ecf20Sopenharmony_ci * 7948c2ecf20Sopenharmony_ci * And if we need a pre-allocated page but don't yet have 7958c2ecf20Sopenharmony_ci * one, return a negative error to let the preallocation 7968c2ecf20Sopenharmony_ci * code know so that it can do so outside the page table 7978c2ecf20Sopenharmony_ci * lock. 7988c2ecf20Sopenharmony_ci */ 7998c2ecf20Sopenharmony_cistatic inline int 8008c2ecf20Sopenharmony_cicopy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, 8018c2ecf20Sopenharmony_ci pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, 8028c2ecf20Sopenharmony_ci struct page **prealloc, pte_t pte, struct page *page) 8038c2ecf20Sopenharmony_ci{ 8048c2ecf20Sopenharmony_ci struct mm_struct *src_mm = src_vma->vm_mm; 8058c2ecf20Sopenharmony_ci struct page *new_page; 8068c2ecf20Sopenharmony_ci 8078c2ecf20Sopenharmony_ci if (!is_cow_mapping(src_vma->vm_flags)) 8088c2ecf20Sopenharmony_ci return 1; 8098c2ecf20Sopenharmony_ci 8108c2ecf20Sopenharmony_ci /* 8118c2ecf20Sopenharmony_ci * What we want to do is to check whether this page may 8128c2ecf20Sopenharmony_ci * have been pinned by the parent process. If so, 8138c2ecf20Sopenharmony_ci * instead of wrprotect the pte on both sides, we copy 8148c2ecf20Sopenharmony_ci * the page immediately so that we'll always guarantee 8158c2ecf20Sopenharmony_ci * the pinned page won't be randomly replaced in the 8168c2ecf20Sopenharmony_ci * future. 8178c2ecf20Sopenharmony_ci * 8188c2ecf20Sopenharmony_ci * The page pinning checks are just "has this mm ever 8198c2ecf20Sopenharmony_ci * seen pinning", along with the (inexact) check of 8208c2ecf20Sopenharmony_ci * the page count. That might give false positives for 8218c2ecf20Sopenharmony_ci * for pinning, but it will work correctly. 8228c2ecf20Sopenharmony_ci */ 8238c2ecf20Sopenharmony_ci if (likely(!atomic_read(&src_mm->has_pinned))) 8248c2ecf20Sopenharmony_ci return 1; 8258c2ecf20Sopenharmony_ci if (likely(!page_maybe_dma_pinned(page))) 8268c2ecf20Sopenharmony_ci return 1; 8278c2ecf20Sopenharmony_ci 8288c2ecf20Sopenharmony_ci /* 8298c2ecf20Sopenharmony_ci * The vma->anon_vma of the child process may be NULL 8308c2ecf20Sopenharmony_ci * because the entire vma does not contain anonymous pages. 8318c2ecf20Sopenharmony_ci * A BUG will occur when the copy_present_page() passes 8328c2ecf20Sopenharmony_ci * a copy of a non-anonymous page of that vma to the 8338c2ecf20Sopenharmony_ci * page_add_new_anon_rmap() to set up new anonymous rmap. 8348c2ecf20Sopenharmony_ci * Return 1 if the page is not an anonymous page. 8358c2ecf20Sopenharmony_ci */ 8368c2ecf20Sopenharmony_ci if (!PageAnon(page)) 8378c2ecf20Sopenharmony_ci return 1; 8388c2ecf20Sopenharmony_ci 8398c2ecf20Sopenharmony_ci new_page = *prealloc; 8408c2ecf20Sopenharmony_ci if (!new_page) 8418c2ecf20Sopenharmony_ci return -EAGAIN; 8428c2ecf20Sopenharmony_ci 8438c2ecf20Sopenharmony_ci /* 8448c2ecf20Sopenharmony_ci * We have a prealloc page, all good! Take it 8458c2ecf20Sopenharmony_ci * over and copy the page & arm it. 8468c2ecf20Sopenharmony_ci */ 8478c2ecf20Sopenharmony_ci *prealloc = NULL; 8488c2ecf20Sopenharmony_ci copy_user_highpage(new_page, page, addr, src_vma); 8498c2ecf20Sopenharmony_ci __SetPageUptodate(new_page); 8508c2ecf20Sopenharmony_ci page_add_new_anon_rmap(new_page, dst_vma, addr, false); 8518c2ecf20Sopenharmony_ci lru_cache_add_inactive_or_unevictable(new_page, dst_vma); 8528c2ecf20Sopenharmony_ci rss[mm_counter(new_page)]++; 8538c2ecf20Sopenharmony_ci 8548c2ecf20Sopenharmony_ci /* All done, just insert the new page copy in the child */ 8558c2ecf20Sopenharmony_ci pte = mk_pte(new_page, dst_vma->vm_page_prot); 8568c2ecf20Sopenharmony_ci pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); 8578c2ecf20Sopenharmony_ci if (userfaultfd_pte_wp(dst_vma, *src_pte)) 8588c2ecf20Sopenharmony_ci /* Uffd-wp needs to be delivered to dest pte as well */ 8598c2ecf20Sopenharmony_ci pte = pte_wrprotect(pte_mkuffd_wp(pte)); 8608c2ecf20Sopenharmony_ci set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); 8618c2ecf20Sopenharmony_ci return 0; 8628c2ecf20Sopenharmony_ci} 8638c2ecf20Sopenharmony_ci 8648c2ecf20Sopenharmony_ci/* 8658c2ecf20Sopenharmony_ci * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page 8668c2ecf20Sopenharmony_ci * is required to copy this pte. 8678c2ecf20Sopenharmony_ci */ 8688c2ecf20Sopenharmony_cistatic inline int 8698c2ecf20Sopenharmony_cicopy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, 8708c2ecf20Sopenharmony_ci pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, 8718c2ecf20Sopenharmony_ci struct page **prealloc) 8728c2ecf20Sopenharmony_ci{ 8738c2ecf20Sopenharmony_ci struct mm_struct *src_mm = src_vma->vm_mm; 8748c2ecf20Sopenharmony_ci unsigned long vm_flags = src_vma->vm_flags; 8758c2ecf20Sopenharmony_ci pte_t pte = *src_pte; 8768c2ecf20Sopenharmony_ci struct page *page; 8778c2ecf20Sopenharmony_ci 8788c2ecf20Sopenharmony_ci page = vm_normal_page(src_vma, addr, pte); 8798c2ecf20Sopenharmony_ci if (page) { 8808c2ecf20Sopenharmony_ci int retval; 8818c2ecf20Sopenharmony_ci 8828c2ecf20Sopenharmony_ci retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte, 8838c2ecf20Sopenharmony_ci addr, rss, prealloc, pte, page); 8848c2ecf20Sopenharmony_ci if (retval <= 0) 8858c2ecf20Sopenharmony_ci return retval; 8868c2ecf20Sopenharmony_ci 8878c2ecf20Sopenharmony_ci get_page(page); 8888c2ecf20Sopenharmony_ci page_dup_rmap(page, false); 8898c2ecf20Sopenharmony_ci rss[mm_counter(page)]++; 8908c2ecf20Sopenharmony_ci } 8918c2ecf20Sopenharmony_ci 8928c2ecf20Sopenharmony_ci /* 8938c2ecf20Sopenharmony_ci * If it's a COW mapping, write protect it both 8948c2ecf20Sopenharmony_ci * in the parent and the child 8958c2ecf20Sopenharmony_ci */ 8968c2ecf20Sopenharmony_ci if (is_cow_mapping(vm_flags) && pte_write(pte)) { 8978c2ecf20Sopenharmony_ci ptep_set_wrprotect(src_mm, addr, src_pte); 8988c2ecf20Sopenharmony_ci pte = pte_wrprotect(pte); 8998c2ecf20Sopenharmony_ci } 9008c2ecf20Sopenharmony_ci 9018c2ecf20Sopenharmony_ci /* 9028c2ecf20Sopenharmony_ci * If it's a shared mapping, mark it clean in 9038c2ecf20Sopenharmony_ci * the child 9048c2ecf20Sopenharmony_ci */ 9058c2ecf20Sopenharmony_ci if (vm_flags & VM_SHARED) 9068c2ecf20Sopenharmony_ci pte = pte_mkclean(pte); 9078c2ecf20Sopenharmony_ci pte = pte_mkold(pte); 9088c2ecf20Sopenharmony_ci 9098c2ecf20Sopenharmony_ci if (!userfaultfd_wp(dst_vma)) 9108c2ecf20Sopenharmony_ci pte = pte_clear_uffd_wp(pte); 9118c2ecf20Sopenharmony_ci 9128c2ecf20Sopenharmony_ci set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); 9138c2ecf20Sopenharmony_ci return 0; 9148c2ecf20Sopenharmony_ci} 9158c2ecf20Sopenharmony_ci 9168c2ecf20Sopenharmony_cistatic inline struct page * 9178c2ecf20Sopenharmony_cipage_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma, 9188c2ecf20Sopenharmony_ci unsigned long addr) 9198c2ecf20Sopenharmony_ci{ 9208c2ecf20Sopenharmony_ci struct page *new_page; 9218c2ecf20Sopenharmony_ci 9228c2ecf20Sopenharmony_ci new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr); 9238c2ecf20Sopenharmony_ci if (!new_page) 9248c2ecf20Sopenharmony_ci return NULL; 9258c2ecf20Sopenharmony_ci 9268c2ecf20Sopenharmony_ci if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) { 9278c2ecf20Sopenharmony_ci put_page(new_page); 9288c2ecf20Sopenharmony_ci return NULL; 9298c2ecf20Sopenharmony_ci } 9308c2ecf20Sopenharmony_ci cgroup_throttle_swaprate(new_page, GFP_KERNEL); 9318c2ecf20Sopenharmony_ci 9328c2ecf20Sopenharmony_ci return new_page; 9338c2ecf20Sopenharmony_ci} 9348c2ecf20Sopenharmony_ci 9358c2ecf20Sopenharmony_cistatic int 9368c2ecf20Sopenharmony_cicopy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, 9378c2ecf20Sopenharmony_ci pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, 9388c2ecf20Sopenharmony_ci unsigned long end) 9398c2ecf20Sopenharmony_ci{ 9408c2ecf20Sopenharmony_ci struct mm_struct *dst_mm = dst_vma->vm_mm; 9418c2ecf20Sopenharmony_ci struct mm_struct *src_mm = src_vma->vm_mm; 9428c2ecf20Sopenharmony_ci pte_t *orig_src_pte, *orig_dst_pte; 9438c2ecf20Sopenharmony_ci pte_t *src_pte, *dst_pte; 9448c2ecf20Sopenharmony_ci spinlock_t *src_ptl, *dst_ptl; 9458c2ecf20Sopenharmony_ci int progress, ret = 0; 9468c2ecf20Sopenharmony_ci int rss[NR_MM_COUNTERS]; 9478c2ecf20Sopenharmony_ci swp_entry_t entry = (swp_entry_t){0}; 9488c2ecf20Sopenharmony_ci struct page *prealloc = NULL; 9498c2ecf20Sopenharmony_ci 9508c2ecf20Sopenharmony_ciagain: 9518c2ecf20Sopenharmony_ci progress = 0; 9528c2ecf20Sopenharmony_ci init_rss_vec(rss); 9538c2ecf20Sopenharmony_ci 9548c2ecf20Sopenharmony_ci dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); 9558c2ecf20Sopenharmony_ci if (!dst_pte) { 9568c2ecf20Sopenharmony_ci ret = -ENOMEM; 9578c2ecf20Sopenharmony_ci goto out; 9588c2ecf20Sopenharmony_ci } 9598c2ecf20Sopenharmony_ci src_pte = pte_offset_map(src_pmd, addr); 9608c2ecf20Sopenharmony_ci src_ptl = pte_lockptr(src_mm, src_pmd); 9618c2ecf20Sopenharmony_ci spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 9628c2ecf20Sopenharmony_ci orig_src_pte = src_pte; 9638c2ecf20Sopenharmony_ci orig_dst_pte = dst_pte; 9648c2ecf20Sopenharmony_ci arch_enter_lazy_mmu_mode(); 9658c2ecf20Sopenharmony_ci 9668c2ecf20Sopenharmony_ci do { 9678c2ecf20Sopenharmony_ci /* 9688c2ecf20Sopenharmony_ci * We are holding two locks at this point - either of them 9698c2ecf20Sopenharmony_ci * could generate latencies in another task on another CPU. 9708c2ecf20Sopenharmony_ci */ 9718c2ecf20Sopenharmony_ci if (progress >= 32) { 9728c2ecf20Sopenharmony_ci progress = 0; 9738c2ecf20Sopenharmony_ci if (need_resched() || 9748c2ecf20Sopenharmony_ci spin_needbreak(src_ptl) || spin_needbreak(dst_ptl)) 9758c2ecf20Sopenharmony_ci break; 9768c2ecf20Sopenharmony_ci } 9778c2ecf20Sopenharmony_ci if (pte_none(*src_pte)) { 9788c2ecf20Sopenharmony_ci progress++; 9798c2ecf20Sopenharmony_ci continue; 9808c2ecf20Sopenharmony_ci } 9818c2ecf20Sopenharmony_ci if (unlikely(!pte_present(*src_pte))) { 9828c2ecf20Sopenharmony_ci entry.val = copy_nonpresent_pte(dst_mm, src_mm, 9838c2ecf20Sopenharmony_ci dst_pte, src_pte, 9848c2ecf20Sopenharmony_ci dst_vma, src_vma, 9858c2ecf20Sopenharmony_ci addr, rss); 9868c2ecf20Sopenharmony_ci if (entry.val) 9878c2ecf20Sopenharmony_ci break; 9888c2ecf20Sopenharmony_ci progress += 8; 9898c2ecf20Sopenharmony_ci continue; 9908c2ecf20Sopenharmony_ci } 9918c2ecf20Sopenharmony_ci /* copy_present_pte() will clear `*prealloc' if consumed */ 9928c2ecf20Sopenharmony_ci ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte, 9938c2ecf20Sopenharmony_ci addr, rss, &prealloc); 9948c2ecf20Sopenharmony_ci /* 9958c2ecf20Sopenharmony_ci * If we need a pre-allocated page for this pte, drop the 9968c2ecf20Sopenharmony_ci * locks, allocate, and try again. 9978c2ecf20Sopenharmony_ci */ 9988c2ecf20Sopenharmony_ci if (unlikely(ret == -EAGAIN)) 9998c2ecf20Sopenharmony_ci break; 10008c2ecf20Sopenharmony_ci if (unlikely(prealloc)) { 10018c2ecf20Sopenharmony_ci /* 10028c2ecf20Sopenharmony_ci * pre-alloc page cannot be reused by next time so as 10038c2ecf20Sopenharmony_ci * to strictly follow mempolicy (e.g., alloc_page_vma() 10048c2ecf20Sopenharmony_ci * will allocate page according to address). This 10058c2ecf20Sopenharmony_ci * could only happen if one pinned pte changed. 10068c2ecf20Sopenharmony_ci */ 10078c2ecf20Sopenharmony_ci put_page(prealloc); 10088c2ecf20Sopenharmony_ci prealloc = NULL; 10098c2ecf20Sopenharmony_ci } 10108c2ecf20Sopenharmony_ci progress += 8; 10118c2ecf20Sopenharmony_ci } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); 10128c2ecf20Sopenharmony_ci 10138c2ecf20Sopenharmony_ci arch_leave_lazy_mmu_mode(); 10148c2ecf20Sopenharmony_ci spin_unlock(src_ptl); 10158c2ecf20Sopenharmony_ci pte_unmap(orig_src_pte); 10168c2ecf20Sopenharmony_ci add_mm_rss_vec(dst_mm, rss); 10178c2ecf20Sopenharmony_ci pte_unmap_unlock(orig_dst_pte, dst_ptl); 10188c2ecf20Sopenharmony_ci cond_resched(); 10198c2ecf20Sopenharmony_ci 10208c2ecf20Sopenharmony_ci if (entry.val) { 10218c2ecf20Sopenharmony_ci if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) { 10228c2ecf20Sopenharmony_ci ret = -ENOMEM; 10238c2ecf20Sopenharmony_ci goto out; 10248c2ecf20Sopenharmony_ci } 10258c2ecf20Sopenharmony_ci entry.val = 0; 10268c2ecf20Sopenharmony_ci } else if (ret) { 10278c2ecf20Sopenharmony_ci WARN_ON_ONCE(ret != -EAGAIN); 10288c2ecf20Sopenharmony_ci prealloc = page_copy_prealloc(src_mm, src_vma, addr); 10298c2ecf20Sopenharmony_ci if (!prealloc) 10308c2ecf20Sopenharmony_ci return -ENOMEM; 10318c2ecf20Sopenharmony_ci /* We've captured and resolved the error. Reset, try again. */ 10328c2ecf20Sopenharmony_ci ret = 0; 10338c2ecf20Sopenharmony_ci } 10348c2ecf20Sopenharmony_ci if (addr != end) 10358c2ecf20Sopenharmony_ci goto again; 10368c2ecf20Sopenharmony_ciout: 10378c2ecf20Sopenharmony_ci if (unlikely(prealloc)) 10388c2ecf20Sopenharmony_ci put_page(prealloc); 10398c2ecf20Sopenharmony_ci return ret; 10408c2ecf20Sopenharmony_ci} 10418c2ecf20Sopenharmony_ci 10428c2ecf20Sopenharmony_cistatic inline int 10438c2ecf20Sopenharmony_cicopy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, 10448c2ecf20Sopenharmony_ci pud_t *dst_pud, pud_t *src_pud, unsigned long addr, 10458c2ecf20Sopenharmony_ci unsigned long end) 10468c2ecf20Sopenharmony_ci{ 10478c2ecf20Sopenharmony_ci struct mm_struct *dst_mm = dst_vma->vm_mm; 10488c2ecf20Sopenharmony_ci struct mm_struct *src_mm = src_vma->vm_mm; 10498c2ecf20Sopenharmony_ci pmd_t *src_pmd, *dst_pmd; 10508c2ecf20Sopenharmony_ci unsigned long next; 10518c2ecf20Sopenharmony_ci 10528c2ecf20Sopenharmony_ci dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); 10538c2ecf20Sopenharmony_ci if (!dst_pmd) 10548c2ecf20Sopenharmony_ci return -ENOMEM; 10558c2ecf20Sopenharmony_ci src_pmd = pmd_offset(src_pud, addr); 10568c2ecf20Sopenharmony_ci do { 10578c2ecf20Sopenharmony_ci next = pmd_addr_end(addr, end); 10588c2ecf20Sopenharmony_ci if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd) 10598c2ecf20Sopenharmony_ci || pmd_devmap(*src_pmd)) { 10608c2ecf20Sopenharmony_ci int err; 10618c2ecf20Sopenharmony_ci VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma); 10628c2ecf20Sopenharmony_ci err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd, 10638c2ecf20Sopenharmony_ci addr, dst_vma, src_vma); 10648c2ecf20Sopenharmony_ci if (err == -ENOMEM) 10658c2ecf20Sopenharmony_ci return -ENOMEM; 10668c2ecf20Sopenharmony_ci if (!err) 10678c2ecf20Sopenharmony_ci continue; 10688c2ecf20Sopenharmony_ci /* fall through */ 10698c2ecf20Sopenharmony_ci } 10708c2ecf20Sopenharmony_ci if (pmd_none_or_clear_bad(src_pmd)) 10718c2ecf20Sopenharmony_ci continue; 10728c2ecf20Sopenharmony_ci if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd, 10738c2ecf20Sopenharmony_ci addr, next)) 10748c2ecf20Sopenharmony_ci return -ENOMEM; 10758c2ecf20Sopenharmony_ci } while (dst_pmd++, src_pmd++, addr = next, addr != end); 10768c2ecf20Sopenharmony_ci return 0; 10778c2ecf20Sopenharmony_ci} 10788c2ecf20Sopenharmony_ci 10798c2ecf20Sopenharmony_cistatic inline int 10808c2ecf20Sopenharmony_cicopy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, 10818c2ecf20Sopenharmony_ci p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr, 10828c2ecf20Sopenharmony_ci unsigned long end) 10838c2ecf20Sopenharmony_ci{ 10848c2ecf20Sopenharmony_ci struct mm_struct *dst_mm = dst_vma->vm_mm; 10858c2ecf20Sopenharmony_ci struct mm_struct *src_mm = src_vma->vm_mm; 10868c2ecf20Sopenharmony_ci pud_t *src_pud, *dst_pud; 10878c2ecf20Sopenharmony_ci unsigned long next; 10888c2ecf20Sopenharmony_ci 10898c2ecf20Sopenharmony_ci dst_pud = pud_alloc(dst_mm, dst_p4d, addr); 10908c2ecf20Sopenharmony_ci if (!dst_pud) 10918c2ecf20Sopenharmony_ci return -ENOMEM; 10928c2ecf20Sopenharmony_ci src_pud = pud_offset(src_p4d, addr); 10938c2ecf20Sopenharmony_ci do { 10948c2ecf20Sopenharmony_ci next = pud_addr_end(addr, end); 10958c2ecf20Sopenharmony_ci if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) { 10968c2ecf20Sopenharmony_ci int err; 10978c2ecf20Sopenharmony_ci 10988c2ecf20Sopenharmony_ci VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma); 10998c2ecf20Sopenharmony_ci err = copy_huge_pud(dst_mm, src_mm, 11008c2ecf20Sopenharmony_ci dst_pud, src_pud, addr, src_vma); 11018c2ecf20Sopenharmony_ci if (err == -ENOMEM) 11028c2ecf20Sopenharmony_ci return -ENOMEM; 11038c2ecf20Sopenharmony_ci if (!err) 11048c2ecf20Sopenharmony_ci continue; 11058c2ecf20Sopenharmony_ci /* fall through */ 11068c2ecf20Sopenharmony_ci } 11078c2ecf20Sopenharmony_ci if (pud_none_or_clear_bad(src_pud)) 11088c2ecf20Sopenharmony_ci continue; 11098c2ecf20Sopenharmony_ci if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud, 11108c2ecf20Sopenharmony_ci addr, next)) 11118c2ecf20Sopenharmony_ci return -ENOMEM; 11128c2ecf20Sopenharmony_ci } while (dst_pud++, src_pud++, addr = next, addr != end); 11138c2ecf20Sopenharmony_ci return 0; 11148c2ecf20Sopenharmony_ci} 11158c2ecf20Sopenharmony_ci 11168c2ecf20Sopenharmony_cistatic inline int 11178c2ecf20Sopenharmony_cicopy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, 11188c2ecf20Sopenharmony_ci pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr, 11198c2ecf20Sopenharmony_ci unsigned long end) 11208c2ecf20Sopenharmony_ci{ 11218c2ecf20Sopenharmony_ci struct mm_struct *dst_mm = dst_vma->vm_mm; 11228c2ecf20Sopenharmony_ci p4d_t *src_p4d, *dst_p4d; 11238c2ecf20Sopenharmony_ci unsigned long next; 11248c2ecf20Sopenharmony_ci 11258c2ecf20Sopenharmony_ci dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr); 11268c2ecf20Sopenharmony_ci if (!dst_p4d) 11278c2ecf20Sopenharmony_ci return -ENOMEM; 11288c2ecf20Sopenharmony_ci src_p4d = p4d_offset(src_pgd, addr); 11298c2ecf20Sopenharmony_ci do { 11308c2ecf20Sopenharmony_ci next = p4d_addr_end(addr, end); 11318c2ecf20Sopenharmony_ci if (p4d_none_or_clear_bad(src_p4d)) 11328c2ecf20Sopenharmony_ci continue; 11338c2ecf20Sopenharmony_ci if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d, 11348c2ecf20Sopenharmony_ci addr, next)) 11358c2ecf20Sopenharmony_ci return -ENOMEM; 11368c2ecf20Sopenharmony_ci } while (dst_p4d++, src_p4d++, addr = next, addr != end); 11378c2ecf20Sopenharmony_ci return 0; 11388c2ecf20Sopenharmony_ci} 11398c2ecf20Sopenharmony_ci 11408c2ecf20Sopenharmony_ciint 11418c2ecf20Sopenharmony_cicopy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) 11428c2ecf20Sopenharmony_ci{ 11438c2ecf20Sopenharmony_ci pgd_t *src_pgd, *dst_pgd; 11448c2ecf20Sopenharmony_ci unsigned long next; 11458c2ecf20Sopenharmony_ci unsigned long addr = src_vma->vm_start; 11468c2ecf20Sopenharmony_ci unsigned long end = src_vma->vm_end; 11478c2ecf20Sopenharmony_ci struct mm_struct *dst_mm = dst_vma->vm_mm; 11488c2ecf20Sopenharmony_ci struct mm_struct *src_mm = src_vma->vm_mm; 11498c2ecf20Sopenharmony_ci struct mmu_notifier_range range; 11508c2ecf20Sopenharmony_ci bool is_cow; 11518c2ecf20Sopenharmony_ci int ret; 11528c2ecf20Sopenharmony_ci 11538c2ecf20Sopenharmony_ci /* 11548c2ecf20Sopenharmony_ci * Don't copy ptes where a page fault will fill them correctly. 11558c2ecf20Sopenharmony_ci * Fork becomes much lighter when there are big shared or private 11568c2ecf20Sopenharmony_ci * readonly mappings. The tradeoff is that copy_page_range is more 11578c2ecf20Sopenharmony_ci * efficient than faulting. 11588c2ecf20Sopenharmony_ci */ 11598c2ecf20Sopenharmony_ci if (!(src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) && 11608c2ecf20Sopenharmony_ci !src_vma->anon_vma) 11618c2ecf20Sopenharmony_ci return 0; 11628c2ecf20Sopenharmony_ci 11638c2ecf20Sopenharmony_ci if (is_vm_hugetlb_page(src_vma)) 11648c2ecf20Sopenharmony_ci return copy_hugetlb_page_range(dst_mm, src_mm, src_vma); 11658c2ecf20Sopenharmony_ci 11668c2ecf20Sopenharmony_ci if (unlikely(src_vma->vm_flags & VM_PFNMAP)) { 11678c2ecf20Sopenharmony_ci /* 11688c2ecf20Sopenharmony_ci * We do not free on error cases below as remove_vma 11698c2ecf20Sopenharmony_ci * gets called on error from higher level routine 11708c2ecf20Sopenharmony_ci */ 11718c2ecf20Sopenharmony_ci ret = track_pfn_copy(src_vma); 11728c2ecf20Sopenharmony_ci if (ret) 11738c2ecf20Sopenharmony_ci return ret; 11748c2ecf20Sopenharmony_ci } 11758c2ecf20Sopenharmony_ci 11768c2ecf20Sopenharmony_ci /* 11778c2ecf20Sopenharmony_ci * We need to invalidate the secondary MMU mappings only when 11788c2ecf20Sopenharmony_ci * there could be a permission downgrade on the ptes of the 11798c2ecf20Sopenharmony_ci * parent mm. And a permission downgrade will only happen if 11808c2ecf20Sopenharmony_ci * is_cow_mapping() returns true. 11818c2ecf20Sopenharmony_ci */ 11828c2ecf20Sopenharmony_ci is_cow = is_cow_mapping(src_vma->vm_flags); 11838c2ecf20Sopenharmony_ci 11848c2ecf20Sopenharmony_ci if (is_cow) { 11858c2ecf20Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 11868c2ecf20Sopenharmony_ci 0, src_vma, src_mm, addr, end); 11878c2ecf20Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 11888c2ecf20Sopenharmony_ci /* 11898c2ecf20Sopenharmony_ci * Disabling preemption is not needed for the write side, as 11908c2ecf20Sopenharmony_ci * the read side doesn't spin, but goes to the mmap_lock. 11918c2ecf20Sopenharmony_ci * 11928c2ecf20Sopenharmony_ci * Use the raw variant of the seqcount_t write API to avoid 11938c2ecf20Sopenharmony_ci * lockdep complaining about preemptibility. 11948c2ecf20Sopenharmony_ci */ 11958c2ecf20Sopenharmony_ci mmap_assert_write_locked(src_mm); 11968c2ecf20Sopenharmony_ci raw_write_seqcount_begin(&src_mm->write_protect_seq); 11978c2ecf20Sopenharmony_ci } 11988c2ecf20Sopenharmony_ci 11998c2ecf20Sopenharmony_ci ret = 0; 12008c2ecf20Sopenharmony_ci dst_pgd = pgd_offset(dst_mm, addr); 12018c2ecf20Sopenharmony_ci src_pgd = pgd_offset(src_mm, addr); 12028c2ecf20Sopenharmony_ci do { 12038c2ecf20Sopenharmony_ci next = pgd_addr_end(addr, end); 12048c2ecf20Sopenharmony_ci if (pgd_none_or_clear_bad(src_pgd)) 12058c2ecf20Sopenharmony_ci continue; 12068c2ecf20Sopenharmony_ci if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd, 12078c2ecf20Sopenharmony_ci addr, next))) { 12088c2ecf20Sopenharmony_ci ret = -ENOMEM; 12098c2ecf20Sopenharmony_ci break; 12108c2ecf20Sopenharmony_ci } 12118c2ecf20Sopenharmony_ci } while (dst_pgd++, src_pgd++, addr = next, addr != end); 12128c2ecf20Sopenharmony_ci 12138c2ecf20Sopenharmony_ci if (is_cow) { 12148c2ecf20Sopenharmony_ci raw_write_seqcount_end(&src_mm->write_protect_seq); 12158c2ecf20Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 12168c2ecf20Sopenharmony_ci } 12178c2ecf20Sopenharmony_ci return ret; 12188c2ecf20Sopenharmony_ci} 12198c2ecf20Sopenharmony_ci 12208c2ecf20Sopenharmony_ci/* Whether we should zap all COWed (private) pages too */ 12218c2ecf20Sopenharmony_cistatic inline bool should_zap_cows(struct zap_details *details) 12228c2ecf20Sopenharmony_ci{ 12238c2ecf20Sopenharmony_ci /* By default, zap all pages */ 12248c2ecf20Sopenharmony_ci if (!details) 12258c2ecf20Sopenharmony_ci return true; 12268c2ecf20Sopenharmony_ci 12278c2ecf20Sopenharmony_ci /* Or, we zap COWed pages only if the caller wants to */ 12288c2ecf20Sopenharmony_ci return !details->check_mapping; 12298c2ecf20Sopenharmony_ci} 12308c2ecf20Sopenharmony_ci 12318c2ecf20Sopenharmony_cistatic unsigned long zap_pte_range(struct mmu_gather *tlb, 12328c2ecf20Sopenharmony_ci struct vm_area_struct *vma, pmd_t *pmd, 12338c2ecf20Sopenharmony_ci unsigned long addr, unsigned long end, 12348c2ecf20Sopenharmony_ci struct zap_details *details) 12358c2ecf20Sopenharmony_ci{ 12368c2ecf20Sopenharmony_ci struct mm_struct *mm = tlb->mm; 12378c2ecf20Sopenharmony_ci int force_flush = 0; 12388c2ecf20Sopenharmony_ci int rss[NR_MM_COUNTERS]; 12398c2ecf20Sopenharmony_ci spinlock_t *ptl; 12408c2ecf20Sopenharmony_ci pte_t *start_pte; 12418c2ecf20Sopenharmony_ci pte_t *pte; 12428c2ecf20Sopenharmony_ci swp_entry_t entry; 12438c2ecf20Sopenharmony_ci 12448c2ecf20Sopenharmony_ci tlb_change_page_size(tlb, PAGE_SIZE); 12458c2ecf20Sopenharmony_ciagain: 12468c2ecf20Sopenharmony_ci init_rss_vec(rss); 12478c2ecf20Sopenharmony_ci start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 12488c2ecf20Sopenharmony_ci pte = start_pte; 12498c2ecf20Sopenharmony_ci flush_tlb_batched_pending(mm); 12508c2ecf20Sopenharmony_ci arch_enter_lazy_mmu_mode(); 12518c2ecf20Sopenharmony_ci do { 12528c2ecf20Sopenharmony_ci pte_t ptent = *pte; 12538c2ecf20Sopenharmony_ci if (pte_none(ptent)) 12548c2ecf20Sopenharmony_ci continue; 12558c2ecf20Sopenharmony_ci 12568c2ecf20Sopenharmony_ci if (need_resched()) 12578c2ecf20Sopenharmony_ci break; 12588c2ecf20Sopenharmony_ci 12598c2ecf20Sopenharmony_ci if (pte_present(ptent)) { 12608c2ecf20Sopenharmony_ci struct page *page; 12618c2ecf20Sopenharmony_ci 12628c2ecf20Sopenharmony_ci page = vm_normal_page(vma, addr, ptent); 12638c2ecf20Sopenharmony_ci if (vma->vm_flags & VM_USEREXPTE) 12648c2ecf20Sopenharmony_ci page = NULL; 12658c2ecf20Sopenharmony_ci if (unlikely(details) && page) { 12668c2ecf20Sopenharmony_ci /* 12678c2ecf20Sopenharmony_ci * unmap_shared_mapping_pages() wants to 12688c2ecf20Sopenharmony_ci * invalidate cache without truncating: 12698c2ecf20Sopenharmony_ci * unmap shared but keep private pages. 12708c2ecf20Sopenharmony_ci */ 12718c2ecf20Sopenharmony_ci if (details->check_mapping && 12728c2ecf20Sopenharmony_ci details->check_mapping != page_rmapping(page)) 12738c2ecf20Sopenharmony_ci continue; 12748c2ecf20Sopenharmony_ci } 12758c2ecf20Sopenharmony_ci ptent = ptep_get_and_clear_full(mm, addr, pte, 12768c2ecf20Sopenharmony_ci tlb->fullmm); 12778c2ecf20Sopenharmony_ci tlb_remove_tlb_entry(tlb, pte, addr); 12788c2ecf20Sopenharmony_ci if (unlikely(!page)) 12798c2ecf20Sopenharmony_ci continue; 12808c2ecf20Sopenharmony_ci if (vma->vm_flags & VM_PURGEABLE) 12818c2ecf20Sopenharmony_ci uxpte_clear_present(vma, addr); 12828c2ecf20Sopenharmony_ci if (!PageAnon(page)) { 12838c2ecf20Sopenharmony_ci if (pte_dirty(ptent)) { 12848c2ecf20Sopenharmony_ci force_flush = 1; 12858c2ecf20Sopenharmony_ci set_page_dirty(page); 12868c2ecf20Sopenharmony_ci } 12878c2ecf20Sopenharmony_ci if (pte_young(ptent) && 12888c2ecf20Sopenharmony_ci likely(!(vma->vm_flags & VM_SEQ_READ))) 12898c2ecf20Sopenharmony_ci mark_page_accessed(page); 12908c2ecf20Sopenharmony_ci } 12918c2ecf20Sopenharmony_ci rss[mm_counter(page)]--; 12928c2ecf20Sopenharmony_ci page_remove_rmap(page, false); 12938c2ecf20Sopenharmony_ci if (unlikely(page_mapcount(page) < 0)) 12948c2ecf20Sopenharmony_ci print_bad_pte(vma, addr, ptent, page); 12958c2ecf20Sopenharmony_ci if (unlikely(__tlb_remove_page(tlb, page))) { 12968c2ecf20Sopenharmony_ci force_flush = 1; 12978c2ecf20Sopenharmony_ci addr += PAGE_SIZE; 12988c2ecf20Sopenharmony_ci break; 12998c2ecf20Sopenharmony_ci } 13008c2ecf20Sopenharmony_ci continue; 13018c2ecf20Sopenharmony_ci } 13028c2ecf20Sopenharmony_ci 13038c2ecf20Sopenharmony_ci entry = pte_to_swp_entry(ptent); 13048c2ecf20Sopenharmony_ci if (is_device_private_entry(entry)) { 13058c2ecf20Sopenharmony_ci struct page *page = device_private_entry_to_page(entry); 13068c2ecf20Sopenharmony_ci 13078c2ecf20Sopenharmony_ci if (unlikely(details && details->check_mapping)) { 13088c2ecf20Sopenharmony_ci /* 13098c2ecf20Sopenharmony_ci * unmap_shared_mapping_pages() wants to 13108c2ecf20Sopenharmony_ci * invalidate cache without truncating: 13118c2ecf20Sopenharmony_ci * unmap shared but keep private pages. 13128c2ecf20Sopenharmony_ci */ 13138c2ecf20Sopenharmony_ci if (details->check_mapping != 13148c2ecf20Sopenharmony_ci page_rmapping(page)) 13158c2ecf20Sopenharmony_ci continue; 13168c2ecf20Sopenharmony_ci } 13178c2ecf20Sopenharmony_ci 13188c2ecf20Sopenharmony_ci pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 13198c2ecf20Sopenharmony_ci rss[mm_counter(page)]--; 13208c2ecf20Sopenharmony_ci page_remove_rmap(page, false); 13218c2ecf20Sopenharmony_ci put_page(page); 13228c2ecf20Sopenharmony_ci continue; 13238c2ecf20Sopenharmony_ci } 13248c2ecf20Sopenharmony_ci 13258c2ecf20Sopenharmony_ci if (!non_swap_entry(entry)) { 13268c2ecf20Sopenharmony_ci /* Genuine swap entry, hence a private anon page */ 13278c2ecf20Sopenharmony_ci if (!should_zap_cows(details)) 13288c2ecf20Sopenharmony_ci continue; 13298c2ecf20Sopenharmony_ci rss[MM_SWAPENTS]--; 13308c2ecf20Sopenharmony_ci } else if (is_migration_entry(entry)) { 13318c2ecf20Sopenharmony_ci struct page *page; 13328c2ecf20Sopenharmony_ci 13338c2ecf20Sopenharmony_ci page = migration_entry_to_page(entry); 13348c2ecf20Sopenharmony_ci if (details && details->check_mapping && 13358c2ecf20Sopenharmony_ci details->check_mapping != page_rmapping(page)) 13368c2ecf20Sopenharmony_ci continue; 13378c2ecf20Sopenharmony_ci rss[mm_counter(page)]--; 13388c2ecf20Sopenharmony_ci } 13398c2ecf20Sopenharmony_ci if (unlikely(!free_swap_and_cache(entry))) 13408c2ecf20Sopenharmony_ci print_bad_pte(vma, addr, ptent, NULL); 13418c2ecf20Sopenharmony_ci pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 13428c2ecf20Sopenharmony_ci } while (pte++, addr += PAGE_SIZE, addr != end); 13438c2ecf20Sopenharmony_ci 13448c2ecf20Sopenharmony_ci add_mm_rss_vec(mm, rss); 13458c2ecf20Sopenharmony_ci arch_leave_lazy_mmu_mode(); 13468c2ecf20Sopenharmony_ci 13478c2ecf20Sopenharmony_ci /* Do the actual TLB flush before dropping ptl */ 13488c2ecf20Sopenharmony_ci if (force_flush) 13498c2ecf20Sopenharmony_ci tlb_flush_mmu_tlbonly(tlb); 13508c2ecf20Sopenharmony_ci pte_unmap_unlock(start_pte, ptl); 13518c2ecf20Sopenharmony_ci 13528c2ecf20Sopenharmony_ci /* 13538c2ecf20Sopenharmony_ci * If we forced a TLB flush (either due to running out of 13548c2ecf20Sopenharmony_ci * batch buffers or because we needed to flush dirty TLB 13558c2ecf20Sopenharmony_ci * entries before releasing the ptl), free the batched 13568c2ecf20Sopenharmony_ci * memory too. Restart if we didn't do everything. 13578c2ecf20Sopenharmony_ci */ 13588c2ecf20Sopenharmony_ci if (force_flush) { 13598c2ecf20Sopenharmony_ci force_flush = 0; 13608c2ecf20Sopenharmony_ci tlb_flush_mmu(tlb); 13618c2ecf20Sopenharmony_ci } 13628c2ecf20Sopenharmony_ci 13638c2ecf20Sopenharmony_ci if (addr != end) { 13648c2ecf20Sopenharmony_ci cond_resched(); 13658c2ecf20Sopenharmony_ci goto again; 13668c2ecf20Sopenharmony_ci } 13678c2ecf20Sopenharmony_ci 13688c2ecf20Sopenharmony_ci return addr; 13698c2ecf20Sopenharmony_ci} 13708c2ecf20Sopenharmony_ci 13718c2ecf20Sopenharmony_cistatic inline unsigned long zap_pmd_range(struct mmu_gather *tlb, 13728c2ecf20Sopenharmony_ci struct vm_area_struct *vma, pud_t *pud, 13738c2ecf20Sopenharmony_ci unsigned long addr, unsigned long end, 13748c2ecf20Sopenharmony_ci struct zap_details *details) 13758c2ecf20Sopenharmony_ci{ 13768c2ecf20Sopenharmony_ci pmd_t *pmd; 13778c2ecf20Sopenharmony_ci unsigned long next; 13788c2ecf20Sopenharmony_ci 13798c2ecf20Sopenharmony_ci pmd = pmd_offset(pud, addr); 13808c2ecf20Sopenharmony_ci do { 13818c2ecf20Sopenharmony_ci next = pmd_addr_end(addr, end); 13828c2ecf20Sopenharmony_ci if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { 13838c2ecf20Sopenharmony_ci if (next - addr != HPAGE_PMD_SIZE) 13848c2ecf20Sopenharmony_ci __split_huge_pmd(vma, pmd, addr, false, NULL); 13858c2ecf20Sopenharmony_ci else if (zap_huge_pmd(tlb, vma, pmd, addr)) 13868c2ecf20Sopenharmony_ci goto next; 13878c2ecf20Sopenharmony_ci /* fall through */ 13888c2ecf20Sopenharmony_ci } else if (details && details->single_page && 13898c2ecf20Sopenharmony_ci PageTransCompound(details->single_page) && 13908c2ecf20Sopenharmony_ci next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) { 13918c2ecf20Sopenharmony_ci spinlock_t *ptl = pmd_lock(tlb->mm, pmd); 13928c2ecf20Sopenharmony_ci /* 13938c2ecf20Sopenharmony_ci * Take and drop THP pmd lock so that we cannot return 13948c2ecf20Sopenharmony_ci * prematurely, while zap_huge_pmd() has cleared *pmd, 13958c2ecf20Sopenharmony_ci * but not yet decremented compound_mapcount(). 13968c2ecf20Sopenharmony_ci */ 13978c2ecf20Sopenharmony_ci spin_unlock(ptl); 13988c2ecf20Sopenharmony_ci } 13998c2ecf20Sopenharmony_ci 14008c2ecf20Sopenharmony_ci /* 14018c2ecf20Sopenharmony_ci * Here there can be other concurrent MADV_DONTNEED or 14028c2ecf20Sopenharmony_ci * trans huge page faults running, and if the pmd is 14038c2ecf20Sopenharmony_ci * none or trans huge it can change under us. This is 14048c2ecf20Sopenharmony_ci * because MADV_DONTNEED holds the mmap_lock in read 14058c2ecf20Sopenharmony_ci * mode. 14068c2ecf20Sopenharmony_ci */ 14078c2ecf20Sopenharmony_ci if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 14088c2ecf20Sopenharmony_ci goto next; 14098c2ecf20Sopenharmony_ci next = zap_pte_range(tlb, vma, pmd, addr, next, details); 14108c2ecf20Sopenharmony_cinext: 14118c2ecf20Sopenharmony_ci cond_resched(); 14128c2ecf20Sopenharmony_ci } while (pmd++, addr = next, addr != end); 14138c2ecf20Sopenharmony_ci 14148c2ecf20Sopenharmony_ci return addr; 14158c2ecf20Sopenharmony_ci} 14168c2ecf20Sopenharmony_ci 14178c2ecf20Sopenharmony_cistatic inline unsigned long zap_pud_range(struct mmu_gather *tlb, 14188c2ecf20Sopenharmony_ci struct vm_area_struct *vma, p4d_t *p4d, 14198c2ecf20Sopenharmony_ci unsigned long addr, unsigned long end, 14208c2ecf20Sopenharmony_ci struct zap_details *details) 14218c2ecf20Sopenharmony_ci{ 14228c2ecf20Sopenharmony_ci pud_t *pud; 14238c2ecf20Sopenharmony_ci unsigned long next; 14248c2ecf20Sopenharmony_ci 14258c2ecf20Sopenharmony_ci pud = pud_offset(p4d, addr); 14268c2ecf20Sopenharmony_ci do { 14278c2ecf20Sopenharmony_ci next = pud_addr_end(addr, end); 14288c2ecf20Sopenharmony_ci if (pud_trans_huge(*pud) || pud_devmap(*pud)) { 14298c2ecf20Sopenharmony_ci if (next - addr != HPAGE_PUD_SIZE) { 14308c2ecf20Sopenharmony_ci mmap_assert_locked(tlb->mm); 14318c2ecf20Sopenharmony_ci split_huge_pud(vma, pud, addr); 14328c2ecf20Sopenharmony_ci } else if (zap_huge_pud(tlb, vma, pud, addr)) 14338c2ecf20Sopenharmony_ci goto next; 14348c2ecf20Sopenharmony_ci /* fall through */ 14358c2ecf20Sopenharmony_ci } 14368c2ecf20Sopenharmony_ci if (pud_none_or_clear_bad(pud)) 14378c2ecf20Sopenharmony_ci continue; 14388c2ecf20Sopenharmony_ci next = zap_pmd_range(tlb, vma, pud, addr, next, details); 14398c2ecf20Sopenharmony_cinext: 14408c2ecf20Sopenharmony_ci cond_resched(); 14418c2ecf20Sopenharmony_ci } while (pud++, addr = next, addr != end); 14428c2ecf20Sopenharmony_ci 14438c2ecf20Sopenharmony_ci return addr; 14448c2ecf20Sopenharmony_ci} 14458c2ecf20Sopenharmony_ci 14468c2ecf20Sopenharmony_cistatic inline unsigned long zap_p4d_range(struct mmu_gather *tlb, 14478c2ecf20Sopenharmony_ci struct vm_area_struct *vma, pgd_t *pgd, 14488c2ecf20Sopenharmony_ci unsigned long addr, unsigned long end, 14498c2ecf20Sopenharmony_ci struct zap_details *details) 14508c2ecf20Sopenharmony_ci{ 14518c2ecf20Sopenharmony_ci p4d_t *p4d; 14528c2ecf20Sopenharmony_ci unsigned long next; 14538c2ecf20Sopenharmony_ci 14548c2ecf20Sopenharmony_ci p4d = p4d_offset(pgd, addr); 14558c2ecf20Sopenharmony_ci do { 14568c2ecf20Sopenharmony_ci next = p4d_addr_end(addr, end); 14578c2ecf20Sopenharmony_ci if (p4d_none_or_clear_bad(p4d)) 14588c2ecf20Sopenharmony_ci continue; 14598c2ecf20Sopenharmony_ci next = zap_pud_range(tlb, vma, p4d, addr, next, details); 14608c2ecf20Sopenharmony_ci } while (p4d++, addr = next, addr != end); 14618c2ecf20Sopenharmony_ci 14628c2ecf20Sopenharmony_ci return addr; 14638c2ecf20Sopenharmony_ci} 14648c2ecf20Sopenharmony_ci 14658c2ecf20Sopenharmony_civoid unmap_page_range(struct mmu_gather *tlb, 14668c2ecf20Sopenharmony_ci struct vm_area_struct *vma, 14678c2ecf20Sopenharmony_ci unsigned long addr, unsigned long end, 14688c2ecf20Sopenharmony_ci struct zap_details *details) 14698c2ecf20Sopenharmony_ci{ 14708c2ecf20Sopenharmony_ci pgd_t *pgd; 14718c2ecf20Sopenharmony_ci unsigned long next; 14728c2ecf20Sopenharmony_ci 14738c2ecf20Sopenharmony_ci BUG_ON(addr >= end); 14748c2ecf20Sopenharmony_ci tlb_start_vma(tlb, vma); 14758c2ecf20Sopenharmony_ci pgd = pgd_offset(vma->vm_mm, addr); 14768c2ecf20Sopenharmony_ci do { 14778c2ecf20Sopenharmony_ci next = pgd_addr_end(addr, end); 14788c2ecf20Sopenharmony_ci if (pgd_none_or_clear_bad(pgd)) 14798c2ecf20Sopenharmony_ci continue; 14808c2ecf20Sopenharmony_ci next = zap_p4d_range(tlb, vma, pgd, addr, next, details); 14818c2ecf20Sopenharmony_ci } while (pgd++, addr = next, addr != end); 14828c2ecf20Sopenharmony_ci tlb_end_vma(tlb, vma); 14838c2ecf20Sopenharmony_ci} 14848c2ecf20Sopenharmony_ci 14858c2ecf20Sopenharmony_ci 14868c2ecf20Sopenharmony_cistatic void unmap_single_vma(struct mmu_gather *tlb, 14878c2ecf20Sopenharmony_ci struct vm_area_struct *vma, unsigned long start_addr, 14888c2ecf20Sopenharmony_ci unsigned long end_addr, 14898c2ecf20Sopenharmony_ci struct zap_details *details) 14908c2ecf20Sopenharmony_ci{ 14918c2ecf20Sopenharmony_ci unsigned long start = max(vma->vm_start, start_addr); 14928c2ecf20Sopenharmony_ci unsigned long end; 14938c2ecf20Sopenharmony_ci 14948c2ecf20Sopenharmony_ci if (start >= vma->vm_end) 14958c2ecf20Sopenharmony_ci return; 14968c2ecf20Sopenharmony_ci end = min(vma->vm_end, end_addr); 14978c2ecf20Sopenharmony_ci if (end <= vma->vm_start) 14988c2ecf20Sopenharmony_ci return; 14998c2ecf20Sopenharmony_ci 15008c2ecf20Sopenharmony_ci if (vma->vm_file) 15018c2ecf20Sopenharmony_ci uprobe_munmap(vma, start, end); 15028c2ecf20Sopenharmony_ci 15038c2ecf20Sopenharmony_ci if (unlikely(vma->vm_flags & VM_PFNMAP)) 15048c2ecf20Sopenharmony_ci untrack_pfn(vma, 0, 0); 15058c2ecf20Sopenharmony_ci 15068c2ecf20Sopenharmony_ci if (start != end) { 15078c2ecf20Sopenharmony_ci if (unlikely(is_vm_hugetlb_page(vma))) { 15088c2ecf20Sopenharmony_ci /* 15098c2ecf20Sopenharmony_ci * It is undesirable to test vma->vm_file as it 15108c2ecf20Sopenharmony_ci * should be non-null for valid hugetlb area. 15118c2ecf20Sopenharmony_ci * However, vm_file will be NULL in the error 15128c2ecf20Sopenharmony_ci * cleanup path of mmap_region. When 15138c2ecf20Sopenharmony_ci * hugetlbfs ->mmap method fails, 15148c2ecf20Sopenharmony_ci * mmap_region() nullifies vma->vm_file 15158c2ecf20Sopenharmony_ci * before calling this function to clean up. 15168c2ecf20Sopenharmony_ci * Since no pte has actually been setup, it is 15178c2ecf20Sopenharmony_ci * safe to do nothing in this case. 15188c2ecf20Sopenharmony_ci */ 15198c2ecf20Sopenharmony_ci if (vma->vm_file) { 15208c2ecf20Sopenharmony_ci i_mmap_lock_write(vma->vm_file->f_mapping); 15218c2ecf20Sopenharmony_ci __unmap_hugepage_range_final(tlb, vma, start, end, NULL); 15228c2ecf20Sopenharmony_ci i_mmap_unlock_write(vma->vm_file->f_mapping); 15238c2ecf20Sopenharmony_ci } 15248c2ecf20Sopenharmony_ci } else 15258c2ecf20Sopenharmony_ci unmap_page_range(tlb, vma, start, end, details); 15268c2ecf20Sopenharmony_ci } 15278c2ecf20Sopenharmony_ci} 15288c2ecf20Sopenharmony_ci 15298c2ecf20Sopenharmony_ci/** 15308c2ecf20Sopenharmony_ci * unmap_vmas - unmap a range of memory covered by a list of vma's 15318c2ecf20Sopenharmony_ci * @tlb: address of the caller's struct mmu_gather 15328c2ecf20Sopenharmony_ci * @vma: the starting vma 15338c2ecf20Sopenharmony_ci * @start_addr: virtual address at which to start unmapping 15348c2ecf20Sopenharmony_ci * @end_addr: virtual address at which to end unmapping 15358c2ecf20Sopenharmony_ci * 15368c2ecf20Sopenharmony_ci * Unmap all pages in the vma list. 15378c2ecf20Sopenharmony_ci * 15388c2ecf20Sopenharmony_ci * Only addresses between `start' and `end' will be unmapped. 15398c2ecf20Sopenharmony_ci * 15408c2ecf20Sopenharmony_ci * The VMA list must be sorted in ascending virtual address order. 15418c2ecf20Sopenharmony_ci * 15428c2ecf20Sopenharmony_ci * unmap_vmas() assumes that the caller will flush the whole unmapped address 15438c2ecf20Sopenharmony_ci * range after unmap_vmas() returns. So the only responsibility here is to 15448c2ecf20Sopenharmony_ci * ensure that any thus-far unmapped pages are flushed before unmap_vmas() 15458c2ecf20Sopenharmony_ci * drops the lock and schedules. 15468c2ecf20Sopenharmony_ci */ 15478c2ecf20Sopenharmony_civoid unmap_vmas(struct mmu_gather *tlb, 15488c2ecf20Sopenharmony_ci struct vm_area_struct *vma, unsigned long start_addr, 15498c2ecf20Sopenharmony_ci unsigned long end_addr) 15508c2ecf20Sopenharmony_ci{ 15518c2ecf20Sopenharmony_ci struct mmu_notifier_range range; 15528c2ecf20Sopenharmony_ci 15538c2ecf20Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, 15548c2ecf20Sopenharmony_ci start_addr, end_addr); 15558c2ecf20Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 15568c2ecf20Sopenharmony_ci for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) 15578c2ecf20Sopenharmony_ci unmap_single_vma(tlb, vma, start_addr, end_addr, NULL); 15588c2ecf20Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 15598c2ecf20Sopenharmony_ci} 15608c2ecf20Sopenharmony_ci 15618c2ecf20Sopenharmony_ci/** 15628c2ecf20Sopenharmony_ci * zap_page_range - remove user pages in a given range 15638c2ecf20Sopenharmony_ci * @vma: vm_area_struct holding the applicable pages 15648c2ecf20Sopenharmony_ci * @start: starting address of pages to zap 15658c2ecf20Sopenharmony_ci * @size: number of bytes to zap 15668c2ecf20Sopenharmony_ci * 15678c2ecf20Sopenharmony_ci * Caller must protect the VMA list 15688c2ecf20Sopenharmony_ci */ 15698c2ecf20Sopenharmony_civoid zap_page_range(struct vm_area_struct *vma, unsigned long start, 15708c2ecf20Sopenharmony_ci unsigned long size) 15718c2ecf20Sopenharmony_ci{ 15728c2ecf20Sopenharmony_ci struct mmu_notifier_range range; 15738c2ecf20Sopenharmony_ci struct mmu_gather tlb; 15748c2ecf20Sopenharmony_ci 15758c2ecf20Sopenharmony_ci lru_add_drain(); 15768c2ecf20Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, 15778c2ecf20Sopenharmony_ci start, start + size); 15788c2ecf20Sopenharmony_ci tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end); 15798c2ecf20Sopenharmony_ci update_hiwater_rss(vma->vm_mm); 15808c2ecf20Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 15818c2ecf20Sopenharmony_ci for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next) 15828c2ecf20Sopenharmony_ci unmap_single_vma(&tlb, vma, start, range.end, NULL); 15838c2ecf20Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 15848c2ecf20Sopenharmony_ci tlb_finish_mmu(&tlb, start, range.end); 15858c2ecf20Sopenharmony_ci} 15868c2ecf20Sopenharmony_ci 15878c2ecf20Sopenharmony_ci/** 15888c2ecf20Sopenharmony_ci * zap_page_range_single - remove user pages in a given range 15898c2ecf20Sopenharmony_ci * @vma: vm_area_struct holding the applicable pages 15908c2ecf20Sopenharmony_ci * @address: starting address of pages to zap 15918c2ecf20Sopenharmony_ci * @size: number of bytes to zap 15928c2ecf20Sopenharmony_ci * @details: details of shared cache invalidation 15938c2ecf20Sopenharmony_ci * 15948c2ecf20Sopenharmony_ci * The range must fit into one VMA. 15958c2ecf20Sopenharmony_ci */ 15968c2ecf20Sopenharmony_cistatic void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, 15978c2ecf20Sopenharmony_ci unsigned long size, struct zap_details *details) 15988c2ecf20Sopenharmony_ci{ 15998c2ecf20Sopenharmony_ci struct mmu_notifier_range range; 16008c2ecf20Sopenharmony_ci struct mmu_gather tlb; 16018c2ecf20Sopenharmony_ci 16028c2ecf20Sopenharmony_ci lru_add_drain(); 16038c2ecf20Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, 16048c2ecf20Sopenharmony_ci address, address + size); 16058c2ecf20Sopenharmony_ci tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end); 16068c2ecf20Sopenharmony_ci update_hiwater_rss(vma->vm_mm); 16078c2ecf20Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 16088c2ecf20Sopenharmony_ci unmap_single_vma(&tlb, vma, address, range.end, details); 16098c2ecf20Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 16108c2ecf20Sopenharmony_ci tlb_finish_mmu(&tlb, address, range.end); 16118c2ecf20Sopenharmony_ci} 16128c2ecf20Sopenharmony_ci 16138c2ecf20Sopenharmony_ci/** 16148c2ecf20Sopenharmony_ci * zap_vma_ptes - remove ptes mapping the vma 16158c2ecf20Sopenharmony_ci * @vma: vm_area_struct holding ptes to be zapped 16168c2ecf20Sopenharmony_ci * @address: starting address of pages to zap 16178c2ecf20Sopenharmony_ci * @size: number of bytes to zap 16188c2ecf20Sopenharmony_ci * 16198c2ecf20Sopenharmony_ci * This function only unmaps ptes assigned to VM_PFNMAP vmas. 16208c2ecf20Sopenharmony_ci * 16218c2ecf20Sopenharmony_ci * The entire address range must be fully contained within the vma. 16228c2ecf20Sopenharmony_ci * 16238c2ecf20Sopenharmony_ci */ 16248c2ecf20Sopenharmony_civoid zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, 16258c2ecf20Sopenharmony_ci unsigned long size) 16268c2ecf20Sopenharmony_ci{ 16278c2ecf20Sopenharmony_ci if (address < vma->vm_start || address + size > vma->vm_end || 16288c2ecf20Sopenharmony_ci !(vma->vm_flags & VM_PFNMAP)) 16298c2ecf20Sopenharmony_ci return; 16308c2ecf20Sopenharmony_ci 16318c2ecf20Sopenharmony_ci zap_page_range_single(vma, address, size, NULL); 16328c2ecf20Sopenharmony_ci} 16338c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(zap_vma_ptes); 16348c2ecf20Sopenharmony_ci 16358c2ecf20Sopenharmony_cistatic pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr) 16368c2ecf20Sopenharmony_ci{ 16378c2ecf20Sopenharmony_ci pgd_t *pgd; 16388c2ecf20Sopenharmony_ci p4d_t *p4d; 16398c2ecf20Sopenharmony_ci pud_t *pud; 16408c2ecf20Sopenharmony_ci pmd_t *pmd; 16418c2ecf20Sopenharmony_ci 16428c2ecf20Sopenharmony_ci pgd = pgd_offset(mm, addr); 16438c2ecf20Sopenharmony_ci p4d = p4d_alloc(mm, pgd, addr); 16448c2ecf20Sopenharmony_ci if (!p4d) 16458c2ecf20Sopenharmony_ci return NULL; 16468c2ecf20Sopenharmony_ci pud = pud_alloc(mm, p4d, addr); 16478c2ecf20Sopenharmony_ci if (!pud) 16488c2ecf20Sopenharmony_ci return NULL; 16498c2ecf20Sopenharmony_ci pmd = pmd_alloc(mm, pud, addr); 16508c2ecf20Sopenharmony_ci if (!pmd) 16518c2ecf20Sopenharmony_ci return NULL; 16528c2ecf20Sopenharmony_ci 16538c2ecf20Sopenharmony_ci VM_BUG_ON(pmd_trans_huge(*pmd)); 16548c2ecf20Sopenharmony_ci return pmd; 16558c2ecf20Sopenharmony_ci} 16568c2ecf20Sopenharmony_ci 16578c2ecf20Sopenharmony_cipte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, 16588c2ecf20Sopenharmony_ci spinlock_t **ptl) 16598c2ecf20Sopenharmony_ci{ 16608c2ecf20Sopenharmony_ci pmd_t *pmd = walk_to_pmd(mm, addr); 16618c2ecf20Sopenharmony_ci 16628c2ecf20Sopenharmony_ci if (!pmd) 16638c2ecf20Sopenharmony_ci return NULL; 16648c2ecf20Sopenharmony_ci return pte_alloc_map_lock(mm, pmd, addr, ptl); 16658c2ecf20Sopenharmony_ci} 16668c2ecf20Sopenharmony_ci 16678c2ecf20Sopenharmony_cistatic int validate_page_before_insert(struct page *page) 16688c2ecf20Sopenharmony_ci{ 16698c2ecf20Sopenharmony_ci if (PageAnon(page) || PageSlab(page) || page_has_type(page)) 16708c2ecf20Sopenharmony_ci return -EINVAL; 16718c2ecf20Sopenharmony_ci flush_dcache_page(page); 16728c2ecf20Sopenharmony_ci return 0; 16738c2ecf20Sopenharmony_ci} 16748c2ecf20Sopenharmony_ci 16758c2ecf20Sopenharmony_cistatic int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte, 16768c2ecf20Sopenharmony_ci unsigned long addr, struct page *page, pgprot_t prot) 16778c2ecf20Sopenharmony_ci{ 16788c2ecf20Sopenharmony_ci if (!pte_none(*pte)) 16798c2ecf20Sopenharmony_ci return -EBUSY; 16808c2ecf20Sopenharmony_ci /* Ok, finally just insert the thing.. */ 16818c2ecf20Sopenharmony_ci get_page(page); 16828c2ecf20Sopenharmony_ci inc_mm_counter_fast(mm, mm_counter_file(page)); 16838c2ecf20Sopenharmony_ci page_add_file_rmap(page, false); 16848c2ecf20Sopenharmony_ci set_pte_at(mm, addr, pte, mk_pte(page, prot)); 16858c2ecf20Sopenharmony_ci return 0; 16868c2ecf20Sopenharmony_ci} 16878c2ecf20Sopenharmony_ci 16888c2ecf20Sopenharmony_ci/* 16898c2ecf20Sopenharmony_ci * This is the old fallback for page remapping. 16908c2ecf20Sopenharmony_ci * 16918c2ecf20Sopenharmony_ci * For historical reasons, it only allows reserved pages. Only 16928c2ecf20Sopenharmony_ci * old drivers should use this, and they needed to mark their 16938c2ecf20Sopenharmony_ci * pages reserved for the old functions anyway. 16948c2ecf20Sopenharmony_ci */ 16958c2ecf20Sopenharmony_cistatic int insert_page(struct vm_area_struct *vma, unsigned long addr, 16968c2ecf20Sopenharmony_ci struct page *page, pgprot_t prot) 16978c2ecf20Sopenharmony_ci{ 16988c2ecf20Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 16998c2ecf20Sopenharmony_ci int retval; 17008c2ecf20Sopenharmony_ci pte_t *pte; 17018c2ecf20Sopenharmony_ci spinlock_t *ptl; 17028c2ecf20Sopenharmony_ci 17038c2ecf20Sopenharmony_ci retval = validate_page_before_insert(page); 17048c2ecf20Sopenharmony_ci if (retval) 17058c2ecf20Sopenharmony_ci goto out; 17068c2ecf20Sopenharmony_ci retval = -ENOMEM; 17078c2ecf20Sopenharmony_ci pte = get_locked_pte(mm, addr, &ptl); 17088c2ecf20Sopenharmony_ci if (!pte) 17098c2ecf20Sopenharmony_ci goto out; 17108c2ecf20Sopenharmony_ci retval = insert_page_into_pte_locked(mm, pte, addr, page, prot); 17118c2ecf20Sopenharmony_ci pte_unmap_unlock(pte, ptl); 17128c2ecf20Sopenharmony_ciout: 17138c2ecf20Sopenharmony_ci return retval; 17148c2ecf20Sopenharmony_ci} 17158c2ecf20Sopenharmony_ci 17168c2ecf20Sopenharmony_ci#ifdef pte_index 17178c2ecf20Sopenharmony_cistatic int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte, 17188c2ecf20Sopenharmony_ci unsigned long addr, struct page *page, pgprot_t prot) 17198c2ecf20Sopenharmony_ci{ 17208c2ecf20Sopenharmony_ci int err; 17218c2ecf20Sopenharmony_ci 17228c2ecf20Sopenharmony_ci if (!page_count(page)) 17238c2ecf20Sopenharmony_ci return -EINVAL; 17248c2ecf20Sopenharmony_ci err = validate_page_before_insert(page); 17258c2ecf20Sopenharmony_ci if (err) 17268c2ecf20Sopenharmony_ci return err; 17278c2ecf20Sopenharmony_ci return insert_page_into_pte_locked(mm, pte, addr, page, prot); 17288c2ecf20Sopenharmony_ci} 17298c2ecf20Sopenharmony_ci 17308c2ecf20Sopenharmony_ci/* insert_pages() amortizes the cost of spinlock operations 17318c2ecf20Sopenharmony_ci * when inserting pages in a loop. Arch *must* define pte_index. 17328c2ecf20Sopenharmony_ci */ 17338c2ecf20Sopenharmony_cistatic int insert_pages(struct vm_area_struct *vma, unsigned long addr, 17348c2ecf20Sopenharmony_ci struct page **pages, unsigned long *num, pgprot_t prot) 17358c2ecf20Sopenharmony_ci{ 17368c2ecf20Sopenharmony_ci pmd_t *pmd = NULL; 17378c2ecf20Sopenharmony_ci pte_t *start_pte, *pte; 17388c2ecf20Sopenharmony_ci spinlock_t *pte_lock; 17398c2ecf20Sopenharmony_ci struct mm_struct *const mm = vma->vm_mm; 17408c2ecf20Sopenharmony_ci unsigned long curr_page_idx = 0; 17418c2ecf20Sopenharmony_ci unsigned long remaining_pages_total = *num; 17428c2ecf20Sopenharmony_ci unsigned long pages_to_write_in_pmd; 17438c2ecf20Sopenharmony_ci int ret; 17448c2ecf20Sopenharmony_cimore: 17458c2ecf20Sopenharmony_ci ret = -EFAULT; 17468c2ecf20Sopenharmony_ci pmd = walk_to_pmd(mm, addr); 17478c2ecf20Sopenharmony_ci if (!pmd) 17488c2ecf20Sopenharmony_ci goto out; 17498c2ecf20Sopenharmony_ci 17508c2ecf20Sopenharmony_ci pages_to_write_in_pmd = min_t(unsigned long, 17518c2ecf20Sopenharmony_ci remaining_pages_total, PTRS_PER_PTE - pte_index(addr)); 17528c2ecf20Sopenharmony_ci 17538c2ecf20Sopenharmony_ci /* Allocate the PTE if necessary; takes PMD lock once only. */ 17548c2ecf20Sopenharmony_ci ret = -ENOMEM; 17558c2ecf20Sopenharmony_ci if (pte_alloc(mm, pmd)) 17568c2ecf20Sopenharmony_ci goto out; 17578c2ecf20Sopenharmony_ci 17588c2ecf20Sopenharmony_ci while (pages_to_write_in_pmd) { 17598c2ecf20Sopenharmony_ci int pte_idx = 0; 17608c2ecf20Sopenharmony_ci const int batch_size = min_t(int, pages_to_write_in_pmd, 8); 17618c2ecf20Sopenharmony_ci 17628c2ecf20Sopenharmony_ci start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock); 17638c2ecf20Sopenharmony_ci for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) { 17648c2ecf20Sopenharmony_ci int err = insert_page_in_batch_locked(mm, pte, 17658c2ecf20Sopenharmony_ci addr, pages[curr_page_idx], prot); 17668c2ecf20Sopenharmony_ci if (unlikely(err)) { 17678c2ecf20Sopenharmony_ci pte_unmap_unlock(start_pte, pte_lock); 17688c2ecf20Sopenharmony_ci ret = err; 17698c2ecf20Sopenharmony_ci remaining_pages_total -= pte_idx; 17708c2ecf20Sopenharmony_ci goto out; 17718c2ecf20Sopenharmony_ci } 17728c2ecf20Sopenharmony_ci addr += PAGE_SIZE; 17738c2ecf20Sopenharmony_ci ++curr_page_idx; 17748c2ecf20Sopenharmony_ci } 17758c2ecf20Sopenharmony_ci pte_unmap_unlock(start_pte, pte_lock); 17768c2ecf20Sopenharmony_ci pages_to_write_in_pmd -= batch_size; 17778c2ecf20Sopenharmony_ci remaining_pages_total -= batch_size; 17788c2ecf20Sopenharmony_ci } 17798c2ecf20Sopenharmony_ci if (remaining_pages_total) 17808c2ecf20Sopenharmony_ci goto more; 17818c2ecf20Sopenharmony_ci ret = 0; 17828c2ecf20Sopenharmony_ciout: 17838c2ecf20Sopenharmony_ci *num = remaining_pages_total; 17848c2ecf20Sopenharmony_ci return ret; 17858c2ecf20Sopenharmony_ci} 17868c2ecf20Sopenharmony_ci#endif /* ifdef pte_index */ 17878c2ecf20Sopenharmony_ci 17888c2ecf20Sopenharmony_ci/** 17898c2ecf20Sopenharmony_ci * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock. 17908c2ecf20Sopenharmony_ci * @vma: user vma to map to 17918c2ecf20Sopenharmony_ci * @addr: target start user address of these pages 17928c2ecf20Sopenharmony_ci * @pages: source kernel pages 17938c2ecf20Sopenharmony_ci * @num: in: number of pages to map. out: number of pages that were *not* 17948c2ecf20Sopenharmony_ci * mapped. (0 means all pages were successfully mapped). 17958c2ecf20Sopenharmony_ci * 17968c2ecf20Sopenharmony_ci * Preferred over vm_insert_page() when inserting multiple pages. 17978c2ecf20Sopenharmony_ci * 17988c2ecf20Sopenharmony_ci * In case of error, we may have mapped a subset of the provided 17998c2ecf20Sopenharmony_ci * pages. It is the caller's responsibility to account for this case. 18008c2ecf20Sopenharmony_ci * 18018c2ecf20Sopenharmony_ci * The same restrictions apply as in vm_insert_page(). 18028c2ecf20Sopenharmony_ci */ 18038c2ecf20Sopenharmony_ciint vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, 18048c2ecf20Sopenharmony_ci struct page **pages, unsigned long *num) 18058c2ecf20Sopenharmony_ci{ 18068c2ecf20Sopenharmony_ci#ifdef pte_index 18078c2ecf20Sopenharmony_ci const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1; 18088c2ecf20Sopenharmony_ci 18098c2ecf20Sopenharmony_ci if (addr < vma->vm_start || end_addr >= vma->vm_end) 18108c2ecf20Sopenharmony_ci return -EFAULT; 18118c2ecf20Sopenharmony_ci if (!(vma->vm_flags & VM_MIXEDMAP)) { 18128c2ecf20Sopenharmony_ci BUG_ON(mmap_read_trylock(vma->vm_mm)); 18138c2ecf20Sopenharmony_ci BUG_ON(vma->vm_flags & VM_PFNMAP); 18148c2ecf20Sopenharmony_ci vma->vm_flags |= VM_MIXEDMAP; 18158c2ecf20Sopenharmony_ci } 18168c2ecf20Sopenharmony_ci /* Defer page refcount checking till we're about to map that page. */ 18178c2ecf20Sopenharmony_ci return insert_pages(vma, addr, pages, num, vma->vm_page_prot); 18188c2ecf20Sopenharmony_ci#else 18198c2ecf20Sopenharmony_ci unsigned long idx = 0, pgcount = *num; 18208c2ecf20Sopenharmony_ci int err = -EINVAL; 18218c2ecf20Sopenharmony_ci 18228c2ecf20Sopenharmony_ci for (; idx < pgcount; ++idx) { 18238c2ecf20Sopenharmony_ci err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]); 18248c2ecf20Sopenharmony_ci if (err) 18258c2ecf20Sopenharmony_ci break; 18268c2ecf20Sopenharmony_ci } 18278c2ecf20Sopenharmony_ci *num = pgcount - idx; 18288c2ecf20Sopenharmony_ci return err; 18298c2ecf20Sopenharmony_ci#endif /* ifdef pte_index */ 18308c2ecf20Sopenharmony_ci} 18318c2ecf20Sopenharmony_ciEXPORT_SYMBOL(vm_insert_pages); 18328c2ecf20Sopenharmony_ci 18338c2ecf20Sopenharmony_ci/** 18348c2ecf20Sopenharmony_ci * vm_insert_page - insert single page into user vma 18358c2ecf20Sopenharmony_ci * @vma: user vma to map to 18368c2ecf20Sopenharmony_ci * @addr: target user address of this page 18378c2ecf20Sopenharmony_ci * @page: source kernel page 18388c2ecf20Sopenharmony_ci * 18398c2ecf20Sopenharmony_ci * This allows drivers to insert individual pages they've allocated 18408c2ecf20Sopenharmony_ci * into a user vma. 18418c2ecf20Sopenharmony_ci * 18428c2ecf20Sopenharmony_ci * The page has to be a nice clean _individual_ kernel allocation. 18438c2ecf20Sopenharmony_ci * If you allocate a compound page, you need to have marked it as 18448c2ecf20Sopenharmony_ci * such (__GFP_COMP), or manually just split the page up yourself 18458c2ecf20Sopenharmony_ci * (see split_page()). 18468c2ecf20Sopenharmony_ci * 18478c2ecf20Sopenharmony_ci * NOTE! Traditionally this was done with "remap_pfn_range()" which 18488c2ecf20Sopenharmony_ci * took an arbitrary page protection parameter. This doesn't allow 18498c2ecf20Sopenharmony_ci * that. Your vma protection will have to be set up correctly, which 18508c2ecf20Sopenharmony_ci * means that if you want a shared writable mapping, you'd better 18518c2ecf20Sopenharmony_ci * ask for a shared writable mapping! 18528c2ecf20Sopenharmony_ci * 18538c2ecf20Sopenharmony_ci * The page does not need to be reserved. 18548c2ecf20Sopenharmony_ci * 18558c2ecf20Sopenharmony_ci * Usually this function is called from f_op->mmap() handler 18568c2ecf20Sopenharmony_ci * under mm->mmap_lock write-lock, so it can change vma->vm_flags. 18578c2ecf20Sopenharmony_ci * Caller must set VM_MIXEDMAP on vma if it wants to call this 18588c2ecf20Sopenharmony_ci * function from other places, for example from page-fault handler. 18598c2ecf20Sopenharmony_ci * 18608c2ecf20Sopenharmony_ci * Return: %0 on success, negative error code otherwise. 18618c2ecf20Sopenharmony_ci */ 18628c2ecf20Sopenharmony_ciint vm_insert_page(struct vm_area_struct *vma, unsigned long addr, 18638c2ecf20Sopenharmony_ci struct page *page) 18648c2ecf20Sopenharmony_ci{ 18658c2ecf20Sopenharmony_ci if (addr < vma->vm_start || addr >= vma->vm_end) 18668c2ecf20Sopenharmony_ci return -EFAULT; 18678c2ecf20Sopenharmony_ci if (!page_count(page)) 18688c2ecf20Sopenharmony_ci return -EINVAL; 18698c2ecf20Sopenharmony_ci if (!(vma->vm_flags & VM_MIXEDMAP)) { 18708c2ecf20Sopenharmony_ci BUG_ON(mmap_read_trylock(vma->vm_mm)); 18718c2ecf20Sopenharmony_ci BUG_ON(vma->vm_flags & VM_PFNMAP); 18728c2ecf20Sopenharmony_ci vma->vm_flags |= VM_MIXEDMAP; 18738c2ecf20Sopenharmony_ci } 18748c2ecf20Sopenharmony_ci return insert_page(vma, addr, page, vma->vm_page_prot); 18758c2ecf20Sopenharmony_ci} 18768c2ecf20Sopenharmony_ciEXPORT_SYMBOL(vm_insert_page); 18778c2ecf20Sopenharmony_ci 18788c2ecf20Sopenharmony_ci/* 18798c2ecf20Sopenharmony_ci * __vm_map_pages - maps range of kernel pages into user vma 18808c2ecf20Sopenharmony_ci * @vma: user vma to map to 18818c2ecf20Sopenharmony_ci * @pages: pointer to array of source kernel pages 18828c2ecf20Sopenharmony_ci * @num: number of pages in page array 18838c2ecf20Sopenharmony_ci * @offset: user's requested vm_pgoff 18848c2ecf20Sopenharmony_ci * 18858c2ecf20Sopenharmony_ci * This allows drivers to map range of kernel pages into a user vma. 18868c2ecf20Sopenharmony_ci * 18878c2ecf20Sopenharmony_ci * Return: 0 on success and error code otherwise. 18888c2ecf20Sopenharmony_ci */ 18898c2ecf20Sopenharmony_cistatic int __vm_map_pages(struct vm_area_struct *vma, struct page **pages, 18908c2ecf20Sopenharmony_ci unsigned long num, unsigned long offset) 18918c2ecf20Sopenharmony_ci{ 18928c2ecf20Sopenharmony_ci unsigned long count = vma_pages(vma); 18938c2ecf20Sopenharmony_ci unsigned long uaddr = vma->vm_start; 18948c2ecf20Sopenharmony_ci int ret, i; 18958c2ecf20Sopenharmony_ci 18968c2ecf20Sopenharmony_ci /* Fail if the user requested offset is beyond the end of the object */ 18978c2ecf20Sopenharmony_ci if (offset >= num) 18988c2ecf20Sopenharmony_ci return -ENXIO; 18998c2ecf20Sopenharmony_ci 19008c2ecf20Sopenharmony_ci /* Fail if the user requested size exceeds available object size */ 19018c2ecf20Sopenharmony_ci if (count > num - offset) 19028c2ecf20Sopenharmony_ci return -ENXIO; 19038c2ecf20Sopenharmony_ci 19048c2ecf20Sopenharmony_ci for (i = 0; i < count; i++) { 19058c2ecf20Sopenharmony_ci ret = vm_insert_page(vma, uaddr, pages[offset + i]); 19068c2ecf20Sopenharmony_ci if (ret < 0) 19078c2ecf20Sopenharmony_ci return ret; 19088c2ecf20Sopenharmony_ci uaddr += PAGE_SIZE; 19098c2ecf20Sopenharmony_ci } 19108c2ecf20Sopenharmony_ci 19118c2ecf20Sopenharmony_ci return 0; 19128c2ecf20Sopenharmony_ci} 19138c2ecf20Sopenharmony_ci 19148c2ecf20Sopenharmony_ci/** 19158c2ecf20Sopenharmony_ci * vm_map_pages - maps range of kernel pages starts with non zero offset 19168c2ecf20Sopenharmony_ci * @vma: user vma to map to 19178c2ecf20Sopenharmony_ci * @pages: pointer to array of source kernel pages 19188c2ecf20Sopenharmony_ci * @num: number of pages in page array 19198c2ecf20Sopenharmony_ci * 19208c2ecf20Sopenharmony_ci * Maps an object consisting of @num pages, catering for the user's 19218c2ecf20Sopenharmony_ci * requested vm_pgoff 19228c2ecf20Sopenharmony_ci * 19238c2ecf20Sopenharmony_ci * If we fail to insert any page into the vma, the function will return 19248c2ecf20Sopenharmony_ci * immediately leaving any previously inserted pages present. Callers 19258c2ecf20Sopenharmony_ci * from the mmap handler may immediately return the error as their caller 19268c2ecf20Sopenharmony_ci * will destroy the vma, removing any successfully inserted pages. Other 19278c2ecf20Sopenharmony_ci * callers should make their own arrangements for calling unmap_region(). 19288c2ecf20Sopenharmony_ci * 19298c2ecf20Sopenharmony_ci * Context: Process context. Called by mmap handlers. 19308c2ecf20Sopenharmony_ci * Return: 0 on success and error code otherwise. 19318c2ecf20Sopenharmony_ci */ 19328c2ecf20Sopenharmony_ciint vm_map_pages(struct vm_area_struct *vma, struct page **pages, 19338c2ecf20Sopenharmony_ci unsigned long num) 19348c2ecf20Sopenharmony_ci{ 19358c2ecf20Sopenharmony_ci return __vm_map_pages(vma, pages, num, vma->vm_pgoff); 19368c2ecf20Sopenharmony_ci} 19378c2ecf20Sopenharmony_ciEXPORT_SYMBOL(vm_map_pages); 19388c2ecf20Sopenharmony_ci 19398c2ecf20Sopenharmony_ci/** 19408c2ecf20Sopenharmony_ci * vm_map_pages_zero - map range of kernel pages starts with zero offset 19418c2ecf20Sopenharmony_ci * @vma: user vma to map to 19428c2ecf20Sopenharmony_ci * @pages: pointer to array of source kernel pages 19438c2ecf20Sopenharmony_ci * @num: number of pages in page array 19448c2ecf20Sopenharmony_ci * 19458c2ecf20Sopenharmony_ci * Similar to vm_map_pages(), except that it explicitly sets the offset 19468c2ecf20Sopenharmony_ci * to 0. This function is intended for the drivers that did not consider 19478c2ecf20Sopenharmony_ci * vm_pgoff. 19488c2ecf20Sopenharmony_ci * 19498c2ecf20Sopenharmony_ci * Context: Process context. Called by mmap handlers. 19508c2ecf20Sopenharmony_ci * Return: 0 on success and error code otherwise. 19518c2ecf20Sopenharmony_ci */ 19528c2ecf20Sopenharmony_ciint vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, 19538c2ecf20Sopenharmony_ci unsigned long num) 19548c2ecf20Sopenharmony_ci{ 19558c2ecf20Sopenharmony_ci return __vm_map_pages(vma, pages, num, 0); 19568c2ecf20Sopenharmony_ci} 19578c2ecf20Sopenharmony_ciEXPORT_SYMBOL(vm_map_pages_zero); 19588c2ecf20Sopenharmony_ci 19598c2ecf20Sopenharmony_cistatic vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, 19608c2ecf20Sopenharmony_ci pfn_t pfn, pgprot_t prot, bool mkwrite) 19618c2ecf20Sopenharmony_ci{ 19628c2ecf20Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 19638c2ecf20Sopenharmony_ci pte_t *pte, entry; 19648c2ecf20Sopenharmony_ci spinlock_t *ptl; 19658c2ecf20Sopenharmony_ci 19668c2ecf20Sopenharmony_ci pte = get_locked_pte(mm, addr, &ptl); 19678c2ecf20Sopenharmony_ci if (!pte) 19688c2ecf20Sopenharmony_ci return VM_FAULT_OOM; 19698c2ecf20Sopenharmony_ci if (!pte_none(*pte)) { 19708c2ecf20Sopenharmony_ci if (mkwrite) { 19718c2ecf20Sopenharmony_ci /* 19728c2ecf20Sopenharmony_ci * For read faults on private mappings the PFN passed 19738c2ecf20Sopenharmony_ci * in may not match the PFN we have mapped if the 19748c2ecf20Sopenharmony_ci * mapped PFN is a writeable COW page. In the mkwrite 19758c2ecf20Sopenharmony_ci * case we are creating a writable PTE for a shared 19768c2ecf20Sopenharmony_ci * mapping and we expect the PFNs to match. If they 19778c2ecf20Sopenharmony_ci * don't match, we are likely racing with block 19788c2ecf20Sopenharmony_ci * allocation and mapping invalidation so just skip the 19798c2ecf20Sopenharmony_ci * update. 19808c2ecf20Sopenharmony_ci */ 19818c2ecf20Sopenharmony_ci if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) { 19828c2ecf20Sopenharmony_ci WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte))); 19838c2ecf20Sopenharmony_ci goto out_unlock; 19848c2ecf20Sopenharmony_ci } 19858c2ecf20Sopenharmony_ci entry = pte_mkyoung(*pte); 19868c2ecf20Sopenharmony_ci entry = maybe_mkwrite(pte_mkdirty(entry), vma); 19878c2ecf20Sopenharmony_ci if (ptep_set_access_flags(vma, addr, pte, entry, 1)) 19888c2ecf20Sopenharmony_ci update_mmu_cache(vma, addr, pte); 19898c2ecf20Sopenharmony_ci } 19908c2ecf20Sopenharmony_ci goto out_unlock; 19918c2ecf20Sopenharmony_ci } 19928c2ecf20Sopenharmony_ci 19938c2ecf20Sopenharmony_ci /* Ok, finally just insert the thing.. */ 19948c2ecf20Sopenharmony_ci if (pfn_t_devmap(pfn)) 19958c2ecf20Sopenharmony_ci entry = pte_mkdevmap(pfn_t_pte(pfn, prot)); 19968c2ecf20Sopenharmony_ci else 19978c2ecf20Sopenharmony_ci entry = pte_mkspecial(pfn_t_pte(pfn, prot)); 19988c2ecf20Sopenharmony_ci 19998c2ecf20Sopenharmony_ci if (mkwrite) { 20008c2ecf20Sopenharmony_ci entry = pte_mkyoung(entry); 20018c2ecf20Sopenharmony_ci entry = maybe_mkwrite(pte_mkdirty(entry), vma); 20028c2ecf20Sopenharmony_ci } 20038c2ecf20Sopenharmony_ci 20048c2ecf20Sopenharmony_ci set_pte_at(mm, addr, pte, entry); 20058c2ecf20Sopenharmony_ci update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ 20068c2ecf20Sopenharmony_ci 20078c2ecf20Sopenharmony_ciout_unlock: 20088c2ecf20Sopenharmony_ci pte_unmap_unlock(pte, ptl); 20098c2ecf20Sopenharmony_ci return VM_FAULT_NOPAGE; 20108c2ecf20Sopenharmony_ci} 20118c2ecf20Sopenharmony_ci 20128c2ecf20Sopenharmony_ci/** 20138c2ecf20Sopenharmony_ci * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot 20148c2ecf20Sopenharmony_ci * @vma: user vma to map to 20158c2ecf20Sopenharmony_ci * @addr: target user address of this page 20168c2ecf20Sopenharmony_ci * @pfn: source kernel pfn 20178c2ecf20Sopenharmony_ci * @pgprot: pgprot flags for the inserted page 20188c2ecf20Sopenharmony_ci * 20198c2ecf20Sopenharmony_ci * This is exactly like vmf_insert_pfn(), except that it allows drivers 20208c2ecf20Sopenharmony_ci * to override pgprot on a per-page basis. 20218c2ecf20Sopenharmony_ci * 20228c2ecf20Sopenharmony_ci * This only makes sense for IO mappings, and it makes no sense for 20238c2ecf20Sopenharmony_ci * COW mappings. In general, using multiple vmas is preferable; 20248c2ecf20Sopenharmony_ci * vmf_insert_pfn_prot should only be used if using multiple VMAs is 20258c2ecf20Sopenharmony_ci * impractical. 20268c2ecf20Sopenharmony_ci * 20278c2ecf20Sopenharmony_ci * See vmf_insert_mixed_prot() for a discussion of the implication of using 20288c2ecf20Sopenharmony_ci * a value of @pgprot different from that of @vma->vm_page_prot. 20298c2ecf20Sopenharmony_ci * 20308c2ecf20Sopenharmony_ci * Context: Process context. May allocate using %GFP_KERNEL. 20318c2ecf20Sopenharmony_ci * Return: vm_fault_t value. 20328c2ecf20Sopenharmony_ci */ 20338c2ecf20Sopenharmony_civm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, 20348c2ecf20Sopenharmony_ci unsigned long pfn, pgprot_t pgprot) 20358c2ecf20Sopenharmony_ci{ 20368c2ecf20Sopenharmony_ci /* 20378c2ecf20Sopenharmony_ci * Technically, architectures with pte_special can avoid all these 20388c2ecf20Sopenharmony_ci * restrictions (same for remap_pfn_range). However we would like 20398c2ecf20Sopenharmony_ci * consistency in testing and feature parity among all, so we should 20408c2ecf20Sopenharmony_ci * try to keep these invariants in place for everybody. 20418c2ecf20Sopenharmony_ci */ 20428c2ecf20Sopenharmony_ci BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); 20438c2ecf20Sopenharmony_ci BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 20448c2ecf20Sopenharmony_ci (VM_PFNMAP|VM_MIXEDMAP)); 20458c2ecf20Sopenharmony_ci BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); 20468c2ecf20Sopenharmony_ci BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); 20478c2ecf20Sopenharmony_ci 20488c2ecf20Sopenharmony_ci if (addr < vma->vm_start || addr >= vma->vm_end) 20498c2ecf20Sopenharmony_ci return VM_FAULT_SIGBUS; 20508c2ecf20Sopenharmony_ci 20518c2ecf20Sopenharmony_ci if (!pfn_modify_allowed(pfn, pgprot)) 20528c2ecf20Sopenharmony_ci return VM_FAULT_SIGBUS; 20538c2ecf20Sopenharmony_ci 20548c2ecf20Sopenharmony_ci track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); 20558c2ecf20Sopenharmony_ci 20568c2ecf20Sopenharmony_ci return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, 20578c2ecf20Sopenharmony_ci false); 20588c2ecf20Sopenharmony_ci} 20598c2ecf20Sopenharmony_ciEXPORT_SYMBOL(vmf_insert_pfn_prot); 20608c2ecf20Sopenharmony_ci 20618c2ecf20Sopenharmony_ci/** 20628c2ecf20Sopenharmony_ci * vmf_insert_pfn - insert single pfn into user vma 20638c2ecf20Sopenharmony_ci * @vma: user vma to map to 20648c2ecf20Sopenharmony_ci * @addr: target user address of this page 20658c2ecf20Sopenharmony_ci * @pfn: source kernel pfn 20668c2ecf20Sopenharmony_ci * 20678c2ecf20Sopenharmony_ci * Similar to vm_insert_page, this allows drivers to insert individual pages 20688c2ecf20Sopenharmony_ci * they've allocated into a user vma. Same comments apply. 20698c2ecf20Sopenharmony_ci * 20708c2ecf20Sopenharmony_ci * This function should only be called from a vm_ops->fault handler, and 20718c2ecf20Sopenharmony_ci * in that case the handler should return the result of this function. 20728c2ecf20Sopenharmony_ci * 20738c2ecf20Sopenharmony_ci * vma cannot be a COW mapping. 20748c2ecf20Sopenharmony_ci * 20758c2ecf20Sopenharmony_ci * As this is called only for pages that do not currently exist, we 20768c2ecf20Sopenharmony_ci * do not need to flush old virtual caches or the TLB. 20778c2ecf20Sopenharmony_ci * 20788c2ecf20Sopenharmony_ci * Context: Process context. May allocate using %GFP_KERNEL. 20798c2ecf20Sopenharmony_ci * Return: vm_fault_t value. 20808c2ecf20Sopenharmony_ci */ 20818c2ecf20Sopenharmony_civm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, 20828c2ecf20Sopenharmony_ci unsigned long pfn) 20838c2ecf20Sopenharmony_ci{ 20848c2ecf20Sopenharmony_ci return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); 20858c2ecf20Sopenharmony_ci} 20868c2ecf20Sopenharmony_ciEXPORT_SYMBOL(vmf_insert_pfn); 20878c2ecf20Sopenharmony_ci 20888c2ecf20Sopenharmony_cistatic bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn) 20898c2ecf20Sopenharmony_ci{ 20908c2ecf20Sopenharmony_ci /* these checks mirror the abort conditions in vm_normal_page */ 20918c2ecf20Sopenharmony_ci if (vma->vm_flags & VM_MIXEDMAP) 20928c2ecf20Sopenharmony_ci return true; 20938c2ecf20Sopenharmony_ci if (pfn_t_devmap(pfn)) 20948c2ecf20Sopenharmony_ci return true; 20958c2ecf20Sopenharmony_ci if (pfn_t_special(pfn)) 20968c2ecf20Sopenharmony_ci return true; 20978c2ecf20Sopenharmony_ci if (is_zero_pfn(pfn_t_to_pfn(pfn))) 20988c2ecf20Sopenharmony_ci return true; 20998c2ecf20Sopenharmony_ci return false; 21008c2ecf20Sopenharmony_ci} 21018c2ecf20Sopenharmony_ci 21028c2ecf20Sopenharmony_cistatic vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, 21038c2ecf20Sopenharmony_ci unsigned long addr, pfn_t pfn, pgprot_t pgprot, 21048c2ecf20Sopenharmony_ci bool mkwrite) 21058c2ecf20Sopenharmony_ci{ 21068c2ecf20Sopenharmony_ci int err; 21078c2ecf20Sopenharmony_ci 21088c2ecf20Sopenharmony_ci BUG_ON(!vm_mixed_ok(vma, pfn)); 21098c2ecf20Sopenharmony_ci 21108c2ecf20Sopenharmony_ci if (addr < vma->vm_start || addr >= vma->vm_end) 21118c2ecf20Sopenharmony_ci return VM_FAULT_SIGBUS; 21128c2ecf20Sopenharmony_ci 21138c2ecf20Sopenharmony_ci track_pfn_insert(vma, &pgprot, pfn); 21148c2ecf20Sopenharmony_ci 21158c2ecf20Sopenharmony_ci if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot)) 21168c2ecf20Sopenharmony_ci return VM_FAULT_SIGBUS; 21178c2ecf20Sopenharmony_ci 21188c2ecf20Sopenharmony_ci /* 21198c2ecf20Sopenharmony_ci * If we don't have pte special, then we have to use the pfn_valid() 21208c2ecf20Sopenharmony_ci * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* 21218c2ecf20Sopenharmony_ci * refcount the page if pfn_valid is true (hence insert_page rather 21228c2ecf20Sopenharmony_ci * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP 21238c2ecf20Sopenharmony_ci * without pte special, it would there be refcounted as a normal page. 21248c2ecf20Sopenharmony_ci */ 21258c2ecf20Sopenharmony_ci if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && 21268c2ecf20Sopenharmony_ci !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) { 21278c2ecf20Sopenharmony_ci struct page *page; 21288c2ecf20Sopenharmony_ci 21298c2ecf20Sopenharmony_ci /* 21308c2ecf20Sopenharmony_ci * At this point we are committed to insert_page() 21318c2ecf20Sopenharmony_ci * regardless of whether the caller specified flags that 21328c2ecf20Sopenharmony_ci * result in pfn_t_has_page() == false. 21338c2ecf20Sopenharmony_ci */ 21348c2ecf20Sopenharmony_ci page = pfn_to_page(pfn_t_to_pfn(pfn)); 21358c2ecf20Sopenharmony_ci err = insert_page(vma, addr, page, pgprot); 21368c2ecf20Sopenharmony_ci } else { 21378c2ecf20Sopenharmony_ci return insert_pfn(vma, addr, pfn, pgprot, mkwrite); 21388c2ecf20Sopenharmony_ci } 21398c2ecf20Sopenharmony_ci 21408c2ecf20Sopenharmony_ci if (err == -ENOMEM) 21418c2ecf20Sopenharmony_ci return VM_FAULT_OOM; 21428c2ecf20Sopenharmony_ci if (err < 0 && err != -EBUSY) 21438c2ecf20Sopenharmony_ci return VM_FAULT_SIGBUS; 21448c2ecf20Sopenharmony_ci 21458c2ecf20Sopenharmony_ci return VM_FAULT_NOPAGE; 21468c2ecf20Sopenharmony_ci} 21478c2ecf20Sopenharmony_ci 21488c2ecf20Sopenharmony_ci/** 21498c2ecf20Sopenharmony_ci * vmf_insert_mixed_prot - insert single pfn into user vma with specified pgprot 21508c2ecf20Sopenharmony_ci * @vma: user vma to map to 21518c2ecf20Sopenharmony_ci * @addr: target user address of this page 21528c2ecf20Sopenharmony_ci * @pfn: source kernel pfn 21538c2ecf20Sopenharmony_ci * @pgprot: pgprot flags for the inserted page 21548c2ecf20Sopenharmony_ci * 21558c2ecf20Sopenharmony_ci * This is exactly like vmf_insert_mixed(), except that it allows drivers 21568c2ecf20Sopenharmony_ci * to override pgprot on a per-page basis. 21578c2ecf20Sopenharmony_ci * 21588c2ecf20Sopenharmony_ci * Typically this function should be used by drivers to set caching- and 21598c2ecf20Sopenharmony_ci * encryption bits different than those of @vma->vm_page_prot, because 21608c2ecf20Sopenharmony_ci * the caching- or encryption mode may not be known at mmap() time. 21618c2ecf20Sopenharmony_ci * This is ok as long as @vma->vm_page_prot is not used by the core vm 21628c2ecf20Sopenharmony_ci * to set caching and encryption bits for those vmas (except for COW pages). 21638c2ecf20Sopenharmony_ci * This is ensured by core vm only modifying these page table entries using 21648c2ecf20Sopenharmony_ci * functions that don't touch caching- or encryption bits, using pte_modify() 21658c2ecf20Sopenharmony_ci * if needed. (See for example mprotect()). 21668c2ecf20Sopenharmony_ci * Also when new page-table entries are created, this is only done using the 21678c2ecf20Sopenharmony_ci * fault() callback, and never using the value of vma->vm_page_prot, 21688c2ecf20Sopenharmony_ci * except for page-table entries that point to anonymous pages as the result 21698c2ecf20Sopenharmony_ci * of COW. 21708c2ecf20Sopenharmony_ci * 21718c2ecf20Sopenharmony_ci * Context: Process context. May allocate using %GFP_KERNEL. 21728c2ecf20Sopenharmony_ci * Return: vm_fault_t value. 21738c2ecf20Sopenharmony_ci */ 21748c2ecf20Sopenharmony_civm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr, 21758c2ecf20Sopenharmony_ci pfn_t pfn, pgprot_t pgprot) 21768c2ecf20Sopenharmony_ci{ 21778c2ecf20Sopenharmony_ci return __vm_insert_mixed(vma, addr, pfn, pgprot, false); 21788c2ecf20Sopenharmony_ci} 21798c2ecf20Sopenharmony_ciEXPORT_SYMBOL(vmf_insert_mixed_prot); 21808c2ecf20Sopenharmony_ci 21818c2ecf20Sopenharmony_civm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, 21828c2ecf20Sopenharmony_ci pfn_t pfn) 21838c2ecf20Sopenharmony_ci{ 21848c2ecf20Sopenharmony_ci return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, false); 21858c2ecf20Sopenharmony_ci} 21868c2ecf20Sopenharmony_ciEXPORT_SYMBOL(vmf_insert_mixed); 21878c2ecf20Sopenharmony_ci 21888c2ecf20Sopenharmony_ci/* 21898c2ecf20Sopenharmony_ci * If the insertion of PTE failed because someone else already added a 21908c2ecf20Sopenharmony_ci * different entry in the mean time, we treat that as success as we assume 21918c2ecf20Sopenharmony_ci * the same entry was actually inserted. 21928c2ecf20Sopenharmony_ci */ 21938c2ecf20Sopenharmony_civm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, 21948c2ecf20Sopenharmony_ci unsigned long addr, pfn_t pfn) 21958c2ecf20Sopenharmony_ci{ 21968c2ecf20Sopenharmony_ci return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, true); 21978c2ecf20Sopenharmony_ci} 21988c2ecf20Sopenharmony_ciEXPORT_SYMBOL(vmf_insert_mixed_mkwrite); 21998c2ecf20Sopenharmony_ci 22008c2ecf20Sopenharmony_ci/* 22018c2ecf20Sopenharmony_ci * maps a range of physical memory into the requested pages. the old 22028c2ecf20Sopenharmony_ci * mappings are removed. any references to nonexistent pages results 22038c2ecf20Sopenharmony_ci * in null mappings (currently treated as "copy-on-access") 22048c2ecf20Sopenharmony_ci */ 22058c2ecf20Sopenharmony_cistatic int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, 22068c2ecf20Sopenharmony_ci unsigned long addr, unsigned long end, 22078c2ecf20Sopenharmony_ci unsigned long pfn, pgprot_t prot) 22088c2ecf20Sopenharmony_ci{ 22098c2ecf20Sopenharmony_ci pte_t *pte, *mapped_pte; 22108c2ecf20Sopenharmony_ci spinlock_t *ptl; 22118c2ecf20Sopenharmony_ci int err = 0; 22128c2ecf20Sopenharmony_ci 22138c2ecf20Sopenharmony_ci mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); 22148c2ecf20Sopenharmony_ci if (!pte) 22158c2ecf20Sopenharmony_ci return -ENOMEM; 22168c2ecf20Sopenharmony_ci arch_enter_lazy_mmu_mode(); 22178c2ecf20Sopenharmony_ci do { 22188c2ecf20Sopenharmony_ci BUG_ON(!pte_none(*pte)); 22198c2ecf20Sopenharmony_ci if (!pfn_modify_allowed(pfn, prot)) { 22208c2ecf20Sopenharmony_ci err = -EACCES; 22218c2ecf20Sopenharmony_ci break; 22228c2ecf20Sopenharmony_ci } 22238c2ecf20Sopenharmony_ci set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); 22248c2ecf20Sopenharmony_ci pfn++; 22258c2ecf20Sopenharmony_ci } while (pte++, addr += PAGE_SIZE, addr != end); 22268c2ecf20Sopenharmony_ci arch_leave_lazy_mmu_mode(); 22278c2ecf20Sopenharmony_ci pte_unmap_unlock(mapped_pte, ptl); 22288c2ecf20Sopenharmony_ci return err; 22298c2ecf20Sopenharmony_ci} 22308c2ecf20Sopenharmony_ci 22318c2ecf20Sopenharmony_cistatic inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, 22328c2ecf20Sopenharmony_ci unsigned long addr, unsigned long end, 22338c2ecf20Sopenharmony_ci unsigned long pfn, pgprot_t prot) 22348c2ecf20Sopenharmony_ci{ 22358c2ecf20Sopenharmony_ci pmd_t *pmd; 22368c2ecf20Sopenharmony_ci unsigned long next; 22378c2ecf20Sopenharmony_ci int err; 22388c2ecf20Sopenharmony_ci 22398c2ecf20Sopenharmony_ci pfn -= addr >> PAGE_SHIFT; 22408c2ecf20Sopenharmony_ci pmd = pmd_alloc(mm, pud, addr); 22418c2ecf20Sopenharmony_ci if (!pmd) 22428c2ecf20Sopenharmony_ci return -ENOMEM; 22438c2ecf20Sopenharmony_ci VM_BUG_ON(pmd_trans_huge(*pmd)); 22448c2ecf20Sopenharmony_ci do { 22458c2ecf20Sopenharmony_ci next = pmd_addr_end(addr, end); 22468c2ecf20Sopenharmony_ci err = remap_pte_range(mm, pmd, addr, next, 22478c2ecf20Sopenharmony_ci pfn + (addr >> PAGE_SHIFT), prot); 22488c2ecf20Sopenharmony_ci if (err) 22498c2ecf20Sopenharmony_ci return err; 22508c2ecf20Sopenharmony_ci } while (pmd++, addr = next, addr != end); 22518c2ecf20Sopenharmony_ci return 0; 22528c2ecf20Sopenharmony_ci} 22538c2ecf20Sopenharmony_ci 22548c2ecf20Sopenharmony_cistatic inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d, 22558c2ecf20Sopenharmony_ci unsigned long addr, unsigned long end, 22568c2ecf20Sopenharmony_ci unsigned long pfn, pgprot_t prot) 22578c2ecf20Sopenharmony_ci{ 22588c2ecf20Sopenharmony_ci pud_t *pud; 22598c2ecf20Sopenharmony_ci unsigned long next; 22608c2ecf20Sopenharmony_ci int err; 22618c2ecf20Sopenharmony_ci 22628c2ecf20Sopenharmony_ci pfn -= addr >> PAGE_SHIFT; 22638c2ecf20Sopenharmony_ci pud = pud_alloc(mm, p4d, addr); 22648c2ecf20Sopenharmony_ci if (!pud) 22658c2ecf20Sopenharmony_ci return -ENOMEM; 22668c2ecf20Sopenharmony_ci do { 22678c2ecf20Sopenharmony_ci next = pud_addr_end(addr, end); 22688c2ecf20Sopenharmony_ci err = remap_pmd_range(mm, pud, addr, next, 22698c2ecf20Sopenharmony_ci pfn + (addr >> PAGE_SHIFT), prot); 22708c2ecf20Sopenharmony_ci if (err) 22718c2ecf20Sopenharmony_ci return err; 22728c2ecf20Sopenharmony_ci } while (pud++, addr = next, addr != end); 22738c2ecf20Sopenharmony_ci return 0; 22748c2ecf20Sopenharmony_ci} 22758c2ecf20Sopenharmony_ci 22768c2ecf20Sopenharmony_cistatic inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, 22778c2ecf20Sopenharmony_ci unsigned long addr, unsigned long end, 22788c2ecf20Sopenharmony_ci unsigned long pfn, pgprot_t prot) 22798c2ecf20Sopenharmony_ci{ 22808c2ecf20Sopenharmony_ci p4d_t *p4d; 22818c2ecf20Sopenharmony_ci unsigned long next; 22828c2ecf20Sopenharmony_ci int err; 22838c2ecf20Sopenharmony_ci 22848c2ecf20Sopenharmony_ci pfn -= addr >> PAGE_SHIFT; 22858c2ecf20Sopenharmony_ci p4d = p4d_alloc(mm, pgd, addr); 22868c2ecf20Sopenharmony_ci if (!p4d) 22878c2ecf20Sopenharmony_ci return -ENOMEM; 22888c2ecf20Sopenharmony_ci do { 22898c2ecf20Sopenharmony_ci next = p4d_addr_end(addr, end); 22908c2ecf20Sopenharmony_ci err = remap_pud_range(mm, p4d, addr, next, 22918c2ecf20Sopenharmony_ci pfn + (addr >> PAGE_SHIFT), prot); 22928c2ecf20Sopenharmony_ci if (err) 22938c2ecf20Sopenharmony_ci return err; 22948c2ecf20Sopenharmony_ci } while (p4d++, addr = next, addr != end); 22958c2ecf20Sopenharmony_ci return 0; 22968c2ecf20Sopenharmony_ci} 22978c2ecf20Sopenharmony_ci 22988c2ecf20Sopenharmony_ci/** 22998c2ecf20Sopenharmony_ci * remap_pfn_range - remap kernel memory to userspace 23008c2ecf20Sopenharmony_ci * @vma: user vma to map to 23018c2ecf20Sopenharmony_ci * @addr: target page aligned user address to start at 23028c2ecf20Sopenharmony_ci * @pfn: page frame number of kernel physical memory address 23038c2ecf20Sopenharmony_ci * @size: size of mapping area 23048c2ecf20Sopenharmony_ci * @prot: page protection flags for this mapping 23058c2ecf20Sopenharmony_ci * 23068c2ecf20Sopenharmony_ci * Note: this is only safe if the mm semaphore is held when called. 23078c2ecf20Sopenharmony_ci * 23088c2ecf20Sopenharmony_ci * Return: %0 on success, negative error code otherwise. 23098c2ecf20Sopenharmony_ci */ 23108c2ecf20Sopenharmony_ciint remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, 23118c2ecf20Sopenharmony_ci unsigned long pfn, unsigned long size, pgprot_t prot) 23128c2ecf20Sopenharmony_ci{ 23138c2ecf20Sopenharmony_ci pgd_t *pgd; 23148c2ecf20Sopenharmony_ci unsigned long next; 23158c2ecf20Sopenharmony_ci unsigned long end = addr + PAGE_ALIGN(size); 23168c2ecf20Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 23178c2ecf20Sopenharmony_ci unsigned long remap_pfn = pfn; 23188c2ecf20Sopenharmony_ci int err; 23198c2ecf20Sopenharmony_ci 23208c2ecf20Sopenharmony_ci if (WARN_ON_ONCE(!PAGE_ALIGNED(addr))) 23218c2ecf20Sopenharmony_ci return -EINVAL; 23228c2ecf20Sopenharmony_ci 23238c2ecf20Sopenharmony_ci /* 23248c2ecf20Sopenharmony_ci * Physically remapped pages are special. Tell the 23258c2ecf20Sopenharmony_ci * rest of the world about it: 23268c2ecf20Sopenharmony_ci * VM_IO tells people not to look at these pages 23278c2ecf20Sopenharmony_ci * (accesses can have side effects). 23288c2ecf20Sopenharmony_ci * VM_PFNMAP tells the core MM that the base pages are just 23298c2ecf20Sopenharmony_ci * raw PFN mappings, and do not have a "struct page" associated 23308c2ecf20Sopenharmony_ci * with them. 23318c2ecf20Sopenharmony_ci * VM_DONTEXPAND 23328c2ecf20Sopenharmony_ci * Disable vma merging and expanding with mremap(). 23338c2ecf20Sopenharmony_ci * VM_DONTDUMP 23348c2ecf20Sopenharmony_ci * Omit vma from core dump, even when VM_IO turned off. 23358c2ecf20Sopenharmony_ci * 23368c2ecf20Sopenharmony_ci * There's a horrible special case to handle copy-on-write 23378c2ecf20Sopenharmony_ci * behaviour that some programs depend on. We mark the "original" 23388c2ecf20Sopenharmony_ci * un-COW'ed pages by matching them up with "vma->vm_pgoff". 23398c2ecf20Sopenharmony_ci * See vm_normal_page() for details. 23408c2ecf20Sopenharmony_ci */ 23418c2ecf20Sopenharmony_ci if (is_cow_mapping(vma->vm_flags)) { 23428c2ecf20Sopenharmony_ci if (addr != vma->vm_start || end != vma->vm_end) 23438c2ecf20Sopenharmony_ci return -EINVAL; 23448c2ecf20Sopenharmony_ci vma->vm_pgoff = pfn; 23458c2ecf20Sopenharmony_ci } 23468c2ecf20Sopenharmony_ci 23478c2ecf20Sopenharmony_ci err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size)); 23488c2ecf20Sopenharmony_ci if (err) 23498c2ecf20Sopenharmony_ci return -EINVAL; 23508c2ecf20Sopenharmony_ci 23518c2ecf20Sopenharmony_ci vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; 23528c2ecf20Sopenharmony_ci 23538c2ecf20Sopenharmony_ci BUG_ON(addr >= end); 23548c2ecf20Sopenharmony_ci pfn -= addr >> PAGE_SHIFT; 23558c2ecf20Sopenharmony_ci pgd = pgd_offset(mm, addr); 23568c2ecf20Sopenharmony_ci flush_cache_range(vma, addr, end); 23578c2ecf20Sopenharmony_ci do { 23588c2ecf20Sopenharmony_ci next = pgd_addr_end(addr, end); 23598c2ecf20Sopenharmony_ci err = remap_p4d_range(mm, pgd, addr, next, 23608c2ecf20Sopenharmony_ci pfn + (addr >> PAGE_SHIFT), prot); 23618c2ecf20Sopenharmony_ci if (err) 23628c2ecf20Sopenharmony_ci break; 23638c2ecf20Sopenharmony_ci } while (pgd++, addr = next, addr != end); 23648c2ecf20Sopenharmony_ci 23658c2ecf20Sopenharmony_ci if (err) 23668c2ecf20Sopenharmony_ci untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size)); 23678c2ecf20Sopenharmony_ci 23688c2ecf20Sopenharmony_ci return err; 23698c2ecf20Sopenharmony_ci} 23708c2ecf20Sopenharmony_ciEXPORT_SYMBOL(remap_pfn_range); 23718c2ecf20Sopenharmony_ci 23728c2ecf20Sopenharmony_ci/** 23738c2ecf20Sopenharmony_ci * vm_iomap_memory - remap memory to userspace 23748c2ecf20Sopenharmony_ci * @vma: user vma to map to 23758c2ecf20Sopenharmony_ci * @start: start of the physical memory to be mapped 23768c2ecf20Sopenharmony_ci * @len: size of area 23778c2ecf20Sopenharmony_ci * 23788c2ecf20Sopenharmony_ci * This is a simplified io_remap_pfn_range() for common driver use. The 23798c2ecf20Sopenharmony_ci * driver just needs to give us the physical memory range to be mapped, 23808c2ecf20Sopenharmony_ci * we'll figure out the rest from the vma information. 23818c2ecf20Sopenharmony_ci * 23828c2ecf20Sopenharmony_ci * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get 23838c2ecf20Sopenharmony_ci * whatever write-combining details or similar. 23848c2ecf20Sopenharmony_ci * 23858c2ecf20Sopenharmony_ci * Return: %0 on success, negative error code otherwise. 23868c2ecf20Sopenharmony_ci */ 23878c2ecf20Sopenharmony_ciint vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) 23888c2ecf20Sopenharmony_ci{ 23898c2ecf20Sopenharmony_ci unsigned long vm_len, pfn, pages; 23908c2ecf20Sopenharmony_ci 23918c2ecf20Sopenharmony_ci /* Check that the physical memory area passed in looks valid */ 23928c2ecf20Sopenharmony_ci if (start + len < start) 23938c2ecf20Sopenharmony_ci return -EINVAL; 23948c2ecf20Sopenharmony_ci /* 23958c2ecf20Sopenharmony_ci * You *really* shouldn't map things that aren't page-aligned, 23968c2ecf20Sopenharmony_ci * but we've historically allowed it because IO memory might 23978c2ecf20Sopenharmony_ci * just have smaller alignment. 23988c2ecf20Sopenharmony_ci */ 23998c2ecf20Sopenharmony_ci len += start & ~PAGE_MASK; 24008c2ecf20Sopenharmony_ci pfn = start >> PAGE_SHIFT; 24018c2ecf20Sopenharmony_ci pages = (len + ~PAGE_MASK) >> PAGE_SHIFT; 24028c2ecf20Sopenharmony_ci if (pfn + pages < pfn) 24038c2ecf20Sopenharmony_ci return -EINVAL; 24048c2ecf20Sopenharmony_ci 24058c2ecf20Sopenharmony_ci /* We start the mapping 'vm_pgoff' pages into the area */ 24068c2ecf20Sopenharmony_ci if (vma->vm_pgoff > pages) 24078c2ecf20Sopenharmony_ci return -EINVAL; 24088c2ecf20Sopenharmony_ci pfn += vma->vm_pgoff; 24098c2ecf20Sopenharmony_ci pages -= vma->vm_pgoff; 24108c2ecf20Sopenharmony_ci 24118c2ecf20Sopenharmony_ci /* Can we fit all of the mapping? */ 24128c2ecf20Sopenharmony_ci vm_len = vma->vm_end - vma->vm_start; 24138c2ecf20Sopenharmony_ci if (vm_len >> PAGE_SHIFT > pages) 24148c2ecf20Sopenharmony_ci return -EINVAL; 24158c2ecf20Sopenharmony_ci 24168c2ecf20Sopenharmony_ci /* Ok, let it rip */ 24178c2ecf20Sopenharmony_ci return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); 24188c2ecf20Sopenharmony_ci} 24198c2ecf20Sopenharmony_ciEXPORT_SYMBOL(vm_iomap_memory); 24208c2ecf20Sopenharmony_ci 24218c2ecf20Sopenharmony_cistatic int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, 24228c2ecf20Sopenharmony_ci unsigned long addr, unsigned long end, 24238c2ecf20Sopenharmony_ci pte_fn_t fn, void *data, bool create, 24248c2ecf20Sopenharmony_ci pgtbl_mod_mask *mask) 24258c2ecf20Sopenharmony_ci{ 24268c2ecf20Sopenharmony_ci pte_t *pte; 24278c2ecf20Sopenharmony_ci int err = 0; 24288c2ecf20Sopenharmony_ci spinlock_t *ptl; 24298c2ecf20Sopenharmony_ci 24308c2ecf20Sopenharmony_ci if (create) { 24318c2ecf20Sopenharmony_ci pte = (mm == &init_mm) ? 24328c2ecf20Sopenharmony_ci pte_alloc_kernel_track(pmd, addr, mask) : 24338c2ecf20Sopenharmony_ci pte_alloc_map_lock(mm, pmd, addr, &ptl); 24348c2ecf20Sopenharmony_ci if (!pte) 24358c2ecf20Sopenharmony_ci return -ENOMEM; 24368c2ecf20Sopenharmony_ci } else { 24378c2ecf20Sopenharmony_ci pte = (mm == &init_mm) ? 24388c2ecf20Sopenharmony_ci pte_offset_kernel(pmd, addr) : 24398c2ecf20Sopenharmony_ci pte_offset_map_lock(mm, pmd, addr, &ptl); 24408c2ecf20Sopenharmony_ci } 24418c2ecf20Sopenharmony_ci 24428c2ecf20Sopenharmony_ci BUG_ON(pmd_huge(*pmd)); 24438c2ecf20Sopenharmony_ci 24448c2ecf20Sopenharmony_ci arch_enter_lazy_mmu_mode(); 24458c2ecf20Sopenharmony_ci 24468c2ecf20Sopenharmony_ci if (fn) { 24478c2ecf20Sopenharmony_ci do { 24488c2ecf20Sopenharmony_ci if (create || !pte_none(*pte)) { 24498c2ecf20Sopenharmony_ci err = fn(pte++, addr, data); 24508c2ecf20Sopenharmony_ci if (err) 24518c2ecf20Sopenharmony_ci break; 24528c2ecf20Sopenharmony_ci } 24538c2ecf20Sopenharmony_ci } while (addr += PAGE_SIZE, addr != end); 24548c2ecf20Sopenharmony_ci } 24558c2ecf20Sopenharmony_ci *mask |= PGTBL_PTE_MODIFIED; 24568c2ecf20Sopenharmony_ci 24578c2ecf20Sopenharmony_ci arch_leave_lazy_mmu_mode(); 24588c2ecf20Sopenharmony_ci 24598c2ecf20Sopenharmony_ci if (mm != &init_mm) 24608c2ecf20Sopenharmony_ci pte_unmap_unlock(pte-1, ptl); 24618c2ecf20Sopenharmony_ci return err; 24628c2ecf20Sopenharmony_ci} 24638c2ecf20Sopenharmony_ci 24648c2ecf20Sopenharmony_cistatic int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, 24658c2ecf20Sopenharmony_ci unsigned long addr, unsigned long end, 24668c2ecf20Sopenharmony_ci pte_fn_t fn, void *data, bool create, 24678c2ecf20Sopenharmony_ci pgtbl_mod_mask *mask) 24688c2ecf20Sopenharmony_ci{ 24698c2ecf20Sopenharmony_ci pmd_t *pmd; 24708c2ecf20Sopenharmony_ci unsigned long next; 24718c2ecf20Sopenharmony_ci int err = 0; 24728c2ecf20Sopenharmony_ci 24738c2ecf20Sopenharmony_ci BUG_ON(pud_huge(*pud)); 24748c2ecf20Sopenharmony_ci 24758c2ecf20Sopenharmony_ci if (create) { 24768c2ecf20Sopenharmony_ci pmd = pmd_alloc_track(mm, pud, addr, mask); 24778c2ecf20Sopenharmony_ci if (!pmd) 24788c2ecf20Sopenharmony_ci return -ENOMEM; 24798c2ecf20Sopenharmony_ci } else { 24808c2ecf20Sopenharmony_ci pmd = pmd_offset(pud, addr); 24818c2ecf20Sopenharmony_ci } 24828c2ecf20Sopenharmony_ci do { 24838c2ecf20Sopenharmony_ci next = pmd_addr_end(addr, end); 24848c2ecf20Sopenharmony_ci if (create || !pmd_none_or_clear_bad(pmd)) { 24858c2ecf20Sopenharmony_ci err = apply_to_pte_range(mm, pmd, addr, next, fn, data, 24868c2ecf20Sopenharmony_ci create, mask); 24878c2ecf20Sopenharmony_ci if (err) 24888c2ecf20Sopenharmony_ci break; 24898c2ecf20Sopenharmony_ci } 24908c2ecf20Sopenharmony_ci } while (pmd++, addr = next, addr != end); 24918c2ecf20Sopenharmony_ci return err; 24928c2ecf20Sopenharmony_ci} 24938c2ecf20Sopenharmony_ci 24948c2ecf20Sopenharmony_cistatic int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, 24958c2ecf20Sopenharmony_ci unsigned long addr, unsigned long end, 24968c2ecf20Sopenharmony_ci pte_fn_t fn, void *data, bool create, 24978c2ecf20Sopenharmony_ci pgtbl_mod_mask *mask) 24988c2ecf20Sopenharmony_ci{ 24998c2ecf20Sopenharmony_ci pud_t *pud; 25008c2ecf20Sopenharmony_ci unsigned long next; 25018c2ecf20Sopenharmony_ci int err = 0; 25028c2ecf20Sopenharmony_ci 25038c2ecf20Sopenharmony_ci if (create) { 25048c2ecf20Sopenharmony_ci pud = pud_alloc_track(mm, p4d, addr, mask); 25058c2ecf20Sopenharmony_ci if (!pud) 25068c2ecf20Sopenharmony_ci return -ENOMEM; 25078c2ecf20Sopenharmony_ci } else { 25088c2ecf20Sopenharmony_ci pud = pud_offset(p4d, addr); 25098c2ecf20Sopenharmony_ci } 25108c2ecf20Sopenharmony_ci do { 25118c2ecf20Sopenharmony_ci next = pud_addr_end(addr, end); 25128c2ecf20Sopenharmony_ci if (create || !pud_none_or_clear_bad(pud)) { 25138c2ecf20Sopenharmony_ci err = apply_to_pmd_range(mm, pud, addr, next, fn, data, 25148c2ecf20Sopenharmony_ci create, mask); 25158c2ecf20Sopenharmony_ci if (err) 25168c2ecf20Sopenharmony_ci break; 25178c2ecf20Sopenharmony_ci } 25188c2ecf20Sopenharmony_ci } while (pud++, addr = next, addr != end); 25198c2ecf20Sopenharmony_ci return err; 25208c2ecf20Sopenharmony_ci} 25218c2ecf20Sopenharmony_ci 25228c2ecf20Sopenharmony_cistatic int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, 25238c2ecf20Sopenharmony_ci unsigned long addr, unsigned long end, 25248c2ecf20Sopenharmony_ci pte_fn_t fn, void *data, bool create, 25258c2ecf20Sopenharmony_ci pgtbl_mod_mask *mask) 25268c2ecf20Sopenharmony_ci{ 25278c2ecf20Sopenharmony_ci p4d_t *p4d; 25288c2ecf20Sopenharmony_ci unsigned long next; 25298c2ecf20Sopenharmony_ci int err = 0; 25308c2ecf20Sopenharmony_ci 25318c2ecf20Sopenharmony_ci if (create) { 25328c2ecf20Sopenharmony_ci p4d = p4d_alloc_track(mm, pgd, addr, mask); 25338c2ecf20Sopenharmony_ci if (!p4d) 25348c2ecf20Sopenharmony_ci return -ENOMEM; 25358c2ecf20Sopenharmony_ci } else { 25368c2ecf20Sopenharmony_ci p4d = p4d_offset(pgd, addr); 25378c2ecf20Sopenharmony_ci } 25388c2ecf20Sopenharmony_ci do { 25398c2ecf20Sopenharmony_ci next = p4d_addr_end(addr, end); 25408c2ecf20Sopenharmony_ci if (create || !p4d_none_or_clear_bad(p4d)) { 25418c2ecf20Sopenharmony_ci err = apply_to_pud_range(mm, p4d, addr, next, fn, data, 25428c2ecf20Sopenharmony_ci create, mask); 25438c2ecf20Sopenharmony_ci if (err) 25448c2ecf20Sopenharmony_ci break; 25458c2ecf20Sopenharmony_ci } 25468c2ecf20Sopenharmony_ci } while (p4d++, addr = next, addr != end); 25478c2ecf20Sopenharmony_ci return err; 25488c2ecf20Sopenharmony_ci} 25498c2ecf20Sopenharmony_ci 25508c2ecf20Sopenharmony_cistatic int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, 25518c2ecf20Sopenharmony_ci unsigned long size, pte_fn_t fn, 25528c2ecf20Sopenharmony_ci void *data, bool create) 25538c2ecf20Sopenharmony_ci{ 25548c2ecf20Sopenharmony_ci pgd_t *pgd; 25558c2ecf20Sopenharmony_ci unsigned long start = addr, next; 25568c2ecf20Sopenharmony_ci unsigned long end = addr + size; 25578c2ecf20Sopenharmony_ci pgtbl_mod_mask mask = 0; 25588c2ecf20Sopenharmony_ci int err = 0; 25598c2ecf20Sopenharmony_ci 25608c2ecf20Sopenharmony_ci if (WARN_ON(addr >= end)) 25618c2ecf20Sopenharmony_ci return -EINVAL; 25628c2ecf20Sopenharmony_ci 25638c2ecf20Sopenharmony_ci pgd = pgd_offset(mm, addr); 25648c2ecf20Sopenharmony_ci do { 25658c2ecf20Sopenharmony_ci next = pgd_addr_end(addr, end); 25668c2ecf20Sopenharmony_ci if (!create && pgd_none_or_clear_bad(pgd)) 25678c2ecf20Sopenharmony_ci continue; 25688c2ecf20Sopenharmony_ci err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask); 25698c2ecf20Sopenharmony_ci if (err) 25708c2ecf20Sopenharmony_ci break; 25718c2ecf20Sopenharmony_ci } while (pgd++, addr = next, addr != end); 25728c2ecf20Sopenharmony_ci 25738c2ecf20Sopenharmony_ci if (mask & ARCH_PAGE_TABLE_SYNC_MASK) 25748c2ecf20Sopenharmony_ci arch_sync_kernel_mappings(start, start + size); 25758c2ecf20Sopenharmony_ci 25768c2ecf20Sopenharmony_ci return err; 25778c2ecf20Sopenharmony_ci} 25788c2ecf20Sopenharmony_ci 25798c2ecf20Sopenharmony_ci/* 25808c2ecf20Sopenharmony_ci * Scan a region of virtual memory, filling in page tables as necessary 25818c2ecf20Sopenharmony_ci * and calling a provided function on each leaf page table. 25828c2ecf20Sopenharmony_ci */ 25838c2ecf20Sopenharmony_ciint apply_to_page_range(struct mm_struct *mm, unsigned long addr, 25848c2ecf20Sopenharmony_ci unsigned long size, pte_fn_t fn, void *data) 25858c2ecf20Sopenharmony_ci{ 25868c2ecf20Sopenharmony_ci return __apply_to_page_range(mm, addr, size, fn, data, true); 25878c2ecf20Sopenharmony_ci} 25888c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(apply_to_page_range); 25898c2ecf20Sopenharmony_ci 25908c2ecf20Sopenharmony_ci/* 25918c2ecf20Sopenharmony_ci * Scan a region of virtual memory, calling a provided function on 25928c2ecf20Sopenharmony_ci * each leaf page table where it exists. 25938c2ecf20Sopenharmony_ci * 25948c2ecf20Sopenharmony_ci * Unlike apply_to_page_range, this does _not_ fill in page tables 25958c2ecf20Sopenharmony_ci * where they are absent. 25968c2ecf20Sopenharmony_ci */ 25978c2ecf20Sopenharmony_ciint apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr, 25988c2ecf20Sopenharmony_ci unsigned long size, pte_fn_t fn, void *data) 25998c2ecf20Sopenharmony_ci{ 26008c2ecf20Sopenharmony_ci return __apply_to_page_range(mm, addr, size, fn, data, false); 26018c2ecf20Sopenharmony_ci} 26028c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(apply_to_existing_page_range); 26038c2ecf20Sopenharmony_ci 26048c2ecf20Sopenharmony_ci/* 26058c2ecf20Sopenharmony_ci * handle_pte_fault chooses page fault handler according to an entry which was 26068c2ecf20Sopenharmony_ci * read non-atomically. Before making any commitment, on those architectures 26078c2ecf20Sopenharmony_ci * or configurations (e.g. i386 with PAE) which might give a mix of unmatched 26088c2ecf20Sopenharmony_ci * parts, do_swap_page must check under lock before unmapping the pte and 26098c2ecf20Sopenharmony_ci * proceeding (but do_wp_page is only called after already making such a check; 26108c2ecf20Sopenharmony_ci * and do_anonymous_page can safely check later on). 26118c2ecf20Sopenharmony_ci */ 26128c2ecf20Sopenharmony_cistatic inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, 26138c2ecf20Sopenharmony_ci pte_t *page_table, pte_t orig_pte) 26148c2ecf20Sopenharmony_ci{ 26158c2ecf20Sopenharmony_ci int same = 1; 26168c2ecf20Sopenharmony_ci#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION) 26178c2ecf20Sopenharmony_ci if (sizeof(pte_t) > sizeof(unsigned long)) { 26188c2ecf20Sopenharmony_ci spinlock_t *ptl = pte_lockptr(mm, pmd); 26198c2ecf20Sopenharmony_ci spin_lock(ptl); 26208c2ecf20Sopenharmony_ci same = pte_same(*page_table, orig_pte); 26218c2ecf20Sopenharmony_ci spin_unlock(ptl); 26228c2ecf20Sopenharmony_ci } 26238c2ecf20Sopenharmony_ci#endif 26248c2ecf20Sopenharmony_ci pte_unmap(page_table); 26258c2ecf20Sopenharmony_ci return same; 26268c2ecf20Sopenharmony_ci} 26278c2ecf20Sopenharmony_ci 26288c2ecf20Sopenharmony_cistatic inline bool cow_user_page(struct page *dst, struct page *src, 26298c2ecf20Sopenharmony_ci struct vm_fault *vmf) 26308c2ecf20Sopenharmony_ci{ 26318c2ecf20Sopenharmony_ci bool ret; 26328c2ecf20Sopenharmony_ci void *kaddr; 26338c2ecf20Sopenharmony_ci void __user *uaddr; 26348c2ecf20Sopenharmony_ci bool locked = false; 26358c2ecf20Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 26368c2ecf20Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 26378c2ecf20Sopenharmony_ci unsigned long addr = vmf->address; 26388c2ecf20Sopenharmony_ci 26398c2ecf20Sopenharmony_ci if (likely(src)) { 26408c2ecf20Sopenharmony_ci copy_user_highpage(dst, src, addr, vma); 26418c2ecf20Sopenharmony_ci return true; 26428c2ecf20Sopenharmony_ci } 26438c2ecf20Sopenharmony_ci 26448c2ecf20Sopenharmony_ci /* 26458c2ecf20Sopenharmony_ci * If the source page was a PFN mapping, we don't have 26468c2ecf20Sopenharmony_ci * a "struct page" for it. We do a best-effort copy by 26478c2ecf20Sopenharmony_ci * just copying from the original user address. If that 26488c2ecf20Sopenharmony_ci * fails, we just zero-fill it. Live with it. 26498c2ecf20Sopenharmony_ci */ 26508c2ecf20Sopenharmony_ci kaddr = kmap_atomic(dst); 26518c2ecf20Sopenharmony_ci uaddr = (void __user *)(addr & PAGE_MASK); 26528c2ecf20Sopenharmony_ci 26538c2ecf20Sopenharmony_ci /* 26548c2ecf20Sopenharmony_ci * On architectures with software "accessed" bits, we would 26558c2ecf20Sopenharmony_ci * take a double page fault, so mark it accessed here. 26568c2ecf20Sopenharmony_ci */ 26578c2ecf20Sopenharmony_ci if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) { 26588c2ecf20Sopenharmony_ci pte_t entry; 26598c2ecf20Sopenharmony_ci 26608c2ecf20Sopenharmony_ci vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); 26618c2ecf20Sopenharmony_ci locked = true; 26628c2ecf20Sopenharmony_ci if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) { 26638c2ecf20Sopenharmony_ci /* 26648c2ecf20Sopenharmony_ci * Other thread has already handled the fault 26658c2ecf20Sopenharmony_ci * and update local tlb only 26668c2ecf20Sopenharmony_ci */ 26678c2ecf20Sopenharmony_ci update_mmu_tlb(vma, addr, vmf->pte); 26688c2ecf20Sopenharmony_ci ret = false; 26698c2ecf20Sopenharmony_ci goto pte_unlock; 26708c2ecf20Sopenharmony_ci } 26718c2ecf20Sopenharmony_ci 26728c2ecf20Sopenharmony_ci entry = pte_mkyoung(vmf->orig_pte); 26738c2ecf20Sopenharmony_ci if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0)) 26748c2ecf20Sopenharmony_ci update_mmu_cache(vma, addr, vmf->pte); 26758c2ecf20Sopenharmony_ci } 26768c2ecf20Sopenharmony_ci 26778c2ecf20Sopenharmony_ci /* 26788c2ecf20Sopenharmony_ci * This really shouldn't fail, because the page is there 26798c2ecf20Sopenharmony_ci * in the page tables. But it might just be unreadable, 26808c2ecf20Sopenharmony_ci * in which case we just give up and fill the result with 26818c2ecf20Sopenharmony_ci * zeroes. 26828c2ecf20Sopenharmony_ci */ 26838c2ecf20Sopenharmony_ci if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { 26848c2ecf20Sopenharmony_ci if (locked) 26858c2ecf20Sopenharmony_ci goto warn; 26868c2ecf20Sopenharmony_ci 26878c2ecf20Sopenharmony_ci /* Re-validate under PTL if the page is still mapped */ 26888c2ecf20Sopenharmony_ci vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); 26898c2ecf20Sopenharmony_ci locked = true; 26908c2ecf20Sopenharmony_ci if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) { 26918c2ecf20Sopenharmony_ci /* The PTE changed under us, update local tlb */ 26928c2ecf20Sopenharmony_ci update_mmu_tlb(vma, addr, vmf->pte); 26938c2ecf20Sopenharmony_ci ret = false; 26948c2ecf20Sopenharmony_ci goto pte_unlock; 26958c2ecf20Sopenharmony_ci } 26968c2ecf20Sopenharmony_ci 26978c2ecf20Sopenharmony_ci /* 26988c2ecf20Sopenharmony_ci * The same page can be mapped back since last copy attempt. 26998c2ecf20Sopenharmony_ci * Try to copy again under PTL. 27008c2ecf20Sopenharmony_ci */ 27018c2ecf20Sopenharmony_ci if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { 27028c2ecf20Sopenharmony_ci /* 27038c2ecf20Sopenharmony_ci * Give a warn in case there can be some obscure 27048c2ecf20Sopenharmony_ci * use-case 27058c2ecf20Sopenharmony_ci */ 27068c2ecf20Sopenharmony_ciwarn: 27078c2ecf20Sopenharmony_ci WARN_ON_ONCE(1); 27088c2ecf20Sopenharmony_ci clear_page(kaddr); 27098c2ecf20Sopenharmony_ci } 27108c2ecf20Sopenharmony_ci } 27118c2ecf20Sopenharmony_ci 27128c2ecf20Sopenharmony_ci ret = true; 27138c2ecf20Sopenharmony_ci 27148c2ecf20Sopenharmony_cipte_unlock: 27158c2ecf20Sopenharmony_ci if (locked) 27168c2ecf20Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 27178c2ecf20Sopenharmony_ci kunmap_atomic(kaddr); 27188c2ecf20Sopenharmony_ci flush_dcache_page(dst); 27198c2ecf20Sopenharmony_ci 27208c2ecf20Sopenharmony_ci return ret; 27218c2ecf20Sopenharmony_ci} 27228c2ecf20Sopenharmony_ci 27238c2ecf20Sopenharmony_cistatic gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) 27248c2ecf20Sopenharmony_ci{ 27258c2ecf20Sopenharmony_ci struct file *vm_file = vma->vm_file; 27268c2ecf20Sopenharmony_ci 27278c2ecf20Sopenharmony_ci if (vm_file) 27288c2ecf20Sopenharmony_ci return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO; 27298c2ecf20Sopenharmony_ci 27308c2ecf20Sopenharmony_ci /* 27318c2ecf20Sopenharmony_ci * Special mappings (e.g. VDSO) do not have any file so fake 27328c2ecf20Sopenharmony_ci * a default GFP_KERNEL for them. 27338c2ecf20Sopenharmony_ci */ 27348c2ecf20Sopenharmony_ci return GFP_KERNEL; 27358c2ecf20Sopenharmony_ci} 27368c2ecf20Sopenharmony_ci 27378c2ecf20Sopenharmony_ci/* 27388c2ecf20Sopenharmony_ci * Notify the address space that the page is about to become writable so that 27398c2ecf20Sopenharmony_ci * it can prohibit this or wait for the page to get into an appropriate state. 27408c2ecf20Sopenharmony_ci * 27418c2ecf20Sopenharmony_ci * We do this without the lock held, so that it can sleep if it needs to. 27428c2ecf20Sopenharmony_ci */ 27438c2ecf20Sopenharmony_cistatic vm_fault_t do_page_mkwrite(struct vm_fault *vmf) 27448c2ecf20Sopenharmony_ci{ 27458c2ecf20Sopenharmony_ci vm_fault_t ret; 27468c2ecf20Sopenharmony_ci struct page *page = vmf->page; 27478c2ecf20Sopenharmony_ci unsigned int old_flags = vmf->flags; 27488c2ecf20Sopenharmony_ci 27498c2ecf20Sopenharmony_ci vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; 27508c2ecf20Sopenharmony_ci 27518c2ecf20Sopenharmony_ci if (vmf->vma->vm_file && 27528c2ecf20Sopenharmony_ci IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host)) 27538c2ecf20Sopenharmony_ci return VM_FAULT_SIGBUS; 27548c2ecf20Sopenharmony_ci 27558c2ecf20Sopenharmony_ci ret = vmf->vma->vm_ops->page_mkwrite(vmf); 27568c2ecf20Sopenharmony_ci /* Restore original flags so that caller is not surprised */ 27578c2ecf20Sopenharmony_ci vmf->flags = old_flags; 27588c2ecf20Sopenharmony_ci if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) 27598c2ecf20Sopenharmony_ci return ret; 27608c2ecf20Sopenharmony_ci if (unlikely(!(ret & VM_FAULT_LOCKED))) { 27618c2ecf20Sopenharmony_ci lock_page(page); 27628c2ecf20Sopenharmony_ci if (!page->mapping) { 27638c2ecf20Sopenharmony_ci unlock_page(page); 27648c2ecf20Sopenharmony_ci return 0; /* retry */ 27658c2ecf20Sopenharmony_ci } 27668c2ecf20Sopenharmony_ci ret |= VM_FAULT_LOCKED; 27678c2ecf20Sopenharmony_ci } else 27688c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(!PageLocked(page), page); 27698c2ecf20Sopenharmony_ci return ret; 27708c2ecf20Sopenharmony_ci} 27718c2ecf20Sopenharmony_ci 27728c2ecf20Sopenharmony_ci/* 27738c2ecf20Sopenharmony_ci * Handle dirtying of a page in shared file mapping on a write fault. 27748c2ecf20Sopenharmony_ci * 27758c2ecf20Sopenharmony_ci * The function expects the page to be locked and unlocks it. 27768c2ecf20Sopenharmony_ci */ 27778c2ecf20Sopenharmony_cistatic vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf) 27788c2ecf20Sopenharmony_ci{ 27798c2ecf20Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 27808c2ecf20Sopenharmony_ci struct address_space *mapping; 27818c2ecf20Sopenharmony_ci struct page *page = vmf->page; 27828c2ecf20Sopenharmony_ci bool dirtied; 27838c2ecf20Sopenharmony_ci bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite; 27848c2ecf20Sopenharmony_ci 27858c2ecf20Sopenharmony_ci dirtied = set_page_dirty(page); 27868c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(PageAnon(page), page); 27878c2ecf20Sopenharmony_ci /* 27888c2ecf20Sopenharmony_ci * Take a local copy of the address_space - page.mapping may be zeroed 27898c2ecf20Sopenharmony_ci * by truncate after unlock_page(). The address_space itself remains 27908c2ecf20Sopenharmony_ci * pinned by vma->vm_file's reference. We rely on unlock_page()'s 27918c2ecf20Sopenharmony_ci * release semantics to prevent the compiler from undoing this copying. 27928c2ecf20Sopenharmony_ci */ 27938c2ecf20Sopenharmony_ci mapping = page_rmapping(page); 27948c2ecf20Sopenharmony_ci unlock_page(page); 27958c2ecf20Sopenharmony_ci 27968c2ecf20Sopenharmony_ci if (!page_mkwrite) 27978c2ecf20Sopenharmony_ci file_update_time(vma->vm_file); 27988c2ecf20Sopenharmony_ci 27998c2ecf20Sopenharmony_ci /* 28008c2ecf20Sopenharmony_ci * Throttle page dirtying rate down to writeback speed. 28018c2ecf20Sopenharmony_ci * 28028c2ecf20Sopenharmony_ci * mapping may be NULL here because some device drivers do not 28038c2ecf20Sopenharmony_ci * set page.mapping but still dirty their pages 28048c2ecf20Sopenharmony_ci * 28058c2ecf20Sopenharmony_ci * Drop the mmap_lock before waiting on IO, if we can. The file 28068c2ecf20Sopenharmony_ci * is pinning the mapping, as per above. 28078c2ecf20Sopenharmony_ci */ 28088c2ecf20Sopenharmony_ci if ((dirtied || page_mkwrite) && mapping) { 28098c2ecf20Sopenharmony_ci struct file *fpin; 28108c2ecf20Sopenharmony_ci 28118c2ecf20Sopenharmony_ci fpin = maybe_unlock_mmap_for_io(vmf, NULL); 28128c2ecf20Sopenharmony_ci balance_dirty_pages_ratelimited(mapping); 28138c2ecf20Sopenharmony_ci if (fpin) { 28148c2ecf20Sopenharmony_ci fput(fpin); 28158c2ecf20Sopenharmony_ci return VM_FAULT_RETRY; 28168c2ecf20Sopenharmony_ci } 28178c2ecf20Sopenharmony_ci } 28188c2ecf20Sopenharmony_ci 28198c2ecf20Sopenharmony_ci return 0; 28208c2ecf20Sopenharmony_ci} 28218c2ecf20Sopenharmony_ci 28228c2ecf20Sopenharmony_ci/* 28238c2ecf20Sopenharmony_ci * Handle write page faults for pages that can be reused in the current vma 28248c2ecf20Sopenharmony_ci * 28258c2ecf20Sopenharmony_ci * This can happen either due to the mapping being with the VM_SHARED flag, 28268c2ecf20Sopenharmony_ci * or due to us being the last reference standing to the page. In either 28278c2ecf20Sopenharmony_ci * case, all we need to do here is to mark the page as writable and update 28288c2ecf20Sopenharmony_ci * any related book-keeping. 28298c2ecf20Sopenharmony_ci */ 28308c2ecf20Sopenharmony_cistatic inline void wp_page_reuse(struct vm_fault *vmf) 28318c2ecf20Sopenharmony_ci __releases(vmf->ptl) 28328c2ecf20Sopenharmony_ci{ 28338c2ecf20Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 28348c2ecf20Sopenharmony_ci struct page *page = vmf->page; 28358c2ecf20Sopenharmony_ci pte_t entry; 28368c2ecf20Sopenharmony_ci /* 28378c2ecf20Sopenharmony_ci * Clear the pages cpupid information as the existing 28388c2ecf20Sopenharmony_ci * information potentially belongs to a now completely 28398c2ecf20Sopenharmony_ci * unrelated process. 28408c2ecf20Sopenharmony_ci */ 28418c2ecf20Sopenharmony_ci if (page) 28428c2ecf20Sopenharmony_ci page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); 28438c2ecf20Sopenharmony_ci 28448c2ecf20Sopenharmony_ci flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); 28458c2ecf20Sopenharmony_ci entry = pte_mkyoung(vmf->orig_pte); 28468c2ecf20Sopenharmony_ci entry = maybe_mkwrite(pte_mkdirty(entry), vma); 28478c2ecf20Sopenharmony_ci if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) 28488c2ecf20Sopenharmony_ci update_mmu_cache(vma, vmf->address, vmf->pte); 28498c2ecf20Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 28508c2ecf20Sopenharmony_ci count_vm_event(PGREUSE); 28518c2ecf20Sopenharmony_ci} 28528c2ecf20Sopenharmony_ci 28538c2ecf20Sopenharmony_ci/* 28548c2ecf20Sopenharmony_ci * Handle the case of a page which we actually need to copy to a new page. 28558c2ecf20Sopenharmony_ci * 28568c2ecf20Sopenharmony_ci * Called with mmap_lock locked and the old page referenced, but 28578c2ecf20Sopenharmony_ci * without the ptl held. 28588c2ecf20Sopenharmony_ci * 28598c2ecf20Sopenharmony_ci * High level logic flow: 28608c2ecf20Sopenharmony_ci * 28618c2ecf20Sopenharmony_ci * - Allocate a page, copy the content of the old page to the new one. 28628c2ecf20Sopenharmony_ci * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc. 28638c2ecf20Sopenharmony_ci * - Take the PTL. If the pte changed, bail out and release the allocated page 28648c2ecf20Sopenharmony_ci * - If the pte is still the way we remember it, update the page table and all 28658c2ecf20Sopenharmony_ci * relevant references. This includes dropping the reference the page-table 28668c2ecf20Sopenharmony_ci * held to the old page, as well as updating the rmap. 28678c2ecf20Sopenharmony_ci * - In any case, unlock the PTL and drop the reference we took to the old page. 28688c2ecf20Sopenharmony_ci */ 28698c2ecf20Sopenharmony_cistatic vm_fault_t wp_page_copy(struct vm_fault *vmf) 28708c2ecf20Sopenharmony_ci{ 28718c2ecf20Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 28728c2ecf20Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 28738c2ecf20Sopenharmony_ci struct page *old_page = vmf->page; 28748c2ecf20Sopenharmony_ci struct page *new_page = NULL; 28758c2ecf20Sopenharmony_ci pte_t entry; 28768c2ecf20Sopenharmony_ci int page_copied = 0; 28778c2ecf20Sopenharmony_ci struct mmu_notifier_range range; 28788c2ecf20Sopenharmony_ci 28798c2ecf20Sopenharmony_ci if (unlikely(anon_vma_prepare(vma))) 28808c2ecf20Sopenharmony_ci goto oom; 28818c2ecf20Sopenharmony_ci 28828c2ecf20Sopenharmony_ci if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { 28838c2ecf20Sopenharmony_ci new_page = alloc_zeroed_user_highpage_movable(vma, 28848c2ecf20Sopenharmony_ci vmf->address); 28858c2ecf20Sopenharmony_ci if (!new_page) 28868c2ecf20Sopenharmony_ci goto oom; 28878c2ecf20Sopenharmony_ci } else { 28888c2ecf20Sopenharmony_ci new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, 28898c2ecf20Sopenharmony_ci vmf->address); 28908c2ecf20Sopenharmony_ci if (!new_page) 28918c2ecf20Sopenharmony_ci goto oom; 28928c2ecf20Sopenharmony_ci 28938c2ecf20Sopenharmony_ci if (!cow_user_page(new_page, old_page, vmf)) { 28948c2ecf20Sopenharmony_ci /* 28958c2ecf20Sopenharmony_ci * COW failed, if the fault was solved by other, 28968c2ecf20Sopenharmony_ci * it's fine. If not, userspace would re-fault on 28978c2ecf20Sopenharmony_ci * the same address and we will handle the fault 28988c2ecf20Sopenharmony_ci * from the second attempt. 28998c2ecf20Sopenharmony_ci */ 29008c2ecf20Sopenharmony_ci put_page(new_page); 29018c2ecf20Sopenharmony_ci if (old_page) 29028c2ecf20Sopenharmony_ci put_page(old_page); 29038c2ecf20Sopenharmony_ci return 0; 29048c2ecf20Sopenharmony_ci } 29058c2ecf20Sopenharmony_ci } 29068c2ecf20Sopenharmony_ci 29078c2ecf20Sopenharmony_ci if (mem_cgroup_charge(new_page, mm, GFP_KERNEL)) 29088c2ecf20Sopenharmony_ci goto oom_free_new; 29098c2ecf20Sopenharmony_ci cgroup_throttle_swaprate(new_page, GFP_KERNEL); 29108c2ecf20Sopenharmony_ci 29118c2ecf20Sopenharmony_ci __SetPageUptodate(new_page); 29128c2ecf20Sopenharmony_ci 29138c2ecf20Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, 29148c2ecf20Sopenharmony_ci vmf->address & PAGE_MASK, 29158c2ecf20Sopenharmony_ci (vmf->address & PAGE_MASK) + PAGE_SIZE); 29168c2ecf20Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 29178c2ecf20Sopenharmony_ci 29188c2ecf20Sopenharmony_ci /* 29198c2ecf20Sopenharmony_ci * Re-check the pte - we dropped the lock 29208c2ecf20Sopenharmony_ci */ 29218c2ecf20Sopenharmony_ci vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); 29228c2ecf20Sopenharmony_ci if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { 29238c2ecf20Sopenharmony_ci if (old_page) { 29248c2ecf20Sopenharmony_ci if (!PageAnon(old_page)) { 29258c2ecf20Sopenharmony_ci dec_mm_counter_fast(mm, 29268c2ecf20Sopenharmony_ci mm_counter_file(old_page)); 29278c2ecf20Sopenharmony_ci inc_mm_counter_fast(mm, MM_ANONPAGES); 29288c2ecf20Sopenharmony_ci } 29298c2ecf20Sopenharmony_ci } else { 29308c2ecf20Sopenharmony_ci inc_mm_counter_fast(mm, MM_ANONPAGES); 29318c2ecf20Sopenharmony_ci } 29328c2ecf20Sopenharmony_ci flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); 29338c2ecf20Sopenharmony_ci entry = mk_pte(new_page, vma->vm_page_prot); 29348c2ecf20Sopenharmony_ci entry = pte_sw_mkyoung(entry); 29358c2ecf20Sopenharmony_ci entry = maybe_mkwrite(pte_mkdirty(entry), vma); 29368c2ecf20Sopenharmony_ci /* 29378c2ecf20Sopenharmony_ci * Clear the pte entry and flush it first, before updating the 29388c2ecf20Sopenharmony_ci * pte with the new entry. This will avoid a race condition 29398c2ecf20Sopenharmony_ci * seen in the presence of one thread doing SMC and another 29408c2ecf20Sopenharmony_ci * thread doing COW. 29418c2ecf20Sopenharmony_ci */ 29428c2ecf20Sopenharmony_ci ptep_clear_flush_notify(vma, vmf->address, vmf->pte); 29438c2ecf20Sopenharmony_ci page_add_new_anon_rmap(new_page, vma, vmf->address, false); 29448c2ecf20Sopenharmony_ci if (vma->vm_flags & VM_PURGEABLE) { 29458c2ecf20Sopenharmony_ci pr_info("set wp new page %lx purgeable\n", page_to_pfn(new_page)); 29468c2ecf20Sopenharmony_ci SetPagePurgeable(new_page); 29478c2ecf20Sopenharmony_ci uxpte_set_present(vma, vmf->address); 29488c2ecf20Sopenharmony_ci } 29498c2ecf20Sopenharmony_ci lru_cache_add_inactive_or_unevictable(new_page, vma); 29508c2ecf20Sopenharmony_ci /* 29518c2ecf20Sopenharmony_ci * We call the notify macro here because, when using secondary 29528c2ecf20Sopenharmony_ci * mmu page tables (such as kvm shadow page tables), we want the 29538c2ecf20Sopenharmony_ci * new page to be mapped directly into the secondary page table. 29548c2ecf20Sopenharmony_ci */ 29558c2ecf20Sopenharmony_ci set_pte_at_notify(mm, vmf->address, vmf->pte, entry); 29568c2ecf20Sopenharmony_ci update_mmu_cache(vma, vmf->address, vmf->pte); 29578c2ecf20Sopenharmony_ci xpm_integrity_update_hook(vma, vmf->flags, new_page); 29588c2ecf20Sopenharmony_ci if (old_page) { 29598c2ecf20Sopenharmony_ci /* 29608c2ecf20Sopenharmony_ci * Only after switching the pte to the new page may 29618c2ecf20Sopenharmony_ci * we remove the mapcount here. Otherwise another 29628c2ecf20Sopenharmony_ci * process may come and find the rmap count decremented 29638c2ecf20Sopenharmony_ci * before the pte is switched to the new page, and 29648c2ecf20Sopenharmony_ci * "reuse" the old page writing into it while our pte 29658c2ecf20Sopenharmony_ci * here still points into it and can be read by other 29668c2ecf20Sopenharmony_ci * threads. 29678c2ecf20Sopenharmony_ci * 29688c2ecf20Sopenharmony_ci * The critical issue is to order this 29698c2ecf20Sopenharmony_ci * page_remove_rmap with the ptp_clear_flush above. 29708c2ecf20Sopenharmony_ci * Those stores are ordered by (if nothing else,) 29718c2ecf20Sopenharmony_ci * the barrier present in the atomic_add_negative 29728c2ecf20Sopenharmony_ci * in page_remove_rmap. 29738c2ecf20Sopenharmony_ci * 29748c2ecf20Sopenharmony_ci * Then the TLB flush in ptep_clear_flush ensures that 29758c2ecf20Sopenharmony_ci * no process can access the old page before the 29768c2ecf20Sopenharmony_ci * decremented mapcount is visible. And the old page 29778c2ecf20Sopenharmony_ci * cannot be reused until after the decremented 29788c2ecf20Sopenharmony_ci * mapcount is visible. So transitively, TLBs to 29798c2ecf20Sopenharmony_ci * old page will be flushed before it can be reused. 29808c2ecf20Sopenharmony_ci */ 29818c2ecf20Sopenharmony_ci page_remove_rmap(old_page, false); 29828c2ecf20Sopenharmony_ci } 29838c2ecf20Sopenharmony_ci 29848c2ecf20Sopenharmony_ci /* Free the old page.. */ 29858c2ecf20Sopenharmony_ci new_page = old_page; 29868c2ecf20Sopenharmony_ci page_copied = 1; 29878c2ecf20Sopenharmony_ci } else { 29888c2ecf20Sopenharmony_ci update_mmu_tlb(vma, vmf->address, vmf->pte); 29898c2ecf20Sopenharmony_ci } 29908c2ecf20Sopenharmony_ci 29918c2ecf20Sopenharmony_ci if (new_page) 29928c2ecf20Sopenharmony_ci put_page(new_page); 29938c2ecf20Sopenharmony_ci 29948c2ecf20Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 29958c2ecf20Sopenharmony_ci /* 29968c2ecf20Sopenharmony_ci * No need to double call mmu_notifier->invalidate_range() callback as 29978c2ecf20Sopenharmony_ci * the above ptep_clear_flush_notify() did already call it. 29988c2ecf20Sopenharmony_ci */ 29998c2ecf20Sopenharmony_ci mmu_notifier_invalidate_range_only_end(&range); 30008c2ecf20Sopenharmony_ci if (old_page) { 30018c2ecf20Sopenharmony_ci /* 30028c2ecf20Sopenharmony_ci * Don't let another task, with possibly unlocked vma, 30038c2ecf20Sopenharmony_ci * keep the mlocked page. 30048c2ecf20Sopenharmony_ci */ 30058c2ecf20Sopenharmony_ci if (page_copied && (vma->vm_flags & VM_LOCKED)) { 30068c2ecf20Sopenharmony_ci lock_page(old_page); /* LRU manipulation */ 30078c2ecf20Sopenharmony_ci if (PageMlocked(old_page)) 30088c2ecf20Sopenharmony_ci munlock_vma_page(old_page); 30098c2ecf20Sopenharmony_ci unlock_page(old_page); 30108c2ecf20Sopenharmony_ci } 30118c2ecf20Sopenharmony_ci put_page(old_page); 30128c2ecf20Sopenharmony_ci } 30138c2ecf20Sopenharmony_ci return page_copied ? VM_FAULT_WRITE : 0; 30148c2ecf20Sopenharmony_cioom_free_new: 30158c2ecf20Sopenharmony_ci put_page(new_page); 30168c2ecf20Sopenharmony_cioom: 30178c2ecf20Sopenharmony_ci if (old_page) 30188c2ecf20Sopenharmony_ci put_page(old_page); 30198c2ecf20Sopenharmony_ci return VM_FAULT_OOM; 30208c2ecf20Sopenharmony_ci} 30218c2ecf20Sopenharmony_ci 30228c2ecf20Sopenharmony_ci/** 30238c2ecf20Sopenharmony_ci * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE 30248c2ecf20Sopenharmony_ci * writeable once the page is prepared 30258c2ecf20Sopenharmony_ci * 30268c2ecf20Sopenharmony_ci * @vmf: structure describing the fault 30278c2ecf20Sopenharmony_ci * 30288c2ecf20Sopenharmony_ci * This function handles all that is needed to finish a write page fault in a 30298c2ecf20Sopenharmony_ci * shared mapping due to PTE being read-only once the mapped page is prepared. 30308c2ecf20Sopenharmony_ci * It handles locking of PTE and modifying it. 30318c2ecf20Sopenharmony_ci * 30328c2ecf20Sopenharmony_ci * The function expects the page to be locked or other protection against 30338c2ecf20Sopenharmony_ci * concurrent faults / writeback (such as DAX radix tree locks). 30348c2ecf20Sopenharmony_ci * 30358c2ecf20Sopenharmony_ci * Return: %VM_FAULT_WRITE on success, %0 when PTE got changed before 30368c2ecf20Sopenharmony_ci * we acquired PTE lock. 30378c2ecf20Sopenharmony_ci */ 30388c2ecf20Sopenharmony_civm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) 30398c2ecf20Sopenharmony_ci{ 30408c2ecf20Sopenharmony_ci WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); 30418c2ecf20Sopenharmony_ci vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, 30428c2ecf20Sopenharmony_ci &vmf->ptl); 30438c2ecf20Sopenharmony_ci /* 30448c2ecf20Sopenharmony_ci * We might have raced with another page fault while we released the 30458c2ecf20Sopenharmony_ci * pte_offset_map_lock. 30468c2ecf20Sopenharmony_ci */ 30478c2ecf20Sopenharmony_ci if (!pte_same(*vmf->pte, vmf->orig_pte)) { 30488c2ecf20Sopenharmony_ci update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); 30498c2ecf20Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 30508c2ecf20Sopenharmony_ci return VM_FAULT_NOPAGE; 30518c2ecf20Sopenharmony_ci } 30528c2ecf20Sopenharmony_ci 30538c2ecf20Sopenharmony_ci if (unlikely(xpm_integrity_validate_hook(vmf->vma, vmf->flags, 30548c2ecf20Sopenharmony_ci vmf->address, vmf->page))) { 30558c2ecf20Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 30568c2ecf20Sopenharmony_ci return VM_FAULT_SIGSEGV; 30578c2ecf20Sopenharmony_ci } 30588c2ecf20Sopenharmony_ci 30598c2ecf20Sopenharmony_ci wp_page_reuse(vmf); 30608c2ecf20Sopenharmony_ci return 0; 30618c2ecf20Sopenharmony_ci} 30628c2ecf20Sopenharmony_ci 30638c2ecf20Sopenharmony_ci/* 30648c2ecf20Sopenharmony_ci * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED 30658c2ecf20Sopenharmony_ci * mapping 30668c2ecf20Sopenharmony_ci */ 30678c2ecf20Sopenharmony_cistatic vm_fault_t wp_pfn_shared(struct vm_fault *vmf) 30688c2ecf20Sopenharmony_ci{ 30698c2ecf20Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 30708c2ecf20Sopenharmony_ci 30718c2ecf20Sopenharmony_ci if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { 30728c2ecf20Sopenharmony_ci vm_fault_t ret; 30738c2ecf20Sopenharmony_ci 30748c2ecf20Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 30758c2ecf20Sopenharmony_ci vmf->flags |= FAULT_FLAG_MKWRITE; 30768c2ecf20Sopenharmony_ci ret = vma->vm_ops->pfn_mkwrite(vmf); 30778c2ecf20Sopenharmony_ci if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)) 30788c2ecf20Sopenharmony_ci return ret; 30798c2ecf20Sopenharmony_ci return finish_mkwrite_fault(vmf); 30808c2ecf20Sopenharmony_ci } 30818c2ecf20Sopenharmony_ci wp_page_reuse(vmf); 30828c2ecf20Sopenharmony_ci return VM_FAULT_WRITE; 30838c2ecf20Sopenharmony_ci} 30848c2ecf20Sopenharmony_ci 30858c2ecf20Sopenharmony_cistatic vm_fault_t wp_page_shared(struct vm_fault *vmf) 30868c2ecf20Sopenharmony_ci __releases(vmf->ptl) 30878c2ecf20Sopenharmony_ci{ 30888c2ecf20Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 30898c2ecf20Sopenharmony_ci vm_fault_t ret = VM_FAULT_WRITE; 30908c2ecf20Sopenharmony_ci 30918c2ecf20Sopenharmony_ci get_page(vmf->page); 30928c2ecf20Sopenharmony_ci 30938c2ecf20Sopenharmony_ci if (vma->vm_ops && vma->vm_ops->page_mkwrite) { 30948c2ecf20Sopenharmony_ci vm_fault_t tmp; 30958c2ecf20Sopenharmony_ci 30968c2ecf20Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 30978c2ecf20Sopenharmony_ci tmp = do_page_mkwrite(vmf); 30988c2ecf20Sopenharmony_ci if (unlikely(!tmp || (tmp & 30998c2ecf20Sopenharmony_ci (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { 31008c2ecf20Sopenharmony_ci put_page(vmf->page); 31018c2ecf20Sopenharmony_ci return tmp; 31028c2ecf20Sopenharmony_ci } 31038c2ecf20Sopenharmony_ci tmp = finish_mkwrite_fault(vmf); 31048c2ecf20Sopenharmony_ci if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { 31058c2ecf20Sopenharmony_ci unlock_page(vmf->page); 31068c2ecf20Sopenharmony_ci put_page(vmf->page); 31078c2ecf20Sopenharmony_ci return tmp; 31088c2ecf20Sopenharmony_ci } 31098c2ecf20Sopenharmony_ci } else { 31108c2ecf20Sopenharmony_ci if (unlikely(xpm_integrity_validate_hook(vmf->vma, vmf->flags, vmf->address, 31118c2ecf20Sopenharmony_ci vmf->page))){ 31128c2ecf20Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 31138c2ecf20Sopenharmony_ci put_page(vmf->page); 31148c2ecf20Sopenharmony_ci return VM_FAULT_SIGSEGV; 31158c2ecf20Sopenharmony_ci } 31168c2ecf20Sopenharmony_ci 31178c2ecf20Sopenharmony_ci wp_page_reuse(vmf); 31188c2ecf20Sopenharmony_ci lock_page(vmf->page); 31198c2ecf20Sopenharmony_ci } 31208c2ecf20Sopenharmony_ci ret |= fault_dirty_shared_page(vmf); 31218c2ecf20Sopenharmony_ci put_page(vmf->page); 31228c2ecf20Sopenharmony_ci 31238c2ecf20Sopenharmony_ci return ret; 31248c2ecf20Sopenharmony_ci} 31258c2ecf20Sopenharmony_ci 31268c2ecf20Sopenharmony_ci/* 31278c2ecf20Sopenharmony_ci * This routine handles present pages, when users try to write 31288c2ecf20Sopenharmony_ci * to a shared page. It is done by copying the page to a new address 31298c2ecf20Sopenharmony_ci * and decrementing the shared-page counter for the old page. 31308c2ecf20Sopenharmony_ci * 31318c2ecf20Sopenharmony_ci * Note that this routine assumes that the protection checks have been 31328c2ecf20Sopenharmony_ci * done by the caller (the low-level page fault routine in most cases). 31338c2ecf20Sopenharmony_ci * Thus we can safely just mark it writable once we've done any necessary 31348c2ecf20Sopenharmony_ci * COW. 31358c2ecf20Sopenharmony_ci * 31368c2ecf20Sopenharmony_ci * We also mark the page dirty at this point even though the page will 31378c2ecf20Sopenharmony_ci * change only once the write actually happens. This avoids a few races, 31388c2ecf20Sopenharmony_ci * and potentially makes it more efficient. 31398c2ecf20Sopenharmony_ci * 31408c2ecf20Sopenharmony_ci * We enter with non-exclusive mmap_lock (to exclude vma changes, 31418c2ecf20Sopenharmony_ci * but allow concurrent faults), with pte both mapped and locked. 31428c2ecf20Sopenharmony_ci * We return with mmap_lock still held, but pte unmapped and unlocked. 31438c2ecf20Sopenharmony_ci */ 31448c2ecf20Sopenharmony_cistatic vm_fault_t do_wp_page(struct vm_fault *vmf) 31458c2ecf20Sopenharmony_ci __releases(vmf->ptl) 31468c2ecf20Sopenharmony_ci{ 31478c2ecf20Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 31488c2ecf20Sopenharmony_ci 31498c2ecf20Sopenharmony_ci if (userfaultfd_pte_wp(vma, *vmf->pte)) { 31508c2ecf20Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 31518c2ecf20Sopenharmony_ci return handle_userfault(vmf, VM_UFFD_WP); 31528c2ecf20Sopenharmony_ci } 31538c2ecf20Sopenharmony_ci 31548c2ecf20Sopenharmony_ci /* 31558c2ecf20Sopenharmony_ci * Userfaultfd write-protect can defer flushes. Ensure the TLB 31568c2ecf20Sopenharmony_ci * is flushed in this case before copying. 31578c2ecf20Sopenharmony_ci */ 31588c2ecf20Sopenharmony_ci if (unlikely(userfaultfd_wp(vmf->vma) && 31598c2ecf20Sopenharmony_ci mm_tlb_flush_pending(vmf->vma->vm_mm))) 31608c2ecf20Sopenharmony_ci flush_tlb_page(vmf->vma, vmf->address); 31618c2ecf20Sopenharmony_ci 31628c2ecf20Sopenharmony_ci vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); 31638c2ecf20Sopenharmony_ci if (!vmf->page) { 31648c2ecf20Sopenharmony_ci /* 31658c2ecf20Sopenharmony_ci * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a 31668c2ecf20Sopenharmony_ci * VM_PFNMAP VMA. 31678c2ecf20Sopenharmony_ci * 31688c2ecf20Sopenharmony_ci * We should not cow pages in a shared writeable mapping. 31698c2ecf20Sopenharmony_ci * Just mark the pages writable and/or call ops->pfn_mkwrite. 31708c2ecf20Sopenharmony_ci */ 31718c2ecf20Sopenharmony_ci if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 31728c2ecf20Sopenharmony_ci (VM_WRITE|VM_SHARED)) 31738c2ecf20Sopenharmony_ci return wp_pfn_shared(vmf); 31748c2ecf20Sopenharmony_ci 31758c2ecf20Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 31768c2ecf20Sopenharmony_ci return wp_page_copy(vmf); 31778c2ecf20Sopenharmony_ci } 31788c2ecf20Sopenharmony_ci 31798c2ecf20Sopenharmony_ci /* 31808c2ecf20Sopenharmony_ci * Take out anonymous pages first, anonymous shared vmas are 31818c2ecf20Sopenharmony_ci * not dirty accountable. 31828c2ecf20Sopenharmony_ci */ 31838c2ecf20Sopenharmony_ci if (PageAnon(vmf->page)) { 31848c2ecf20Sopenharmony_ci struct page *page = vmf->page; 31858c2ecf20Sopenharmony_ci 31868c2ecf20Sopenharmony_ci /* PageKsm() doesn't necessarily raise the page refcount */ 31878c2ecf20Sopenharmony_ci if (PageKsm(page) || page_count(page) != 1) 31888c2ecf20Sopenharmony_ci goto copy; 31898c2ecf20Sopenharmony_ci if (!trylock_page(page)) 31908c2ecf20Sopenharmony_ci goto copy; 31918c2ecf20Sopenharmony_ci if (PageKsm(page) || page_mapcount(page) != 1 || page_count(page) != 1) { 31928c2ecf20Sopenharmony_ci unlock_page(page); 31938c2ecf20Sopenharmony_ci goto copy; 31948c2ecf20Sopenharmony_ci } 31958c2ecf20Sopenharmony_ci /* 31968c2ecf20Sopenharmony_ci * Ok, we've got the only map reference, and the only 31978c2ecf20Sopenharmony_ci * page count reference, and the page is locked, 31988c2ecf20Sopenharmony_ci * it's dark out, and we're wearing sunglasses. Hit it. 31998c2ecf20Sopenharmony_ci */ 32008c2ecf20Sopenharmony_ci unlock_page(page); 32018c2ecf20Sopenharmony_ci 32028c2ecf20Sopenharmony_ci if (unlikely(xpm_integrity_validate_hook(vmf->vma, vmf->flags, vmf->address, 32038c2ecf20Sopenharmony_ci vmf->page))){ 32048c2ecf20Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 32058c2ecf20Sopenharmony_ci return VM_FAULT_SIGSEGV; 32068c2ecf20Sopenharmony_ci } 32078c2ecf20Sopenharmony_ci 32088c2ecf20Sopenharmony_ci wp_page_reuse(vmf); 32098c2ecf20Sopenharmony_ci return VM_FAULT_WRITE; 32108c2ecf20Sopenharmony_ci } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 32118c2ecf20Sopenharmony_ci (VM_WRITE|VM_SHARED))) { 32128c2ecf20Sopenharmony_ci return wp_page_shared(vmf); 32138c2ecf20Sopenharmony_ci } 32148c2ecf20Sopenharmony_cicopy: 32158c2ecf20Sopenharmony_ci /* 32168c2ecf20Sopenharmony_ci * Ok, we need to copy. Oh, well.. 32178c2ecf20Sopenharmony_ci */ 32188c2ecf20Sopenharmony_ci get_page(vmf->page); 32198c2ecf20Sopenharmony_ci 32208c2ecf20Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 32218c2ecf20Sopenharmony_ci return wp_page_copy(vmf); 32228c2ecf20Sopenharmony_ci} 32238c2ecf20Sopenharmony_ci 32248c2ecf20Sopenharmony_cistatic void unmap_mapping_range_vma(struct vm_area_struct *vma, 32258c2ecf20Sopenharmony_ci unsigned long start_addr, unsigned long end_addr, 32268c2ecf20Sopenharmony_ci struct zap_details *details) 32278c2ecf20Sopenharmony_ci{ 32288c2ecf20Sopenharmony_ci zap_page_range_single(vma, start_addr, end_addr - start_addr, details); 32298c2ecf20Sopenharmony_ci} 32308c2ecf20Sopenharmony_ci 32318c2ecf20Sopenharmony_cistatic inline void unmap_mapping_range_tree(struct rb_root_cached *root, 32328c2ecf20Sopenharmony_ci struct zap_details *details) 32338c2ecf20Sopenharmony_ci{ 32348c2ecf20Sopenharmony_ci struct vm_area_struct *vma; 32358c2ecf20Sopenharmony_ci pgoff_t vba, vea, zba, zea; 32368c2ecf20Sopenharmony_ci 32378c2ecf20Sopenharmony_ci vma_interval_tree_foreach(vma, root, 32388c2ecf20Sopenharmony_ci details->first_index, details->last_index) { 32398c2ecf20Sopenharmony_ci 32408c2ecf20Sopenharmony_ci vba = vma->vm_pgoff; 32418c2ecf20Sopenharmony_ci vea = vba + vma_pages(vma) - 1; 32428c2ecf20Sopenharmony_ci zba = details->first_index; 32438c2ecf20Sopenharmony_ci if (zba < vba) 32448c2ecf20Sopenharmony_ci zba = vba; 32458c2ecf20Sopenharmony_ci zea = details->last_index; 32468c2ecf20Sopenharmony_ci if (zea > vea) 32478c2ecf20Sopenharmony_ci zea = vea; 32488c2ecf20Sopenharmony_ci 32498c2ecf20Sopenharmony_ci unmap_mapping_range_vma(vma, 32508c2ecf20Sopenharmony_ci ((zba - vba) << PAGE_SHIFT) + vma->vm_start, 32518c2ecf20Sopenharmony_ci ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, 32528c2ecf20Sopenharmony_ci details); 32538c2ecf20Sopenharmony_ci } 32548c2ecf20Sopenharmony_ci} 32558c2ecf20Sopenharmony_ci 32568c2ecf20Sopenharmony_ci/** 32578c2ecf20Sopenharmony_ci * unmap_mapping_page() - Unmap single page from processes. 32588c2ecf20Sopenharmony_ci * @page: The locked page to be unmapped. 32598c2ecf20Sopenharmony_ci * 32608c2ecf20Sopenharmony_ci * Unmap this page from any userspace process which still has it mmaped. 32618c2ecf20Sopenharmony_ci * Typically, for efficiency, the range of nearby pages has already been 32628c2ecf20Sopenharmony_ci * unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once 32638c2ecf20Sopenharmony_ci * truncation or invalidation holds the lock on a page, it may find that 32648c2ecf20Sopenharmony_ci * the page has been remapped again: and then uses unmap_mapping_page() 32658c2ecf20Sopenharmony_ci * to unmap it finally. 32668c2ecf20Sopenharmony_ci */ 32678c2ecf20Sopenharmony_civoid unmap_mapping_page(struct page *page) 32688c2ecf20Sopenharmony_ci{ 32698c2ecf20Sopenharmony_ci struct address_space *mapping = page->mapping; 32708c2ecf20Sopenharmony_ci struct zap_details details = { }; 32718c2ecf20Sopenharmony_ci 32728c2ecf20Sopenharmony_ci VM_BUG_ON(!PageLocked(page)); 32738c2ecf20Sopenharmony_ci VM_BUG_ON(PageTail(page)); 32748c2ecf20Sopenharmony_ci 32758c2ecf20Sopenharmony_ci details.check_mapping = mapping; 32768c2ecf20Sopenharmony_ci details.first_index = page->index; 32778c2ecf20Sopenharmony_ci details.last_index = page->index + thp_nr_pages(page) - 1; 32788c2ecf20Sopenharmony_ci details.single_page = page; 32798c2ecf20Sopenharmony_ci 32808c2ecf20Sopenharmony_ci i_mmap_lock_write(mapping); 32818c2ecf20Sopenharmony_ci if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) 32828c2ecf20Sopenharmony_ci unmap_mapping_range_tree(&mapping->i_mmap, &details); 32838c2ecf20Sopenharmony_ci i_mmap_unlock_write(mapping); 32848c2ecf20Sopenharmony_ci} 32858c2ecf20Sopenharmony_ci 32868c2ecf20Sopenharmony_ci/** 32878c2ecf20Sopenharmony_ci * unmap_mapping_pages() - Unmap pages from processes. 32888c2ecf20Sopenharmony_ci * @mapping: The address space containing pages to be unmapped. 32898c2ecf20Sopenharmony_ci * @start: Index of first page to be unmapped. 32908c2ecf20Sopenharmony_ci * @nr: Number of pages to be unmapped. 0 to unmap to end of file. 32918c2ecf20Sopenharmony_ci * @even_cows: Whether to unmap even private COWed pages. 32928c2ecf20Sopenharmony_ci * 32938c2ecf20Sopenharmony_ci * Unmap the pages in this address space from any userspace process which 32948c2ecf20Sopenharmony_ci * has them mmaped. Generally, you want to remove COWed pages as well when 32958c2ecf20Sopenharmony_ci * a file is being truncated, but not when invalidating pages from the page 32968c2ecf20Sopenharmony_ci * cache. 32978c2ecf20Sopenharmony_ci */ 32988c2ecf20Sopenharmony_civoid unmap_mapping_pages(struct address_space *mapping, pgoff_t start, 32998c2ecf20Sopenharmony_ci pgoff_t nr, bool even_cows) 33008c2ecf20Sopenharmony_ci{ 33018c2ecf20Sopenharmony_ci struct zap_details details = { }; 33028c2ecf20Sopenharmony_ci 33038c2ecf20Sopenharmony_ci details.check_mapping = even_cows ? NULL : mapping; 33048c2ecf20Sopenharmony_ci details.first_index = start; 33058c2ecf20Sopenharmony_ci details.last_index = start + nr - 1; 33068c2ecf20Sopenharmony_ci if (details.last_index < details.first_index) 33078c2ecf20Sopenharmony_ci details.last_index = ULONG_MAX; 33088c2ecf20Sopenharmony_ci 33098c2ecf20Sopenharmony_ci i_mmap_lock_write(mapping); 33108c2ecf20Sopenharmony_ci if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) 33118c2ecf20Sopenharmony_ci unmap_mapping_range_tree(&mapping->i_mmap, &details); 33128c2ecf20Sopenharmony_ci i_mmap_unlock_write(mapping); 33138c2ecf20Sopenharmony_ci} 33148c2ecf20Sopenharmony_ci 33158c2ecf20Sopenharmony_ci/** 33168c2ecf20Sopenharmony_ci * unmap_mapping_range - unmap the portion of all mmaps in the specified 33178c2ecf20Sopenharmony_ci * address_space corresponding to the specified byte range in the underlying 33188c2ecf20Sopenharmony_ci * file. 33198c2ecf20Sopenharmony_ci * 33208c2ecf20Sopenharmony_ci * @mapping: the address space containing mmaps to be unmapped. 33218c2ecf20Sopenharmony_ci * @holebegin: byte in first page to unmap, relative to the start of 33228c2ecf20Sopenharmony_ci * the underlying file. This will be rounded down to a PAGE_SIZE 33238c2ecf20Sopenharmony_ci * boundary. Note that this is different from truncate_pagecache(), which 33248c2ecf20Sopenharmony_ci * must keep the partial page. In contrast, we must get rid of 33258c2ecf20Sopenharmony_ci * partial pages. 33268c2ecf20Sopenharmony_ci * @holelen: size of prospective hole in bytes. This will be rounded 33278c2ecf20Sopenharmony_ci * up to a PAGE_SIZE boundary. A holelen of zero truncates to the 33288c2ecf20Sopenharmony_ci * end of the file. 33298c2ecf20Sopenharmony_ci * @even_cows: 1 when truncating a file, unmap even private COWed pages; 33308c2ecf20Sopenharmony_ci * but 0 when invalidating pagecache, don't throw away private data. 33318c2ecf20Sopenharmony_ci */ 33328c2ecf20Sopenharmony_civoid unmap_mapping_range(struct address_space *mapping, 33338c2ecf20Sopenharmony_ci loff_t const holebegin, loff_t const holelen, int even_cows) 33348c2ecf20Sopenharmony_ci{ 33358c2ecf20Sopenharmony_ci pgoff_t hba = (pgoff_t)(holebegin) >> PAGE_SHIFT; 33368c2ecf20Sopenharmony_ci pgoff_t hlen = ((pgoff_t)(holelen) + PAGE_SIZE - 1) >> PAGE_SHIFT; 33378c2ecf20Sopenharmony_ci 33388c2ecf20Sopenharmony_ci /* Check for overflow. */ 33398c2ecf20Sopenharmony_ci if (sizeof(holelen) > sizeof(hlen)) { 33408c2ecf20Sopenharmony_ci long long holeend = 33418c2ecf20Sopenharmony_ci (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; 33428c2ecf20Sopenharmony_ci if (holeend & ~(long long)ULONG_MAX) 33438c2ecf20Sopenharmony_ci hlen = ULONG_MAX - hba + 1; 33448c2ecf20Sopenharmony_ci } 33458c2ecf20Sopenharmony_ci 33468c2ecf20Sopenharmony_ci unmap_mapping_pages(mapping, hba, hlen, even_cows); 33478c2ecf20Sopenharmony_ci} 33488c2ecf20Sopenharmony_ciEXPORT_SYMBOL(unmap_mapping_range); 33498c2ecf20Sopenharmony_ci 33508c2ecf20Sopenharmony_ci/* 33518c2ecf20Sopenharmony_ci * We enter with non-exclusive mmap_lock (to exclude vma changes, 33528c2ecf20Sopenharmony_ci * but allow concurrent faults), and pte mapped but not yet locked. 33538c2ecf20Sopenharmony_ci * We return with pte unmapped and unlocked. 33548c2ecf20Sopenharmony_ci * 33558c2ecf20Sopenharmony_ci * We return with the mmap_lock locked or unlocked in the same cases 33568c2ecf20Sopenharmony_ci * as does filemap_fault(). 33578c2ecf20Sopenharmony_ci */ 33588c2ecf20Sopenharmony_civm_fault_t do_swap_page(struct vm_fault *vmf) 33598c2ecf20Sopenharmony_ci{ 33608c2ecf20Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 33618c2ecf20Sopenharmony_ci struct page *page = NULL, *swapcache; 33628c2ecf20Sopenharmony_ci swp_entry_t entry; 33638c2ecf20Sopenharmony_ci pte_t pte; 33648c2ecf20Sopenharmony_ci int locked; 33658c2ecf20Sopenharmony_ci int exclusive = 0; 33668c2ecf20Sopenharmony_ci vm_fault_t ret = 0; 33678c2ecf20Sopenharmony_ci void *shadow = NULL; 33688c2ecf20Sopenharmony_ci 33698c2ecf20Sopenharmony_ci if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) 33708c2ecf20Sopenharmony_ci goto out; 33718c2ecf20Sopenharmony_ci 33728c2ecf20Sopenharmony_ci entry = pte_to_swp_entry(vmf->orig_pte); 33738c2ecf20Sopenharmony_ci if (unlikely(non_swap_entry(entry))) { 33748c2ecf20Sopenharmony_ci if (is_migration_entry(entry)) { 33758c2ecf20Sopenharmony_ci migration_entry_wait(vma->vm_mm, vmf->pmd, 33768c2ecf20Sopenharmony_ci vmf->address); 33778c2ecf20Sopenharmony_ci } else if (is_device_private_entry(entry)) { 33788c2ecf20Sopenharmony_ci vmf->page = device_private_entry_to_page(entry); 33798c2ecf20Sopenharmony_ci vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, 33808c2ecf20Sopenharmony_ci vmf->address, &vmf->ptl); 33818c2ecf20Sopenharmony_ci if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { 33828c2ecf20Sopenharmony_ci spin_unlock(vmf->ptl); 33838c2ecf20Sopenharmony_ci goto out; 33848c2ecf20Sopenharmony_ci } 33858c2ecf20Sopenharmony_ci 33868c2ecf20Sopenharmony_ci /* 33878c2ecf20Sopenharmony_ci * Get a page reference while we know the page can't be 33888c2ecf20Sopenharmony_ci * freed. 33898c2ecf20Sopenharmony_ci */ 33908c2ecf20Sopenharmony_ci get_page(vmf->page); 33918c2ecf20Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 33928c2ecf20Sopenharmony_ci vmf->page->pgmap->ops->migrate_to_ram(vmf); 33938c2ecf20Sopenharmony_ci put_page(vmf->page); 33948c2ecf20Sopenharmony_ci } else if (is_hwpoison_entry(entry)) { 33958c2ecf20Sopenharmony_ci ret = VM_FAULT_HWPOISON; 33968c2ecf20Sopenharmony_ci } else { 33978c2ecf20Sopenharmony_ci print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL); 33988c2ecf20Sopenharmony_ci ret = VM_FAULT_SIGBUS; 33998c2ecf20Sopenharmony_ci } 34008c2ecf20Sopenharmony_ci goto out; 34018c2ecf20Sopenharmony_ci } 34028c2ecf20Sopenharmony_ci 34038c2ecf20Sopenharmony_ci 34048c2ecf20Sopenharmony_ci delayacct_set_flag(DELAYACCT_PF_SWAPIN); 34058c2ecf20Sopenharmony_ci page = lookup_swap_cache(entry, vma, vmf->address); 34068c2ecf20Sopenharmony_ci swapcache = page; 34078c2ecf20Sopenharmony_ci 34088c2ecf20Sopenharmony_ci if (!page) { 34098c2ecf20Sopenharmony_ci struct swap_info_struct *si = swp_swap_info(entry); 34108c2ecf20Sopenharmony_ci 34118c2ecf20Sopenharmony_ci if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && 34128c2ecf20Sopenharmony_ci __swap_count(entry) == 1) { 34138c2ecf20Sopenharmony_ci /* skip swapcache */ 34148c2ecf20Sopenharmony_ci page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, 34158c2ecf20Sopenharmony_ci vmf->address); 34168c2ecf20Sopenharmony_ci if (page) { 34178c2ecf20Sopenharmony_ci int err; 34188c2ecf20Sopenharmony_ci 34198c2ecf20Sopenharmony_ci __SetPageLocked(page); 34208c2ecf20Sopenharmony_ci __SetPageSwapBacked(page); 34218c2ecf20Sopenharmony_ci set_page_private(page, entry.val); 34228c2ecf20Sopenharmony_ci 34238c2ecf20Sopenharmony_ci /* Tell memcg to use swap ownership records */ 34248c2ecf20Sopenharmony_ci SetPageSwapCache(page); 34258c2ecf20Sopenharmony_ci err = mem_cgroup_charge(page, vma->vm_mm, 34268c2ecf20Sopenharmony_ci GFP_KERNEL); 34278c2ecf20Sopenharmony_ci ClearPageSwapCache(page); 34288c2ecf20Sopenharmony_ci if (err) { 34298c2ecf20Sopenharmony_ci ret = VM_FAULT_OOM; 34308c2ecf20Sopenharmony_ci goto out_page; 34318c2ecf20Sopenharmony_ci } 34328c2ecf20Sopenharmony_ci 34338c2ecf20Sopenharmony_ci shadow = get_shadow_from_swap_cache(entry); 34348c2ecf20Sopenharmony_ci if (shadow) 34358c2ecf20Sopenharmony_ci workingset_refault(page, shadow); 34368c2ecf20Sopenharmony_ci 34378c2ecf20Sopenharmony_ci lru_cache_add(page); 34388c2ecf20Sopenharmony_ci swap_readpage(page, true); 34398c2ecf20Sopenharmony_ci } 34408c2ecf20Sopenharmony_ci } else { 34418c2ecf20Sopenharmony_ci page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, 34428c2ecf20Sopenharmony_ci vmf); 34438c2ecf20Sopenharmony_ci swapcache = page; 34448c2ecf20Sopenharmony_ci } 34458c2ecf20Sopenharmony_ci 34468c2ecf20Sopenharmony_ci if (!page) { 34478c2ecf20Sopenharmony_ci /* 34488c2ecf20Sopenharmony_ci * Back out if somebody else faulted in this pte 34498c2ecf20Sopenharmony_ci * while we released the pte lock. 34508c2ecf20Sopenharmony_ci */ 34518c2ecf20Sopenharmony_ci vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, 34528c2ecf20Sopenharmony_ci vmf->address, &vmf->ptl); 34538c2ecf20Sopenharmony_ci if (likely(pte_same(*vmf->pte, vmf->orig_pte))) 34548c2ecf20Sopenharmony_ci ret = VM_FAULT_OOM; 34558c2ecf20Sopenharmony_ci delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 34568c2ecf20Sopenharmony_ci goto unlock; 34578c2ecf20Sopenharmony_ci } 34588c2ecf20Sopenharmony_ci 34598c2ecf20Sopenharmony_ci /* Had to read the page from swap area: Major fault */ 34608c2ecf20Sopenharmony_ci ret = VM_FAULT_MAJOR; 34618c2ecf20Sopenharmony_ci count_vm_event(PGMAJFAULT); 34628c2ecf20Sopenharmony_ci count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); 34638c2ecf20Sopenharmony_ci } else if (PageHWPoison(page)) { 34648c2ecf20Sopenharmony_ci /* 34658c2ecf20Sopenharmony_ci * hwpoisoned dirty swapcache pages are kept for killing 34668c2ecf20Sopenharmony_ci * owner processes (which may be unknown at hwpoison time) 34678c2ecf20Sopenharmony_ci */ 34688c2ecf20Sopenharmony_ci ret = VM_FAULT_HWPOISON; 34698c2ecf20Sopenharmony_ci delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 34708c2ecf20Sopenharmony_ci goto out_release; 34718c2ecf20Sopenharmony_ci } 34728c2ecf20Sopenharmony_ci 34738c2ecf20Sopenharmony_ci locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); 34748c2ecf20Sopenharmony_ci 34758c2ecf20Sopenharmony_ci delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 34768c2ecf20Sopenharmony_ci if (!locked) { 34778c2ecf20Sopenharmony_ci ret |= VM_FAULT_RETRY; 34788c2ecf20Sopenharmony_ci goto out_release; 34798c2ecf20Sopenharmony_ci } 34808c2ecf20Sopenharmony_ci 34818c2ecf20Sopenharmony_ci /* 34828c2ecf20Sopenharmony_ci * Make sure try_to_free_swap or reuse_swap_page or swapoff did not 34838c2ecf20Sopenharmony_ci * release the swapcache from under us. The page pin, and pte_same 34848c2ecf20Sopenharmony_ci * test below, are not enough to exclude that. Even if it is still 34858c2ecf20Sopenharmony_ci * swapcache, we need to check that the page's swap has not changed. 34868c2ecf20Sopenharmony_ci */ 34878c2ecf20Sopenharmony_ci if (unlikely((!PageSwapCache(page) || 34888c2ecf20Sopenharmony_ci page_private(page) != entry.val)) && swapcache) 34898c2ecf20Sopenharmony_ci goto out_page; 34908c2ecf20Sopenharmony_ci 34918c2ecf20Sopenharmony_ci page = ksm_might_need_to_copy(page, vma, vmf->address); 34928c2ecf20Sopenharmony_ci if (unlikely(!page)) { 34938c2ecf20Sopenharmony_ci ret = VM_FAULT_OOM; 34948c2ecf20Sopenharmony_ci page = swapcache; 34958c2ecf20Sopenharmony_ci goto out_page; 34968c2ecf20Sopenharmony_ci } 34978c2ecf20Sopenharmony_ci 34988c2ecf20Sopenharmony_ci cgroup_throttle_swaprate(page, GFP_KERNEL); 34998c2ecf20Sopenharmony_ci 35008c2ecf20Sopenharmony_ci /* 35018c2ecf20Sopenharmony_ci * Back out if somebody else already faulted in this pte. 35028c2ecf20Sopenharmony_ci */ 35038c2ecf20Sopenharmony_ci vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, 35048c2ecf20Sopenharmony_ci &vmf->ptl); 35058c2ecf20Sopenharmony_ci if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) 35068c2ecf20Sopenharmony_ci goto out_nomap; 35078c2ecf20Sopenharmony_ci 35088c2ecf20Sopenharmony_ci if (unlikely(!PageUptodate(page))) { 35098c2ecf20Sopenharmony_ci ret = VM_FAULT_SIGBUS; 35108c2ecf20Sopenharmony_ci goto out_nomap; 35118c2ecf20Sopenharmony_ci } 35128c2ecf20Sopenharmony_ci 35138c2ecf20Sopenharmony_ci /* 35148c2ecf20Sopenharmony_ci * The page isn't present yet, go ahead with the fault. 35158c2ecf20Sopenharmony_ci * 35168c2ecf20Sopenharmony_ci * Be careful about the sequence of operations here. 35178c2ecf20Sopenharmony_ci * To get its accounting right, reuse_swap_page() must be called 35188c2ecf20Sopenharmony_ci * while the page is counted on swap but not yet in mapcount i.e. 35198c2ecf20Sopenharmony_ci * before page_add_anon_rmap() and swap_free(); try_to_free_swap() 35208c2ecf20Sopenharmony_ci * must be called after the swap_free(), or it will never succeed. 35218c2ecf20Sopenharmony_ci */ 35228c2ecf20Sopenharmony_ci if (unlikely(xpm_integrity_validate_hook(vmf->vma, vmf->flags, 35238c2ecf20Sopenharmony_ci vmf->address, page))){ 35248c2ecf20Sopenharmony_ci ret = VM_FAULT_SIGSEGV; 35258c2ecf20Sopenharmony_ci goto out_nomap; 35268c2ecf20Sopenharmony_ci } 35278c2ecf20Sopenharmony_ci 35288c2ecf20Sopenharmony_ci inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); 35298c2ecf20Sopenharmony_ci dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); 35308c2ecf20Sopenharmony_ci pte = mk_pte(page, vma->vm_page_prot); 35318c2ecf20Sopenharmony_ci if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { 35328c2ecf20Sopenharmony_ci pte = maybe_mkwrite(pte_mkdirty(pte), vma); 35338c2ecf20Sopenharmony_ci vmf->flags &= ~FAULT_FLAG_WRITE; 35348c2ecf20Sopenharmony_ci ret |= VM_FAULT_WRITE; 35358c2ecf20Sopenharmony_ci exclusive = RMAP_EXCLUSIVE; 35368c2ecf20Sopenharmony_ci } 35378c2ecf20Sopenharmony_ci flush_icache_page(vma, page); 35388c2ecf20Sopenharmony_ci if (pte_swp_soft_dirty(vmf->orig_pte)) 35398c2ecf20Sopenharmony_ci pte = pte_mksoft_dirty(pte); 35408c2ecf20Sopenharmony_ci if (pte_swp_uffd_wp(vmf->orig_pte)) { 35418c2ecf20Sopenharmony_ci pte = pte_mkuffd_wp(pte); 35428c2ecf20Sopenharmony_ci pte = pte_wrprotect(pte); 35438c2ecf20Sopenharmony_ci } 35448c2ecf20Sopenharmony_ci set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); 35458c2ecf20Sopenharmony_ci arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); 35468c2ecf20Sopenharmony_ci vmf->orig_pte = pte; 35478c2ecf20Sopenharmony_ci 35488c2ecf20Sopenharmony_ci /* ksm created a completely new copy */ 35498c2ecf20Sopenharmony_ci if (unlikely(page != swapcache && swapcache)) { 35508c2ecf20Sopenharmony_ci page_add_new_anon_rmap(page, vma, vmf->address, false); 35518c2ecf20Sopenharmony_ci lru_cache_add_inactive_or_unevictable(page, vma); 35528c2ecf20Sopenharmony_ci } else { 35538c2ecf20Sopenharmony_ci do_page_add_anon_rmap(page, vma, vmf->address, exclusive); 35548c2ecf20Sopenharmony_ci } 35558c2ecf20Sopenharmony_ci 35568c2ecf20Sopenharmony_ci swap_free(entry); 35578c2ecf20Sopenharmony_ci if (mem_cgroup_swap_full(page) || 35588c2ecf20Sopenharmony_ci (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) 35598c2ecf20Sopenharmony_ci try_to_free_swap(page); 35608c2ecf20Sopenharmony_ci unlock_page(page); 35618c2ecf20Sopenharmony_ci if (page != swapcache && swapcache) { 35628c2ecf20Sopenharmony_ci /* 35638c2ecf20Sopenharmony_ci * Hold the lock to avoid the swap entry to be reused 35648c2ecf20Sopenharmony_ci * until we take the PT lock for the pte_same() check 35658c2ecf20Sopenharmony_ci * (to avoid false positives from pte_same). For 35668c2ecf20Sopenharmony_ci * further safety release the lock after the swap_free 35678c2ecf20Sopenharmony_ci * so that the swap count won't change under a 35688c2ecf20Sopenharmony_ci * parallel locked swapcache. 35698c2ecf20Sopenharmony_ci */ 35708c2ecf20Sopenharmony_ci unlock_page(swapcache); 35718c2ecf20Sopenharmony_ci put_page(swapcache); 35728c2ecf20Sopenharmony_ci } 35738c2ecf20Sopenharmony_ci 35748c2ecf20Sopenharmony_ci if (vmf->flags & FAULT_FLAG_WRITE) { 35758c2ecf20Sopenharmony_ci ret |= do_wp_page(vmf); 35768c2ecf20Sopenharmony_ci if (ret & VM_FAULT_ERROR) 35778c2ecf20Sopenharmony_ci ret &= VM_FAULT_ERROR; 35788c2ecf20Sopenharmony_ci goto out; 35798c2ecf20Sopenharmony_ci } 35808c2ecf20Sopenharmony_ci 35818c2ecf20Sopenharmony_ci /* No need to invalidate - it was non-present before */ 35828c2ecf20Sopenharmony_ci update_mmu_cache(vma, vmf->address, vmf->pte); 35838c2ecf20Sopenharmony_ciunlock: 35848c2ecf20Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 35858c2ecf20Sopenharmony_ciout: 35868c2ecf20Sopenharmony_ci return ret; 35878c2ecf20Sopenharmony_ciout_nomap: 35888c2ecf20Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 35898c2ecf20Sopenharmony_ciout_page: 35908c2ecf20Sopenharmony_ci unlock_page(page); 35918c2ecf20Sopenharmony_ciout_release: 35928c2ecf20Sopenharmony_ci put_page(page); 35938c2ecf20Sopenharmony_ci if (page != swapcache && swapcache) { 35948c2ecf20Sopenharmony_ci unlock_page(swapcache); 35958c2ecf20Sopenharmony_ci put_page(swapcache); 35968c2ecf20Sopenharmony_ci } 35978c2ecf20Sopenharmony_ci return ret; 35988c2ecf20Sopenharmony_ci} 35998c2ecf20Sopenharmony_ci 36008c2ecf20Sopenharmony_ci/* 36018c2ecf20Sopenharmony_ci * We enter with non-exclusive mmap_lock (to exclude vma changes, 36028c2ecf20Sopenharmony_ci * but allow concurrent faults), and pte mapped but not yet locked. 36038c2ecf20Sopenharmony_ci * We return with mmap_lock still held, but pte unmapped and unlocked. 36048c2ecf20Sopenharmony_ci */ 36058c2ecf20Sopenharmony_cistatic vm_fault_t do_anonymous_page(struct vm_fault *vmf) 36068c2ecf20Sopenharmony_ci{ 36078c2ecf20Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 36088c2ecf20Sopenharmony_ci struct page *page; 36098c2ecf20Sopenharmony_ci vm_fault_t ret = 0; 36108c2ecf20Sopenharmony_ci pte_t entry; 36118c2ecf20Sopenharmony_ci 36128c2ecf20Sopenharmony_ci /* File mapping without ->vm_ops ? */ 36138c2ecf20Sopenharmony_ci if (vma->vm_flags & VM_SHARED) 36148c2ecf20Sopenharmony_ci return VM_FAULT_SIGBUS; 36158c2ecf20Sopenharmony_ci 36168c2ecf20Sopenharmony_ci /* 36178c2ecf20Sopenharmony_ci * Use pte_alloc() instead of pte_alloc_map(). We can't run 36188c2ecf20Sopenharmony_ci * pte_offset_map() on pmds where a huge pmd might be created 36198c2ecf20Sopenharmony_ci * from a different thread. 36208c2ecf20Sopenharmony_ci * 36218c2ecf20Sopenharmony_ci * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when 36228c2ecf20Sopenharmony_ci * parallel threads are excluded by other means. 36238c2ecf20Sopenharmony_ci * 36248c2ecf20Sopenharmony_ci * Here we only have mmap_read_lock(mm). 36258c2ecf20Sopenharmony_ci */ 36268c2ecf20Sopenharmony_ci if (pte_alloc(vma->vm_mm, vmf->pmd)) 36278c2ecf20Sopenharmony_ci return VM_FAULT_OOM; 36288c2ecf20Sopenharmony_ci 36298c2ecf20Sopenharmony_ci /* See the comment in pte_alloc_one_map() */ 36308c2ecf20Sopenharmony_ci if (unlikely(pmd_trans_unstable(vmf->pmd))) 36318c2ecf20Sopenharmony_ci return 0; 36328c2ecf20Sopenharmony_ci 36338c2ecf20Sopenharmony_ci /* use extra page table for userexpte */ 36348c2ecf20Sopenharmony_ci if (vma->vm_flags & VM_USEREXPTE) { 36358c2ecf20Sopenharmony_ci if (do_uxpte_page_fault(vmf, &entry)) 36368c2ecf20Sopenharmony_ci goto oom; 36378c2ecf20Sopenharmony_ci 36388c2ecf20Sopenharmony_ci if(xpm_integrity_check_hook(vma, vmf->flags, vmf->address, 36398c2ecf20Sopenharmony_ci pte_page(entry))) 36408c2ecf20Sopenharmony_ci return VM_FAULT_SIGSEGV; 36418c2ecf20Sopenharmony_ci else 36428c2ecf20Sopenharmony_ci goto got_page; 36438c2ecf20Sopenharmony_ci } 36448c2ecf20Sopenharmony_ci 36458c2ecf20Sopenharmony_ci /* Use the zero-page for reads */ 36468c2ecf20Sopenharmony_ci if (!(vmf->flags & FAULT_FLAG_WRITE) && 36478c2ecf20Sopenharmony_ci !mm_forbids_zeropage(vma->vm_mm)) { 36488c2ecf20Sopenharmony_ci entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), 36498c2ecf20Sopenharmony_ci vma->vm_page_prot)); 36508c2ecf20Sopenharmony_cigot_page: 36518c2ecf20Sopenharmony_ci vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, 36528c2ecf20Sopenharmony_ci vmf->address, &vmf->ptl); 36538c2ecf20Sopenharmony_ci if (!pte_none(*vmf->pte)) { 36548c2ecf20Sopenharmony_ci update_mmu_tlb(vma, vmf->address, vmf->pte); 36558c2ecf20Sopenharmony_ci goto unlock; 36568c2ecf20Sopenharmony_ci } 36578c2ecf20Sopenharmony_ci ret = check_stable_address_space(vma->vm_mm); 36588c2ecf20Sopenharmony_ci if (ret) 36598c2ecf20Sopenharmony_ci goto unlock; 36608c2ecf20Sopenharmony_ci /* Deliver the page fault to userland, check inside PT lock */ 36618c2ecf20Sopenharmony_ci if (userfaultfd_missing(vma)) { 36628c2ecf20Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 36638c2ecf20Sopenharmony_ci return handle_userfault(vmf, VM_UFFD_MISSING); 36648c2ecf20Sopenharmony_ci } 36658c2ecf20Sopenharmony_ci goto setpte; 36668c2ecf20Sopenharmony_ci } 36678c2ecf20Sopenharmony_ci 36688c2ecf20Sopenharmony_ci /* Allocate our own private page. */ 36698c2ecf20Sopenharmony_ci if (unlikely(anon_vma_prepare(vma))) 36708c2ecf20Sopenharmony_ci goto oom; 36718c2ecf20Sopenharmony_ci page = alloc_zeroed_user_highpage_movable(vma, vmf->address); 36728c2ecf20Sopenharmony_ci if (!page) 36738c2ecf20Sopenharmony_ci goto oom; 36748c2ecf20Sopenharmony_ci 36758c2ecf20Sopenharmony_ci if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) 36768c2ecf20Sopenharmony_ci goto oom_free_page; 36778c2ecf20Sopenharmony_ci cgroup_throttle_swaprate(page, GFP_KERNEL); 36788c2ecf20Sopenharmony_ci 36798c2ecf20Sopenharmony_ci /* 36808c2ecf20Sopenharmony_ci * The memory barrier inside __SetPageUptodate makes sure that 36818c2ecf20Sopenharmony_ci * preceding stores to the page contents become visible before 36828c2ecf20Sopenharmony_ci * the set_pte_at() write. 36838c2ecf20Sopenharmony_ci */ 36848c2ecf20Sopenharmony_ci __SetPageUptodate(page); 36858c2ecf20Sopenharmony_ci 36868c2ecf20Sopenharmony_ci entry = mk_pte(page, vma->vm_page_prot); 36878c2ecf20Sopenharmony_ci entry = pte_sw_mkyoung(entry); 36888c2ecf20Sopenharmony_ci if (vma->vm_flags & VM_WRITE) 36898c2ecf20Sopenharmony_ci entry = pte_mkwrite(pte_mkdirty(entry)); 36908c2ecf20Sopenharmony_ci 36918c2ecf20Sopenharmony_ci vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, 36928c2ecf20Sopenharmony_ci &vmf->ptl); 36938c2ecf20Sopenharmony_ci if (!pte_none(*vmf->pte)) { 36948c2ecf20Sopenharmony_ci update_mmu_cache(vma, vmf->address, vmf->pte); 36958c2ecf20Sopenharmony_ci goto release; 36968c2ecf20Sopenharmony_ci } 36978c2ecf20Sopenharmony_ci 36988c2ecf20Sopenharmony_ci ret = check_stable_address_space(vma->vm_mm); 36998c2ecf20Sopenharmony_ci if (ret) 37008c2ecf20Sopenharmony_ci goto release; 37018c2ecf20Sopenharmony_ci 37028c2ecf20Sopenharmony_ci /* Deliver the page fault to userland, check inside PT lock */ 37038c2ecf20Sopenharmony_ci if (userfaultfd_missing(vma)) { 37048c2ecf20Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 37058c2ecf20Sopenharmony_ci put_page(page); 37068c2ecf20Sopenharmony_ci return handle_userfault(vmf, VM_UFFD_MISSING); 37078c2ecf20Sopenharmony_ci } 37088c2ecf20Sopenharmony_ci 37098c2ecf20Sopenharmony_ci inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); 37108c2ecf20Sopenharmony_ci page_add_new_anon_rmap(page, vma, vmf->address, false); 37118c2ecf20Sopenharmony_ci if (vma->vm_flags & VM_PURGEABLE) 37128c2ecf20Sopenharmony_ci SetPagePurgeable(page); 37138c2ecf20Sopenharmony_ci 37148c2ecf20Sopenharmony_ci lru_cache_add_inactive_or_unevictable(page, vma); 37158c2ecf20Sopenharmony_cisetpte: 37168c2ecf20Sopenharmony_ci if (vma->vm_flags & VM_PURGEABLE) 37178c2ecf20Sopenharmony_ci uxpte_set_present(vma, vmf->address); 37188c2ecf20Sopenharmony_ci 37198c2ecf20Sopenharmony_ci if(!pte_special(entry)){ 37208c2ecf20Sopenharmony_ci xpm_integrity_update_hook(vma, vmf->flags, page); 37218c2ecf20Sopenharmony_ci } 37228c2ecf20Sopenharmony_ci 37238c2ecf20Sopenharmony_ci set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); 37248c2ecf20Sopenharmony_ci 37258c2ecf20Sopenharmony_ci /* No need to invalidate - it was non-present before */ 37268c2ecf20Sopenharmony_ci update_mmu_cache(vma, vmf->address, vmf->pte); 37278c2ecf20Sopenharmony_ciunlock: 37288c2ecf20Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 37298c2ecf20Sopenharmony_ci return ret; 37308c2ecf20Sopenharmony_cirelease: 37318c2ecf20Sopenharmony_ci put_page(page); 37328c2ecf20Sopenharmony_ci goto unlock; 37338c2ecf20Sopenharmony_cioom_free_page: 37348c2ecf20Sopenharmony_ci put_page(page); 37358c2ecf20Sopenharmony_cioom: 37368c2ecf20Sopenharmony_ci return VM_FAULT_OOM; 37378c2ecf20Sopenharmony_ci} 37388c2ecf20Sopenharmony_ci 37398c2ecf20Sopenharmony_ci/* 37408c2ecf20Sopenharmony_ci * The mmap_lock must have been held on entry, and may have been 37418c2ecf20Sopenharmony_ci * released depending on flags and vma->vm_ops->fault() return value. 37428c2ecf20Sopenharmony_ci * See filemap_fault() and __lock_page_retry(). 37438c2ecf20Sopenharmony_ci */ 37448c2ecf20Sopenharmony_cistatic vm_fault_t __do_fault(struct vm_fault *vmf) 37458c2ecf20Sopenharmony_ci{ 37468c2ecf20Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 37478c2ecf20Sopenharmony_ci vm_fault_t ret; 37488c2ecf20Sopenharmony_ci 37498c2ecf20Sopenharmony_ci /* 37508c2ecf20Sopenharmony_ci * Preallocate pte before we take page_lock because this might lead to 37518c2ecf20Sopenharmony_ci * deadlocks for memcg reclaim which waits for pages under writeback: 37528c2ecf20Sopenharmony_ci * lock_page(A) 37538c2ecf20Sopenharmony_ci * SetPageWriteback(A) 37548c2ecf20Sopenharmony_ci * unlock_page(A) 37558c2ecf20Sopenharmony_ci * lock_page(B) 37568c2ecf20Sopenharmony_ci * lock_page(B) 37578c2ecf20Sopenharmony_ci * pte_alloc_one 37588c2ecf20Sopenharmony_ci * shrink_page_list 37598c2ecf20Sopenharmony_ci * wait_on_page_writeback(A) 37608c2ecf20Sopenharmony_ci * SetPageWriteback(B) 37618c2ecf20Sopenharmony_ci * unlock_page(B) 37628c2ecf20Sopenharmony_ci * # flush A, B to clear the writeback 37638c2ecf20Sopenharmony_ci */ 37648c2ecf20Sopenharmony_ci if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) { 37658c2ecf20Sopenharmony_ci vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); 37668c2ecf20Sopenharmony_ci if (!vmf->prealloc_pte) 37678c2ecf20Sopenharmony_ci return VM_FAULT_OOM; 37688c2ecf20Sopenharmony_ci smp_wmb(); /* See comment in __pte_alloc() */ 37698c2ecf20Sopenharmony_ci } 37708c2ecf20Sopenharmony_ci 37718c2ecf20Sopenharmony_ci ret = vma->vm_ops->fault(vmf); 37728c2ecf20Sopenharmony_ci if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | 37738c2ecf20Sopenharmony_ci VM_FAULT_DONE_COW))) 37748c2ecf20Sopenharmony_ci return ret; 37758c2ecf20Sopenharmony_ci 37768c2ecf20Sopenharmony_ci if (unlikely(PageHWPoison(vmf->page))) { 37778c2ecf20Sopenharmony_ci struct page *page = vmf->page; 37788c2ecf20Sopenharmony_ci vm_fault_t poisonret = VM_FAULT_HWPOISON; 37798c2ecf20Sopenharmony_ci if (ret & VM_FAULT_LOCKED) { 37808c2ecf20Sopenharmony_ci if (page_mapped(page)) 37818c2ecf20Sopenharmony_ci unmap_mapping_pages(page_mapping(page), 37828c2ecf20Sopenharmony_ci page->index, 1, false); 37838c2ecf20Sopenharmony_ci /* Retry if a clean page was removed from the cache. */ 37848c2ecf20Sopenharmony_ci if (invalidate_inode_page(page)) 37858c2ecf20Sopenharmony_ci poisonret = VM_FAULT_NOPAGE; 37868c2ecf20Sopenharmony_ci unlock_page(page); 37878c2ecf20Sopenharmony_ci } 37888c2ecf20Sopenharmony_ci put_page(page); 37898c2ecf20Sopenharmony_ci vmf->page = NULL; 37908c2ecf20Sopenharmony_ci return poisonret; 37918c2ecf20Sopenharmony_ci } 37928c2ecf20Sopenharmony_ci 37938c2ecf20Sopenharmony_ci if (unlikely(!(ret & VM_FAULT_LOCKED))) 37948c2ecf20Sopenharmony_ci lock_page(vmf->page); 37958c2ecf20Sopenharmony_ci else 37968c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page); 37978c2ecf20Sopenharmony_ci 37988c2ecf20Sopenharmony_ci return ret; 37998c2ecf20Sopenharmony_ci} 38008c2ecf20Sopenharmony_ci 38018c2ecf20Sopenharmony_ci/* 38028c2ecf20Sopenharmony_ci * The ordering of these checks is important for pmds with _PAGE_DEVMAP set. 38038c2ecf20Sopenharmony_ci * If we check pmd_trans_unstable() first we will trip the bad_pmd() check 38048c2ecf20Sopenharmony_ci * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly 38058c2ecf20Sopenharmony_ci * returning 1 but not before it spams dmesg with the pmd_clear_bad() output. 38068c2ecf20Sopenharmony_ci */ 38078c2ecf20Sopenharmony_cistatic int pmd_devmap_trans_unstable(pmd_t *pmd) 38088c2ecf20Sopenharmony_ci{ 38098c2ecf20Sopenharmony_ci return pmd_devmap(*pmd) || pmd_trans_unstable(pmd); 38108c2ecf20Sopenharmony_ci} 38118c2ecf20Sopenharmony_ci 38128c2ecf20Sopenharmony_cistatic vm_fault_t pte_alloc_one_map(struct vm_fault *vmf) 38138c2ecf20Sopenharmony_ci{ 38148c2ecf20Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 38158c2ecf20Sopenharmony_ci 38168c2ecf20Sopenharmony_ci if (!pmd_none(*vmf->pmd)) 38178c2ecf20Sopenharmony_ci goto map_pte; 38188c2ecf20Sopenharmony_ci if (vmf->prealloc_pte) { 38198c2ecf20Sopenharmony_ci vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 38208c2ecf20Sopenharmony_ci if (unlikely(!pmd_none(*vmf->pmd))) { 38218c2ecf20Sopenharmony_ci spin_unlock(vmf->ptl); 38228c2ecf20Sopenharmony_ci goto map_pte; 38238c2ecf20Sopenharmony_ci } 38248c2ecf20Sopenharmony_ci 38258c2ecf20Sopenharmony_ci mm_inc_nr_ptes(vma->vm_mm); 38268c2ecf20Sopenharmony_ci pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); 38278c2ecf20Sopenharmony_ci spin_unlock(vmf->ptl); 38288c2ecf20Sopenharmony_ci vmf->prealloc_pte = NULL; 38298c2ecf20Sopenharmony_ci } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) { 38308c2ecf20Sopenharmony_ci return VM_FAULT_OOM; 38318c2ecf20Sopenharmony_ci } 38328c2ecf20Sopenharmony_cimap_pte: 38338c2ecf20Sopenharmony_ci /* 38348c2ecf20Sopenharmony_ci * If a huge pmd materialized under us just retry later. Use 38358c2ecf20Sopenharmony_ci * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of 38368c2ecf20Sopenharmony_ci * pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge 38378c2ecf20Sopenharmony_ci * under us and then back to pmd_none, as a result of MADV_DONTNEED 38388c2ecf20Sopenharmony_ci * running immediately after a huge pmd fault in a different thread of 38398c2ecf20Sopenharmony_ci * this mm, in turn leading to a misleading pmd_trans_huge() retval. 38408c2ecf20Sopenharmony_ci * All we have to ensure is that it is a regular pmd that we can walk 38418c2ecf20Sopenharmony_ci * with pte_offset_map() and we can do that through an atomic read in 38428c2ecf20Sopenharmony_ci * C, which is what pmd_trans_unstable() provides. 38438c2ecf20Sopenharmony_ci */ 38448c2ecf20Sopenharmony_ci if (pmd_devmap_trans_unstable(vmf->pmd)) 38458c2ecf20Sopenharmony_ci return VM_FAULT_NOPAGE; 38468c2ecf20Sopenharmony_ci 38478c2ecf20Sopenharmony_ci /* 38488c2ecf20Sopenharmony_ci * At this point we know that our vmf->pmd points to a page of ptes 38498c2ecf20Sopenharmony_ci * and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge() 38508c2ecf20Sopenharmony_ci * for the duration of the fault. If a racing MADV_DONTNEED runs and 38518c2ecf20Sopenharmony_ci * we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still 38528c2ecf20Sopenharmony_ci * be valid and we will re-check to make sure the vmf->pte isn't 38538c2ecf20Sopenharmony_ci * pte_none() under vmf->ptl protection when we return to 38548c2ecf20Sopenharmony_ci * alloc_set_pte(). 38558c2ecf20Sopenharmony_ci */ 38568c2ecf20Sopenharmony_ci vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, 38578c2ecf20Sopenharmony_ci &vmf->ptl); 38588c2ecf20Sopenharmony_ci return 0; 38598c2ecf20Sopenharmony_ci} 38608c2ecf20Sopenharmony_ci 38618c2ecf20Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 38628c2ecf20Sopenharmony_cistatic void deposit_prealloc_pte(struct vm_fault *vmf) 38638c2ecf20Sopenharmony_ci{ 38648c2ecf20Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 38658c2ecf20Sopenharmony_ci 38668c2ecf20Sopenharmony_ci pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); 38678c2ecf20Sopenharmony_ci /* 38688c2ecf20Sopenharmony_ci * We are going to consume the prealloc table, 38698c2ecf20Sopenharmony_ci * count that as nr_ptes. 38708c2ecf20Sopenharmony_ci */ 38718c2ecf20Sopenharmony_ci mm_inc_nr_ptes(vma->vm_mm); 38728c2ecf20Sopenharmony_ci vmf->prealloc_pte = NULL; 38738c2ecf20Sopenharmony_ci} 38748c2ecf20Sopenharmony_ci 38758c2ecf20Sopenharmony_cistatic vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) 38768c2ecf20Sopenharmony_ci{ 38778c2ecf20Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 38788c2ecf20Sopenharmony_ci bool write = vmf->flags & FAULT_FLAG_WRITE; 38798c2ecf20Sopenharmony_ci unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 38808c2ecf20Sopenharmony_ci pmd_t entry; 38818c2ecf20Sopenharmony_ci int i; 38828c2ecf20Sopenharmony_ci vm_fault_t ret = VM_FAULT_FALLBACK; 38838c2ecf20Sopenharmony_ci 38848c2ecf20Sopenharmony_ci if (!transhuge_vma_suitable(vma, haddr)) 38858c2ecf20Sopenharmony_ci return ret; 38868c2ecf20Sopenharmony_ci 38878c2ecf20Sopenharmony_ci page = compound_head(page); 38888c2ecf20Sopenharmony_ci if (compound_order(page) != HPAGE_PMD_ORDER) 38898c2ecf20Sopenharmony_ci return ret; 38908c2ecf20Sopenharmony_ci 38918c2ecf20Sopenharmony_ci /* 38928c2ecf20Sopenharmony_ci * Archs like ppc64 need additonal space to store information 38938c2ecf20Sopenharmony_ci * related to pte entry. Use the preallocated table for that. 38948c2ecf20Sopenharmony_ci */ 38958c2ecf20Sopenharmony_ci if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) { 38968c2ecf20Sopenharmony_ci vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); 38978c2ecf20Sopenharmony_ci if (!vmf->prealloc_pte) 38988c2ecf20Sopenharmony_ci return VM_FAULT_OOM; 38998c2ecf20Sopenharmony_ci smp_wmb(); /* See comment in __pte_alloc() */ 39008c2ecf20Sopenharmony_ci } 39018c2ecf20Sopenharmony_ci 39028c2ecf20Sopenharmony_ci vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 39038c2ecf20Sopenharmony_ci if (unlikely(!pmd_none(*vmf->pmd))) 39048c2ecf20Sopenharmony_ci goto out; 39058c2ecf20Sopenharmony_ci 39068c2ecf20Sopenharmony_ci for (i = 0; i < HPAGE_PMD_NR; i++) 39078c2ecf20Sopenharmony_ci flush_icache_page(vma, page + i); 39088c2ecf20Sopenharmony_ci 39098c2ecf20Sopenharmony_ci entry = mk_huge_pmd(page, vma->vm_page_prot); 39108c2ecf20Sopenharmony_ci if (write) 39118c2ecf20Sopenharmony_ci entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 39128c2ecf20Sopenharmony_ci 39138c2ecf20Sopenharmony_ci add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR); 39148c2ecf20Sopenharmony_ci page_add_file_rmap(page, true); 39158c2ecf20Sopenharmony_ci /* 39168c2ecf20Sopenharmony_ci * deposit and withdraw with pmd lock held 39178c2ecf20Sopenharmony_ci */ 39188c2ecf20Sopenharmony_ci if (arch_needs_pgtable_deposit()) 39198c2ecf20Sopenharmony_ci deposit_prealloc_pte(vmf); 39208c2ecf20Sopenharmony_ci 39218c2ecf20Sopenharmony_ci set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); 39228c2ecf20Sopenharmony_ci 39238c2ecf20Sopenharmony_ci update_mmu_cache_pmd(vma, haddr, vmf->pmd); 39248c2ecf20Sopenharmony_ci 39258c2ecf20Sopenharmony_ci /* fault is handled */ 39268c2ecf20Sopenharmony_ci ret = 0; 39278c2ecf20Sopenharmony_ci count_vm_event(THP_FILE_MAPPED); 39288c2ecf20Sopenharmony_ciout: 39298c2ecf20Sopenharmony_ci spin_unlock(vmf->ptl); 39308c2ecf20Sopenharmony_ci return ret; 39318c2ecf20Sopenharmony_ci} 39328c2ecf20Sopenharmony_ci#else 39338c2ecf20Sopenharmony_cistatic vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) 39348c2ecf20Sopenharmony_ci{ 39358c2ecf20Sopenharmony_ci BUILD_BUG(); 39368c2ecf20Sopenharmony_ci return 0; 39378c2ecf20Sopenharmony_ci} 39388c2ecf20Sopenharmony_ci#endif 39398c2ecf20Sopenharmony_ci 39408c2ecf20Sopenharmony_ci/** 39418c2ecf20Sopenharmony_ci * alloc_set_pte - setup new PTE entry for given page and add reverse page 39428c2ecf20Sopenharmony_ci * mapping. If needed, the function allocates page table or use pre-allocated. 39438c2ecf20Sopenharmony_ci * 39448c2ecf20Sopenharmony_ci * @vmf: fault environment 39458c2ecf20Sopenharmony_ci * @page: page to map 39468c2ecf20Sopenharmony_ci * 39478c2ecf20Sopenharmony_ci * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on 39488c2ecf20Sopenharmony_ci * return. 39498c2ecf20Sopenharmony_ci * 39508c2ecf20Sopenharmony_ci * Target users are page handler itself and implementations of 39518c2ecf20Sopenharmony_ci * vm_ops->map_pages. 39528c2ecf20Sopenharmony_ci * 39538c2ecf20Sopenharmony_ci * Return: %0 on success, %VM_FAULT_ code in case of error. 39548c2ecf20Sopenharmony_ci */ 39558c2ecf20Sopenharmony_civm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) 39568c2ecf20Sopenharmony_ci{ 39578c2ecf20Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 39588c2ecf20Sopenharmony_ci bool write = vmf->flags & FAULT_FLAG_WRITE; 39598c2ecf20Sopenharmony_ci pte_t entry; 39608c2ecf20Sopenharmony_ci vm_fault_t ret; 39618c2ecf20Sopenharmony_ci 39628c2ecf20Sopenharmony_ci if (pmd_none(*vmf->pmd) && PageTransCompound(page)) { 39638c2ecf20Sopenharmony_ci ret = do_set_pmd(vmf, page); 39648c2ecf20Sopenharmony_ci if (ret != VM_FAULT_FALLBACK) 39658c2ecf20Sopenharmony_ci return ret; 39668c2ecf20Sopenharmony_ci } 39678c2ecf20Sopenharmony_ci 39688c2ecf20Sopenharmony_ci if (!vmf->pte) { 39698c2ecf20Sopenharmony_ci ret = pte_alloc_one_map(vmf); 39708c2ecf20Sopenharmony_ci if (ret) 39718c2ecf20Sopenharmony_ci return ret; 39728c2ecf20Sopenharmony_ci } 39738c2ecf20Sopenharmony_ci 39748c2ecf20Sopenharmony_ci /* Re-check under ptl */ 39758c2ecf20Sopenharmony_ci if (unlikely(!pte_none(*vmf->pte))) { 39768c2ecf20Sopenharmony_ci update_mmu_tlb(vma, vmf->address, vmf->pte); 39778c2ecf20Sopenharmony_ci return VM_FAULT_NOPAGE; 39788c2ecf20Sopenharmony_ci } 39798c2ecf20Sopenharmony_ci 39808c2ecf20Sopenharmony_ci /* check the confliction of xpm integrity flags*/ 39818c2ecf20Sopenharmony_ci if (unlikely(xpm_integrity_validate_hook(vmf->vma, vmf->flags, 39828c2ecf20Sopenharmony_ci vmf->address, page))) 39838c2ecf20Sopenharmony_ci return VM_FAULT_SIGSEGV; 39848c2ecf20Sopenharmony_ci 39858c2ecf20Sopenharmony_ci flush_icache_page(vma, page); 39868c2ecf20Sopenharmony_ci entry = mk_pte(page, vma->vm_page_prot); 39878c2ecf20Sopenharmony_ci entry = pte_sw_mkyoung(entry); 39888c2ecf20Sopenharmony_ci if (write) 39898c2ecf20Sopenharmony_ci entry = maybe_mkwrite(pte_mkdirty(entry), vma); 39908c2ecf20Sopenharmony_ci /* copy-on-write page */ 39918c2ecf20Sopenharmony_ci if (write && !(vma->vm_flags & VM_SHARED)) { 39928c2ecf20Sopenharmony_ci inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); 39938c2ecf20Sopenharmony_ci page_add_new_anon_rmap(page, vma, vmf->address, false); 39948c2ecf20Sopenharmony_ci lru_cache_add_inactive_or_unevictable(page, vma); 39958c2ecf20Sopenharmony_ci } else { 39968c2ecf20Sopenharmony_ci inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); 39978c2ecf20Sopenharmony_ci page_add_file_rmap(page, false); 39988c2ecf20Sopenharmony_ci } 39998c2ecf20Sopenharmony_ci set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); 40008c2ecf20Sopenharmony_ci 40018c2ecf20Sopenharmony_ci /* no need to invalidate: a not-present page won't be cached */ 40028c2ecf20Sopenharmony_ci update_mmu_cache(vma, vmf->address, vmf->pte); 40038c2ecf20Sopenharmony_ci 40048c2ecf20Sopenharmony_ci return 0; 40058c2ecf20Sopenharmony_ci} 40068c2ecf20Sopenharmony_ci 40078c2ecf20Sopenharmony_ci 40088c2ecf20Sopenharmony_ci/** 40098c2ecf20Sopenharmony_ci * finish_fault - finish page fault once we have prepared the page to fault 40108c2ecf20Sopenharmony_ci * 40118c2ecf20Sopenharmony_ci * @vmf: structure describing the fault 40128c2ecf20Sopenharmony_ci * 40138c2ecf20Sopenharmony_ci * This function handles all that is needed to finish a page fault once the 40148c2ecf20Sopenharmony_ci * page to fault in is prepared. It handles locking of PTEs, inserts PTE for 40158c2ecf20Sopenharmony_ci * given page, adds reverse page mapping, handles memcg charges and LRU 40168c2ecf20Sopenharmony_ci * addition. 40178c2ecf20Sopenharmony_ci * 40188c2ecf20Sopenharmony_ci * The function expects the page to be locked and on success it consumes a 40198c2ecf20Sopenharmony_ci * reference of a page being mapped (for the PTE which maps it). 40208c2ecf20Sopenharmony_ci * 40218c2ecf20Sopenharmony_ci * Return: %0 on success, %VM_FAULT_ code in case of error. 40228c2ecf20Sopenharmony_ci */ 40238c2ecf20Sopenharmony_civm_fault_t finish_fault(struct vm_fault *vmf) 40248c2ecf20Sopenharmony_ci{ 40258c2ecf20Sopenharmony_ci struct page *page; 40268c2ecf20Sopenharmony_ci vm_fault_t ret = 0; 40278c2ecf20Sopenharmony_ci 40288c2ecf20Sopenharmony_ci /* Did we COW the page? */ 40298c2ecf20Sopenharmony_ci if ((vmf->flags & FAULT_FLAG_WRITE) && 40308c2ecf20Sopenharmony_ci !(vmf->vma->vm_flags & VM_SHARED)) 40318c2ecf20Sopenharmony_ci page = vmf->cow_page; 40328c2ecf20Sopenharmony_ci else 40338c2ecf20Sopenharmony_ci page = vmf->page; 40348c2ecf20Sopenharmony_ci 40358c2ecf20Sopenharmony_ci /* 40368c2ecf20Sopenharmony_ci * check even for read faults because we might have lost our CoWed 40378c2ecf20Sopenharmony_ci * page 40388c2ecf20Sopenharmony_ci */ 40398c2ecf20Sopenharmony_ci if (!(vmf->vma->vm_flags & VM_SHARED)) 40408c2ecf20Sopenharmony_ci ret = check_stable_address_space(vmf->vma->vm_mm); 40418c2ecf20Sopenharmony_ci if (!ret) 40428c2ecf20Sopenharmony_ci ret = alloc_set_pte(vmf, page); 40438c2ecf20Sopenharmony_ci if (vmf->pte) 40448c2ecf20Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 40458c2ecf20Sopenharmony_ci return ret; 40468c2ecf20Sopenharmony_ci} 40478c2ecf20Sopenharmony_ci 40488c2ecf20Sopenharmony_cistatic unsigned long fault_around_bytes __read_mostly = 40498c2ecf20Sopenharmony_ci rounddown_pow_of_two(65536); 40508c2ecf20Sopenharmony_ci 40518c2ecf20Sopenharmony_ci#ifdef CONFIG_DEBUG_FS 40528c2ecf20Sopenharmony_cistatic int fault_around_bytes_get(void *data, u64 *val) 40538c2ecf20Sopenharmony_ci{ 40548c2ecf20Sopenharmony_ci *val = fault_around_bytes; 40558c2ecf20Sopenharmony_ci return 0; 40568c2ecf20Sopenharmony_ci} 40578c2ecf20Sopenharmony_ci 40588c2ecf20Sopenharmony_ci/* 40598c2ecf20Sopenharmony_ci * fault_around_bytes must be rounded down to the nearest page order as it's 40608c2ecf20Sopenharmony_ci * what do_fault_around() expects to see. 40618c2ecf20Sopenharmony_ci */ 40628c2ecf20Sopenharmony_cistatic int fault_around_bytes_set(void *data, u64 val) 40638c2ecf20Sopenharmony_ci{ 40648c2ecf20Sopenharmony_ci if (val / PAGE_SIZE > PTRS_PER_PTE) 40658c2ecf20Sopenharmony_ci return -EINVAL; 40668c2ecf20Sopenharmony_ci if (val > PAGE_SIZE) 40678c2ecf20Sopenharmony_ci fault_around_bytes = rounddown_pow_of_two(val); 40688c2ecf20Sopenharmony_ci else 40698c2ecf20Sopenharmony_ci fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */ 40708c2ecf20Sopenharmony_ci return 0; 40718c2ecf20Sopenharmony_ci} 40728c2ecf20Sopenharmony_ciDEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops, 40738c2ecf20Sopenharmony_ci fault_around_bytes_get, fault_around_bytes_set, "%llu\n"); 40748c2ecf20Sopenharmony_ci 40758c2ecf20Sopenharmony_cistatic int __init fault_around_debugfs(void) 40768c2ecf20Sopenharmony_ci{ 40778c2ecf20Sopenharmony_ci debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL, 40788c2ecf20Sopenharmony_ci &fault_around_bytes_fops); 40798c2ecf20Sopenharmony_ci return 0; 40808c2ecf20Sopenharmony_ci} 40818c2ecf20Sopenharmony_cilate_initcall(fault_around_debugfs); 40828c2ecf20Sopenharmony_ci#endif 40838c2ecf20Sopenharmony_ci 40848c2ecf20Sopenharmony_ci/* 40858c2ecf20Sopenharmony_ci * do_fault_around() tries to map few pages around the fault address. The hope 40868c2ecf20Sopenharmony_ci * is that the pages will be needed soon and this will lower the number of 40878c2ecf20Sopenharmony_ci * faults to handle. 40888c2ecf20Sopenharmony_ci * 40898c2ecf20Sopenharmony_ci * It uses vm_ops->map_pages() to map the pages, which skips the page if it's 40908c2ecf20Sopenharmony_ci * not ready to be mapped: not up-to-date, locked, etc. 40918c2ecf20Sopenharmony_ci * 40928c2ecf20Sopenharmony_ci * This function is called with the page table lock taken. In the split ptlock 40938c2ecf20Sopenharmony_ci * case the page table lock only protects only those entries which belong to 40948c2ecf20Sopenharmony_ci * the page table corresponding to the fault address. 40958c2ecf20Sopenharmony_ci * 40968c2ecf20Sopenharmony_ci * This function doesn't cross the VMA boundaries, in order to call map_pages() 40978c2ecf20Sopenharmony_ci * only once. 40988c2ecf20Sopenharmony_ci * 40998c2ecf20Sopenharmony_ci * fault_around_bytes defines how many bytes we'll try to map. 41008c2ecf20Sopenharmony_ci * do_fault_around() expects it to be set to a power of two less than or equal 41018c2ecf20Sopenharmony_ci * to PTRS_PER_PTE. 41028c2ecf20Sopenharmony_ci * 41038c2ecf20Sopenharmony_ci * The virtual address of the area that we map is naturally aligned to 41048c2ecf20Sopenharmony_ci * fault_around_bytes rounded down to the machine page size 41058c2ecf20Sopenharmony_ci * (and therefore to page order). This way it's easier to guarantee 41068c2ecf20Sopenharmony_ci * that we don't cross page table boundaries. 41078c2ecf20Sopenharmony_ci */ 41088c2ecf20Sopenharmony_cistatic vm_fault_t do_fault_around(struct vm_fault *vmf) 41098c2ecf20Sopenharmony_ci{ 41108c2ecf20Sopenharmony_ci unsigned long address = vmf->address, nr_pages, mask; 41118c2ecf20Sopenharmony_ci pgoff_t start_pgoff = vmf->pgoff; 41128c2ecf20Sopenharmony_ci pgoff_t end_pgoff; 41138c2ecf20Sopenharmony_ci int off; 41148c2ecf20Sopenharmony_ci vm_fault_t ret = 0; 41158c2ecf20Sopenharmony_ci 41168c2ecf20Sopenharmony_ci nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; 41178c2ecf20Sopenharmony_ci mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; 41188c2ecf20Sopenharmony_ci 41198c2ecf20Sopenharmony_ci vmf->address = max(address & mask, vmf->vma->vm_start); 41208c2ecf20Sopenharmony_ci off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); 41218c2ecf20Sopenharmony_ci start_pgoff -= off; 41228c2ecf20Sopenharmony_ci 41238c2ecf20Sopenharmony_ci /* 41248c2ecf20Sopenharmony_ci * end_pgoff is either the end of the page table, the end of 41258c2ecf20Sopenharmony_ci * the vma or nr_pages from start_pgoff, depending what is nearest. 41268c2ecf20Sopenharmony_ci */ 41278c2ecf20Sopenharmony_ci end_pgoff = start_pgoff - 41288c2ecf20Sopenharmony_ci ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + 41298c2ecf20Sopenharmony_ci PTRS_PER_PTE - 1; 41308c2ecf20Sopenharmony_ci end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1, 41318c2ecf20Sopenharmony_ci start_pgoff + nr_pages - 1); 41328c2ecf20Sopenharmony_ci 41338c2ecf20Sopenharmony_ci if (pmd_none(*vmf->pmd)) { 41348c2ecf20Sopenharmony_ci vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); 41358c2ecf20Sopenharmony_ci if (!vmf->prealloc_pte) 41368c2ecf20Sopenharmony_ci goto out; 41378c2ecf20Sopenharmony_ci smp_wmb(); /* See comment in __pte_alloc() */ 41388c2ecf20Sopenharmony_ci } 41398c2ecf20Sopenharmony_ci 41408c2ecf20Sopenharmony_ci vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); 41418c2ecf20Sopenharmony_ci 41428c2ecf20Sopenharmony_ci /* Huge page is mapped? Page fault is solved */ 41438c2ecf20Sopenharmony_ci if (pmd_trans_huge(*vmf->pmd)) { 41448c2ecf20Sopenharmony_ci ret = VM_FAULT_NOPAGE; 41458c2ecf20Sopenharmony_ci goto out; 41468c2ecf20Sopenharmony_ci } 41478c2ecf20Sopenharmony_ci 41488c2ecf20Sopenharmony_ci /* ->map_pages() haven't done anything useful. Cold page cache? */ 41498c2ecf20Sopenharmony_ci if (!vmf->pte) 41508c2ecf20Sopenharmony_ci goto out; 41518c2ecf20Sopenharmony_ci 41528c2ecf20Sopenharmony_ci /* check if the page fault is solved */ 41538c2ecf20Sopenharmony_ci vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); 41548c2ecf20Sopenharmony_ci if (!pte_none(*vmf->pte)) 41558c2ecf20Sopenharmony_ci ret = VM_FAULT_NOPAGE; 41568c2ecf20Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 41578c2ecf20Sopenharmony_ciout: 41588c2ecf20Sopenharmony_ci vmf->address = address; 41598c2ecf20Sopenharmony_ci vmf->pte = NULL; 41608c2ecf20Sopenharmony_ci return ret; 41618c2ecf20Sopenharmony_ci} 41628c2ecf20Sopenharmony_ci 41638c2ecf20Sopenharmony_cistatic vm_fault_t do_read_fault(struct vm_fault *vmf) 41648c2ecf20Sopenharmony_ci{ 41658c2ecf20Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 41668c2ecf20Sopenharmony_ci vm_fault_t ret = 0; 41678c2ecf20Sopenharmony_ci 41688c2ecf20Sopenharmony_ci /* 41698c2ecf20Sopenharmony_ci * Let's call ->map_pages() first and use ->fault() as fallback 41708c2ecf20Sopenharmony_ci * if page by the offset is not ready to be mapped (cold cache or 41718c2ecf20Sopenharmony_ci * something). 41728c2ecf20Sopenharmony_ci */ 41738c2ecf20Sopenharmony_ci if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { 41748c2ecf20Sopenharmony_ci ret = do_fault_around(vmf); 41758c2ecf20Sopenharmony_ci if (ret) 41768c2ecf20Sopenharmony_ci return ret; 41778c2ecf20Sopenharmony_ci } 41788c2ecf20Sopenharmony_ci 41798c2ecf20Sopenharmony_ci ret = __do_fault(vmf); 41808c2ecf20Sopenharmony_ci if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 41818c2ecf20Sopenharmony_ci return ret; 41828c2ecf20Sopenharmony_ci 41838c2ecf20Sopenharmony_ci ret |= finish_fault(vmf); 41848c2ecf20Sopenharmony_ci unlock_page(vmf->page); 41858c2ecf20Sopenharmony_ci if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 41868c2ecf20Sopenharmony_ci put_page(vmf->page); 41878c2ecf20Sopenharmony_ci return ret; 41888c2ecf20Sopenharmony_ci} 41898c2ecf20Sopenharmony_ci 41908c2ecf20Sopenharmony_cistatic vm_fault_t do_cow_fault(struct vm_fault *vmf) 41918c2ecf20Sopenharmony_ci{ 41928c2ecf20Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 41938c2ecf20Sopenharmony_ci vm_fault_t ret; 41948c2ecf20Sopenharmony_ci 41958c2ecf20Sopenharmony_ci if (unlikely(anon_vma_prepare(vma))) 41968c2ecf20Sopenharmony_ci return VM_FAULT_OOM; 41978c2ecf20Sopenharmony_ci 41988c2ecf20Sopenharmony_ci vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); 41998c2ecf20Sopenharmony_ci if (!vmf->cow_page) 42008c2ecf20Sopenharmony_ci return VM_FAULT_OOM; 42018c2ecf20Sopenharmony_ci 42028c2ecf20Sopenharmony_ci if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) { 42038c2ecf20Sopenharmony_ci put_page(vmf->cow_page); 42048c2ecf20Sopenharmony_ci return VM_FAULT_OOM; 42058c2ecf20Sopenharmony_ci } 42068c2ecf20Sopenharmony_ci cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL); 42078c2ecf20Sopenharmony_ci 42088c2ecf20Sopenharmony_ci ret = __do_fault(vmf); 42098c2ecf20Sopenharmony_ci if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 42108c2ecf20Sopenharmony_ci goto uncharge_out; 42118c2ecf20Sopenharmony_ci if (ret & VM_FAULT_DONE_COW) 42128c2ecf20Sopenharmony_ci return ret; 42138c2ecf20Sopenharmony_ci 42148c2ecf20Sopenharmony_ci copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma); 42158c2ecf20Sopenharmony_ci __SetPageUptodate(vmf->cow_page); 42168c2ecf20Sopenharmony_ci 42178c2ecf20Sopenharmony_ci ret |= finish_fault(vmf); 42188c2ecf20Sopenharmony_ci unlock_page(vmf->page); 42198c2ecf20Sopenharmony_ci put_page(vmf->page); 42208c2ecf20Sopenharmony_ci if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 42218c2ecf20Sopenharmony_ci goto uncharge_out; 42228c2ecf20Sopenharmony_ci return ret; 42238c2ecf20Sopenharmony_ciuncharge_out: 42248c2ecf20Sopenharmony_ci put_page(vmf->cow_page); 42258c2ecf20Sopenharmony_ci return ret; 42268c2ecf20Sopenharmony_ci} 42278c2ecf20Sopenharmony_ci 42288c2ecf20Sopenharmony_cistatic vm_fault_t do_shared_fault(struct vm_fault *vmf) 42298c2ecf20Sopenharmony_ci{ 42308c2ecf20Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 42318c2ecf20Sopenharmony_ci vm_fault_t ret, tmp; 42328c2ecf20Sopenharmony_ci 42338c2ecf20Sopenharmony_ci ret = __do_fault(vmf); 42348c2ecf20Sopenharmony_ci if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 42358c2ecf20Sopenharmony_ci return ret; 42368c2ecf20Sopenharmony_ci 42378c2ecf20Sopenharmony_ci /* 42388c2ecf20Sopenharmony_ci * Check if the backing address space wants to know that the page is 42398c2ecf20Sopenharmony_ci * about to become writable 42408c2ecf20Sopenharmony_ci */ 42418c2ecf20Sopenharmony_ci if (vma->vm_ops->page_mkwrite) { 42428c2ecf20Sopenharmony_ci unlock_page(vmf->page); 42438c2ecf20Sopenharmony_ci tmp = do_page_mkwrite(vmf); 42448c2ecf20Sopenharmony_ci if (unlikely(!tmp || 42458c2ecf20Sopenharmony_ci (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { 42468c2ecf20Sopenharmony_ci put_page(vmf->page); 42478c2ecf20Sopenharmony_ci return tmp; 42488c2ecf20Sopenharmony_ci } 42498c2ecf20Sopenharmony_ci } 42508c2ecf20Sopenharmony_ci 42518c2ecf20Sopenharmony_ci ret |= finish_fault(vmf); 42528c2ecf20Sopenharmony_ci if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | 42538c2ecf20Sopenharmony_ci VM_FAULT_RETRY))) { 42548c2ecf20Sopenharmony_ci unlock_page(vmf->page); 42558c2ecf20Sopenharmony_ci put_page(vmf->page); 42568c2ecf20Sopenharmony_ci return ret; 42578c2ecf20Sopenharmony_ci } 42588c2ecf20Sopenharmony_ci 42598c2ecf20Sopenharmony_ci ret |= fault_dirty_shared_page(vmf); 42608c2ecf20Sopenharmony_ci return ret; 42618c2ecf20Sopenharmony_ci} 42628c2ecf20Sopenharmony_ci 42638c2ecf20Sopenharmony_ci/* 42648c2ecf20Sopenharmony_ci * We enter with non-exclusive mmap_lock (to exclude vma changes, 42658c2ecf20Sopenharmony_ci * but allow concurrent faults). 42668c2ecf20Sopenharmony_ci * The mmap_lock may have been released depending on flags and our 42678c2ecf20Sopenharmony_ci * return value. See filemap_fault() and __lock_page_or_retry(). 42688c2ecf20Sopenharmony_ci * If mmap_lock is released, vma may become invalid (for example 42698c2ecf20Sopenharmony_ci * by other thread calling munmap()). 42708c2ecf20Sopenharmony_ci */ 42718c2ecf20Sopenharmony_cistatic vm_fault_t do_fault(struct vm_fault *vmf) 42728c2ecf20Sopenharmony_ci{ 42738c2ecf20Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 42748c2ecf20Sopenharmony_ci struct mm_struct *vm_mm = vma->vm_mm; 42758c2ecf20Sopenharmony_ci vm_fault_t ret; 42768c2ecf20Sopenharmony_ci 42778c2ecf20Sopenharmony_ci /* 42788c2ecf20Sopenharmony_ci * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND 42798c2ecf20Sopenharmony_ci */ 42808c2ecf20Sopenharmony_ci if (!vma->vm_ops->fault) { 42818c2ecf20Sopenharmony_ci /* 42828c2ecf20Sopenharmony_ci * If we find a migration pmd entry or a none pmd entry, which 42838c2ecf20Sopenharmony_ci * should never happen, return SIGBUS 42848c2ecf20Sopenharmony_ci */ 42858c2ecf20Sopenharmony_ci if (unlikely(!pmd_present(*vmf->pmd))) 42868c2ecf20Sopenharmony_ci ret = VM_FAULT_SIGBUS; 42878c2ecf20Sopenharmony_ci else { 42888c2ecf20Sopenharmony_ci vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, 42898c2ecf20Sopenharmony_ci vmf->pmd, 42908c2ecf20Sopenharmony_ci vmf->address, 42918c2ecf20Sopenharmony_ci &vmf->ptl); 42928c2ecf20Sopenharmony_ci /* 42938c2ecf20Sopenharmony_ci * Make sure this is not a temporary clearing of pte 42948c2ecf20Sopenharmony_ci * by holding ptl and checking again. A R/M/W update 42958c2ecf20Sopenharmony_ci * of pte involves: take ptl, clearing the pte so that 42968c2ecf20Sopenharmony_ci * we don't have concurrent modification by hardware 42978c2ecf20Sopenharmony_ci * followed by an update. 42988c2ecf20Sopenharmony_ci */ 42998c2ecf20Sopenharmony_ci if (unlikely(pte_none(*vmf->pte))) 43008c2ecf20Sopenharmony_ci ret = VM_FAULT_SIGBUS; 43018c2ecf20Sopenharmony_ci else 43028c2ecf20Sopenharmony_ci ret = VM_FAULT_NOPAGE; 43038c2ecf20Sopenharmony_ci 43048c2ecf20Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 43058c2ecf20Sopenharmony_ci } 43068c2ecf20Sopenharmony_ci } else if (!(vmf->flags & FAULT_FLAG_WRITE)) 43078c2ecf20Sopenharmony_ci ret = do_read_fault(vmf); 43088c2ecf20Sopenharmony_ci else if (!(vma->vm_flags & VM_SHARED)) 43098c2ecf20Sopenharmony_ci ret = do_cow_fault(vmf); 43108c2ecf20Sopenharmony_ci else 43118c2ecf20Sopenharmony_ci ret = do_shared_fault(vmf); 43128c2ecf20Sopenharmony_ci 43138c2ecf20Sopenharmony_ci /* preallocated pagetable is unused: free it */ 43148c2ecf20Sopenharmony_ci if (vmf->prealloc_pte) { 43158c2ecf20Sopenharmony_ci pte_free(vm_mm, vmf->prealloc_pte); 43168c2ecf20Sopenharmony_ci vmf->prealloc_pte = NULL; 43178c2ecf20Sopenharmony_ci } 43188c2ecf20Sopenharmony_ci return ret; 43198c2ecf20Sopenharmony_ci} 43208c2ecf20Sopenharmony_ci 43218c2ecf20Sopenharmony_cistatic int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, 43228c2ecf20Sopenharmony_ci unsigned long addr, int page_nid, 43238c2ecf20Sopenharmony_ci int *flags) 43248c2ecf20Sopenharmony_ci{ 43258c2ecf20Sopenharmony_ci get_page(page); 43268c2ecf20Sopenharmony_ci 43278c2ecf20Sopenharmony_ci count_vm_numa_event(NUMA_HINT_FAULTS); 43288c2ecf20Sopenharmony_ci if (page_nid == numa_node_id()) { 43298c2ecf20Sopenharmony_ci count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); 43308c2ecf20Sopenharmony_ci *flags |= TNF_FAULT_LOCAL; 43318c2ecf20Sopenharmony_ci } 43328c2ecf20Sopenharmony_ci 43338c2ecf20Sopenharmony_ci return mpol_misplaced(page, vma, addr); 43348c2ecf20Sopenharmony_ci} 43358c2ecf20Sopenharmony_ci 43368c2ecf20Sopenharmony_cistatic vm_fault_t do_numa_page(struct vm_fault *vmf) 43378c2ecf20Sopenharmony_ci{ 43388c2ecf20Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 43398c2ecf20Sopenharmony_ci struct page *page = NULL; 43408c2ecf20Sopenharmony_ci int page_nid = NUMA_NO_NODE; 43418c2ecf20Sopenharmony_ci int last_cpupid; 43428c2ecf20Sopenharmony_ci int target_nid; 43438c2ecf20Sopenharmony_ci bool migrated = false; 43448c2ecf20Sopenharmony_ci pte_t pte, old_pte; 43458c2ecf20Sopenharmony_ci bool was_writable = pte_savedwrite(vmf->orig_pte); 43468c2ecf20Sopenharmony_ci int flags = 0; 43478c2ecf20Sopenharmony_ci 43488c2ecf20Sopenharmony_ci /* 43498c2ecf20Sopenharmony_ci * The "pte" at this point cannot be used safely without 43508c2ecf20Sopenharmony_ci * validation through pte_unmap_same(). It's of NUMA type but 43518c2ecf20Sopenharmony_ci * the pfn may be screwed if the read is non atomic. 43528c2ecf20Sopenharmony_ci */ 43538c2ecf20Sopenharmony_ci vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd); 43548c2ecf20Sopenharmony_ci spin_lock(vmf->ptl); 43558c2ecf20Sopenharmony_ci if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { 43568c2ecf20Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 43578c2ecf20Sopenharmony_ci goto out; 43588c2ecf20Sopenharmony_ci } 43598c2ecf20Sopenharmony_ci 43608c2ecf20Sopenharmony_ci /* 43618c2ecf20Sopenharmony_ci * Make it present again, Depending on how arch implementes non 43628c2ecf20Sopenharmony_ci * accessible ptes, some can allow access by kernel mode. 43638c2ecf20Sopenharmony_ci */ 43648c2ecf20Sopenharmony_ci old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); 43658c2ecf20Sopenharmony_ci pte = pte_modify(old_pte, vma->vm_page_prot); 43668c2ecf20Sopenharmony_ci pte = pte_mkyoung(pte); 43678c2ecf20Sopenharmony_ci if (was_writable) 43688c2ecf20Sopenharmony_ci pte = pte_mkwrite(pte); 43698c2ecf20Sopenharmony_ci ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); 43708c2ecf20Sopenharmony_ci update_mmu_cache(vma, vmf->address, vmf->pte); 43718c2ecf20Sopenharmony_ci 43728c2ecf20Sopenharmony_ci page = vm_normal_page(vma, vmf->address, pte); 43738c2ecf20Sopenharmony_ci if (!page) { 43748c2ecf20Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 43758c2ecf20Sopenharmony_ci return 0; 43768c2ecf20Sopenharmony_ci } 43778c2ecf20Sopenharmony_ci 43788c2ecf20Sopenharmony_ci /* TODO: handle PTE-mapped THP */ 43798c2ecf20Sopenharmony_ci if (PageCompound(page)) { 43808c2ecf20Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 43818c2ecf20Sopenharmony_ci return 0; 43828c2ecf20Sopenharmony_ci } 43838c2ecf20Sopenharmony_ci 43848c2ecf20Sopenharmony_ci /* 43858c2ecf20Sopenharmony_ci * Avoid grouping on RO pages in general. RO pages shouldn't hurt as 43868c2ecf20Sopenharmony_ci * much anyway since they can be in shared cache state. This misses 43878c2ecf20Sopenharmony_ci * the case where a mapping is writable but the process never writes 43888c2ecf20Sopenharmony_ci * to it but pte_write gets cleared during protection updates and 43898c2ecf20Sopenharmony_ci * pte_dirty has unpredictable behaviour between PTE scan updates, 43908c2ecf20Sopenharmony_ci * background writeback, dirty balancing and application behaviour. 43918c2ecf20Sopenharmony_ci */ 43928c2ecf20Sopenharmony_ci if (!pte_write(pte)) 43938c2ecf20Sopenharmony_ci flags |= TNF_NO_GROUP; 43948c2ecf20Sopenharmony_ci 43958c2ecf20Sopenharmony_ci /* 43968c2ecf20Sopenharmony_ci * Flag if the page is shared between multiple address spaces. This 43978c2ecf20Sopenharmony_ci * is later used when determining whether to group tasks together 43988c2ecf20Sopenharmony_ci */ 43998c2ecf20Sopenharmony_ci if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED)) 44008c2ecf20Sopenharmony_ci flags |= TNF_SHARED; 44018c2ecf20Sopenharmony_ci 44028c2ecf20Sopenharmony_ci last_cpupid = page_cpupid_last(page); 44038c2ecf20Sopenharmony_ci page_nid = page_to_nid(page); 44048c2ecf20Sopenharmony_ci target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, 44058c2ecf20Sopenharmony_ci &flags); 44068c2ecf20Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 44078c2ecf20Sopenharmony_ci if (target_nid == NUMA_NO_NODE) { 44088c2ecf20Sopenharmony_ci put_page(page); 44098c2ecf20Sopenharmony_ci goto out; 44108c2ecf20Sopenharmony_ci } 44118c2ecf20Sopenharmony_ci 44128c2ecf20Sopenharmony_ci /* Migrate to the requested node */ 44138c2ecf20Sopenharmony_ci migrated = migrate_misplaced_page(page, vma, target_nid); 44148c2ecf20Sopenharmony_ci if (migrated) { 44158c2ecf20Sopenharmony_ci page_nid = target_nid; 44168c2ecf20Sopenharmony_ci flags |= TNF_MIGRATED; 44178c2ecf20Sopenharmony_ci } else 44188c2ecf20Sopenharmony_ci flags |= TNF_MIGRATE_FAIL; 44198c2ecf20Sopenharmony_ci 44208c2ecf20Sopenharmony_ciout: 44218c2ecf20Sopenharmony_ci if (page_nid != NUMA_NO_NODE) 44228c2ecf20Sopenharmony_ci task_numa_fault(last_cpupid, page_nid, 1, flags); 44238c2ecf20Sopenharmony_ci return 0; 44248c2ecf20Sopenharmony_ci} 44258c2ecf20Sopenharmony_ci 44268c2ecf20Sopenharmony_cistatic inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) 44278c2ecf20Sopenharmony_ci{ 44288c2ecf20Sopenharmony_ci if (vma_is_anonymous(vmf->vma)) 44298c2ecf20Sopenharmony_ci return do_huge_pmd_anonymous_page(vmf); 44308c2ecf20Sopenharmony_ci if (vmf->vma->vm_ops->huge_fault) 44318c2ecf20Sopenharmony_ci return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); 44328c2ecf20Sopenharmony_ci return VM_FAULT_FALLBACK; 44338c2ecf20Sopenharmony_ci} 44348c2ecf20Sopenharmony_ci 44358c2ecf20Sopenharmony_ci/* `inline' is required to avoid gcc 4.1.2 build error */ 44368c2ecf20Sopenharmony_cistatic inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) 44378c2ecf20Sopenharmony_ci{ 44388c2ecf20Sopenharmony_ci if (vma_is_anonymous(vmf->vma)) { 44398c2ecf20Sopenharmony_ci if (userfaultfd_huge_pmd_wp(vmf->vma, orig_pmd)) 44408c2ecf20Sopenharmony_ci return handle_userfault(vmf, VM_UFFD_WP); 44418c2ecf20Sopenharmony_ci return do_huge_pmd_wp_page(vmf, orig_pmd); 44428c2ecf20Sopenharmony_ci } 44438c2ecf20Sopenharmony_ci if (vmf->vma->vm_ops->huge_fault) { 44448c2ecf20Sopenharmony_ci vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); 44458c2ecf20Sopenharmony_ci 44468c2ecf20Sopenharmony_ci if (!(ret & VM_FAULT_FALLBACK)) 44478c2ecf20Sopenharmony_ci return ret; 44488c2ecf20Sopenharmony_ci } 44498c2ecf20Sopenharmony_ci 44508c2ecf20Sopenharmony_ci /* COW or write-notify handled on pte level: split pmd. */ 44518c2ecf20Sopenharmony_ci __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL); 44528c2ecf20Sopenharmony_ci 44538c2ecf20Sopenharmony_ci return VM_FAULT_FALLBACK; 44548c2ecf20Sopenharmony_ci} 44558c2ecf20Sopenharmony_ci 44568c2ecf20Sopenharmony_cistatic vm_fault_t create_huge_pud(struct vm_fault *vmf) 44578c2ecf20Sopenharmony_ci{ 44588c2ecf20Sopenharmony_ci#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ 44598c2ecf20Sopenharmony_ci defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) 44608c2ecf20Sopenharmony_ci /* No support for anonymous transparent PUD pages yet */ 44618c2ecf20Sopenharmony_ci if (vma_is_anonymous(vmf->vma)) 44628c2ecf20Sopenharmony_ci return VM_FAULT_FALLBACK; 44638c2ecf20Sopenharmony_ci if (vmf->vma->vm_ops->huge_fault) 44648c2ecf20Sopenharmony_ci return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); 44658c2ecf20Sopenharmony_ci#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 44668c2ecf20Sopenharmony_ci return VM_FAULT_FALLBACK; 44678c2ecf20Sopenharmony_ci} 44688c2ecf20Sopenharmony_ci 44698c2ecf20Sopenharmony_cistatic vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) 44708c2ecf20Sopenharmony_ci{ 44718c2ecf20Sopenharmony_ci#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ 44728c2ecf20Sopenharmony_ci defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) 44738c2ecf20Sopenharmony_ci /* No support for anonymous transparent PUD pages yet */ 44748c2ecf20Sopenharmony_ci if (vma_is_anonymous(vmf->vma)) 44758c2ecf20Sopenharmony_ci goto split; 44768c2ecf20Sopenharmony_ci if (vmf->vma->vm_ops->huge_fault) { 44778c2ecf20Sopenharmony_ci vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); 44788c2ecf20Sopenharmony_ci 44798c2ecf20Sopenharmony_ci if (!(ret & VM_FAULT_FALLBACK)) 44808c2ecf20Sopenharmony_ci return ret; 44818c2ecf20Sopenharmony_ci } 44828c2ecf20Sopenharmony_cisplit: 44838c2ecf20Sopenharmony_ci /* COW or write-notify not handled on PUD level: split pud.*/ 44848c2ecf20Sopenharmony_ci __split_huge_pud(vmf->vma, vmf->pud, vmf->address); 44858c2ecf20Sopenharmony_ci#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 44868c2ecf20Sopenharmony_ci return VM_FAULT_FALLBACK; 44878c2ecf20Sopenharmony_ci} 44888c2ecf20Sopenharmony_ci 44898c2ecf20Sopenharmony_ci/* 44908c2ecf20Sopenharmony_ci * These routines also need to handle stuff like marking pages dirty 44918c2ecf20Sopenharmony_ci * and/or accessed for architectures that don't do it in hardware (most 44928c2ecf20Sopenharmony_ci * RISC architectures). The early dirtying is also good on the i386. 44938c2ecf20Sopenharmony_ci * 44948c2ecf20Sopenharmony_ci * There is also a hook called "update_mmu_cache()" that architectures 44958c2ecf20Sopenharmony_ci * with external mmu caches can use to update those (ie the Sparc or 44968c2ecf20Sopenharmony_ci * PowerPC hashed page tables that act as extended TLBs). 44978c2ecf20Sopenharmony_ci * 44988c2ecf20Sopenharmony_ci * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow 44998c2ecf20Sopenharmony_ci * concurrent faults). 45008c2ecf20Sopenharmony_ci * 45018c2ecf20Sopenharmony_ci * The mmap_lock may have been released depending on flags and our return value. 45028c2ecf20Sopenharmony_ci * See filemap_fault() and __lock_page_or_retry(). 45038c2ecf20Sopenharmony_ci */ 45048c2ecf20Sopenharmony_cistatic vm_fault_t handle_pte_fault(struct vm_fault *vmf) 45058c2ecf20Sopenharmony_ci{ 45068c2ecf20Sopenharmony_ci pte_t entry; 45078c2ecf20Sopenharmony_ci 45088c2ecf20Sopenharmony_ci if (unlikely(pmd_none(*vmf->pmd))) { 45098c2ecf20Sopenharmony_ci /* 45108c2ecf20Sopenharmony_ci * Leave __pte_alloc() until later: because vm_ops->fault may 45118c2ecf20Sopenharmony_ci * want to allocate huge page, and if we expose page table 45128c2ecf20Sopenharmony_ci * for an instant, it will be difficult to retract from 45138c2ecf20Sopenharmony_ci * concurrent faults and from rmap lookups. 45148c2ecf20Sopenharmony_ci */ 45158c2ecf20Sopenharmony_ci vmf->pte = NULL; 45168c2ecf20Sopenharmony_ci } else { 45178c2ecf20Sopenharmony_ci /* See comment in pte_alloc_one_map() */ 45188c2ecf20Sopenharmony_ci if (pmd_devmap_trans_unstable(vmf->pmd)) 45198c2ecf20Sopenharmony_ci return 0; 45208c2ecf20Sopenharmony_ci /* 45218c2ecf20Sopenharmony_ci * A regular pmd is established and it can't morph into a huge 45228c2ecf20Sopenharmony_ci * pmd from under us anymore at this point because we hold the 45238c2ecf20Sopenharmony_ci * mmap_lock read mode and khugepaged takes it in write mode. 45248c2ecf20Sopenharmony_ci * So now it's safe to run pte_offset_map(). 45258c2ecf20Sopenharmony_ci */ 45268c2ecf20Sopenharmony_ci vmf->pte = pte_offset_map(vmf->pmd, vmf->address); 45278c2ecf20Sopenharmony_ci vmf->orig_pte = *vmf->pte; 45288c2ecf20Sopenharmony_ci 45298c2ecf20Sopenharmony_ci /* 45308c2ecf20Sopenharmony_ci * some architectures can have larger ptes than wordsize, 45318c2ecf20Sopenharmony_ci * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and 45328c2ecf20Sopenharmony_ci * CONFIG_32BIT=y, so READ_ONCE cannot guarantee atomic 45338c2ecf20Sopenharmony_ci * accesses. The code below just needs a consistent view 45348c2ecf20Sopenharmony_ci * for the ifs and we later double check anyway with the 45358c2ecf20Sopenharmony_ci * ptl lock held. So here a barrier will do. 45368c2ecf20Sopenharmony_ci */ 45378c2ecf20Sopenharmony_ci barrier(); 45388c2ecf20Sopenharmony_ci if (pte_none(vmf->orig_pte)) { 45398c2ecf20Sopenharmony_ci pte_unmap(vmf->pte); 45408c2ecf20Sopenharmony_ci vmf->pte = NULL; 45418c2ecf20Sopenharmony_ci } 45428c2ecf20Sopenharmony_ci } 45438c2ecf20Sopenharmony_ci 45448c2ecf20Sopenharmony_ci if (!vmf->pte) { 45458c2ecf20Sopenharmony_ci if (vma_is_anonymous(vmf->vma)) 45468c2ecf20Sopenharmony_ci return do_anonymous_page(vmf); 45478c2ecf20Sopenharmony_ci else 45488c2ecf20Sopenharmony_ci return do_fault(vmf); 45498c2ecf20Sopenharmony_ci } 45508c2ecf20Sopenharmony_ci 45518c2ecf20Sopenharmony_ci if (!pte_present(vmf->orig_pte)) 45528c2ecf20Sopenharmony_ci return do_swap_page(vmf); 45538c2ecf20Sopenharmony_ci 45548c2ecf20Sopenharmony_ci if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) 45558c2ecf20Sopenharmony_ci return do_numa_page(vmf); 45568c2ecf20Sopenharmony_ci 45578c2ecf20Sopenharmony_ci vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); 45588c2ecf20Sopenharmony_ci spin_lock(vmf->ptl); 45598c2ecf20Sopenharmony_ci entry = vmf->orig_pte; 45608c2ecf20Sopenharmony_ci if (unlikely(!pte_same(*vmf->pte, entry))) { 45618c2ecf20Sopenharmony_ci update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); 45628c2ecf20Sopenharmony_ci goto unlock; 45638c2ecf20Sopenharmony_ci } 45648c2ecf20Sopenharmony_ci if (vmf->flags & FAULT_FLAG_WRITE) { 45658c2ecf20Sopenharmony_ci if (!pte_write(entry)) 45668c2ecf20Sopenharmony_ci return do_wp_page(vmf); 45678c2ecf20Sopenharmony_ci entry = pte_mkdirty(entry); 45688c2ecf20Sopenharmony_ci } 45698c2ecf20Sopenharmony_ci entry = pte_mkyoung(entry); 45708c2ecf20Sopenharmony_ci if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, 45718c2ecf20Sopenharmony_ci vmf->flags & FAULT_FLAG_WRITE)) { 45728c2ecf20Sopenharmony_ci update_mmu_cache(vmf->vma, vmf->address, vmf->pte); 45738c2ecf20Sopenharmony_ci } else { 45748c2ecf20Sopenharmony_ci /* Skip spurious TLB flush for retried page fault */ 45758c2ecf20Sopenharmony_ci if (vmf->flags & FAULT_FLAG_TRIED) 45768c2ecf20Sopenharmony_ci goto unlock; 45778c2ecf20Sopenharmony_ci /* 45788c2ecf20Sopenharmony_ci * This is needed only for protection faults but the arch code 45798c2ecf20Sopenharmony_ci * is not yet telling us if this is a protection fault or not. 45808c2ecf20Sopenharmony_ci * This still avoids useless tlb flushes for .text page faults 45818c2ecf20Sopenharmony_ci * with threads. 45828c2ecf20Sopenharmony_ci */ 45838c2ecf20Sopenharmony_ci if (vmf->flags & FAULT_FLAG_WRITE) 45848c2ecf20Sopenharmony_ci flush_tlb_fix_spurious_fault(vmf->vma, vmf->address); 45858c2ecf20Sopenharmony_ci } 45868c2ecf20Sopenharmony_ciunlock: 45878c2ecf20Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 45888c2ecf20Sopenharmony_ci return 0; 45898c2ecf20Sopenharmony_ci} 45908c2ecf20Sopenharmony_ci 45918c2ecf20Sopenharmony_ci/* 45928c2ecf20Sopenharmony_ci * By the time we get here, we already hold the mm semaphore 45938c2ecf20Sopenharmony_ci * 45948c2ecf20Sopenharmony_ci * The mmap_lock may have been released depending on flags and our 45958c2ecf20Sopenharmony_ci * return value. See filemap_fault() and __lock_page_or_retry(). 45968c2ecf20Sopenharmony_ci */ 45978c2ecf20Sopenharmony_cistatic vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, 45988c2ecf20Sopenharmony_ci unsigned long address, unsigned int flags) 45998c2ecf20Sopenharmony_ci{ 46008c2ecf20Sopenharmony_ci struct vm_fault vmf = { 46018c2ecf20Sopenharmony_ci .vma = vma, 46028c2ecf20Sopenharmony_ci .address = address & PAGE_MASK, 46038c2ecf20Sopenharmony_ci .flags = flags, 46048c2ecf20Sopenharmony_ci .pgoff = linear_page_index(vma, address), 46058c2ecf20Sopenharmony_ci .gfp_mask = __get_fault_gfp_mask(vma), 46068c2ecf20Sopenharmony_ci }; 46078c2ecf20Sopenharmony_ci unsigned int dirty = flags & FAULT_FLAG_WRITE; 46088c2ecf20Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 46098c2ecf20Sopenharmony_ci pgd_t *pgd; 46108c2ecf20Sopenharmony_ci p4d_t *p4d; 46118c2ecf20Sopenharmony_ci vm_fault_t ret; 46128c2ecf20Sopenharmony_ci 46138c2ecf20Sopenharmony_ci pgd = pgd_offset(mm, address); 46148c2ecf20Sopenharmony_ci p4d = p4d_alloc(mm, pgd, address); 46158c2ecf20Sopenharmony_ci if (!p4d) 46168c2ecf20Sopenharmony_ci return VM_FAULT_OOM; 46178c2ecf20Sopenharmony_ci 46188c2ecf20Sopenharmony_ci vmf.pud = pud_alloc(mm, p4d, address); 46198c2ecf20Sopenharmony_ci if (!vmf.pud) 46208c2ecf20Sopenharmony_ci return VM_FAULT_OOM; 46218c2ecf20Sopenharmony_ciretry_pud: 46228c2ecf20Sopenharmony_ci if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) { 46238c2ecf20Sopenharmony_ci ret = create_huge_pud(&vmf); 46248c2ecf20Sopenharmony_ci if (!(ret & VM_FAULT_FALLBACK)) 46258c2ecf20Sopenharmony_ci return ret; 46268c2ecf20Sopenharmony_ci } else { 46278c2ecf20Sopenharmony_ci pud_t orig_pud = *vmf.pud; 46288c2ecf20Sopenharmony_ci 46298c2ecf20Sopenharmony_ci barrier(); 46308c2ecf20Sopenharmony_ci if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) { 46318c2ecf20Sopenharmony_ci 46328c2ecf20Sopenharmony_ci /* NUMA case for anonymous PUDs would go here */ 46338c2ecf20Sopenharmony_ci 46348c2ecf20Sopenharmony_ci if (dirty && !pud_write(orig_pud)) { 46358c2ecf20Sopenharmony_ci ret = wp_huge_pud(&vmf, orig_pud); 46368c2ecf20Sopenharmony_ci if (!(ret & VM_FAULT_FALLBACK)) 46378c2ecf20Sopenharmony_ci return ret; 46388c2ecf20Sopenharmony_ci } else { 46398c2ecf20Sopenharmony_ci huge_pud_set_accessed(&vmf, orig_pud); 46408c2ecf20Sopenharmony_ci return 0; 46418c2ecf20Sopenharmony_ci } 46428c2ecf20Sopenharmony_ci } 46438c2ecf20Sopenharmony_ci } 46448c2ecf20Sopenharmony_ci 46458c2ecf20Sopenharmony_ci vmf.pmd = pmd_alloc(mm, vmf.pud, address); 46468c2ecf20Sopenharmony_ci if (!vmf.pmd) 46478c2ecf20Sopenharmony_ci return VM_FAULT_OOM; 46488c2ecf20Sopenharmony_ci 46498c2ecf20Sopenharmony_ci /* Huge pud page fault raced with pmd_alloc? */ 46508c2ecf20Sopenharmony_ci if (pud_trans_unstable(vmf.pud)) 46518c2ecf20Sopenharmony_ci goto retry_pud; 46528c2ecf20Sopenharmony_ci 46538c2ecf20Sopenharmony_ci if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) { 46548c2ecf20Sopenharmony_ci ret = create_huge_pmd(&vmf); 46558c2ecf20Sopenharmony_ci if (!(ret & VM_FAULT_FALLBACK)) 46568c2ecf20Sopenharmony_ci return ret; 46578c2ecf20Sopenharmony_ci } else { 46588c2ecf20Sopenharmony_ci pmd_t orig_pmd = *vmf.pmd; 46598c2ecf20Sopenharmony_ci 46608c2ecf20Sopenharmony_ci barrier(); 46618c2ecf20Sopenharmony_ci if (unlikely(is_swap_pmd(orig_pmd))) { 46628c2ecf20Sopenharmony_ci VM_BUG_ON(thp_migration_supported() && 46638c2ecf20Sopenharmony_ci !is_pmd_migration_entry(orig_pmd)); 46648c2ecf20Sopenharmony_ci if (is_pmd_migration_entry(orig_pmd)) 46658c2ecf20Sopenharmony_ci pmd_migration_entry_wait(mm, vmf.pmd); 46668c2ecf20Sopenharmony_ci return 0; 46678c2ecf20Sopenharmony_ci } 46688c2ecf20Sopenharmony_ci if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { 46698c2ecf20Sopenharmony_ci if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) 46708c2ecf20Sopenharmony_ci return do_huge_pmd_numa_page(&vmf, orig_pmd); 46718c2ecf20Sopenharmony_ci 46728c2ecf20Sopenharmony_ci if (dirty && !pmd_write(orig_pmd)) { 46738c2ecf20Sopenharmony_ci ret = wp_huge_pmd(&vmf, orig_pmd); 46748c2ecf20Sopenharmony_ci if (!(ret & VM_FAULT_FALLBACK)) 46758c2ecf20Sopenharmony_ci return ret; 46768c2ecf20Sopenharmony_ci } else { 46778c2ecf20Sopenharmony_ci huge_pmd_set_accessed(&vmf, orig_pmd); 46788c2ecf20Sopenharmony_ci return 0; 46798c2ecf20Sopenharmony_ci } 46808c2ecf20Sopenharmony_ci } 46818c2ecf20Sopenharmony_ci } 46828c2ecf20Sopenharmony_ci 46838c2ecf20Sopenharmony_ci return handle_pte_fault(&vmf); 46848c2ecf20Sopenharmony_ci} 46858c2ecf20Sopenharmony_ci 46868c2ecf20Sopenharmony_ci/** 46878c2ecf20Sopenharmony_ci * mm_account_fault - Do page fault accountings 46888c2ecf20Sopenharmony_ci * 46898c2ecf20Sopenharmony_ci * @regs: the pt_regs struct pointer. When set to NULL, will skip accounting 46908c2ecf20Sopenharmony_ci * of perf event counters, but we'll still do the per-task accounting to 46918c2ecf20Sopenharmony_ci * the task who triggered this page fault. 46928c2ecf20Sopenharmony_ci * @address: the faulted address. 46938c2ecf20Sopenharmony_ci * @flags: the fault flags. 46948c2ecf20Sopenharmony_ci * @ret: the fault retcode. 46958c2ecf20Sopenharmony_ci * 46968c2ecf20Sopenharmony_ci * This will take care of most of the page fault accountings. Meanwhile, it 46978c2ecf20Sopenharmony_ci * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter 46988c2ecf20Sopenharmony_ci * updates. However note that the handling of PERF_COUNT_SW_PAGE_FAULTS should 46998c2ecf20Sopenharmony_ci * still be in per-arch page fault handlers at the entry of page fault. 47008c2ecf20Sopenharmony_ci */ 47018c2ecf20Sopenharmony_cistatic inline void mm_account_fault(struct pt_regs *regs, 47028c2ecf20Sopenharmony_ci unsigned long address, unsigned int flags, 47038c2ecf20Sopenharmony_ci vm_fault_t ret) 47048c2ecf20Sopenharmony_ci{ 47058c2ecf20Sopenharmony_ci bool major; 47068c2ecf20Sopenharmony_ci 47078c2ecf20Sopenharmony_ci /* 47088c2ecf20Sopenharmony_ci * We don't do accounting for some specific faults: 47098c2ecf20Sopenharmony_ci * 47108c2ecf20Sopenharmony_ci * - Unsuccessful faults (e.g. when the address wasn't valid). That 47118c2ecf20Sopenharmony_ci * includes arch_vma_access_permitted() failing before reaching here. 47128c2ecf20Sopenharmony_ci * So this is not a "this many hardware page faults" counter. We 47138c2ecf20Sopenharmony_ci * should use the hw profiling for that. 47148c2ecf20Sopenharmony_ci * 47158c2ecf20Sopenharmony_ci * - Incomplete faults (VM_FAULT_RETRY). They will only be counted 47168c2ecf20Sopenharmony_ci * once they're completed. 47178c2ecf20Sopenharmony_ci */ 47188c2ecf20Sopenharmony_ci if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY)) 47198c2ecf20Sopenharmony_ci return; 47208c2ecf20Sopenharmony_ci 47218c2ecf20Sopenharmony_ci /* 47228c2ecf20Sopenharmony_ci * We define the fault as a major fault when the final successful fault 47238c2ecf20Sopenharmony_ci * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't 47248c2ecf20Sopenharmony_ci * handle it immediately previously). 47258c2ecf20Sopenharmony_ci */ 47268c2ecf20Sopenharmony_ci major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED); 47278c2ecf20Sopenharmony_ci 47288c2ecf20Sopenharmony_ci if (major) 47298c2ecf20Sopenharmony_ci current->maj_flt++; 47308c2ecf20Sopenharmony_ci else 47318c2ecf20Sopenharmony_ci current->min_flt++; 47328c2ecf20Sopenharmony_ci 47338c2ecf20Sopenharmony_ci /* 47348c2ecf20Sopenharmony_ci * If the fault is done for GUP, regs will be NULL. We only do the 47358c2ecf20Sopenharmony_ci * accounting for the per thread fault counters who triggered the 47368c2ecf20Sopenharmony_ci * fault, and we skip the perf event updates. 47378c2ecf20Sopenharmony_ci */ 47388c2ecf20Sopenharmony_ci if (!regs) 47398c2ecf20Sopenharmony_ci return; 47408c2ecf20Sopenharmony_ci 47418c2ecf20Sopenharmony_ci if (major) 47428c2ecf20Sopenharmony_ci perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); 47438c2ecf20Sopenharmony_ci else 47448c2ecf20Sopenharmony_ci perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); 47458c2ecf20Sopenharmony_ci} 47468c2ecf20Sopenharmony_ci 47478c2ecf20Sopenharmony_ci/* 47488c2ecf20Sopenharmony_ci * By the time we get here, we already hold the mm semaphore 47498c2ecf20Sopenharmony_ci * 47508c2ecf20Sopenharmony_ci * The mmap_lock may have been released depending on flags and our 47518c2ecf20Sopenharmony_ci * return value. See filemap_fault() and __lock_page_or_retry(). 47528c2ecf20Sopenharmony_ci */ 47538c2ecf20Sopenharmony_civm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, 47548c2ecf20Sopenharmony_ci unsigned int flags, struct pt_regs *regs) 47558c2ecf20Sopenharmony_ci{ 47568c2ecf20Sopenharmony_ci vm_fault_t ret; 47578c2ecf20Sopenharmony_ci 47588c2ecf20Sopenharmony_ci __set_current_state(TASK_RUNNING); 47598c2ecf20Sopenharmony_ci 47608c2ecf20Sopenharmony_ci count_vm_event(PGFAULT); 47618c2ecf20Sopenharmony_ci count_memcg_event_mm(vma->vm_mm, PGFAULT); 47628c2ecf20Sopenharmony_ci 47638c2ecf20Sopenharmony_ci /* do counter updates before entering really critical section. */ 47648c2ecf20Sopenharmony_ci check_sync_rss_stat(current); 47658c2ecf20Sopenharmony_ci 47668c2ecf20Sopenharmony_ci if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, 47678c2ecf20Sopenharmony_ci flags & FAULT_FLAG_INSTRUCTION, 47688c2ecf20Sopenharmony_ci flags & FAULT_FLAG_REMOTE)) 47698c2ecf20Sopenharmony_ci return VM_FAULT_SIGSEGV; 47708c2ecf20Sopenharmony_ci 47718c2ecf20Sopenharmony_ci /* 47728c2ecf20Sopenharmony_ci * Enable the memcg OOM handling for faults triggered in user 47738c2ecf20Sopenharmony_ci * space. Kernel faults are handled more gracefully. 47748c2ecf20Sopenharmony_ci */ 47758c2ecf20Sopenharmony_ci if (flags & FAULT_FLAG_USER) 47768c2ecf20Sopenharmony_ci mem_cgroup_enter_user_fault(); 47778c2ecf20Sopenharmony_ci 47788c2ecf20Sopenharmony_ci if (unlikely(is_vm_hugetlb_page(vma))) 47798c2ecf20Sopenharmony_ci ret = hugetlb_fault(vma->vm_mm, vma, address, flags); 47808c2ecf20Sopenharmony_ci else 47818c2ecf20Sopenharmony_ci ret = __handle_mm_fault(vma, address, flags); 47828c2ecf20Sopenharmony_ci 47838c2ecf20Sopenharmony_ci if (flags & FAULT_FLAG_USER) { 47848c2ecf20Sopenharmony_ci mem_cgroup_exit_user_fault(); 47858c2ecf20Sopenharmony_ci /* 47868c2ecf20Sopenharmony_ci * The task may have entered a memcg OOM situation but 47878c2ecf20Sopenharmony_ci * if the allocation error was handled gracefully (no 47888c2ecf20Sopenharmony_ci * VM_FAULT_OOM), there is no need to kill anything. 47898c2ecf20Sopenharmony_ci * Just clean up the OOM state peacefully. 47908c2ecf20Sopenharmony_ci */ 47918c2ecf20Sopenharmony_ci if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) 47928c2ecf20Sopenharmony_ci mem_cgroup_oom_synchronize(false); 47938c2ecf20Sopenharmony_ci } 47948c2ecf20Sopenharmony_ci 47958c2ecf20Sopenharmony_ci mm_account_fault(regs, address, flags, ret); 47968c2ecf20Sopenharmony_ci 47978c2ecf20Sopenharmony_ci return ret; 47988c2ecf20Sopenharmony_ci} 47998c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(handle_mm_fault); 48008c2ecf20Sopenharmony_ci 48018c2ecf20Sopenharmony_ci#ifndef __PAGETABLE_P4D_FOLDED 48028c2ecf20Sopenharmony_ci/* 48038c2ecf20Sopenharmony_ci * Allocate p4d page table. 48048c2ecf20Sopenharmony_ci * We've already handled the fast-path in-line. 48058c2ecf20Sopenharmony_ci */ 48068c2ecf20Sopenharmony_ciint __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) 48078c2ecf20Sopenharmony_ci{ 48088c2ecf20Sopenharmony_ci p4d_t *new = p4d_alloc_one(mm, address); 48098c2ecf20Sopenharmony_ci if (!new) 48108c2ecf20Sopenharmony_ci return -ENOMEM; 48118c2ecf20Sopenharmony_ci 48128c2ecf20Sopenharmony_ci smp_wmb(); /* See comment in __pte_alloc */ 48138c2ecf20Sopenharmony_ci 48148c2ecf20Sopenharmony_ci spin_lock(&mm->page_table_lock); 48158c2ecf20Sopenharmony_ci if (pgd_present(*pgd)) /* Another has populated it */ 48168c2ecf20Sopenharmony_ci p4d_free(mm, new); 48178c2ecf20Sopenharmony_ci else 48188c2ecf20Sopenharmony_ci pgd_populate(mm, pgd, new); 48198c2ecf20Sopenharmony_ci spin_unlock(&mm->page_table_lock); 48208c2ecf20Sopenharmony_ci return 0; 48218c2ecf20Sopenharmony_ci} 48228c2ecf20Sopenharmony_ci#endif /* __PAGETABLE_P4D_FOLDED */ 48238c2ecf20Sopenharmony_ci 48248c2ecf20Sopenharmony_ci#ifndef __PAGETABLE_PUD_FOLDED 48258c2ecf20Sopenharmony_ci/* 48268c2ecf20Sopenharmony_ci * Allocate page upper directory. 48278c2ecf20Sopenharmony_ci * We've already handled the fast-path in-line. 48288c2ecf20Sopenharmony_ci */ 48298c2ecf20Sopenharmony_ciint __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) 48308c2ecf20Sopenharmony_ci{ 48318c2ecf20Sopenharmony_ci pud_t *new = pud_alloc_one(mm, address); 48328c2ecf20Sopenharmony_ci if (!new) 48338c2ecf20Sopenharmony_ci return -ENOMEM; 48348c2ecf20Sopenharmony_ci 48358c2ecf20Sopenharmony_ci smp_wmb(); /* See comment in __pte_alloc */ 48368c2ecf20Sopenharmony_ci 48378c2ecf20Sopenharmony_ci spin_lock(&mm->page_table_lock); 48388c2ecf20Sopenharmony_ci if (!p4d_present(*p4d)) { 48398c2ecf20Sopenharmony_ci mm_inc_nr_puds(mm); 48408c2ecf20Sopenharmony_ci p4d_populate(mm, p4d, new); 48418c2ecf20Sopenharmony_ci } else /* Another has populated it */ 48428c2ecf20Sopenharmony_ci pud_free(mm, new); 48438c2ecf20Sopenharmony_ci spin_unlock(&mm->page_table_lock); 48448c2ecf20Sopenharmony_ci return 0; 48458c2ecf20Sopenharmony_ci} 48468c2ecf20Sopenharmony_ci#endif /* __PAGETABLE_PUD_FOLDED */ 48478c2ecf20Sopenharmony_ci 48488c2ecf20Sopenharmony_ci#ifndef __PAGETABLE_PMD_FOLDED 48498c2ecf20Sopenharmony_ci/* 48508c2ecf20Sopenharmony_ci * Allocate page middle directory. 48518c2ecf20Sopenharmony_ci * We've already handled the fast-path in-line. 48528c2ecf20Sopenharmony_ci */ 48538c2ecf20Sopenharmony_ciint __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) 48548c2ecf20Sopenharmony_ci{ 48558c2ecf20Sopenharmony_ci spinlock_t *ptl; 48568c2ecf20Sopenharmony_ci pmd_t *new = pmd_alloc_one(mm, address); 48578c2ecf20Sopenharmony_ci if (!new) 48588c2ecf20Sopenharmony_ci return -ENOMEM; 48598c2ecf20Sopenharmony_ci 48608c2ecf20Sopenharmony_ci smp_wmb(); /* See comment in __pte_alloc */ 48618c2ecf20Sopenharmony_ci 48628c2ecf20Sopenharmony_ci ptl = pud_lock(mm, pud); 48638c2ecf20Sopenharmony_ci if (!pud_present(*pud)) { 48648c2ecf20Sopenharmony_ci mm_inc_nr_pmds(mm); 48658c2ecf20Sopenharmony_ci pud_populate(mm, pud, new); 48668c2ecf20Sopenharmony_ci } else /* Another has populated it */ 48678c2ecf20Sopenharmony_ci pmd_free(mm, new); 48688c2ecf20Sopenharmony_ci spin_unlock(ptl); 48698c2ecf20Sopenharmony_ci return 0; 48708c2ecf20Sopenharmony_ci} 48718c2ecf20Sopenharmony_ci#endif /* __PAGETABLE_PMD_FOLDED */ 48728c2ecf20Sopenharmony_ci 48738c2ecf20Sopenharmony_ciint follow_invalidate_pte(struct mm_struct *mm, unsigned long address, 48748c2ecf20Sopenharmony_ci struct mmu_notifier_range *range, pte_t **ptepp, 48758c2ecf20Sopenharmony_ci pmd_t **pmdpp, spinlock_t **ptlp) 48768c2ecf20Sopenharmony_ci{ 48778c2ecf20Sopenharmony_ci pgd_t *pgd; 48788c2ecf20Sopenharmony_ci p4d_t *p4d; 48798c2ecf20Sopenharmony_ci pud_t *pud; 48808c2ecf20Sopenharmony_ci pmd_t *pmd; 48818c2ecf20Sopenharmony_ci pte_t *ptep; 48828c2ecf20Sopenharmony_ci 48838c2ecf20Sopenharmony_ci pgd = pgd_offset(mm, address); 48848c2ecf20Sopenharmony_ci if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) 48858c2ecf20Sopenharmony_ci goto out; 48868c2ecf20Sopenharmony_ci 48878c2ecf20Sopenharmony_ci p4d = p4d_offset(pgd, address); 48888c2ecf20Sopenharmony_ci if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d))) 48898c2ecf20Sopenharmony_ci goto out; 48908c2ecf20Sopenharmony_ci 48918c2ecf20Sopenharmony_ci pud = pud_offset(p4d, address); 48928c2ecf20Sopenharmony_ci if (pud_none(*pud) || unlikely(pud_bad(*pud))) 48938c2ecf20Sopenharmony_ci goto out; 48948c2ecf20Sopenharmony_ci 48958c2ecf20Sopenharmony_ci pmd = pmd_offset(pud, address); 48968c2ecf20Sopenharmony_ci VM_BUG_ON(pmd_trans_huge(*pmd)); 48978c2ecf20Sopenharmony_ci 48988c2ecf20Sopenharmony_ci if (pmd_huge(*pmd)) { 48998c2ecf20Sopenharmony_ci if (!pmdpp) 49008c2ecf20Sopenharmony_ci goto out; 49018c2ecf20Sopenharmony_ci 49028c2ecf20Sopenharmony_ci if (range) { 49038c2ecf20Sopenharmony_ci mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, 49048c2ecf20Sopenharmony_ci NULL, mm, address & PMD_MASK, 49058c2ecf20Sopenharmony_ci (address & PMD_MASK) + PMD_SIZE); 49068c2ecf20Sopenharmony_ci mmu_notifier_invalidate_range_start(range); 49078c2ecf20Sopenharmony_ci } 49088c2ecf20Sopenharmony_ci *ptlp = pmd_lock(mm, pmd); 49098c2ecf20Sopenharmony_ci if (pmd_huge(*pmd)) { 49108c2ecf20Sopenharmony_ci *pmdpp = pmd; 49118c2ecf20Sopenharmony_ci return 0; 49128c2ecf20Sopenharmony_ci } 49138c2ecf20Sopenharmony_ci spin_unlock(*ptlp); 49148c2ecf20Sopenharmony_ci if (range) 49158c2ecf20Sopenharmony_ci mmu_notifier_invalidate_range_end(range); 49168c2ecf20Sopenharmony_ci } 49178c2ecf20Sopenharmony_ci 49188c2ecf20Sopenharmony_ci if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) 49198c2ecf20Sopenharmony_ci goto out; 49208c2ecf20Sopenharmony_ci 49218c2ecf20Sopenharmony_ci if (range) { 49228c2ecf20Sopenharmony_ci mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm, 49238c2ecf20Sopenharmony_ci address & PAGE_MASK, 49248c2ecf20Sopenharmony_ci (address & PAGE_MASK) + PAGE_SIZE); 49258c2ecf20Sopenharmony_ci mmu_notifier_invalidate_range_start(range); 49268c2ecf20Sopenharmony_ci } 49278c2ecf20Sopenharmony_ci ptep = pte_offset_map_lock(mm, pmd, address, ptlp); 49288c2ecf20Sopenharmony_ci if (!pte_present(*ptep)) 49298c2ecf20Sopenharmony_ci goto unlock; 49308c2ecf20Sopenharmony_ci *ptepp = ptep; 49318c2ecf20Sopenharmony_ci return 0; 49328c2ecf20Sopenharmony_ciunlock: 49338c2ecf20Sopenharmony_ci pte_unmap_unlock(ptep, *ptlp); 49348c2ecf20Sopenharmony_ci if (range) 49358c2ecf20Sopenharmony_ci mmu_notifier_invalidate_range_end(range); 49368c2ecf20Sopenharmony_ciout: 49378c2ecf20Sopenharmony_ci return -EINVAL; 49388c2ecf20Sopenharmony_ci} 49398c2ecf20Sopenharmony_ci 49408c2ecf20Sopenharmony_ci/** 49418c2ecf20Sopenharmony_ci * follow_pte - look up PTE at a user virtual address 49428c2ecf20Sopenharmony_ci * @mm: the mm_struct of the target address space 49438c2ecf20Sopenharmony_ci * @address: user virtual address 49448c2ecf20Sopenharmony_ci * @ptepp: location to store found PTE 49458c2ecf20Sopenharmony_ci * @ptlp: location to store the lock for the PTE 49468c2ecf20Sopenharmony_ci * 49478c2ecf20Sopenharmony_ci * On a successful return, the pointer to the PTE is stored in @ptepp; 49488c2ecf20Sopenharmony_ci * the corresponding lock is taken and its location is stored in @ptlp. 49498c2ecf20Sopenharmony_ci * The contents of the PTE are only stable until @ptlp is released; 49508c2ecf20Sopenharmony_ci * any further use, if any, must be protected against invalidation 49518c2ecf20Sopenharmony_ci * with MMU notifiers. 49528c2ecf20Sopenharmony_ci * 49538c2ecf20Sopenharmony_ci * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore 49548c2ecf20Sopenharmony_ci * should be taken for read. 49558c2ecf20Sopenharmony_ci * 49568c2ecf20Sopenharmony_ci * KVM uses this function. While it is arguably less bad than ``follow_pfn``, 49578c2ecf20Sopenharmony_ci * it is not a good general-purpose API. 49588c2ecf20Sopenharmony_ci * 49598c2ecf20Sopenharmony_ci * Return: zero on success, -ve otherwise. 49608c2ecf20Sopenharmony_ci */ 49618c2ecf20Sopenharmony_ciint follow_pte(struct mm_struct *mm, unsigned long address, 49628c2ecf20Sopenharmony_ci pte_t **ptepp, spinlock_t **ptlp) 49638c2ecf20Sopenharmony_ci{ 49648c2ecf20Sopenharmony_ci return follow_invalidate_pte(mm, address, NULL, ptepp, NULL, ptlp); 49658c2ecf20Sopenharmony_ci} 49668c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(follow_pte); 49678c2ecf20Sopenharmony_ci 49688c2ecf20Sopenharmony_ci/** 49698c2ecf20Sopenharmony_ci * follow_pfn - look up PFN at a user virtual address 49708c2ecf20Sopenharmony_ci * @vma: memory mapping 49718c2ecf20Sopenharmony_ci * @address: user virtual address 49728c2ecf20Sopenharmony_ci * @pfn: location to store found PFN 49738c2ecf20Sopenharmony_ci * 49748c2ecf20Sopenharmony_ci * Only IO mappings and raw PFN mappings are allowed. 49758c2ecf20Sopenharmony_ci * 49768c2ecf20Sopenharmony_ci * This function does not allow the caller to read the permissions 49778c2ecf20Sopenharmony_ci * of the PTE. Do not use it. 49788c2ecf20Sopenharmony_ci * 49798c2ecf20Sopenharmony_ci * Return: zero and the pfn at @pfn on success, -ve otherwise. 49808c2ecf20Sopenharmony_ci */ 49818c2ecf20Sopenharmony_ciint follow_pfn(struct vm_area_struct *vma, unsigned long address, 49828c2ecf20Sopenharmony_ci unsigned long *pfn) 49838c2ecf20Sopenharmony_ci{ 49848c2ecf20Sopenharmony_ci int ret = -EINVAL; 49858c2ecf20Sopenharmony_ci spinlock_t *ptl; 49868c2ecf20Sopenharmony_ci pte_t *ptep; 49878c2ecf20Sopenharmony_ci 49888c2ecf20Sopenharmony_ci if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) 49898c2ecf20Sopenharmony_ci return ret; 49908c2ecf20Sopenharmony_ci 49918c2ecf20Sopenharmony_ci ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); 49928c2ecf20Sopenharmony_ci if (ret) 49938c2ecf20Sopenharmony_ci return ret; 49948c2ecf20Sopenharmony_ci *pfn = pte_pfn(*ptep); 49958c2ecf20Sopenharmony_ci pte_unmap_unlock(ptep, ptl); 49968c2ecf20Sopenharmony_ci return 0; 49978c2ecf20Sopenharmony_ci} 49988c2ecf20Sopenharmony_ciEXPORT_SYMBOL(follow_pfn); 49998c2ecf20Sopenharmony_ci 50008c2ecf20Sopenharmony_ci#ifdef CONFIG_HAVE_IOREMAP_PROT 50018c2ecf20Sopenharmony_ciint follow_phys(struct vm_area_struct *vma, 50028c2ecf20Sopenharmony_ci unsigned long address, unsigned int flags, 50038c2ecf20Sopenharmony_ci unsigned long *prot, resource_size_t *phys) 50048c2ecf20Sopenharmony_ci{ 50058c2ecf20Sopenharmony_ci int ret = -EINVAL; 50068c2ecf20Sopenharmony_ci pte_t *ptep, pte; 50078c2ecf20Sopenharmony_ci spinlock_t *ptl; 50088c2ecf20Sopenharmony_ci 50098c2ecf20Sopenharmony_ci if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) 50108c2ecf20Sopenharmony_ci goto out; 50118c2ecf20Sopenharmony_ci 50128c2ecf20Sopenharmony_ci if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) 50138c2ecf20Sopenharmony_ci goto out; 50148c2ecf20Sopenharmony_ci pte = *ptep; 50158c2ecf20Sopenharmony_ci 50168c2ecf20Sopenharmony_ci if ((flags & FOLL_WRITE) && !pte_write(pte)) 50178c2ecf20Sopenharmony_ci goto unlock; 50188c2ecf20Sopenharmony_ci 50198c2ecf20Sopenharmony_ci *prot = pgprot_val(pte_pgprot(pte)); 50208c2ecf20Sopenharmony_ci *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; 50218c2ecf20Sopenharmony_ci 50228c2ecf20Sopenharmony_ci ret = 0; 50238c2ecf20Sopenharmony_ciunlock: 50248c2ecf20Sopenharmony_ci pte_unmap_unlock(ptep, ptl); 50258c2ecf20Sopenharmony_ciout: 50268c2ecf20Sopenharmony_ci return ret; 50278c2ecf20Sopenharmony_ci} 50288c2ecf20Sopenharmony_ci 50298c2ecf20Sopenharmony_ciint generic_access_phys(struct vm_area_struct *vma, unsigned long addr, 50308c2ecf20Sopenharmony_ci void *buf, int len, int write) 50318c2ecf20Sopenharmony_ci{ 50328c2ecf20Sopenharmony_ci resource_size_t phys_addr; 50338c2ecf20Sopenharmony_ci unsigned long prot = 0; 50348c2ecf20Sopenharmony_ci void __iomem *maddr; 50358c2ecf20Sopenharmony_ci int offset = addr & (PAGE_SIZE-1); 50368c2ecf20Sopenharmony_ci 50378c2ecf20Sopenharmony_ci if (follow_phys(vma, addr, write, &prot, &phys_addr)) 50388c2ecf20Sopenharmony_ci return -EINVAL; 50398c2ecf20Sopenharmony_ci 50408c2ecf20Sopenharmony_ci maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot); 50418c2ecf20Sopenharmony_ci if (!maddr) 50428c2ecf20Sopenharmony_ci return -ENOMEM; 50438c2ecf20Sopenharmony_ci 50448c2ecf20Sopenharmony_ci if (write) 50458c2ecf20Sopenharmony_ci memcpy_toio(maddr + offset, buf, len); 50468c2ecf20Sopenharmony_ci else 50478c2ecf20Sopenharmony_ci memcpy_fromio(buf, maddr + offset, len); 50488c2ecf20Sopenharmony_ci iounmap(maddr); 50498c2ecf20Sopenharmony_ci 50508c2ecf20Sopenharmony_ci return len; 50518c2ecf20Sopenharmony_ci} 50528c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(generic_access_phys); 50538c2ecf20Sopenharmony_ci#endif 50548c2ecf20Sopenharmony_ci 50558c2ecf20Sopenharmony_ci/* 50568c2ecf20Sopenharmony_ci * Access another process' address space as given in mm. If non-NULL, use the 50578c2ecf20Sopenharmony_ci * given task for page fault accounting. 50588c2ecf20Sopenharmony_ci */ 50598c2ecf20Sopenharmony_ciint __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, 50608c2ecf20Sopenharmony_ci unsigned long addr, void *buf, int len, unsigned int gup_flags) 50618c2ecf20Sopenharmony_ci{ 50628c2ecf20Sopenharmony_ci struct vm_area_struct *vma; 50638c2ecf20Sopenharmony_ci void *old_buf = buf; 50648c2ecf20Sopenharmony_ci int write = gup_flags & FOLL_WRITE; 50658c2ecf20Sopenharmony_ci 50668c2ecf20Sopenharmony_ci if (mmap_read_lock_killable(mm)) 50678c2ecf20Sopenharmony_ci return 0; 50688c2ecf20Sopenharmony_ci 50698c2ecf20Sopenharmony_ci /* ignore errors, just check how much was successfully transferred */ 50708c2ecf20Sopenharmony_ci while (len) { 50718c2ecf20Sopenharmony_ci int bytes, ret, offset; 50728c2ecf20Sopenharmony_ci void *maddr; 50738c2ecf20Sopenharmony_ci struct page *page = NULL; 50748c2ecf20Sopenharmony_ci 50758c2ecf20Sopenharmony_ci ret = get_user_pages_remote(mm, addr, 1, 50768c2ecf20Sopenharmony_ci gup_flags, &page, &vma, NULL); 50778c2ecf20Sopenharmony_ci if (ret <= 0) { 50788c2ecf20Sopenharmony_ci#ifndef CONFIG_HAVE_IOREMAP_PROT 50798c2ecf20Sopenharmony_ci break; 50808c2ecf20Sopenharmony_ci#else 50818c2ecf20Sopenharmony_ci /* 50828c2ecf20Sopenharmony_ci * Check if this is a VM_IO | VM_PFNMAP VMA, which 50838c2ecf20Sopenharmony_ci * we can access using slightly different code. 50848c2ecf20Sopenharmony_ci */ 50858c2ecf20Sopenharmony_ci vma = find_vma(mm, addr); 50868c2ecf20Sopenharmony_ci if (!vma || vma->vm_start > addr) 50878c2ecf20Sopenharmony_ci break; 50888c2ecf20Sopenharmony_ci if (vma->vm_ops && vma->vm_ops->access) 50898c2ecf20Sopenharmony_ci ret = vma->vm_ops->access(vma, addr, buf, 50908c2ecf20Sopenharmony_ci len, write); 50918c2ecf20Sopenharmony_ci if (ret <= 0) 50928c2ecf20Sopenharmony_ci break; 50938c2ecf20Sopenharmony_ci bytes = ret; 50948c2ecf20Sopenharmony_ci#endif 50958c2ecf20Sopenharmony_ci } else { 50968c2ecf20Sopenharmony_ci bytes = len; 50978c2ecf20Sopenharmony_ci offset = addr & (PAGE_SIZE-1); 50988c2ecf20Sopenharmony_ci if (bytes > PAGE_SIZE-offset) 50998c2ecf20Sopenharmony_ci bytes = PAGE_SIZE-offset; 51008c2ecf20Sopenharmony_ci 51018c2ecf20Sopenharmony_ci maddr = kmap(page); 51028c2ecf20Sopenharmony_ci if (write) { 51038c2ecf20Sopenharmony_ci copy_to_user_page(vma, page, addr, 51048c2ecf20Sopenharmony_ci maddr + offset, buf, bytes); 51058c2ecf20Sopenharmony_ci set_page_dirty_lock(page); 51068c2ecf20Sopenharmony_ci } else { 51078c2ecf20Sopenharmony_ci copy_from_user_page(vma, page, addr, 51088c2ecf20Sopenharmony_ci buf, maddr + offset, bytes); 51098c2ecf20Sopenharmony_ci } 51108c2ecf20Sopenharmony_ci kunmap(page); 51118c2ecf20Sopenharmony_ci put_page(page); 51128c2ecf20Sopenharmony_ci } 51138c2ecf20Sopenharmony_ci len -= bytes; 51148c2ecf20Sopenharmony_ci buf += bytes; 51158c2ecf20Sopenharmony_ci addr += bytes; 51168c2ecf20Sopenharmony_ci } 51178c2ecf20Sopenharmony_ci mmap_read_unlock(mm); 51188c2ecf20Sopenharmony_ci 51198c2ecf20Sopenharmony_ci return buf - old_buf; 51208c2ecf20Sopenharmony_ci} 51218c2ecf20Sopenharmony_ci 51228c2ecf20Sopenharmony_ci/** 51238c2ecf20Sopenharmony_ci * access_remote_vm - access another process' address space 51248c2ecf20Sopenharmony_ci * @mm: the mm_struct of the target address space 51258c2ecf20Sopenharmony_ci * @addr: start address to access 51268c2ecf20Sopenharmony_ci * @buf: source or destination buffer 51278c2ecf20Sopenharmony_ci * @len: number of bytes to transfer 51288c2ecf20Sopenharmony_ci * @gup_flags: flags modifying lookup behaviour 51298c2ecf20Sopenharmony_ci * 51308c2ecf20Sopenharmony_ci * The caller must hold a reference on @mm. 51318c2ecf20Sopenharmony_ci * 51328c2ecf20Sopenharmony_ci * Return: number of bytes copied from source to destination. 51338c2ecf20Sopenharmony_ci */ 51348c2ecf20Sopenharmony_ciint access_remote_vm(struct mm_struct *mm, unsigned long addr, 51358c2ecf20Sopenharmony_ci void *buf, int len, unsigned int gup_flags) 51368c2ecf20Sopenharmony_ci{ 51378c2ecf20Sopenharmony_ci return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags); 51388c2ecf20Sopenharmony_ci} 51398c2ecf20Sopenharmony_ci 51408c2ecf20Sopenharmony_ci/* 51418c2ecf20Sopenharmony_ci * Access another process' address space. 51428c2ecf20Sopenharmony_ci * Source/target buffer must be kernel space, 51438c2ecf20Sopenharmony_ci * Do not walk the page table directly, use get_user_pages 51448c2ecf20Sopenharmony_ci */ 51458c2ecf20Sopenharmony_ciint access_process_vm(struct task_struct *tsk, unsigned long addr, 51468c2ecf20Sopenharmony_ci void *buf, int len, unsigned int gup_flags) 51478c2ecf20Sopenharmony_ci{ 51488c2ecf20Sopenharmony_ci struct mm_struct *mm; 51498c2ecf20Sopenharmony_ci int ret; 51508c2ecf20Sopenharmony_ci 51518c2ecf20Sopenharmony_ci mm = get_task_mm(tsk); 51528c2ecf20Sopenharmony_ci if (!mm) 51538c2ecf20Sopenharmony_ci return 0; 51548c2ecf20Sopenharmony_ci 51558c2ecf20Sopenharmony_ci ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags); 51568c2ecf20Sopenharmony_ci 51578c2ecf20Sopenharmony_ci mmput(mm); 51588c2ecf20Sopenharmony_ci 51598c2ecf20Sopenharmony_ci return ret; 51608c2ecf20Sopenharmony_ci} 51618c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(access_process_vm); 51628c2ecf20Sopenharmony_ci 51638c2ecf20Sopenharmony_ci/* 51648c2ecf20Sopenharmony_ci * Print the name of a VMA. 51658c2ecf20Sopenharmony_ci */ 51668c2ecf20Sopenharmony_civoid print_vma_addr(char *prefix, unsigned long ip) 51678c2ecf20Sopenharmony_ci{ 51688c2ecf20Sopenharmony_ci struct mm_struct *mm = current->mm; 51698c2ecf20Sopenharmony_ci struct vm_area_struct *vma; 51708c2ecf20Sopenharmony_ci 51718c2ecf20Sopenharmony_ci /* 51728c2ecf20Sopenharmony_ci * we might be running from an atomic context so we cannot sleep 51738c2ecf20Sopenharmony_ci */ 51748c2ecf20Sopenharmony_ci if (!mmap_read_trylock(mm)) 51758c2ecf20Sopenharmony_ci return; 51768c2ecf20Sopenharmony_ci 51778c2ecf20Sopenharmony_ci vma = find_vma(mm, ip); 51788c2ecf20Sopenharmony_ci if (vma && vma->vm_file) { 51798c2ecf20Sopenharmony_ci struct file *f = vma->vm_file; 51808c2ecf20Sopenharmony_ci char *buf = (char *)__get_free_page(GFP_NOWAIT); 51818c2ecf20Sopenharmony_ci if (buf) { 51828c2ecf20Sopenharmony_ci char *p; 51838c2ecf20Sopenharmony_ci 51848c2ecf20Sopenharmony_ci p = file_path(f, buf, PAGE_SIZE); 51858c2ecf20Sopenharmony_ci if (IS_ERR(p)) 51868c2ecf20Sopenharmony_ci p = "?"; 51878c2ecf20Sopenharmony_ci printk("%s%s[%lx+%lx]", prefix, kbasename(p), 51888c2ecf20Sopenharmony_ci vma->vm_start, 51898c2ecf20Sopenharmony_ci vma->vm_end - vma->vm_start); 51908c2ecf20Sopenharmony_ci free_page((unsigned long)buf); 51918c2ecf20Sopenharmony_ci } 51928c2ecf20Sopenharmony_ci } 51938c2ecf20Sopenharmony_ci mmap_read_unlock(mm); 51948c2ecf20Sopenharmony_ci} 51958c2ecf20Sopenharmony_ci 51968c2ecf20Sopenharmony_ci#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP) 51978c2ecf20Sopenharmony_civoid __might_fault(const char *file, int line) 51988c2ecf20Sopenharmony_ci{ 51998c2ecf20Sopenharmony_ci /* 52008c2ecf20Sopenharmony_ci * Some code (nfs/sunrpc) uses socket ops on kernel memory while 52018c2ecf20Sopenharmony_ci * holding the mmap_lock, this is safe because kernel memory doesn't 52028c2ecf20Sopenharmony_ci * get paged out, therefore we'll never actually fault, and the 52038c2ecf20Sopenharmony_ci * below annotations will generate false positives. 52048c2ecf20Sopenharmony_ci */ 52058c2ecf20Sopenharmony_ci if (uaccess_kernel()) 52068c2ecf20Sopenharmony_ci return; 52078c2ecf20Sopenharmony_ci if (pagefault_disabled()) 52088c2ecf20Sopenharmony_ci return; 52098c2ecf20Sopenharmony_ci __might_sleep(file, line, 0); 52108c2ecf20Sopenharmony_ci#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) 52118c2ecf20Sopenharmony_ci if (current->mm) 52128c2ecf20Sopenharmony_ci might_lock_read(¤t->mm->mmap_lock); 52138c2ecf20Sopenharmony_ci#endif 52148c2ecf20Sopenharmony_ci} 52158c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__might_fault); 52168c2ecf20Sopenharmony_ci#endif 52178c2ecf20Sopenharmony_ci 52188c2ecf20Sopenharmony_ci#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) 52198c2ecf20Sopenharmony_ci/* 52208c2ecf20Sopenharmony_ci * Process all subpages of the specified huge page with the specified 52218c2ecf20Sopenharmony_ci * operation. The target subpage will be processed last to keep its 52228c2ecf20Sopenharmony_ci * cache lines hot. 52238c2ecf20Sopenharmony_ci */ 52248c2ecf20Sopenharmony_cistatic inline void process_huge_page( 52258c2ecf20Sopenharmony_ci unsigned long addr_hint, unsigned int pages_per_huge_page, 52268c2ecf20Sopenharmony_ci void (*process_subpage)(unsigned long addr, int idx, void *arg), 52278c2ecf20Sopenharmony_ci void *arg) 52288c2ecf20Sopenharmony_ci{ 52298c2ecf20Sopenharmony_ci int i, n, base, l; 52308c2ecf20Sopenharmony_ci unsigned long addr = addr_hint & 52318c2ecf20Sopenharmony_ci ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); 52328c2ecf20Sopenharmony_ci 52338c2ecf20Sopenharmony_ci /* Process target subpage last to keep its cache lines hot */ 52348c2ecf20Sopenharmony_ci might_sleep(); 52358c2ecf20Sopenharmony_ci n = (addr_hint - addr) / PAGE_SIZE; 52368c2ecf20Sopenharmony_ci if (2 * n <= pages_per_huge_page) { 52378c2ecf20Sopenharmony_ci /* If target subpage in first half of huge page */ 52388c2ecf20Sopenharmony_ci base = 0; 52398c2ecf20Sopenharmony_ci l = n; 52408c2ecf20Sopenharmony_ci /* Process subpages at the end of huge page */ 52418c2ecf20Sopenharmony_ci for (i = pages_per_huge_page - 1; i >= 2 * n; i--) { 52428c2ecf20Sopenharmony_ci cond_resched(); 52438c2ecf20Sopenharmony_ci process_subpage(addr + i * PAGE_SIZE, i, arg); 52448c2ecf20Sopenharmony_ci } 52458c2ecf20Sopenharmony_ci } else { 52468c2ecf20Sopenharmony_ci /* If target subpage in second half of huge page */ 52478c2ecf20Sopenharmony_ci base = pages_per_huge_page - 2 * (pages_per_huge_page - n); 52488c2ecf20Sopenharmony_ci l = pages_per_huge_page - n; 52498c2ecf20Sopenharmony_ci /* Process subpages at the begin of huge page */ 52508c2ecf20Sopenharmony_ci for (i = 0; i < base; i++) { 52518c2ecf20Sopenharmony_ci cond_resched(); 52528c2ecf20Sopenharmony_ci process_subpage(addr + i * PAGE_SIZE, i, arg); 52538c2ecf20Sopenharmony_ci } 52548c2ecf20Sopenharmony_ci } 52558c2ecf20Sopenharmony_ci /* 52568c2ecf20Sopenharmony_ci * Process remaining subpages in left-right-left-right pattern 52578c2ecf20Sopenharmony_ci * towards the target subpage 52588c2ecf20Sopenharmony_ci */ 52598c2ecf20Sopenharmony_ci for (i = 0; i < l; i++) { 52608c2ecf20Sopenharmony_ci int left_idx = base + i; 52618c2ecf20Sopenharmony_ci int right_idx = base + 2 * l - 1 - i; 52628c2ecf20Sopenharmony_ci 52638c2ecf20Sopenharmony_ci cond_resched(); 52648c2ecf20Sopenharmony_ci process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg); 52658c2ecf20Sopenharmony_ci cond_resched(); 52668c2ecf20Sopenharmony_ci process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg); 52678c2ecf20Sopenharmony_ci } 52688c2ecf20Sopenharmony_ci} 52698c2ecf20Sopenharmony_ci 52708c2ecf20Sopenharmony_cistatic void clear_gigantic_page(struct page *page, 52718c2ecf20Sopenharmony_ci unsigned long addr, 52728c2ecf20Sopenharmony_ci unsigned int pages_per_huge_page) 52738c2ecf20Sopenharmony_ci{ 52748c2ecf20Sopenharmony_ci int i; 52758c2ecf20Sopenharmony_ci struct page *p = page; 52768c2ecf20Sopenharmony_ci 52778c2ecf20Sopenharmony_ci might_sleep(); 52788c2ecf20Sopenharmony_ci for (i = 0; i < pages_per_huge_page; 52798c2ecf20Sopenharmony_ci i++, p = mem_map_next(p, page, i)) { 52808c2ecf20Sopenharmony_ci cond_resched(); 52818c2ecf20Sopenharmony_ci clear_user_highpage(p, addr + i * PAGE_SIZE); 52828c2ecf20Sopenharmony_ci } 52838c2ecf20Sopenharmony_ci} 52848c2ecf20Sopenharmony_ci 52858c2ecf20Sopenharmony_cistatic void clear_subpage(unsigned long addr, int idx, void *arg) 52868c2ecf20Sopenharmony_ci{ 52878c2ecf20Sopenharmony_ci struct page *page = arg; 52888c2ecf20Sopenharmony_ci 52898c2ecf20Sopenharmony_ci clear_user_highpage(page + idx, addr); 52908c2ecf20Sopenharmony_ci} 52918c2ecf20Sopenharmony_ci 52928c2ecf20Sopenharmony_civoid clear_huge_page(struct page *page, 52938c2ecf20Sopenharmony_ci unsigned long addr_hint, unsigned int pages_per_huge_page) 52948c2ecf20Sopenharmony_ci{ 52958c2ecf20Sopenharmony_ci unsigned long addr = addr_hint & 52968c2ecf20Sopenharmony_ci ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); 52978c2ecf20Sopenharmony_ci 52988c2ecf20Sopenharmony_ci if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { 52998c2ecf20Sopenharmony_ci clear_gigantic_page(page, addr, pages_per_huge_page); 53008c2ecf20Sopenharmony_ci return; 53018c2ecf20Sopenharmony_ci } 53028c2ecf20Sopenharmony_ci 53038c2ecf20Sopenharmony_ci process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page); 53048c2ecf20Sopenharmony_ci} 53058c2ecf20Sopenharmony_ci 53068c2ecf20Sopenharmony_cistatic void copy_user_gigantic_page(struct page *dst, struct page *src, 53078c2ecf20Sopenharmony_ci unsigned long addr, 53088c2ecf20Sopenharmony_ci struct vm_area_struct *vma, 53098c2ecf20Sopenharmony_ci unsigned int pages_per_huge_page) 53108c2ecf20Sopenharmony_ci{ 53118c2ecf20Sopenharmony_ci int i; 53128c2ecf20Sopenharmony_ci struct page *dst_base = dst; 53138c2ecf20Sopenharmony_ci struct page *src_base = src; 53148c2ecf20Sopenharmony_ci 53158c2ecf20Sopenharmony_ci for (i = 0; i < pages_per_huge_page; ) { 53168c2ecf20Sopenharmony_ci cond_resched(); 53178c2ecf20Sopenharmony_ci copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); 53188c2ecf20Sopenharmony_ci 53198c2ecf20Sopenharmony_ci i++; 53208c2ecf20Sopenharmony_ci dst = mem_map_next(dst, dst_base, i); 53218c2ecf20Sopenharmony_ci src = mem_map_next(src, src_base, i); 53228c2ecf20Sopenharmony_ci } 53238c2ecf20Sopenharmony_ci} 53248c2ecf20Sopenharmony_ci 53258c2ecf20Sopenharmony_cistruct copy_subpage_arg { 53268c2ecf20Sopenharmony_ci struct page *dst; 53278c2ecf20Sopenharmony_ci struct page *src; 53288c2ecf20Sopenharmony_ci struct vm_area_struct *vma; 53298c2ecf20Sopenharmony_ci}; 53308c2ecf20Sopenharmony_ci 53318c2ecf20Sopenharmony_cistatic void copy_subpage(unsigned long addr, int idx, void *arg) 53328c2ecf20Sopenharmony_ci{ 53338c2ecf20Sopenharmony_ci struct copy_subpage_arg *copy_arg = arg; 53348c2ecf20Sopenharmony_ci 53358c2ecf20Sopenharmony_ci copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx, 53368c2ecf20Sopenharmony_ci addr, copy_arg->vma); 53378c2ecf20Sopenharmony_ci} 53388c2ecf20Sopenharmony_ci 53398c2ecf20Sopenharmony_civoid copy_user_huge_page(struct page *dst, struct page *src, 53408c2ecf20Sopenharmony_ci unsigned long addr_hint, struct vm_area_struct *vma, 53418c2ecf20Sopenharmony_ci unsigned int pages_per_huge_page) 53428c2ecf20Sopenharmony_ci{ 53438c2ecf20Sopenharmony_ci unsigned long addr = addr_hint & 53448c2ecf20Sopenharmony_ci ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); 53458c2ecf20Sopenharmony_ci struct copy_subpage_arg arg = { 53468c2ecf20Sopenharmony_ci .dst = dst, 53478c2ecf20Sopenharmony_ci .src = src, 53488c2ecf20Sopenharmony_ci .vma = vma, 53498c2ecf20Sopenharmony_ci }; 53508c2ecf20Sopenharmony_ci 53518c2ecf20Sopenharmony_ci if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { 53528c2ecf20Sopenharmony_ci copy_user_gigantic_page(dst, src, addr, vma, 53538c2ecf20Sopenharmony_ci pages_per_huge_page); 53548c2ecf20Sopenharmony_ci return; 53558c2ecf20Sopenharmony_ci } 53568c2ecf20Sopenharmony_ci 53578c2ecf20Sopenharmony_ci process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg); 53588c2ecf20Sopenharmony_ci} 53598c2ecf20Sopenharmony_ci 53608c2ecf20Sopenharmony_cilong copy_huge_page_from_user(struct page *dst_page, 53618c2ecf20Sopenharmony_ci const void __user *usr_src, 53628c2ecf20Sopenharmony_ci unsigned int pages_per_huge_page, 53638c2ecf20Sopenharmony_ci bool allow_pagefault) 53648c2ecf20Sopenharmony_ci{ 53658c2ecf20Sopenharmony_ci void *src = (void *)usr_src; 53668c2ecf20Sopenharmony_ci void *page_kaddr; 53678c2ecf20Sopenharmony_ci unsigned long i, rc = 0; 53688c2ecf20Sopenharmony_ci unsigned long ret_val = pages_per_huge_page * PAGE_SIZE; 53698c2ecf20Sopenharmony_ci struct page *subpage = dst_page; 53708c2ecf20Sopenharmony_ci 53718c2ecf20Sopenharmony_ci for (i = 0; i < pages_per_huge_page; 53728c2ecf20Sopenharmony_ci i++, subpage = mem_map_next(subpage, dst_page, i)) { 53738c2ecf20Sopenharmony_ci if (allow_pagefault) 53748c2ecf20Sopenharmony_ci page_kaddr = kmap(subpage); 53758c2ecf20Sopenharmony_ci else 53768c2ecf20Sopenharmony_ci page_kaddr = kmap_atomic(subpage); 53778c2ecf20Sopenharmony_ci rc = copy_from_user(page_kaddr, 53788c2ecf20Sopenharmony_ci (const void __user *)(src + i * PAGE_SIZE), 53798c2ecf20Sopenharmony_ci PAGE_SIZE); 53808c2ecf20Sopenharmony_ci if (allow_pagefault) 53818c2ecf20Sopenharmony_ci kunmap(subpage); 53828c2ecf20Sopenharmony_ci else 53838c2ecf20Sopenharmony_ci kunmap_atomic(page_kaddr); 53848c2ecf20Sopenharmony_ci 53858c2ecf20Sopenharmony_ci ret_val -= (PAGE_SIZE - rc); 53868c2ecf20Sopenharmony_ci if (rc) 53878c2ecf20Sopenharmony_ci break; 53888c2ecf20Sopenharmony_ci 53898c2ecf20Sopenharmony_ci flush_dcache_page(subpage); 53908c2ecf20Sopenharmony_ci 53918c2ecf20Sopenharmony_ci cond_resched(); 53928c2ecf20Sopenharmony_ci } 53938c2ecf20Sopenharmony_ci return ret_val; 53948c2ecf20Sopenharmony_ci} 53958c2ecf20Sopenharmony_ci#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ 53968c2ecf20Sopenharmony_ci 53978c2ecf20Sopenharmony_ci#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS 53988c2ecf20Sopenharmony_ci 53998c2ecf20Sopenharmony_cistatic struct kmem_cache *page_ptl_cachep; 54008c2ecf20Sopenharmony_ci 54018c2ecf20Sopenharmony_civoid __init ptlock_cache_init(void) 54028c2ecf20Sopenharmony_ci{ 54038c2ecf20Sopenharmony_ci page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, 54048c2ecf20Sopenharmony_ci SLAB_PANIC, NULL); 54058c2ecf20Sopenharmony_ci} 54068c2ecf20Sopenharmony_ci 54078c2ecf20Sopenharmony_cibool ptlock_alloc(struct page *page) 54088c2ecf20Sopenharmony_ci{ 54098c2ecf20Sopenharmony_ci spinlock_t *ptl; 54108c2ecf20Sopenharmony_ci 54118c2ecf20Sopenharmony_ci ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL); 54128c2ecf20Sopenharmony_ci if (!ptl) 54138c2ecf20Sopenharmony_ci return false; 54148c2ecf20Sopenharmony_ci page->ptl = ptl; 54158c2ecf20Sopenharmony_ci return true; 54168c2ecf20Sopenharmony_ci} 54178c2ecf20Sopenharmony_ci 54188c2ecf20Sopenharmony_civoid ptlock_free(struct page *page) 54198c2ecf20Sopenharmony_ci{ 54208c2ecf20Sopenharmony_ci kmem_cache_free(page_ptl_cachep, page->ptl); 54218c2ecf20Sopenharmony_ci} 54228c2ecf20Sopenharmony_ci#endif 5423