162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * linux/mm/memory.c 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 662306a36Sopenharmony_ci */ 762306a36Sopenharmony_ci 862306a36Sopenharmony_ci/* 962306a36Sopenharmony_ci * demand-loading started 01.12.91 - seems it is high on the list of 1062306a36Sopenharmony_ci * things wanted, and it should be easy to implement. - Linus 1162306a36Sopenharmony_ci */ 1262306a36Sopenharmony_ci 1362306a36Sopenharmony_ci/* 1462306a36Sopenharmony_ci * Ok, demand-loading was easy, shared pages a little bit tricker. Shared 1562306a36Sopenharmony_ci * pages started 02.12.91, seems to work. - Linus. 1662306a36Sopenharmony_ci * 1762306a36Sopenharmony_ci * Tested sharing by executing about 30 /bin/sh: under the old kernel it 1862306a36Sopenharmony_ci * would have taken more than the 6M I have free, but it worked well as 1962306a36Sopenharmony_ci * far as I could see. 2062306a36Sopenharmony_ci * 2162306a36Sopenharmony_ci * Also corrected some "invalidate()"s - I wasn't doing enough of them. 2262306a36Sopenharmony_ci */ 2362306a36Sopenharmony_ci 2462306a36Sopenharmony_ci/* 2562306a36Sopenharmony_ci * Real VM (paging to/from disk) started 18.12.91. Much more work and 2662306a36Sopenharmony_ci * thought has to go into this. Oh, well.. 2762306a36Sopenharmony_ci * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. 2862306a36Sopenharmony_ci * Found it. Everything seems to work now. 2962306a36Sopenharmony_ci * 20.12.91 - Ok, making the swap-device changeable like the root. 3062306a36Sopenharmony_ci */ 3162306a36Sopenharmony_ci 3262306a36Sopenharmony_ci/* 3362306a36Sopenharmony_ci * 05.04.94 - Multi-page memory management added for v1.1. 3462306a36Sopenharmony_ci * Idea by Alex Bligh (alex@cconcepts.co.uk) 3562306a36Sopenharmony_ci * 3662306a36Sopenharmony_ci * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG 3762306a36Sopenharmony_ci * (Gerhard.Wichert@pdb.siemens.de) 3862306a36Sopenharmony_ci * 3962306a36Sopenharmony_ci * Aug/Sep 2004 Changed to four level page tables (Andi Kleen) 4062306a36Sopenharmony_ci */ 4162306a36Sopenharmony_ci 4262306a36Sopenharmony_ci#include <linux/kernel_stat.h> 4362306a36Sopenharmony_ci#include <linux/mm.h> 4462306a36Sopenharmony_ci#include <linux/mm_inline.h> 4562306a36Sopenharmony_ci#include <linux/sched/mm.h> 4662306a36Sopenharmony_ci#include <linux/sched/coredump.h> 4762306a36Sopenharmony_ci#include <linux/sched/numa_balancing.h> 4862306a36Sopenharmony_ci#include <linux/sched/task.h> 4962306a36Sopenharmony_ci#include <linux/hugetlb.h> 5062306a36Sopenharmony_ci#include <linux/mman.h> 5162306a36Sopenharmony_ci#include <linux/swap.h> 5262306a36Sopenharmony_ci#include <linux/highmem.h> 5362306a36Sopenharmony_ci#include <linux/pagemap.h> 5462306a36Sopenharmony_ci#include <linux/memremap.h> 5562306a36Sopenharmony_ci#include <linux/kmsan.h> 5662306a36Sopenharmony_ci#include <linux/ksm.h> 5762306a36Sopenharmony_ci#include <linux/rmap.h> 5862306a36Sopenharmony_ci#include <linux/export.h> 5962306a36Sopenharmony_ci#include <linux/delayacct.h> 6062306a36Sopenharmony_ci#include <linux/init.h> 6162306a36Sopenharmony_ci#include <linux/pfn_t.h> 6262306a36Sopenharmony_ci#include <linux/writeback.h> 6362306a36Sopenharmony_ci#include <linux/memcontrol.h> 6462306a36Sopenharmony_ci#include <linux/mmu_notifier.h> 6562306a36Sopenharmony_ci#include <linux/swapops.h> 6662306a36Sopenharmony_ci#include <linux/elf.h> 6762306a36Sopenharmony_ci#include <linux/gfp.h> 6862306a36Sopenharmony_ci#include <linux/migrate.h> 6962306a36Sopenharmony_ci#include <linux/string.h> 7062306a36Sopenharmony_ci#include <linux/memory-tiers.h> 7162306a36Sopenharmony_ci#include <linux/debugfs.h> 7262306a36Sopenharmony_ci#include <linux/userfaultfd_k.h> 7362306a36Sopenharmony_ci#include <linux/dax.h> 7462306a36Sopenharmony_ci#include <linux/oom.h> 7562306a36Sopenharmony_ci#include <linux/numa.h> 7662306a36Sopenharmony_ci#include <linux/perf_event.h> 7762306a36Sopenharmony_ci#include <linux/ptrace.h> 7862306a36Sopenharmony_ci#include <linux/vmalloc.h> 7962306a36Sopenharmony_ci#include <linux/sched/sysctl.h> 8062306a36Sopenharmony_ci#ifdef CONFIG_MEM_PURGEABLE 8162306a36Sopenharmony_ci#include <linux/mm_purgeable.h> 8262306a36Sopenharmony_ci#endif 8362306a36Sopenharmony_ci#include <trace/events/kmem.h> 8462306a36Sopenharmony_ci 8562306a36Sopenharmony_ci#include <asm/io.h> 8662306a36Sopenharmony_ci#include <asm/mmu_context.h> 8762306a36Sopenharmony_ci#include <asm/pgalloc.h> 8862306a36Sopenharmony_ci#include <linux/uaccess.h> 8962306a36Sopenharmony_ci#include <asm/tlb.h> 9062306a36Sopenharmony_ci#include <asm/tlbflush.h> 9162306a36Sopenharmony_ci 9262306a36Sopenharmony_ci#include "pgalloc-track.h" 9362306a36Sopenharmony_ci#include "internal.h" 9462306a36Sopenharmony_ci#include "swap.h" 9562306a36Sopenharmony_ci 9662306a36Sopenharmony_ci#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST) 9762306a36Sopenharmony_ci#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. 9862306a36Sopenharmony_ci#endif 9962306a36Sopenharmony_ci 10062306a36Sopenharmony_ci#ifndef CONFIG_NUMA 10162306a36Sopenharmony_ciunsigned long max_mapnr; 10262306a36Sopenharmony_ciEXPORT_SYMBOL(max_mapnr); 10362306a36Sopenharmony_ci 10462306a36Sopenharmony_cistruct page *mem_map; 10562306a36Sopenharmony_ciEXPORT_SYMBOL(mem_map); 10662306a36Sopenharmony_ci#endif 10762306a36Sopenharmony_ci 10862306a36Sopenharmony_cistatic vm_fault_t do_fault(struct vm_fault *vmf); 10962306a36Sopenharmony_cistatic vm_fault_t do_anonymous_page(struct vm_fault *vmf); 11062306a36Sopenharmony_cistatic bool vmf_pte_changed(struct vm_fault *vmf); 11162306a36Sopenharmony_ci 11262306a36Sopenharmony_ci/* 11362306a36Sopenharmony_ci * Return true if the original pte was a uffd-wp pte marker (so the pte was 11462306a36Sopenharmony_ci * wr-protected). 11562306a36Sopenharmony_ci */ 11662306a36Sopenharmony_cistatic bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf) 11762306a36Sopenharmony_ci{ 11862306a36Sopenharmony_ci if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) 11962306a36Sopenharmony_ci return false; 12062306a36Sopenharmony_ci 12162306a36Sopenharmony_ci return pte_marker_uffd_wp(vmf->orig_pte); 12262306a36Sopenharmony_ci} 12362306a36Sopenharmony_ci 12462306a36Sopenharmony_ci/* 12562306a36Sopenharmony_ci * A number of key systems in x86 including ioremap() rely on the assumption 12662306a36Sopenharmony_ci * that high_memory defines the upper bound on direct map memory, then end 12762306a36Sopenharmony_ci * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and 12862306a36Sopenharmony_ci * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL 12962306a36Sopenharmony_ci * and ZONE_HIGHMEM. 13062306a36Sopenharmony_ci */ 13162306a36Sopenharmony_civoid *high_memory; 13262306a36Sopenharmony_ciEXPORT_SYMBOL(high_memory); 13362306a36Sopenharmony_ci 13462306a36Sopenharmony_ci/* 13562306a36Sopenharmony_ci * Randomize the address space (stacks, mmaps, brk, etc.). 13662306a36Sopenharmony_ci * 13762306a36Sopenharmony_ci * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization, 13862306a36Sopenharmony_ci * as ancient (libc5 based) binaries can segfault. ) 13962306a36Sopenharmony_ci */ 14062306a36Sopenharmony_ciint randomize_va_space __read_mostly = 14162306a36Sopenharmony_ci#ifdef CONFIG_COMPAT_BRK 14262306a36Sopenharmony_ci 1; 14362306a36Sopenharmony_ci#else 14462306a36Sopenharmony_ci 2; 14562306a36Sopenharmony_ci#endif 14662306a36Sopenharmony_ci 14762306a36Sopenharmony_ci#ifndef arch_wants_old_prefaulted_pte 14862306a36Sopenharmony_cistatic inline bool arch_wants_old_prefaulted_pte(void) 14962306a36Sopenharmony_ci{ 15062306a36Sopenharmony_ci /* 15162306a36Sopenharmony_ci * Transitioning a PTE from 'old' to 'young' can be expensive on 15262306a36Sopenharmony_ci * some architectures, even if it's performed in hardware. By 15362306a36Sopenharmony_ci * default, "false" means prefaulted entries will be 'young'. 15462306a36Sopenharmony_ci */ 15562306a36Sopenharmony_ci return false; 15662306a36Sopenharmony_ci} 15762306a36Sopenharmony_ci#endif 15862306a36Sopenharmony_ci 15962306a36Sopenharmony_cistatic int __init disable_randmaps(char *s) 16062306a36Sopenharmony_ci{ 16162306a36Sopenharmony_ci randomize_va_space = 0; 16262306a36Sopenharmony_ci return 1; 16362306a36Sopenharmony_ci} 16462306a36Sopenharmony_ci__setup("norandmaps", disable_randmaps); 16562306a36Sopenharmony_ci 16662306a36Sopenharmony_ciunsigned long zero_pfn __read_mostly; 16762306a36Sopenharmony_ciEXPORT_SYMBOL(zero_pfn); 16862306a36Sopenharmony_ci 16962306a36Sopenharmony_ciunsigned long highest_memmap_pfn __read_mostly; 17062306a36Sopenharmony_ci 17162306a36Sopenharmony_ci/* 17262306a36Sopenharmony_ci * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() 17362306a36Sopenharmony_ci */ 17462306a36Sopenharmony_cistatic int __init init_zero_pfn(void) 17562306a36Sopenharmony_ci{ 17662306a36Sopenharmony_ci zero_pfn = page_to_pfn(ZERO_PAGE(0)); 17762306a36Sopenharmony_ci return 0; 17862306a36Sopenharmony_ci} 17962306a36Sopenharmony_ciearly_initcall(init_zero_pfn); 18062306a36Sopenharmony_ci 18162306a36Sopenharmony_civoid mm_trace_rss_stat(struct mm_struct *mm, int member) 18262306a36Sopenharmony_ci{ 18362306a36Sopenharmony_ci trace_rss_stat(mm, member); 18462306a36Sopenharmony_ci} 18562306a36Sopenharmony_ci 18662306a36Sopenharmony_ci/* 18762306a36Sopenharmony_ci * Note: this doesn't free the actual pages themselves. That 18862306a36Sopenharmony_ci * has been handled earlier when unmapping all the memory regions. 18962306a36Sopenharmony_ci */ 19062306a36Sopenharmony_cistatic void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, 19162306a36Sopenharmony_ci unsigned long addr) 19262306a36Sopenharmony_ci{ 19362306a36Sopenharmony_ci pgtable_t token = pmd_pgtable(*pmd); 19462306a36Sopenharmony_ci pmd_clear(pmd); 19562306a36Sopenharmony_ci pte_free_tlb(tlb, token, addr); 19662306a36Sopenharmony_ci mm_dec_nr_ptes(tlb->mm); 19762306a36Sopenharmony_ci} 19862306a36Sopenharmony_ci 19962306a36Sopenharmony_cistatic inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 20062306a36Sopenharmony_ci unsigned long addr, unsigned long end, 20162306a36Sopenharmony_ci unsigned long floor, unsigned long ceiling) 20262306a36Sopenharmony_ci{ 20362306a36Sopenharmony_ci pmd_t *pmd; 20462306a36Sopenharmony_ci unsigned long next; 20562306a36Sopenharmony_ci unsigned long start; 20662306a36Sopenharmony_ci 20762306a36Sopenharmony_ci start = addr; 20862306a36Sopenharmony_ci pmd = pmd_offset(pud, addr); 20962306a36Sopenharmony_ci do { 21062306a36Sopenharmony_ci next = pmd_addr_end(addr, end); 21162306a36Sopenharmony_ci if (pmd_none_or_clear_bad(pmd)) 21262306a36Sopenharmony_ci continue; 21362306a36Sopenharmony_ci free_pte_range(tlb, pmd, addr); 21462306a36Sopenharmony_ci } while (pmd++, addr = next, addr != end); 21562306a36Sopenharmony_ci 21662306a36Sopenharmony_ci start &= PUD_MASK; 21762306a36Sopenharmony_ci if (start < floor) 21862306a36Sopenharmony_ci return; 21962306a36Sopenharmony_ci if (ceiling) { 22062306a36Sopenharmony_ci ceiling &= PUD_MASK; 22162306a36Sopenharmony_ci if (!ceiling) 22262306a36Sopenharmony_ci return; 22362306a36Sopenharmony_ci } 22462306a36Sopenharmony_ci if (end - 1 > ceiling - 1) 22562306a36Sopenharmony_ci return; 22662306a36Sopenharmony_ci 22762306a36Sopenharmony_ci pmd = pmd_offset(pud, start); 22862306a36Sopenharmony_ci pud_clear(pud); 22962306a36Sopenharmony_ci pmd_free_tlb(tlb, pmd, start); 23062306a36Sopenharmony_ci mm_dec_nr_pmds(tlb->mm); 23162306a36Sopenharmony_ci} 23262306a36Sopenharmony_ci 23362306a36Sopenharmony_cistatic inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, 23462306a36Sopenharmony_ci unsigned long addr, unsigned long end, 23562306a36Sopenharmony_ci unsigned long floor, unsigned long ceiling) 23662306a36Sopenharmony_ci{ 23762306a36Sopenharmony_ci pud_t *pud; 23862306a36Sopenharmony_ci unsigned long next; 23962306a36Sopenharmony_ci unsigned long start; 24062306a36Sopenharmony_ci 24162306a36Sopenharmony_ci start = addr; 24262306a36Sopenharmony_ci pud = pud_offset(p4d, addr); 24362306a36Sopenharmony_ci do { 24462306a36Sopenharmony_ci next = pud_addr_end(addr, end); 24562306a36Sopenharmony_ci if (pud_none_or_clear_bad(pud)) 24662306a36Sopenharmony_ci continue; 24762306a36Sopenharmony_ci free_pmd_range(tlb, pud, addr, next, floor, ceiling); 24862306a36Sopenharmony_ci } while (pud++, addr = next, addr != end); 24962306a36Sopenharmony_ci 25062306a36Sopenharmony_ci start &= P4D_MASK; 25162306a36Sopenharmony_ci if (start < floor) 25262306a36Sopenharmony_ci return; 25362306a36Sopenharmony_ci if (ceiling) { 25462306a36Sopenharmony_ci ceiling &= P4D_MASK; 25562306a36Sopenharmony_ci if (!ceiling) 25662306a36Sopenharmony_ci return; 25762306a36Sopenharmony_ci } 25862306a36Sopenharmony_ci if (end - 1 > ceiling - 1) 25962306a36Sopenharmony_ci return; 26062306a36Sopenharmony_ci 26162306a36Sopenharmony_ci pud = pud_offset(p4d, start); 26262306a36Sopenharmony_ci p4d_clear(p4d); 26362306a36Sopenharmony_ci pud_free_tlb(tlb, pud, start); 26462306a36Sopenharmony_ci mm_dec_nr_puds(tlb->mm); 26562306a36Sopenharmony_ci} 26662306a36Sopenharmony_ci 26762306a36Sopenharmony_cistatic inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, 26862306a36Sopenharmony_ci unsigned long addr, unsigned long end, 26962306a36Sopenharmony_ci unsigned long floor, unsigned long ceiling) 27062306a36Sopenharmony_ci{ 27162306a36Sopenharmony_ci p4d_t *p4d; 27262306a36Sopenharmony_ci unsigned long next; 27362306a36Sopenharmony_ci unsigned long start; 27462306a36Sopenharmony_ci 27562306a36Sopenharmony_ci start = addr; 27662306a36Sopenharmony_ci p4d = p4d_offset(pgd, addr); 27762306a36Sopenharmony_ci do { 27862306a36Sopenharmony_ci next = p4d_addr_end(addr, end); 27962306a36Sopenharmony_ci if (p4d_none_or_clear_bad(p4d)) 28062306a36Sopenharmony_ci continue; 28162306a36Sopenharmony_ci free_pud_range(tlb, p4d, addr, next, floor, ceiling); 28262306a36Sopenharmony_ci } while (p4d++, addr = next, addr != end); 28362306a36Sopenharmony_ci 28462306a36Sopenharmony_ci start &= PGDIR_MASK; 28562306a36Sopenharmony_ci if (start < floor) 28662306a36Sopenharmony_ci return; 28762306a36Sopenharmony_ci if (ceiling) { 28862306a36Sopenharmony_ci ceiling &= PGDIR_MASK; 28962306a36Sopenharmony_ci if (!ceiling) 29062306a36Sopenharmony_ci return; 29162306a36Sopenharmony_ci } 29262306a36Sopenharmony_ci if (end - 1 > ceiling - 1) 29362306a36Sopenharmony_ci return; 29462306a36Sopenharmony_ci 29562306a36Sopenharmony_ci p4d = p4d_offset(pgd, start); 29662306a36Sopenharmony_ci pgd_clear(pgd); 29762306a36Sopenharmony_ci p4d_free_tlb(tlb, p4d, start); 29862306a36Sopenharmony_ci} 29962306a36Sopenharmony_ci 30062306a36Sopenharmony_ci/* 30162306a36Sopenharmony_ci * This function frees user-level page tables of a process. 30262306a36Sopenharmony_ci */ 30362306a36Sopenharmony_civoid free_pgd_range(struct mmu_gather *tlb, 30462306a36Sopenharmony_ci unsigned long addr, unsigned long end, 30562306a36Sopenharmony_ci unsigned long floor, unsigned long ceiling) 30662306a36Sopenharmony_ci{ 30762306a36Sopenharmony_ci pgd_t *pgd; 30862306a36Sopenharmony_ci unsigned long next; 30962306a36Sopenharmony_ci 31062306a36Sopenharmony_ci /* 31162306a36Sopenharmony_ci * The next few lines have given us lots of grief... 31262306a36Sopenharmony_ci * 31362306a36Sopenharmony_ci * Why are we testing PMD* at this top level? Because often 31462306a36Sopenharmony_ci * there will be no work to do at all, and we'd prefer not to 31562306a36Sopenharmony_ci * go all the way down to the bottom just to discover that. 31662306a36Sopenharmony_ci * 31762306a36Sopenharmony_ci * Why all these "- 1"s? Because 0 represents both the bottom 31862306a36Sopenharmony_ci * of the address space and the top of it (using -1 for the 31962306a36Sopenharmony_ci * top wouldn't help much: the masks would do the wrong thing). 32062306a36Sopenharmony_ci * The rule is that addr 0 and floor 0 refer to the bottom of 32162306a36Sopenharmony_ci * the address space, but end 0 and ceiling 0 refer to the top 32262306a36Sopenharmony_ci * Comparisons need to use "end - 1" and "ceiling - 1" (though 32362306a36Sopenharmony_ci * that end 0 case should be mythical). 32462306a36Sopenharmony_ci * 32562306a36Sopenharmony_ci * Wherever addr is brought up or ceiling brought down, we must 32662306a36Sopenharmony_ci * be careful to reject "the opposite 0" before it confuses the 32762306a36Sopenharmony_ci * subsequent tests. But what about where end is brought down 32862306a36Sopenharmony_ci * by PMD_SIZE below? no, end can't go down to 0 there. 32962306a36Sopenharmony_ci * 33062306a36Sopenharmony_ci * Whereas we round start (addr) and ceiling down, by different 33162306a36Sopenharmony_ci * masks at different levels, in order to test whether a table 33262306a36Sopenharmony_ci * now has no other vmas using it, so can be freed, we don't 33362306a36Sopenharmony_ci * bother to round floor or end up - the tests don't need that. 33462306a36Sopenharmony_ci */ 33562306a36Sopenharmony_ci 33662306a36Sopenharmony_ci addr &= PMD_MASK; 33762306a36Sopenharmony_ci if (addr < floor) { 33862306a36Sopenharmony_ci addr += PMD_SIZE; 33962306a36Sopenharmony_ci if (!addr) 34062306a36Sopenharmony_ci return; 34162306a36Sopenharmony_ci } 34262306a36Sopenharmony_ci if (ceiling) { 34362306a36Sopenharmony_ci ceiling &= PMD_MASK; 34462306a36Sopenharmony_ci if (!ceiling) 34562306a36Sopenharmony_ci return; 34662306a36Sopenharmony_ci } 34762306a36Sopenharmony_ci if (end - 1 > ceiling - 1) 34862306a36Sopenharmony_ci end -= PMD_SIZE; 34962306a36Sopenharmony_ci if (addr > end - 1) 35062306a36Sopenharmony_ci return; 35162306a36Sopenharmony_ci /* 35262306a36Sopenharmony_ci * We add page table cache pages with PAGE_SIZE, 35362306a36Sopenharmony_ci * (see pte_free_tlb()), flush the tlb if we need 35462306a36Sopenharmony_ci */ 35562306a36Sopenharmony_ci tlb_change_page_size(tlb, PAGE_SIZE); 35662306a36Sopenharmony_ci pgd = pgd_offset(tlb->mm, addr); 35762306a36Sopenharmony_ci do { 35862306a36Sopenharmony_ci next = pgd_addr_end(addr, end); 35962306a36Sopenharmony_ci if (pgd_none_or_clear_bad(pgd)) 36062306a36Sopenharmony_ci continue; 36162306a36Sopenharmony_ci free_p4d_range(tlb, pgd, addr, next, floor, ceiling); 36262306a36Sopenharmony_ci } while (pgd++, addr = next, addr != end); 36362306a36Sopenharmony_ci} 36462306a36Sopenharmony_ci 36562306a36Sopenharmony_civoid free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, 36662306a36Sopenharmony_ci struct vm_area_struct *vma, unsigned long floor, 36762306a36Sopenharmony_ci unsigned long ceiling, bool mm_wr_locked) 36862306a36Sopenharmony_ci{ 36962306a36Sopenharmony_ci do { 37062306a36Sopenharmony_ci unsigned long addr = vma->vm_start; 37162306a36Sopenharmony_ci struct vm_area_struct *next; 37262306a36Sopenharmony_ci 37362306a36Sopenharmony_ci /* 37462306a36Sopenharmony_ci * Note: USER_PGTABLES_CEILING may be passed as ceiling and may 37562306a36Sopenharmony_ci * be 0. This will underflow and is okay. 37662306a36Sopenharmony_ci */ 37762306a36Sopenharmony_ci next = mas_find(mas, ceiling - 1); 37862306a36Sopenharmony_ci 37962306a36Sopenharmony_ci /* 38062306a36Sopenharmony_ci * Hide vma from rmap and truncate_pagecache before freeing 38162306a36Sopenharmony_ci * pgtables 38262306a36Sopenharmony_ci */ 38362306a36Sopenharmony_ci if (mm_wr_locked) 38462306a36Sopenharmony_ci vma_start_write(vma); 38562306a36Sopenharmony_ci unlink_anon_vmas(vma); 38662306a36Sopenharmony_ci unlink_file_vma(vma); 38762306a36Sopenharmony_ci 38862306a36Sopenharmony_ci if (is_vm_hugetlb_page(vma)) { 38962306a36Sopenharmony_ci hugetlb_free_pgd_range(tlb, addr, vma->vm_end, 39062306a36Sopenharmony_ci floor, next ? next->vm_start : ceiling); 39162306a36Sopenharmony_ci } else { 39262306a36Sopenharmony_ci /* 39362306a36Sopenharmony_ci * Optimization: gather nearby vmas into one call down 39462306a36Sopenharmony_ci */ 39562306a36Sopenharmony_ci while (next && next->vm_start <= vma->vm_end + PMD_SIZE 39662306a36Sopenharmony_ci && !is_vm_hugetlb_page(next)) { 39762306a36Sopenharmony_ci vma = next; 39862306a36Sopenharmony_ci next = mas_find(mas, ceiling - 1); 39962306a36Sopenharmony_ci if (mm_wr_locked) 40062306a36Sopenharmony_ci vma_start_write(vma); 40162306a36Sopenharmony_ci unlink_anon_vmas(vma); 40262306a36Sopenharmony_ci unlink_file_vma(vma); 40362306a36Sopenharmony_ci } 40462306a36Sopenharmony_ci free_pgd_range(tlb, addr, vma->vm_end, 40562306a36Sopenharmony_ci floor, next ? next->vm_start : ceiling); 40662306a36Sopenharmony_ci } 40762306a36Sopenharmony_ci vma = next; 40862306a36Sopenharmony_ci } while (vma); 40962306a36Sopenharmony_ci} 41062306a36Sopenharmony_ci 41162306a36Sopenharmony_civoid pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte) 41262306a36Sopenharmony_ci{ 41362306a36Sopenharmony_ci spinlock_t *ptl = pmd_lock(mm, pmd); 41462306a36Sopenharmony_ci 41562306a36Sopenharmony_ci if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ 41662306a36Sopenharmony_ci mm_inc_nr_ptes(mm); 41762306a36Sopenharmony_ci /* 41862306a36Sopenharmony_ci * Ensure all pte setup (eg. pte page lock and page clearing) are 41962306a36Sopenharmony_ci * visible before the pte is made visible to other CPUs by being 42062306a36Sopenharmony_ci * put into page tables. 42162306a36Sopenharmony_ci * 42262306a36Sopenharmony_ci * The other side of the story is the pointer chasing in the page 42362306a36Sopenharmony_ci * table walking code (when walking the page table without locking; 42462306a36Sopenharmony_ci * ie. most of the time). Fortunately, these data accesses consist 42562306a36Sopenharmony_ci * of a chain of data-dependent loads, meaning most CPUs (alpha 42662306a36Sopenharmony_ci * being the notable exception) will already guarantee loads are 42762306a36Sopenharmony_ci * seen in-order. See the alpha page table accessors for the 42862306a36Sopenharmony_ci * smp_rmb() barriers in page table walking code. 42962306a36Sopenharmony_ci */ 43062306a36Sopenharmony_ci smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ 43162306a36Sopenharmony_ci pmd_populate(mm, pmd, *pte); 43262306a36Sopenharmony_ci *pte = NULL; 43362306a36Sopenharmony_ci } 43462306a36Sopenharmony_ci spin_unlock(ptl); 43562306a36Sopenharmony_ci} 43662306a36Sopenharmony_ci 43762306a36Sopenharmony_ciint __pte_alloc(struct mm_struct *mm, pmd_t *pmd) 43862306a36Sopenharmony_ci{ 43962306a36Sopenharmony_ci pgtable_t new = pte_alloc_one(mm); 44062306a36Sopenharmony_ci if (!new) 44162306a36Sopenharmony_ci return -ENOMEM; 44262306a36Sopenharmony_ci 44362306a36Sopenharmony_ci pmd_install(mm, pmd, &new); 44462306a36Sopenharmony_ci if (new) 44562306a36Sopenharmony_ci pte_free(mm, new); 44662306a36Sopenharmony_ci return 0; 44762306a36Sopenharmony_ci} 44862306a36Sopenharmony_ci 44962306a36Sopenharmony_ciint __pte_alloc_kernel(pmd_t *pmd) 45062306a36Sopenharmony_ci{ 45162306a36Sopenharmony_ci pte_t *new = pte_alloc_one_kernel(&init_mm); 45262306a36Sopenharmony_ci if (!new) 45362306a36Sopenharmony_ci return -ENOMEM; 45462306a36Sopenharmony_ci 45562306a36Sopenharmony_ci spin_lock(&init_mm.page_table_lock); 45662306a36Sopenharmony_ci if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ 45762306a36Sopenharmony_ci smp_wmb(); /* See comment in pmd_install() */ 45862306a36Sopenharmony_ci pmd_populate_kernel(&init_mm, pmd, new); 45962306a36Sopenharmony_ci new = NULL; 46062306a36Sopenharmony_ci } 46162306a36Sopenharmony_ci spin_unlock(&init_mm.page_table_lock); 46262306a36Sopenharmony_ci if (new) 46362306a36Sopenharmony_ci pte_free_kernel(&init_mm, new); 46462306a36Sopenharmony_ci return 0; 46562306a36Sopenharmony_ci} 46662306a36Sopenharmony_ci 46762306a36Sopenharmony_cistatic inline void init_rss_vec(int *rss) 46862306a36Sopenharmony_ci{ 46962306a36Sopenharmony_ci memset(rss, 0, sizeof(int) * NR_MM_COUNTERS); 47062306a36Sopenharmony_ci} 47162306a36Sopenharmony_ci 47262306a36Sopenharmony_cistatic inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) 47362306a36Sopenharmony_ci{ 47462306a36Sopenharmony_ci int i; 47562306a36Sopenharmony_ci 47662306a36Sopenharmony_ci if (current->mm == mm) 47762306a36Sopenharmony_ci sync_mm_rss(mm); 47862306a36Sopenharmony_ci for (i = 0; i < NR_MM_COUNTERS; i++) 47962306a36Sopenharmony_ci if (rss[i]) 48062306a36Sopenharmony_ci add_mm_counter(mm, i, rss[i]); 48162306a36Sopenharmony_ci} 48262306a36Sopenharmony_ci 48362306a36Sopenharmony_ci/* 48462306a36Sopenharmony_ci * This function is called to print an error when a bad pte 48562306a36Sopenharmony_ci * is found. For example, we might have a PFN-mapped pte in 48662306a36Sopenharmony_ci * a region that doesn't allow it. 48762306a36Sopenharmony_ci * 48862306a36Sopenharmony_ci * The calling function must still handle the error. 48962306a36Sopenharmony_ci */ 49062306a36Sopenharmony_cistatic void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, 49162306a36Sopenharmony_ci pte_t pte, struct page *page) 49262306a36Sopenharmony_ci{ 49362306a36Sopenharmony_ci pgd_t *pgd = pgd_offset(vma->vm_mm, addr); 49462306a36Sopenharmony_ci p4d_t *p4d = p4d_offset(pgd, addr); 49562306a36Sopenharmony_ci pud_t *pud = pud_offset(p4d, addr); 49662306a36Sopenharmony_ci pmd_t *pmd = pmd_offset(pud, addr); 49762306a36Sopenharmony_ci struct address_space *mapping; 49862306a36Sopenharmony_ci pgoff_t index; 49962306a36Sopenharmony_ci static unsigned long resume; 50062306a36Sopenharmony_ci static unsigned long nr_shown; 50162306a36Sopenharmony_ci static unsigned long nr_unshown; 50262306a36Sopenharmony_ci 50362306a36Sopenharmony_ci /* 50462306a36Sopenharmony_ci * Allow a burst of 60 reports, then keep quiet for that minute; 50562306a36Sopenharmony_ci * or allow a steady drip of one report per second. 50662306a36Sopenharmony_ci */ 50762306a36Sopenharmony_ci if (nr_shown == 60) { 50862306a36Sopenharmony_ci if (time_before(jiffies, resume)) { 50962306a36Sopenharmony_ci nr_unshown++; 51062306a36Sopenharmony_ci return; 51162306a36Sopenharmony_ci } 51262306a36Sopenharmony_ci if (nr_unshown) { 51362306a36Sopenharmony_ci pr_alert("BUG: Bad page map: %lu messages suppressed\n", 51462306a36Sopenharmony_ci nr_unshown); 51562306a36Sopenharmony_ci nr_unshown = 0; 51662306a36Sopenharmony_ci } 51762306a36Sopenharmony_ci nr_shown = 0; 51862306a36Sopenharmony_ci } 51962306a36Sopenharmony_ci if (nr_shown++ == 0) 52062306a36Sopenharmony_ci resume = jiffies + 60 * HZ; 52162306a36Sopenharmony_ci 52262306a36Sopenharmony_ci mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; 52362306a36Sopenharmony_ci index = linear_page_index(vma, addr); 52462306a36Sopenharmony_ci 52562306a36Sopenharmony_ci pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", 52662306a36Sopenharmony_ci current->comm, 52762306a36Sopenharmony_ci (long long)pte_val(pte), (long long)pmd_val(*pmd)); 52862306a36Sopenharmony_ci if (page) 52962306a36Sopenharmony_ci dump_page(page, "bad pte"); 53062306a36Sopenharmony_ci pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n", 53162306a36Sopenharmony_ci (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); 53262306a36Sopenharmony_ci pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n", 53362306a36Sopenharmony_ci vma->vm_file, 53462306a36Sopenharmony_ci vma->vm_ops ? vma->vm_ops->fault : NULL, 53562306a36Sopenharmony_ci vma->vm_file ? vma->vm_file->f_op->mmap : NULL, 53662306a36Sopenharmony_ci mapping ? mapping->a_ops->read_folio : NULL); 53762306a36Sopenharmony_ci dump_stack(); 53862306a36Sopenharmony_ci add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 53962306a36Sopenharmony_ci} 54062306a36Sopenharmony_ci 54162306a36Sopenharmony_ci/* 54262306a36Sopenharmony_ci * vm_normal_page -- This function gets the "struct page" associated with a pte. 54362306a36Sopenharmony_ci * 54462306a36Sopenharmony_ci * "Special" mappings do not wish to be associated with a "struct page" (either 54562306a36Sopenharmony_ci * it doesn't exist, or it exists but they don't want to touch it). In this 54662306a36Sopenharmony_ci * case, NULL is returned here. "Normal" mappings do have a struct page. 54762306a36Sopenharmony_ci * 54862306a36Sopenharmony_ci * There are 2 broad cases. Firstly, an architecture may define a pte_special() 54962306a36Sopenharmony_ci * pte bit, in which case this function is trivial. Secondly, an architecture 55062306a36Sopenharmony_ci * may not have a spare pte bit, which requires a more complicated scheme, 55162306a36Sopenharmony_ci * described below. 55262306a36Sopenharmony_ci * 55362306a36Sopenharmony_ci * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a 55462306a36Sopenharmony_ci * special mapping (even if there are underlying and valid "struct pages"). 55562306a36Sopenharmony_ci * COWed pages of a VM_PFNMAP are always normal. 55662306a36Sopenharmony_ci * 55762306a36Sopenharmony_ci * The way we recognize COWed pages within VM_PFNMAP mappings is through the 55862306a36Sopenharmony_ci * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit 55962306a36Sopenharmony_ci * set, and the vm_pgoff will point to the first PFN mapped: thus every special 56062306a36Sopenharmony_ci * mapping will always honor the rule 56162306a36Sopenharmony_ci * 56262306a36Sopenharmony_ci * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) 56362306a36Sopenharmony_ci * 56462306a36Sopenharmony_ci * And for normal mappings this is false. 56562306a36Sopenharmony_ci * 56662306a36Sopenharmony_ci * This restricts such mappings to be a linear translation from virtual address 56762306a36Sopenharmony_ci * to pfn. To get around this restriction, we allow arbitrary mappings so long 56862306a36Sopenharmony_ci * as the vma is not a COW mapping; in that case, we know that all ptes are 56962306a36Sopenharmony_ci * special (because none can have been COWed). 57062306a36Sopenharmony_ci * 57162306a36Sopenharmony_ci * 57262306a36Sopenharmony_ci * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP. 57362306a36Sopenharmony_ci * 57462306a36Sopenharmony_ci * VM_MIXEDMAP mappings can likewise contain memory with or without "struct 57562306a36Sopenharmony_ci * page" backing, however the difference is that _all_ pages with a struct 57662306a36Sopenharmony_ci * page (that is, those where pfn_valid is true) are refcounted and considered 57762306a36Sopenharmony_ci * normal pages by the VM. The disadvantage is that pages are refcounted 57862306a36Sopenharmony_ci * (which can be slower and simply not an option for some PFNMAP users). The 57962306a36Sopenharmony_ci * advantage is that we don't have to follow the strict linearity rule of 58062306a36Sopenharmony_ci * PFNMAP mappings in order to support COWable mappings. 58162306a36Sopenharmony_ci * 58262306a36Sopenharmony_ci */ 58362306a36Sopenharmony_cistruct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, 58462306a36Sopenharmony_ci pte_t pte) 58562306a36Sopenharmony_ci{ 58662306a36Sopenharmony_ci unsigned long pfn = pte_pfn(pte); 58762306a36Sopenharmony_ci 58862306a36Sopenharmony_ci if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) { 58962306a36Sopenharmony_ci if (likely(!pte_special(pte))) 59062306a36Sopenharmony_ci goto check_pfn; 59162306a36Sopenharmony_ci if (vma->vm_ops && vma->vm_ops->find_special_page) 59262306a36Sopenharmony_ci return vma->vm_ops->find_special_page(vma, addr); 59362306a36Sopenharmony_ci if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) 59462306a36Sopenharmony_ci return NULL; 59562306a36Sopenharmony_ci if (is_zero_pfn(pfn)) 59662306a36Sopenharmony_ci return NULL; 59762306a36Sopenharmony_ci if (pte_devmap(pte)) 59862306a36Sopenharmony_ci /* 59962306a36Sopenharmony_ci * NOTE: New users of ZONE_DEVICE will not set pte_devmap() 60062306a36Sopenharmony_ci * and will have refcounts incremented on their struct pages 60162306a36Sopenharmony_ci * when they are inserted into PTEs, thus they are safe to 60262306a36Sopenharmony_ci * return here. Legacy ZONE_DEVICE pages that set pte_devmap() 60362306a36Sopenharmony_ci * do not have refcounts. Example of legacy ZONE_DEVICE is 60462306a36Sopenharmony_ci * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers. 60562306a36Sopenharmony_ci */ 60662306a36Sopenharmony_ci return NULL; 60762306a36Sopenharmony_ci 60862306a36Sopenharmony_ci print_bad_pte(vma, addr, pte, NULL); 60962306a36Sopenharmony_ci return NULL; 61062306a36Sopenharmony_ci } 61162306a36Sopenharmony_ci 61262306a36Sopenharmony_ci /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */ 61362306a36Sopenharmony_ci 61462306a36Sopenharmony_ci if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { 61562306a36Sopenharmony_ci if (vma->vm_flags & VM_MIXEDMAP) { 61662306a36Sopenharmony_ci if (!pfn_valid(pfn)) 61762306a36Sopenharmony_ci return NULL; 61862306a36Sopenharmony_ci goto out; 61962306a36Sopenharmony_ci } else { 62062306a36Sopenharmony_ci unsigned long off; 62162306a36Sopenharmony_ci off = (addr - vma->vm_start) >> PAGE_SHIFT; 62262306a36Sopenharmony_ci if (pfn == vma->vm_pgoff + off) 62362306a36Sopenharmony_ci return NULL; 62462306a36Sopenharmony_ci if (!is_cow_mapping(vma->vm_flags)) 62562306a36Sopenharmony_ci return NULL; 62662306a36Sopenharmony_ci } 62762306a36Sopenharmony_ci } 62862306a36Sopenharmony_ci 62962306a36Sopenharmony_ci if (is_zero_pfn(pfn)) 63062306a36Sopenharmony_ci return NULL; 63162306a36Sopenharmony_ci 63262306a36Sopenharmony_cicheck_pfn: 63362306a36Sopenharmony_ci if (unlikely(pfn > highest_memmap_pfn)) { 63462306a36Sopenharmony_ci print_bad_pte(vma, addr, pte, NULL); 63562306a36Sopenharmony_ci return NULL; 63662306a36Sopenharmony_ci } 63762306a36Sopenharmony_ci 63862306a36Sopenharmony_ci /* 63962306a36Sopenharmony_ci * NOTE! We still have PageReserved() pages in the page tables. 64062306a36Sopenharmony_ci * eg. VDSO mappings can cause them to exist. 64162306a36Sopenharmony_ci */ 64262306a36Sopenharmony_ciout: 64362306a36Sopenharmony_ci return pfn_to_page(pfn); 64462306a36Sopenharmony_ci} 64562306a36Sopenharmony_ci 64662306a36Sopenharmony_cistruct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, 64762306a36Sopenharmony_ci pte_t pte) 64862306a36Sopenharmony_ci{ 64962306a36Sopenharmony_ci struct page *page = vm_normal_page(vma, addr, pte); 65062306a36Sopenharmony_ci 65162306a36Sopenharmony_ci if (page) 65262306a36Sopenharmony_ci return page_folio(page); 65362306a36Sopenharmony_ci return NULL; 65462306a36Sopenharmony_ci} 65562306a36Sopenharmony_ci 65662306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 65762306a36Sopenharmony_cistruct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, 65862306a36Sopenharmony_ci pmd_t pmd) 65962306a36Sopenharmony_ci{ 66062306a36Sopenharmony_ci unsigned long pfn = pmd_pfn(pmd); 66162306a36Sopenharmony_ci 66262306a36Sopenharmony_ci /* 66362306a36Sopenharmony_ci * There is no pmd_special() but there may be special pmds, e.g. 66462306a36Sopenharmony_ci * in a direct-access (dax) mapping, so let's just replicate the 66562306a36Sopenharmony_ci * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here. 66662306a36Sopenharmony_ci */ 66762306a36Sopenharmony_ci if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { 66862306a36Sopenharmony_ci if (vma->vm_flags & VM_MIXEDMAP) { 66962306a36Sopenharmony_ci if (!pfn_valid(pfn)) 67062306a36Sopenharmony_ci return NULL; 67162306a36Sopenharmony_ci goto out; 67262306a36Sopenharmony_ci } else { 67362306a36Sopenharmony_ci unsigned long off; 67462306a36Sopenharmony_ci off = (addr - vma->vm_start) >> PAGE_SHIFT; 67562306a36Sopenharmony_ci if (pfn == vma->vm_pgoff + off) 67662306a36Sopenharmony_ci return NULL; 67762306a36Sopenharmony_ci if (!is_cow_mapping(vma->vm_flags)) 67862306a36Sopenharmony_ci return NULL; 67962306a36Sopenharmony_ci } 68062306a36Sopenharmony_ci } 68162306a36Sopenharmony_ci 68262306a36Sopenharmony_ci if (pmd_devmap(pmd)) 68362306a36Sopenharmony_ci return NULL; 68462306a36Sopenharmony_ci if (is_huge_zero_pmd(pmd)) 68562306a36Sopenharmony_ci return NULL; 68662306a36Sopenharmony_ci if (unlikely(pfn > highest_memmap_pfn)) 68762306a36Sopenharmony_ci return NULL; 68862306a36Sopenharmony_ci 68962306a36Sopenharmony_ci /* 69062306a36Sopenharmony_ci * NOTE! We still have PageReserved() pages in the page tables. 69162306a36Sopenharmony_ci * eg. VDSO mappings can cause them to exist. 69262306a36Sopenharmony_ci */ 69362306a36Sopenharmony_ciout: 69462306a36Sopenharmony_ci return pfn_to_page(pfn); 69562306a36Sopenharmony_ci} 69662306a36Sopenharmony_ci#endif 69762306a36Sopenharmony_ci 69862306a36Sopenharmony_cistatic void restore_exclusive_pte(struct vm_area_struct *vma, 69962306a36Sopenharmony_ci struct page *page, unsigned long address, 70062306a36Sopenharmony_ci pte_t *ptep) 70162306a36Sopenharmony_ci{ 70262306a36Sopenharmony_ci pte_t orig_pte; 70362306a36Sopenharmony_ci pte_t pte; 70462306a36Sopenharmony_ci swp_entry_t entry; 70562306a36Sopenharmony_ci 70662306a36Sopenharmony_ci orig_pte = ptep_get(ptep); 70762306a36Sopenharmony_ci pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); 70862306a36Sopenharmony_ci if (pte_swp_soft_dirty(orig_pte)) 70962306a36Sopenharmony_ci pte = pte_mksoft_dirty(pte); 71062306a36Sopenharmony_ci 71162306a36Sopenharmony_ci entry = pte_to_swp_entry(orig_pte); 71262306a36Sopenharmony_ci if (pte_swp_uffd_wp(orig_pte)) 71362306a36Sopenharmony_ci pte = pte_mkuffd_wp(pte); 71462306a36Sopenharmony_ci else if (is_writable_device_exclusive_entry(entry)) 71562306a36Sopenharmony_ci pte = maybe_mkwrite(pte_mkdirty(pte), vma); 71662306a36Sopenharmony_ci 71762306a36Sopenharmony_ci VM_BUG_ON(pte_write(pte) && !(PageAnon(page) && PageAnonExclusive(page))); 71862306a36Sopenharmony_ci 71962306a36Sopenharmony_ci /* 72062306a36Sopenharmony_ci * No need to take a page reference as one was already 72162306a36Sopenharmony_ci * created when the swap entry was made. 72262306a36Sopenharmony_ci */ 72362306a36Sopenharmony_ci if (PageAnon(page)) 72462306a36Sopenharmony_ci page_add_anon_rmap(page, vma, address, RMAP_NONE); 72562306a36Sopenharmony_ci else 72662306a36Sopenharmony_ci /* 72762306a36Sopenharmony_ci * Currently device exclusive access only supports anonymous 72862306a36Sopenharmony_ci * memory so the entry shouldn't point to a filebacked page. 72962306a36Sopenharmony_ci */ 73062306a36Sopenharmony_ci WARN_ON_ONCE(1); 73162306a36Sopenharmony_ci 73262306a36Sopenharmony_ci set_pte_at(vma->vm_mm, address, ptep, pte); 73362306a36Sopenharmony_ci 73462306a36Sopenharmony_ci /* 73562306a36Sopenharmony_ci * No need to invalidate - it was non-present before. However 73662306a36Sopenharmony_ci * secondary CPUs may have mappings that need invalidating. 73762306a36Sopenharmony_ci */ 73862306a36Sopenharmony_ci update_mmu_cache(vma, address, ptep); 73962306a36Sopenharmony_ci} 74062306a36Sopenharmony_ci 74162306a36Sopenharmony_ci/* 74262306a36Sopenharmony_ci * Tries to restore an exclusive pte if the page lock can be acquired without 74362306a36Sopenharmony_ci * sleeping. 74462306a36Sopenharmony_ci */ 74562306a36Sopenharmony_cistatic int 74662306a36Sopenharmony_citry_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma, 74762306a36Sopenharmony_ci unsigned long addr) 74862306a36Sopenharmony_ci{ 74962306a36Sopenharmony_ci swp_entry_t entry = pte_to_swp_entry(ptep_get(src_pte)); 75062306a36Sopenharmony_ci struct page *page = pfn_swap_entry_to_page(entry); 75162306a36Sopenharmony_ci 75262306a36Sopenharmony_ci if (trylock_page(page)) { 75362306a36Sopenharmony_ci restore_exclusive_pte(vma, page, addr, src_pte); 75462306a36Sopenharmony_ci unlock_page(page); 75562306a36Sopenharmony_ci return 0; 75662306a36Sopenharmony_ci } 75762306a36Sopenharmony_ci 75862306a36Sopenharmony_ci return -EBUSY; 75962306a36Sopenharmony_ci} 76062306a36Sopenharmony_ci 76162306a36Sopenharmony_ci/* 76262306a36Sopenharmony_ci * copy one vm_area from one task to the other. Assumes the page tables 76362306a36Sopenharmony_ci * already present in the new task to be cleared in the whole range 76462306a36Sopenharmony_ci * covered by this vma. 76562306a36Sopenharmony_ci */ 76662306a36Sopenharmony_ci 76762306a36Sopenharmony_cistatic unsigned long 76862306a36Sopenharmony_cicopy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, 76962306a36Sopenharmony_ci pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma, 77062306a36Sopenharmony_ci struct vm_area_struct *src_vma, unsigned long addr, int *rss) 77162306a36Sopenharmony_ci{ 77262306a36Sopenharmony_ci unsigned long vm_flags = dst_vma->vm_flags; 77362306a36Sopenharmony_ci pte_t orig_pte = ptep_get(src_pte); 77462306a36Sopenharmony_ci pte_t pte = orig_pte; 77562306a36Sopenharmony_ci struct page *page; 77662306a36Sopenharmony_ci swp_entry_t entry = pte_to_swp_entry(orig_pte); 77762306a36Sopenharmony_ci 77862306a36Sopenharmony_ci if (likely(!non_swap_entry(entry))) { 77962306a36Sopenharmony_ci if (swap_duplicate(entry) < 0) 78062306a36Sopenharmony_ci return -EIO; 78162306a36Sopenharmony_ci 78262306a36Sopenharmony_ci /* make sure dst_mm is on swapoff's mmlist. */ 78362306a36Sopenharmony_ci if (unlikely(list_empty(&dst_mm->mmlist))) { 78462306a36Sopenharmony_ci spin_lock(&mmlist_lock); 78562306a36Sopenharmony_ci if (list_empty(&dst_mm->mmlist)) 78662306a36Sopenharmony_ci list_add(&dst_mm->mmlist, 78762306a36Sopenharmony_ci &src_mm->mmlist); 78862306a36Sopenharmony_ci spin_unlock(&mmlist_lock); 78962306a36Sopenharmony_ci } 79062306a36Sopenharmony_ci /* Mark the swap entry as shared. */ 79162306a36Sopenharmony_ci if (pte_swp_exclusive(orig_pte)) { 79262306a36Sopenharmony_ci pte = pte_swp_clear_exclusive(orig_pte); 79362306a36Sopenharmony_ci set_pte_at(src_mm, addr, src_pte, pte); 79462306a36Sopenharmony_ci } 79562306a36Sopenharmony_ci rss[MM_SWAPENTS]++; 79662306a36Sopenharmony_ci } else if (is_migration_entry(entry)) { 79762306a36Sopenharmony_ci page = pfn_swap_entry_to_page(entry); 79862306a36Sopenharmony_ci 79962306a36Sopenharmony_ci rss[mm_counter(page)]++; 80062306a36Sopenharmony_ci 80162306a36Sopenharmony_ci if (!is_readable_migration_entry(entry) && 80262306a36Sopenharmony_ci is_cow_mapping(vm_flags)) { 80362306a36Sopenharmony_ci /* 80462306a36Sopenharmony_ci * COW mappings require pages in both parent and child 80562306a36Sopenharmony_ci * to be set to read. A previously exclusive entry is 80662306a36Sopenharmony_ci * now shared. 80762306a36Sopenharmony_ci */ 80862306a36Sopenharmony_ci entry = make_readable_migration_entry( 80962306a36Sopenharmony_ci swp_offset(entry)); 81062306a36Sopenharmony_ci pte = swp_entry_to_pte(entry); 81162306a36Sopenharmony_ci if (pte_swp_soft_dirty(orig_pte)) 81262306a36Sopenharmony_ci pte = pte_swp_mksoft_dirty(pte); 81362306a36Sopenharmony_ci if (pte_swp_uffd_wp(orig_pte)) 81462306a36Sopenharmony_ci pte = pte_swp_mkuffd_wp(pte); 81562306a36Sopenharmony_ci set_pte_at(src_mm, addr, src_pte, pte); 81662306a36Sopenharmony_ci } 81762306a36Sopenharmony_ci } else if (is_device_private_entry(entry)) { 81862306a36Sopenharmony_ci page = pfn_swap_entry_to_page(entry); 81962306a36Sopenharmony_ci 82062306a36Sopenharmony_ci /* 82162306a36Sopenharmony_ci * Update rss count even for unaddressable pages, as 82262306a36Sopenharmony_ci * they should treated just like normal pages in this 82362306a36Sopenharmony_ci * respect. 82462306a36Sopenharmony_ci * 82562306a36Sopenharmony_ci * We will likely want to have some new rss counters 82662306a36Sopenharmony_ci * for unaddressable pages, at some point. But for now 82762306a36Sopenharmony_ci * keep things as they are. 82862306a36Sopenharmony_ci */ 82962306a36Sopenharmony_ci get_page(page); 83062306a36Sopenharmony_ci rss[mm_counter(page)]++; 83162306a36Sopenharmony_ci /* Cannot fail as these pages cannot get pinned. */ 83262306a36Sopenharmony_ci BUG_ON(page_try_dup_anon_rmap(page, false, src_vma)); 83362306a36Sopenharmony_ci 83462306a36Sopenharmony_ci /* 83562306a36Sopenharmony_ci * We do not preserve soft-dirty information, because so 83662306a36Sopenharmony_ci * far, checkpoint/restore is the only feature that 83762306a36Sopenharmony_ci * requires that. And checkpoint/restore does not work 83862306a36Sopenharmony_ci * when a device driver is involved (you cannot easily 83962306a36Sopenharmony_ci * save and restore device driver state). 84062306a36Sopenharmony_ci */ 84162306a36Sopenharmony_ci if (is_writable_device_private_entry(entry) && 84262306a36Sopenharmony_ci is_cow_mapping(vm_flags)) { 84362306a36Sopenharmony_ci entry = make_readable_device_private_entry( 84462306a36Sopenharmony_ci swp_offset(entry)); 84562306a36Sopenharmony_ci pte = swp_entry_to_pte(entry); 84662306a36Sopenharmony_ci if (pte_swp_uffd_wp(orig_pte)) 84762306a36Sopenharmony_ci pte = pte_swp_mkuffd_wp(pte); 84862306a36Sopenharmony_ci set_pte_at(src_mm, addr, src_pte, pte); 84962306a36Sopenharmony_ci } 85062306a36Sopenharmony_ci } else if (is_device_exclusive_entry(entry)) { 85162306a36Sopenharmony_ci /* 85262306a36Sopenharmony_ci * Make device exclusive entries present by restoring the 85362306a36Sopenharmony_ci * original entry then copying as for a present pte. Device 85462306a36Sopenharmony_ci * exclusive entries currently only support private writable 85562306a36Sopenharmony_ci * (ie. COW) mappings. 85662306a36Sopenharmony_ci */ 85762306a36Sopenharmony_ci VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags)); 85862306a36Sopenharmony_ci if (try_restore_exclusive_pte(src_pte, src_vma, addr)) 85962306a36Sopenharmony_ci return -EBUSY; 86062306a36Sopenharmony_ci return -ENOENT; 86162306a36Sopenharmony_ci } else if (is_pte_marker_entry(entry)) { 86262306a36Sopenharmony_ci pte_marker marker = copy_pte_marker(entry, dst_vma); 86362306a36Sopenharmony_ci 86462306a36Sopenharmony_ci if (marker) 86562306a36Sopenharmony_ci set_pte_at(dst_mm, addr, dst_pte, 86662306a36Sopenharmony_ci make_pte_marker(marker)); 86762306a36Sopenharmony_ci return 0; 86862306a36Sopenharmony_ci } 86962306a36Sopenharmony_ci if (!userfaultfd_wp(dst_vma)) 87062306a36Sopenharmony_ci pte = pte_swp_clear_uffd_wp(pte); 87162306a36Sopenharmony_ci set_pte_at(dst_mm, addr, dst_pte, pte); 87262306a36Sopenharmony_ci return 0; 87362306a36Sopenharmony_ci} 87462306a36Sopenharmony_ci 87562306a36Sopenharmony_ci/* 87662306a36Sopenharmony_ci * Copy a present and normal page. 87762306a36Sopenharmony_ci * 87862306a36Sopenharmony_ci * NOTE! The usual case is that this isn't required; 87962306a36Sopenharmony_ci * instead, the caller can just increase the page refcount 88062306a36Sopenharmony_ci * and re-use the pte the traditional way. 88162306a36Sopenharmony_ci * 88262306a36Sopenharmony_ci * And if we need a pre-allocated page but don't yet have 88362306a36Sopenharmony_ci * one, return a negative error to let the preallocation 88462306a36Sopenharmony_ci * code know so that it can do so outside the page table 88562306a36Sopenharmony_ci * lock. 88662306a36Sopenharmony_ci */ 88762306a36Sopenharmony_cistatic inline int 88862306a36Sopenharmony_cicopy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, 88962306a36Sopenharmony_ci pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, 89062306a36Sopenharmony_ci struct folio **prealloc, struct page *page) 89162306a36Sopenharmony_ci{ 89262306a36Sopenharmony_ci struct folio *new_folio; 89362306a36Sopenharmony_ci pte_t pte; 89462306a36Sopenharmony_ci 89562306a36Sopenharmony_ci new_folio = *prealloc; 89662306a36Sopenharmony_ci if (!new_folio) 89762306a36Sopenharmony_ci return -EAGAIN; 89862306a36Sopenharmony_ci 89962306a36Sopenharmony_ci /* 90062306a36Sopenharmony_ci * We have a prealloc page, all good! Take it 90162306a36Sopenharmony_ci * over and copy the page & arm it. 90262306a36Sopenharmony_ci */ 90362306a36Sopenharmony_ci *prealloc = NULL; 90462306a36Sopenharmony_ci copy_user_highpage(&new_folio->page, page, addr, src_vma); 90562306a36Sopenharmony_ci __folio_mark_uptodate(new_folio); 90662306a36Sopenharmony_ci folio_add_new_anon_rmap(new_folio, dst_vma, addr); 90762306a36Sopenharmony_ci folio_add_lru_vma(new_folio, dst_vma); 90862306a36Sopenharmony_ci rss[MM_ANONPAGES]++; 90962306a36Sopenharmony_ci 91062306a36Sopenharmony_ci /* All done, just insert the new page copy in the child */ 91162306a36Sopenharmony_ci pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot); 91262306a36Sopenharmony_ci pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); 91362306a36Sopenharmony_ci if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte))) 91462306a36Sopenharmony_ci /* Uffd-wp needs to be delivered to dest pte as well */ 91562306a36Sopenharmony_ci pte = pte_mkuffd_wp(pte); 91662306a36Sopenharmony_ci set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); 91762306a36Sopenharmony_ci return 0; 91862306a36Sopenharmony_ci} 91962306a36Sopenharmony_ci 92062306a36Sopenharmony_ci/* 92162306a36Sopenharmony_ci * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page 92262306a36Sopenharmony_ci * is required to copy this pte. 92362306a36Sopenharmony_ci */ 92462306a36Sopenharmony_cistatic inline int 92562306a36Sopenharmony_cicopy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, 92662306a36Sopenharmony_ci pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, 92762306a36Sopenharmony_ci struct folio **prealloc) 92862306a36Sopenharmony_ci{ 92962306a36Sopenharmony_ci struct mm_struct *src_mm = src_vma->vm_mm; 93062306a36Sopenharmony_ci unsigned long vm_flags = src_vma->vm_flags; 93162306a36Sopenharmony_ci pte_t pte = ptep_get(src_pte); 93262306a36Sopenharmony_ci struct page *page; 93362306a36Sopenharmony_ci struct folio *folio; 93462306a36Sopenharmony_ci 93562306a36Sopenharmony_ci page = vm_normal_page(src_vma, addr, pte); 93662306a36Sopenharmony_ci if (page) 93762306a36Sopenharmony_ci folio = page_folio(page); 93862306a36Sopenharmony_ci if (page && folio_test_anon(folio)) { 93962306a36Sopenharmony_ci /* 94062306a36Sopenharmony_ci * If this page may have been pinned by the parent process, 94162306a36Sopenharmony_ci * copy the page immediately for the child so that we'll always 94262306a36Sopenharmony_ci * guarantee the pinned page won't be randomly replaced in the 94362306a36Sopenharmony_ci * future. 94462306a36Sopenharmony_ci */ 94562306a36Sopenharmony_ci folio_get(folio); 94662306a36Sopenharmony_ci if (unlikely(page_try_dup_anon_rmap(page, false, src_vma))) { 94762306a36Sopenharmony_ci /* Page may be pinned, we have to copy. */ 94862306a36Sopenharmony_ci folio_put(folio); 94962306a36Sopenharmony_ci return copy_present_page(dst_vma, src_vma, dst_pte, src_pte, 95062306a36Sopenharmony_ci addr, rss, prealloc, page); 95162306a36Sopenharmony_ci } 95262306a36Sopenharmony_ci rss[MM_ANONPAGES]++; 95362306a36Sopenharmony_ci } else if (page) { 95462306a36Sopenharmony_ci folio_get(folio); 95562306a36Sopenharmony_ci page_dup_file_rmap(page, false); 95662306a36Sopenharmony_ci rss[mm_counter_file(page)]++; 95762306a36Sopenharmony_ci } 95862306a36Sopenharmony_ci 95962306a36Sopenharmony_ci /* 96062306a36Sopenharmony_ci * If it's a COW mapping, write protect it both 96162306a36Sopenharmony_ci * in the parent and the child 96262306a36Sopenharmony_ci */ 96362306a36Sopenharmony_ci if (is_cow_mapping(vm_flags) && pte_write(pte)) { 96462306a36Sopenharmony_ci ptep_set_wrprotect(src_mm, addr, src_pte); 96562306a36Sopenharmony_ci pte = pte_wrprotect(pte); 96662306a36Sopenharmony_ci } 96762306a36Sopenharmony_ci VM_BUG_ON(page && folio_test_anon(folio) && PageAnonExclusive(page)); 96862306a36Sopenharmony_ci 96962306a36Sopenharmony_ci /* 97062306a36Sopenharmony_ci * If it's a shared mapping, mark it clean in 97162306a36Sopenharmony_ci * the child 97262306a36Sopenharmony_ci */ 97362306a36Sopenharmony_ci if (vm_flags & VM_SHARED) 97462306a36Sopenharmony_ci pte = pte_mkclean(pte); 97562306a36Sopenharmony_ci pte = pte_mkold(pte); 97662306a36Sopenharmony_ci 97762306a36Sopenharmony_ci if (!userfaultfd_wp(dst_vma)) 97862306a36Sopenharmony_ci pte = pte_clear_uffd_wp(pte); 97962306a36Sopenharmony_ci 98062306a36Sopenharmony_ci set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); 98162306a36Sopenharmony_ci return 0; 98262306a36Sopenharmony_ci} 98362306a36Sopenharmony_ci 98462306a36Sopenharmony_cistatic inline struct folio *page_copy_prealloc(struct mm_struct *src_mm, 98562306a36Sopenharmony_ci struct vm_area_struct *vma, unsigned long addr) 98662306a36Sopenharmony_ci{ 98762306a36Sopenharmony_ci struct folio *new_folio; 98862306a36Sopenharmony_ci 98962306a36Sopenharmony_ci new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false); 99062306a36Sopenharmony_ci if (!new_folio) 99162306a36Sopenharmony_ci return NULL; 99262306a36Sopenharmony_ci 99362306a36Sopenharmony_ci if (mem_cgroup_charge(new_folio, src_mm, GFP_KERNEL)) { 99462306a36Sopenharmony_ci folio_put(new_folio); 99562306a36Sopenharmony_ci return NULL; 99662306a36Sopenharmony_ci } 99762306a36Sopenharmony_ci folio_throttle_swaprate(new_folio, GFP_KERNEL); 99862306a36Sopenharmony_ci 99962306a36Sopenharmony_ci return new_folio; 100062306a36Sopenharmony_ci} 100162306a36Sopenharmony_ci 100262306a36Sopenharmony_cistatic int 100362306a36Sopenharmony_cicopy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, 100462306a36Sopenharmony_ci pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, 100562306a36Sopenharmony_ci unsigned long end) 100662306a36Sopenharmony_ci{ 100762306a36Sopenharmony_ci struct mm_struct *dst_mm = dst_vma->vm_mm; 100862306a36Sopenharmony_ci struct mm_struct *src_mm = src_vma->vm_mm; 100962306a36Sopenharmony_ci pte_t *orig_src_pte, *orig_dst_pte; 101062306a36Sopenharmony_ci pte_t *src_pte, *dst_pte; 101162306a36Sopenharmony_ci pte_t ptent; 101262306a36Sopenharmony_ci spinlock_t *src_ptl, *dst_ptl; 101362306a36Sopenharmony_ci int progress, ret = 0; 101462306a36Sopenharmony_ci int rss[NR_MM_COUNTERS]; 101562306a36Sopenharmony_ci swp_entry_t entry = (swp_entry_t){0}; 101662306a36Sopenharmony_ci struct folio *prealloc = NULL; 101762306a36Sopenharmony_ci 101862306a36Sopenharmony_ciagain: 101962306a36Sopenharmony_ci progress = 0; 102062306a36Sopenharmony_ci init_rss_vec(rss); 102162306a36Sopenharmony_ci 102262306a36Sopenharmony_ci /* 102362306a36Sopenharmony_ci * copy_pmd_range()'s prior pmd_none_or_clear_bad(src_pmd), and the 102462306a36Sopenharmony_ci * error handling here, assume that exclusive mmap_lock on dst and src 102562306a36Sopenharmony_ci * protects anon from unexpected THP transitions; with shmem and file 102662306a36Sopenharmony_ci * protected by mmap_lock-less collapse skipping areas with anon_vma 102762306a36Sopenharmony_ci * (whereas vma_needs_copy() skips areas without anon_vma). A rework 102862306a36Sopenharmony_ci * can remove such assumptions later, but this is good enough for now. 102962306a36Sopenharmony_ci */ 103062306a36Sopenharmony_ci dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); 103162306a36Sopenharmony_ci if (!dst_pte) { 103262306a36Sopenharmony_ci ret = -ENOMEM; 103362306a36Sopenharmony_ci goto out; 103462306a36Sopenharmony_ci } 103562306a36Sopenharmony_ci src_pte = pte_offset_map_nolock(src_mm, src_pmd, addr, &src_ptl); 103662306a36Sopenharmony_ci if (!src_pte) { 103762306a36Sopenharmony_ci pte_unmap_unlock(dst_pte, dst_ptl); 103862306a36Sopenharmony_ci /* ret == 0 */ 103962306a36Sopenharmony_ci goto out; 104062306a36Sopenharmony_ci } 104162306a36Sopenharmony_ci spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 104262306a36Sopenharmony_ci orig_src_pte = src_pte; 104362306a36Sopenharmony_ci orig_dst_pte = dst_pte; 104462306a36Sopenharmony_ci arch_enter_lazy_mmu_mode(); 104562306a36Sopenharmony_ci 104662306a36Sopenharmony_ci do { 104762306a36Sopenharmony_ci /* 104862306a36Sopenharmony_ci * We are holding two locks at this point - either of them 104962306a36Sopenharmony_ci * could generate latencies in another task on another CPU. 105062306a36Sopenharmony_ci */ 105162306a36Sopenharmony_ci if (progress >= 32) { 105262306a36Sopenharmony_ci progress = 0; 105362306a36Sopenharmony_ci if (need_resched() || 105462306a36Sopenharmony_ci spin_needbreak(src_ptl) || spin_needbreak(dst_ptl)) 105562306a36Sopenharmony_ci break; 105662306a36Sopenharmony_ci } 105762306a36Sopenharmony_ci ptent = ptep_get(src_pte); 105862306a36Sopenharmony_ci if (pte_none(ptent)) { 105962306a36Sopenharmony_ci progress++; 106062306a36Sopenharmony_ci continue; 106162306a36Sopenharmony_ci } 106262306a36Sopenharmony_ci if (unlikely(!pte_present(ptent))) { 106362306a36Sopenharmony_ci ret = copy_nonpresent_pte(dst_mm, src_mm, 106462306a36Sopenharmony_ci dst_pte, src_pte, 106562306a36Sopenharmony_ci dst_vma, src_vma, 106662306a36Sopenharmony_ci addr, rss); 106762306a36Sopenharmony_ci if (ret == -EIO) { 106862306a36Sopenharmony_ci entry = pte_to_swp_entry(ptep_get(src_pte)); 106962306a36Sopenharmony_ci break; 107062306a36Sopenharmony_ci } else if (ret == -EBUSY) { 107162306a36Sopenharmony_ci break; 107262306a36Sopenharmony_ci } else if (!ret) { 107362306a36Sopenharmony_ci progress += 8; 107462306a36Sopenharmony_ci continue; 107562306a36Sopenharmony_ci } 107662306a36Sopenharmony_ci 107762306a36Sopenharmony_ci /* 107862306a36Sopenharmony_ci * Device exclusive entry restored, continue by copying 107962306a36Sopenharmony_ci * the now present pte. 108062306a36Sopenharmony_ci */ 108162306a36Sopenharmony_ci WARN_ON_ONCE(ret != -ENOENT); 108262306a36Sopenharmony_ci } 108362306a36Sopenharmony_ci /* copy_present_pte() will clear `*prealloc' if consumed */ 108462306a36Sopenharmony_ci ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte, 108562306a36Sopenharmony_ci addr, rss, &prealloc); 108662306a36Sopenharmony_ci /* 108762306a36Sopenharmony_ci * If we need a pre-allocated page for this pte, drop the 108862306a36Sopenharmony_ci * locks, allocate, and try again. 108962306a36Sopenharmony_ci */ 109062306a36Sopenharmony_ci if (unlikely(ret == -EAGAIN)) 109162306a36Sopenharmony_ci break; 109262306a36Sopenharmony_ci if (unlikely(prealloc)) { 109362306a36Sopenharmony_ci /* 109462306a36Sopenharmony_ci * pre-alloc page cannot be reused by next time so as 109562306a36Sopenharmony_ci * to strictly follow mempolicy (e.g., alloc_page_vma() 109662306a36Sopenharmony_ci * will allocate page according to address). This 109762306a36Sopenharmony_ci * could only happen if one pinned pte changed. 109862306a36Sopenharmony_ci */ 109962306a36Sopenharmony_ci folio_put(prealloc); 110062306a36Sopenharmony_ci prealloc = NULL; 110162306a36Sopenharmony_ci } 110262306a36Sopenharmony_ci progress += 8; 110362306a36Sopenharmony_ci } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); 110462306a36Sopenharmony_ci 110562306a36Sopenharmony_ci arch_leave_lazy_mmu_mode(); 110662306a36Sopenharmony_ci pte_unmap_unlock(orig_src_pte, src_ptl); 110762306a36Sopenharmony_ci add_mm_rss_vec(dst_mm, rss); 110862306a36Sopenharmony_ci pte_unmap_unlock(orig_dst_pte, dst_ptl); 110962306a36Sopenharmony_ci cond_resched(); 111062306a36Sopenharmony_ci 111162306a36Sopenharmony_ci if (ret == -EIO) { 111262306a36Sopenharmony_ci VM_WARN_ON_ONCE(!entry.val); 111362306a36Sopenharmony_ci if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) { 111462306a36Sopenharmony_ci ret = -ENOMEM; 111562306a36Sopenharmony_ci goto out; 111662306a36Sopenharmony_ci } 111762306a36Sopenharmony_ci entry.val = 0; 111862306a36Sopenharmony_ci } else if (ret == -EBUSY) { 111962306a36Sopenharmony_ci goto out; 112062306a36Sopenharmony_ci } else if (ret == -EAGAIN) { 112162306a36Sopenharmony_ci prealloc = page_copy_prealloc(src_mm, src_vma, addr); 112262306a36Sopenharmony_ci if (!prealloc) 112362306a36Sopenharmony_ci return -ENOMEM; 112462306a36Sopenharmony_ci } else if (ret) { 112562306a36Sopenharmony_ci VM_WARN_ON_ONCE(1); 112662306a36Sopenharmony_ci } 112762306a36Sopenharmony_ci 112862306a36Sopenharmony_ci /* We've captured and resolved the error. Reset, try again. */ 112962306a36Sopenharmony_ci ret = 0; 113062306a36Sopenharmony_ci 113162306a36Sopenharmony_ci if (addr != end) 113262306a36Sopenharmony_ci goto again; 113362306a36Sopenharmony_ciout: 113462306a36Sopenharmony_ci if (unlikely(prealloc)) 113562306a36Sopenharmony_ci folio_put(prealloc); 113662306a36Sopenharmony_ci return ret; 113762306a36Sopenharmony_ci} 113862306a36Sopenharmony_ci 113962306a36Sopenharmony_cistatic inline int 114062306a36Sopenharmony_cicopy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, 114162306a36Sopenharmony_ci pud_t *dst_pud, pud_t *src_pud, unsigned long addr, 114262306a36Sopenharmony_ci unsigned long end) 114362306a36Sopenharmony_ci{ 114462306a36Sopenharmony_ci struct mm_struct *dst_mm = dst_vma->vm_mm; 114562306a36Sopenharmony_ci struct mm_struct *src_mm = src_vma->vm_mm; 114662306a36Sopenharmony_ci pmd_t *src_pmd, *dst_pmd; 114762306a36Sopenharmony_ci unsigned long next; 114862306a36Sopenharmony_ci 114962306a36Sopenharmony_ci dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); 115062306a36Sopenharmony_ci if (!dst_pmd) 115162306a36Sopenharmony_ci return -ENOMEM; 115262306a36Sopenharmony_ci src_pmd = pmd_offset(src_pud, addr); 115362306a36Sopenharmony_ci do { 115462306a36Sopenharmony_ci next = pmd_addr_end(addr, end); 115562306a36Sopenharmony_ci if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd) 115662306a36Sopenharmony_ci || pmd_devmap(*src_pmd)) { 115762306a36Sopenharmony_ci int err; 115862306a36Sopenharmony_ci VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma); 115962306a36Sopenharmony_ci err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd, 116062306a36Sopenharmony_ci addr, dst_vma, src_vma); 116162306a36Sopenharmony_ci if (err == -ENOMEM) 116262306a36Sopenharmony_ci return -ENOMEM; 116362306a36Sopenharmony_ci if (!err) 116462306a36Sopenharmony_ci continue; 116562306a36Sopenharmony_ci /* fall through */ 116662306a36Sopenharmony_ci } 116762306a36Sopenharmony_ci if (pmd_none_or_clear_bad(src_pmd)) 116862306a36Sopenharmony_ci continue; 116962306a36Sopenharmony_ci if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd, 117062306a36Sopenharmony_ci addr, next)) 117162306a36Sopenharmony_ci return -ENOMEM; 117262306a36Sopenharmony_ci } while (dst_pmd++, src_pmd++, addr = next, addr != end); 117362306a36Sopenharmony_ci return 0; 117462306a36Sopenharmony_ci} 117562306a36Sopenharmony_ci 117662306a36Sopenharmony_cistatic inline int 117762306a36Sopenharmony_cicopy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, 117862306a36Sopenharmony_ci p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr, 117962306a36Sopenharmony_ci unsigned long end) 118062306a36Sopenharmony_ci{ 118162306a36Sopenharmony_ci struct mm_struct *dst_mm = dst_vma->vm_mm; 118262306a36Sopenharmony_ci struct mm_struct *src_mm = src_vma->vm_mm; 118362306a36Sopenharmony_ci pud_t *src_pud, *dst_pud; 118462306a36Sopenharmony_ci unsigned long next; 118562306a36Sopenharmony_ci 118662306a36Sopenharmony_ci dst_pud = pud_alloc(dst_mm, dst_p4d, addr); 118762306a36Sopenharmony_ci if (!dst_pud) 118862306a36Sopenharmony_ci return -ENOMEM; 118962306a36Sopenharmony_ci src_pud = pud_offset(src_p4d, addr); 119062306a36Sopenharmony_ci do { 119162306a36Sopenharmony_ci next = pud_addr_end(addr, end); 119262306a36Sopenharmony_ci if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) { 119362306a36Sopenharmony_ci int err; 119462306a36Sopenharmony_ci 119562306a36Sopenharmony_ci VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma); 119662306a36Sopenharmony_ci err = copy_huge_pud(dst_mm, src_mm, 119762306a36Sopenharmony_ci dst_pud, src_pud, addr, src_vma); 119862306a36Sopenharmony_ci if (err == -ENOMEM) 119962306a36Sopenharmony_ci return -ENOMEM; 120062306a36Sopenharmony_ci if (!err) 120162306a36Sopenharmony_ci continue; 120262306a36Sopenharmony_ci /* fall through */ 120362306a36Sopenharmony_ci } 120462306a36Sopenharmony_ci if (pud_none_or_clear_bad(src_pud)) 120562306a36Sopenharmony_ci continue; 120662306a36Sopenharmony_ci if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud, 120762306a36Sopenharmony_ci addr, next)) 120862306a36Sopenharmony_ci return -ENOMEM; 120962306a36Sopenharmony_ci } while (dst_pud++, src_pud++, addr = next, addr != end); 121062306a36Sopenharmony_ci return 0; 121162306a36Sopenharmony_ci} 121262306a36Sopenharmony_ci 121362306a36Sopenharmony_cistatic inline int 121462306a36Sopenharmony_cicopy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, 121562306a36Sopenharmony_ci pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr, 121662306a36Sopenharmony_ci unsigned long end) 121762306a36Sopenharmony_ci{ 121862306a36Sopenharmony_ci struct mm_struct *dst_mm = dst_vma->vm_mm; 121962306a36Sopenharmony_ci p4d_t *src_p4d, *dst_p4d; 122062306a36Sopenharmony_ci unsigned long next; 122162306a36Sopenharmony_ci 122262306a36Sopenharmony_ci dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr); 122362306a36Sopenharmony_ci if (!dst_p4d) 122462306a36Sopenharmony_ci return -ENOMEM; 122562306a36Sopenharmony_ci src_p4d = p4d_offset(src_pgd, addr); 122662306a36Sopenharmony_ci do { 122762306a36Sopenharmony_ci next = p4d_addr_end(addr, end); 122862306a36Sopenharmony_ci if (p4d_none_or_clear_bad(src_p4d)) 122962306a36Sopenharmony_ci continue; 123062306a36Sopenharmony_ci if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d, 123162306a36Sopenharmony_ci addr, next)) 123262306a36Sopenharmony_ci return -ENOMEM; 123362306a36Sopenharmony_ci } while (dst_p4d++, src_p4d++, addr = next, addr != end); 123462306a36Sopenharmony_ci return 0; 123562306a36Sopenharmony_ci} 123662306a36Sopenharmony_ci 123762306a36Sopenharmony_ci/* 123862306a36Sopenharmony_ci * Return true if the vma needs to copy the pgtable during this fork(). Return 123962306a36Sopenharmony_ci * false when we can speed up fork() by allowing lazy page faults later until 124062306a36Sopenharmony_ci * when the child accesses the memory range. 124162306a36Sopenharmony_ci */ 124262306a36Sopenharmony_cistatic bool 124362306a36Sopenharmony_civma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) 124462306a36Sopenharmony_ci{ 124562306a36Sopenharmony_ci /* 124662306a36Sopenharmony_ci * Always copy pgtables when dst_vma has uffd-wp enabled even if it's 124762306a36Sopenharmony_ci * file-backed (e.g. shmem). Because when uffd-wp is enabled, pgtable 124862306a36Sopenharmony_ci * contains uffd-wp protection information, that's something we can't 124962306a36Sopenharmony_ci * retrieve from page cache, and skip copying will lose those info. 125062306a36Sopenharmony_ci */ 125162306a36Sopenharmony_ci if (userfaultfd_wp(dst_vma)) 125262306a36Sopenharmony_ci return true; 125362306a36Sopenharmony_ci 125462306a36Sopenharmony_ci if (src_vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) 125562306a36Sopenharmony_ci return true; 125662306a36Sopenharmony_ci 125762306a36Sopenharmony_ci if (src_vma->anon_vma) 125862306a36Sopenharmony_ci return true; 125962306a36Sopenharmony_ci 126062306a36Sopenharmony_ci /* 126162306a36Sopenharmony_ci * Don't copy ptes where a page fault will fill them correctly. Fork 126262306a36Sopenharmony_ci * becomes much lighter when there are big shared or private readonly 126362306a36Sopenharmony_ci * mappings. The tradeoff is that copy_page_range is more efficient 126462306a36Sopenharmony_ci * than faulting. 126562306a36Sopenharmony_ci */ 126662306a36Sopenharmony_ci return false; 126762306a36Sopenharmony_ci} 126862306a36Sopenharmony_ci 126962306a36Sopenharmony_ciint 127062306a36Sopenharmony_cicopy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) 127162306a36Sopenharmony_ci{ 127262306a36Sopenharmony_ci pgd_t *src_pgd, *dst_pgd; 127362306a36Sopenharmony_ci unsigned long next; 127462306a36Sopenharmony_ci unsigned long addr = src_vma->vm_start; 127562306a36Sopenharmony_ci unsigned long end = src_vma->vm_end; 127662306a36Sopenharmony_ci struct mm_struct *dst_mm = dst_vma->vm_mm; 127762306a36Sopenharmony_ci struct mm_struct *src_mm = src_vma->vm_mm; 127862306a36Sopenharmony_ci struct mmu_notifier_range range; 127962306a36Sopenharmony_ci bool is_cow; 128062306a36Sopenharmony_ci int ret; 128162306a36Sopenharmony_ci 128262306a36Sopenharmony_ci if (!vma_needs_copy(dst_vma, src_vma)) 128362306a36Sopenharmony_ci return 0; 128462306a36Sopenharmony_ci 128562306a36Sopenharmony_ci if (is_vm_hugetlb_page(src_vma)) 128662306a36Sopenharmony_ci return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma); 128762306a36Sopenharmony_ci 128862306a36Sopenharmony_ci if (unlikely(src_vma->vm_flags & VM_PFNMAP)) { 128962306a36Sopenharmony_ci /* 129062306a36Sopenharmony_ci * We do not free on error cases below as remove_vma 129162306a36Sopenharmony_ci * gets called on error from higher level routine 129262306a36Sopenharmony_ci */ 129362306a36Sopenharmony_ci ret = track_pfn_copy(src_vma); 129462306a36Sopenharmony_ci if (ret) 129562306a36Sopenharmony_ci return ret; 129662306a36Sopenharmony_ci } 129762306a36Sopenharmony_ci 129862306a36Sopenharmony_ci /* 129962306a36Sopenharmony_ci * We need to invalidate the secondary MMU mappings only when 130062306a36Sopenharmony_ci * there could be a permission downgrade on the ptes of the 130162306a36Sopenharmony_ci * parent mm. And a permission downgrade will only happen if 130262306a36Sopenharmony_ci * is_cow_mapping() returns true. 130362306a36Sopenharmony_ci */ 130462306a36Sopenharmony_ci is_cow = is_cow_mapping(src_vma->vm_flags); 130562306a36Sopenharmony_ci 130662306a36Sopenharmony_ci if (is_cow) { 130762306a36Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 130862306a36Sopenharmony_ci 0, src_mm, addr, end); 130962306a36Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 131062306a36Sopenharmony_ci /* 131162306a36Sopenharmony_ci * Disabling preemption is not needed for the write side, as 131262306a36Sopenharmony_ci * the read side doesn't spin, but goes to the mmap_lock. 131362306a36Sopenharmony_ci * 131462306a36Sopenharmony_ci * Use the raw variant of the seqcount_t write API to avoid 131562306a36Sopenharmony_ci * lockdep complaining about preemptibility. 131662306a36Sopenharmony_ci */ 131762306a36Sopenharmony_ci vma_assert_write_locked(src_vma); 131862306a36Sopenharmony_ci raw_write_seqcount_begin(&src_mm->write_protect_seq); 131962306a36Sopenharmony_ci } 132062306a36Sopenharmony_ci 132162306a36Sopenharmony_ci ret = 0; 132262306a36Sopenharmony_ci dst_pgd = pgd_offset(dst_mm, addr); 132362306a36Sopenharmony_ci src_pgd = pgd_offset(src_mm, addr); 132462306a36Sopenharmony_ci do { 132562306a36Sopenharmony_ci next = pgd_addr_end(addr, end); 132662306a36Sopenharmony_ci if (pgd_none_or_clear_bad(src_pgd)) 132762306a36Sopenharmony_ci continue; 132862306a36Sopenharmony_ci if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd, 132962306a36Sopenharmony_ci addr, next))) { 133062306a36Sopenharmony_ci untrack_pfn_clear(dst_vma); 133162306a36Sopenharmony_ci ret = -ENOMEM; 133262306a36Sopenharmony_ci break; 133362306a36Sopenharmony_ci } 133462306a36Sopenharmony_ci } while (dst_pgd++, src_pgd++, addr = next, addr != end); 133562306a36Sopenharmony_ci 133662306a36Sopenharmony_ci if (is_cow) { 133762306a36Sopenharmony_ci raw_write_seqcount_end(&src_mm->write_protect_seq); 133862306a36Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 133962306a36Sopenharmony_ci } 134062306a36Sopenharmony_ci return ret; 134162306a36Sopenharmony_ci} 134262306a36Sopenharmony_ci 134362306a36Sopenharmony_ci/* Whether we should zap all COWed (private) pages too */ 134462306a36Sopenharmony_cistatic inline bool should_zap_cows(struct zap_details *details) 134562306a36Sopenharmony_ci{ 134662306a36Sopenharmony_ci /* By default, zap all pages */ 134762306a36Sopenharmony_ci if (!details) 134862306a36Sopenharmony_ci return true; 134962306a36Sopenharmony_ci 135062306a36Sopenharmony_ci /* Or, we zap COWed pages only if the caller wants to */ 135162306a36Sopenharmony_ci return details->even_cows; 135262306a36Sopenharmony_ci} 135362306a36Sopenharmony_ci 135462306a36Sopenharmony_ci/* Decides whether we should zap this page with the page pointer specified */ 135562306a36Sopenharmony_cistatic inline bool should_zap_page(struct zap_details *details, struct page *page) 135662306a36Sopenharmony_ci{ 135762306a36Sopenharmony_ci /* If we can make a decision without *page.. */ 135862306a36Sopenharmony_ci if (should_zap_cows(details)) 135962306a36Sopenharmony_ci return true; 136062306a36Sopenharmony_ci 136162306a36Sopenharmony_ci /* E.g. the caller passes NULL for the case of a zero page */ 136262306a36Sopenharmony_ci if (!page) 136362306a36Sopenharmony_ci return true; 136462306a36Sopenharmony_ci 136562306a36Sopenharmony_ci /* Otherwise we should only zap non-anon pages */ 136662306a36Sopenharmony_ci return !PageAnon(page); 136762306a36Sopenharmony_ci} 136862306a36Sopenharmony_ci 136962306a36Sopenharmony_cistatic inline bool zap_drop_file_uffd_wp(struct zap_details *details) 137062306a36Sopenharmony_ci{ 137162306a36Sopenharmony_ci if (!details) 137262306a36Sopenharmony_ci return false; 137362306a36Sopenharmony_ci 137462306a36Sopenharmony_ci return details->zap_flags & ZAP_FLAG_DROP_MARKER; 137562306a36Sopenharmony_ci} 137662306a36Sopenharmony_ci 137762306a36Sopenharmony_ci/* 137862306a36Sopenharmony_ci * This function makes sure that we'll replace the none pte with an uffd-wp 137962306a36Sopenharmony_ci * swap special pte marker when necessary. Must be with the pgtable lock held. 138062306a36Sopenharmony_ci */ 138162306a36Sopenharmony_cistatic inline void 138262306a36Sopenharmony_cizap_install_uffd_wp_if_needed(struct vm_area_struct *vma, 138362306a36Sopenharmony_ci unsigned long addr, pte_t *pte, 138462306a36Sopenharmony_ci struct zap_details *details, pte_t pteval) 138562306a36Sopenharmony_ci{ 138662306a36Sopenharmony_ci /* Zap on anonymous always means dropping everything */ 138762306a36Sopenharmony_ci if (vma_is_anonymous(vma)) 138862306a36Sopenharmony_ci return; 138962306a36Sopenharmony_ci 139062306a36Sopenharmony_ci if (zap_drop_file_uffd_wp(details)) 139162306a36Sopenharmony_ci return; 139262306a36Sopenharmony_ci 139362306a36Sopenharmony_ci pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); 139462306a36Sopenharmony_ci} 139562306a36Sopenharmony_ci 139662306a36Sopenharmony_cistatic unsigned long zap_pte_range(struct mmu_gather *tlb, 139762306a36Sopenharmony_ci struct vm_area_struct *vma, pmd_t *pmd, 139862306a36Sopenharmony_ci unsigned long addr, unsigned long end, 139962306a36Sopenharmony_ci struct zap_details *details) 140062306a36Sopenharmony_ci{ 140162306a36Sopenharmony_ci struct mm_struct *mm = tlb->mm; 140262306a36Sopenharmony_ci int force_flush = 0; 140362306a36Sopenharmony_ci int rss[NR_MM_COUNTERS]; 140462306a36Sopenharmony_ci spinlock_t *ptl; 140562306a36Sopenharmony_ci pte_t *start_pte; 140662306a36Sopenharmony_ci pte_t *pte; 140762306a36Sopenharmony_ci swp_entry_t entry; 140862306a36Sopenharmony_ci 140962306a36Sopenharmony_ci tlb_change_page_size(tlb, PAGE_SIZE); 141062306a36Sopenharmony_ci init_rss_vec(rss); 141162306a36Sopenharmony_ci start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 141262306a36Sopenharmony_ci if (!pte) 141362306a36Sopenharmony_ci return addr; 141462306a36Sopenharmony_ci 141562306a36Sopenharmony_ci flush_tlb_batched_pending(mm); 141662306a36Sopenharmony_ci arch_enter_lazy_mmu_mode(); 141762306a36Sopenharmony_ci do { 141862306a36Sopenharmony_ci pte_t ptent = ptep_get(pte); 141962306a36Sopenharmony_ci struct page *page; 142062306a36Sopenharmony_ci 142162306a36Sopenharmony_ci if (pte_none(ptent)) 142262306a36Sopenharmony_ci continue; 142362306a36Sopenharmony_ci 142462306a36Sopenharmony_ci if (need_resched()) 142562306a36Sopenharmony_ci break; 142662306a36Sopenharmony_ci 142762306a36Sopenharmony_ci if (pte_present(ptent)) { 142862306a36Sopenharmony_ci unsigned int delay_rmap; 142962306a36Sopenharmony_ci 143062306a36Sopenharmony_ci page = vm_normal_page(vma, addr, ptent); 143162306a36Sopenharmony_ci#ifdef CONFIG_MEM_PURGEABLE 143262306a36Sopenharmony_ci if (vma->vm_flags & VM_USEREXPTE) 143362306a36Sopenharmony_ci page = NULL; 143462306a36Sopenharmony_ci#endif 143562306a36Sopenharmony_ci if (unlikely(!should_zap_page(details, page))) 143662306a36Sopenharmony_ci continue; 143762306a36Sopenharmony_ci ptent = ptep_get_and_clear_full(mm, addr, pte, 143862306a36Sopenharmony_ci tlb->fullmm); 143962306a36Sopenharmony_ci arch_check_zapped_pte(vma, ptent); 144062306a36Sopenharmony_ci tlb_remove_tlb_entry(tlb, pte, addr); 144162306a36Sopenharmony_ci zap_install_uffd_wp_if_needed(vma, addr, pte, details, 144262306a36Sopenharmony_ci ptent); 144362306a36Sopenharmony_ci if (unlikely(!page)) { 144462306a36Sopenharmony_ci ksm_might_unmap_zero_page(mm, ptent); 144562306a36Sopenharmony_ci continue; 144662306a36Sopenharmony_ci } 144762306a36Sopenharmony_ci#ifdef CONFIG_MEM_PURGEABLE 144862306a36Sopenharmony_ci if (vma->vm_flags & VM_PURGEABLE) 144962306a36Sopenharmony_ci uxpte_clear_present(vma, addr); 145062306a36Sopenharmony_ci#endif 145162306a36Sopenharmony_ci delay_rmap = 0; 145262306a36Sopenharmony_ci if (!PageAnon(page)) { 145362306a36Sopenharmony_ci if (pte_dirty(ptent)) { 145462306a36Sopenharmony_ci set_page_dirty(page); 145562306a36Sopenharmony_ci if (tlb_delay_rmap(tlb)) { 145662306a36Sopenharmony_ci delay_rmap = 1; 145762306a36Sopenharmony_ci force_flush = 1; 145862306a36Sopenharmony_ci } 145962306a36Sopenharmony_ci } 146062306a36Sopenharmony_ci if (pte_young(ptent) && likely(vma_has_recency(vma))) 146162306a36Sopenharmony_ci mark_page_accessed(page); 146262306a36Sopenharmony_ci } 146362306a36Sopenharmony_ci rss[mm_counter(page)]--; 146462306a36Sopenharmony_ci if (!delay_rmap) { 146562306a36Sopenharmony_ci page_remove_rmap(page, vma, false); 146662306a36Sopenharmony_ci if (unlikely(page_mapcount(page) < 0)) 146762306a36Sopenharmony_ci print_bad_pte(vma, addr, ptent, page); 146862306a36Sopenharmony_ci } 146962306a36Sopenharmony_ci if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) { 147062306a36Sopenharmony_ci force_flush = 1; 147162306a36Sopenharmony_ci addr += PAGE_SIZE; 147262306a36Sopenharmony_ci break; 147362306a36Sopenharmony_ci } 147462306a36Sopenharmony_ci continue; 147562306a36Sopenharmony_ci } 147662306a36Sopenharmony_ci 147762306a36Sopenharmony_ci entry = pte_to_swp_entry(ptent); 147862306a36Sopenharmony_ci if (is_device_private_entry(entry) || 147962306a36Sopenharmony_ci is_device_exclusive_entry(entry)) { 148062306a36Sopenharmony_ci page = pfn_swap_entry_to_page(entry); 148162306a36Sopenharmony_ci if (unlikely(!should_zap_page(details, page))) 148262306a36Sopenharmony_ci continue; 148362306a36Sopenharmony_ci /* 148462306a36Sopenharmony_ci * Both device private/exclusive mappings should only 148562306a36Sopenharmony_ci * work with anonymous page so far, so we don't need to 148662306a36Sopenharmony_ci * consider uffd-wp bit when zap. For more information, 148762306a36Sopenharmony_ci * see zap_install_uffd_wp_if_needed(). 148862306a36Sopenharmony_ci */ 148962306a36Sopenharmony_ci WARN_ON_ONCE(!vma_is_anonymous(vma)); 149062306a36Sopenharmony_ci rss[mm_counter(page)]--; 149162306a36Sopenharmony_ci if (is_device_private_entry(entry)) 149262306a36Sopenharmony_ci page_remove_rmap(page, vma, false); 149362306a36Sopenharmony_ci put_page(page); 149462306a36Sopenharmony_ci } else if (!non_swap_entry(entry)) { 149562306a36Sopenharmony_ci /* Genuine swap entry, hence a private anon page */ 149662306a36Sopenharmony_ci if (!should_zap_cows(details)) 149762306a36Sopenharmony_ci continue; 149862306a36Sopenharmony_ci rss[MM_SWAPENTS]--; 149962306a36Sopenharmony_ci if (unlikely(!free_swap_and_cache(entry))) 150062306a36Sopenharmony_ci print_bad_pte(vma, addr, ptent, NULL); 150162306a36Sopenharmony_ci } else if (is_migration_entry(entry)) { 150262306a36Sopenharmony_ci page = pfn_swap_entry_to_page(entry); 150362306a36Sopenharmony_ci if (!should_zap_page(details, page)) 150462306a36Sopenharmony_ci continue; 150562306a36Sopenharmony_ci rss[mm_counter(page)]--; 150662306a36Sopenharmony_ci } else if (pte_marker_entry_uffd_wp(entry)) { 150762306a36Sopenharmony_ci /* 150862306a36Sopenharmony_ci * For anon: always drop the marker; for file: only 150962306a36Sopenharmony_ci * drop the marker if explicitly requested. 151062306a36Sopenharmony_ci */ 151162306a36Sopenharmony_ci if (!vma_is_anonymous(vma) && 151262306a36Sopenharmony_ci !zap_drop_file_uffd_wp(details)) 151362306a36Sopenharmony_ci continue; 151462306a36Sopenharmony_ci } else if (is_hwpoison_entry(entry) || 151562306a36Sopenharmony_ci is_poisoned_swp_entry(entry)) { 151662306a36Sopenharmony_ci if (!should_zap_cows(details)) 151762306a36Sopenharmony_ci continue; 151862306a36Sopenharmony_ci } else { 151962306a36Sopenharmony_ci /* We should have covered all the swap entry types */ 152062306a36Sopenharmony_ci WARN_ON_ONCE(1); 152162306a36Sopenharmony_ci } 152262306a36Sopenharmony_ci pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 152362306a36Sopenharmony_ci zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent); 152462306a36Sopenharmony_ci } while (pte++, addr += PAGE_SIZE, addr != end); 152562306a36Sopenharmony_ci 152662306a36Sopenharmony_ci add_mm_rss_vec(mm, rss); 152762306a36Sopenharmony_ci arch_leave_lazy_mmu_mode(); 152862306a36Sopenharmony_ci 152962306a36Sopenharmony_ci /* Do the actual TLB flush before dropping ptl */ 153062306a36Sopenharmony_ci if (force_flush) { 153162306a36Sopenharmony_ci tlb_flush_mmu_tlbonly(tlb); 153262306a36Sopenharmony_ci tlb_flush_rmaps(tlb, vma); 153362306a36Sopenharmony_ci } 153462306a36Sopenharmony_ci pte_unmap_unlock(start_pte, ptl); 153562306a36Sopenharmony_ci 153662306a36Sopenharmony_ci /* 153762306a36Sopenharmony_ci * If we forced a TLB flush (either due to running out of 153862306a36Sopenharmony_ci * batch buffers or because we needed to flush dirty TLB 153962306a36Sopenharmony_ci * entries before releasing the ptl), free the batched 154062306a36Sopenharmony_ci * memory too. Come back again if we didn't do everything. 154162306a36Sopenharmony_ci */ 154262306a36Sopenharmony_ci if (force_flush) 154362306a36Sopenharmony_ci tlb_flush_mmu(tlb); 154462306a36Sopenharmony_ci 154562306a36Sopenharmony_ci return addr; 154662306a36Sopenharmony_ci} 154762306a36Sopenharmony_ci 154862306a36Sopenharmony_cistatic inline unsigned long zap_pmd_range(struct mmu_gather *tlb, 154962306a36Sopenharmony_ci struct vm_area_struct *vma, pud_t *pud, 155062306a36Sopenharmony_ci unsigned long addr, unsigned long end, 155162306a36Sopenharmony_ci struct zap_details *details) 155262306a36Sopenharmony_ci{ 155362306a36Sopenharmony_ci pmd_t *pmd; 155462306a36Sopenharmony_ci unsigned long next; 155562306a36Sopenharmony_ci 155662306a36Sopenharmony_ci pmd = pmd_offset(pud, addr); 155762306a36Sopenharmony_ci do { 155862306a36Sopenharmony_ci next = pmd_addr_end(addr, end); 155962306a36Sopenharmony_ci if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { 156062306a36Sopenharmony_ci if (next - addr != HPAGE_PMD_SIZE) 156162306a36Sopenharmony_ci __split_huge_pmd(vma, pmd, addr, false, NULL); 156262306a36Sopenharmony_ci else if (zap_huge_pmd(tlb, vma, pmd, addr)) { 156362306a36Sopenharmony_ci addr = next; 156462306a36Sopenharmony_ci continue; 156562306a36Sopenharmony_ci } 156662306a36Sopenharmony_ci /* fall through */ 156762306a36Sopenharmony_ci } else if (details && details->single_folio && 156862306a36Sopenharmony_ci folio_test_pmd_mappable(details->single_folio) && 156962306a36Sopenharmony_ci next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) { 157062306a36Sopenharmony_ci spinlock_t *ptl = pmd_lock(tlb->mm, pmd); 157162306a36Sopenharmony_ci /* 157262306a36Sopenharmony_ci * Take and drop THP pmd lock so that we cannot return 157362306a36Sopenharmony_ci * prematurely, while zap_huge_pmd() has cleared *pmd, 157462306a36Sopenharmony_ci * but not yet decremented compound_mapcount(). 157562306a36Sopenharmony_ci */ 157662306a36Sopenharmony_ci spin_unlock(ptl); 157762306a36Sopenharmony_ci } 157862306a36Sopenharmony_ci if (pmd_none(*pmd)) { 157962306a36Sopenharmony_ci addr = next; 158062306a36Sopenharmony_ci continue; 158162306a36Sopenharmony_ci } 158262306a36Sopenharmony_ci addr = zap_pte_range(tlb, vma, pmd, addr, next, details); 158362306a36Sopenharmony_ci if (addr != next) 158462306a36Sopenharmony_ci pmd--; 158562306a36Sopenharmony_ci } while (pmd++, cond_resched(), addr != end); 158662306a36Sopenharmony_ci 158762306a36Sopenharmony_ci return addr; 158862306a36Sopenharmony_ci} 158962306a36Sopenharmony_ci 159062306a36Sopenharmony_cistatic inline unsigned long zap_pud_range(struct mmu_gather *tlb, 159162306a36Sopenharmony_ci struct vm_area_struct *vma, p4d_t *p4d, 159262306a36Sopenharmony_ci unsigned long addr, unsigned long end, 159362306a36Sopenharmony_ci struct zap_details *details) 159462306a36Sopenharmony_ci{ 159562306a36Sopenharmony_ci pud_t *pud; 159662306a36Sopenharmony_ci unsigned long next; 159762306a36Sopenharmony_ci 159862306a36Sopenharmony_ci pud = pud_offset(p4d, addr); 159962306a36Sopenharmony_ci do { 160062306a36Sopenharmony_ci next = pud_addr_end(addr, end); 160162306a36Sopenharmony_ci if (pud_trans_huge(*pud) || pud_devmap(*pud)) { 160262306a36Sopenharmony_ci if (next - addr != HPAGE_PUD_SIZE) { 160362306a36Sopenharmony_ci mmap_assert_locked(tlb->mm); 160462306a36Sopenharmony_ci split_huge_pud(vma, pud, addr); 160562306a36Sopenharmony_ci } else if (zap_huge_pud(tlb, vma, pud, addr)) 160662306a36Sopenharmony_ci goto next; 160762306a36Sopenharmony_ci /* fall through */ 160862306a36Sopenharmony_ci } 160962306a36Sopenharmony_ci if (pud_none_or_clear_bad(pud)) 161062306a36Sopenharmony_ci continue; 161162306a36Sopenharmony_ci next = zap_pmd_range(tlb, vma, pud, addr, next, details); 161262306a36Sopenharmony_cinext: 161362306a36Sopenharmony_ci cond_resched(); 161462306a36Sopenharmony_ci } while (pud++, addr = next, addr != end); 161562306a36Sopenharmony_ci 161662306a36Sopenharmony_ci return addr; 161762306a36Sopenharmony_ci} 161862306a36Sopenharmony_ci 161962306a36Sopenharmony_cistatic inline unsigned long zap_p4d_range(struct mmu_gather *tlb, 162062306a36Sopenharmony_ci struct vm_area_struct *vma, pgd_t *pgd, 162162306a36Sopenharmony_ci unsigned long addr, unsigned long end, 162262306a36Sopenharmony_ci struct zap_details *details) 162362306a36Sopenharmony_ci{ 162462306a36Sopenharmony_ci p4d_t *p4d; 162562306a36Sopenharmony_ci unsigned long next; 162662306a36Sopenharmony_ci 162762306a36Sopenharmony_ci p4d = p4d_offset(pgd, addr); 162862306a36Sopenharmony_ci do { 162962306a36Sopenharmony_ci next = p4d_addr_end(addr, end); 163062306a36Sopenharmony_ci if (p4d_none_or_clear_bad(p4d)) 163162306a36Sopenharmony_ci continue; 163262306a36Sopenharmony_ci next = zap_pud_range(tlb, vma, p4d, addr, next, details); 163362306a36Sopenharmony_ci } while (p4d++, addr = next, addr != end); 163462306a36Sopenharmony_ci 163562306a36Sopenharmony_ci return addr; 163662306a36Sopenharmony_ci} 163762306a36Sopenharmony_ci 163862306a36Sopenharmony_civoid unmap_page_range(struct mmu_gather *tlb, 163962306a36Sopenharmony_ci struct vm_area_struct *vma, 164062306a36Sopenharmony_ci unsigned long addr, unsigned long end, 164162306a36Sopenharmony_ci struct zap_details *details) 164262306a36Sopenharmony_ci{ 164362306a36Sopenharmony_ci pgd_t *pgd; 164462306a36Sopenharmony_ci unsigned long next; 164562306a36Sopenharmony_ci 164662306a36Sopenharmony_ci BUG_ON(addr >= end); 164762306a36Sopenharmony_ci tlb_start_vma(tlb, vma); 164862306a36Sopenharmony_ci pgd = pgd_offset(vma->vm_mm, addr); 164962306a36Sopenharmony_ci do { 165062306a36Sopenharmony_ci next = pgd_addr_end(addr, end); 165162306a36Sopenharmony_ci if (pgd_none_or_clear_bad(pgd)) 165262306a36Sopenharmony_ci continue; 165362306a36Sopenharmony_ci next = zap_p4d_range(tlb, vma, pgd, addr, next, details); 165462306a36Sopenharmony_ci } while (pgd++, addr = next, addr != end); 165562306a36Sopenharmony_ci tlb_end_vma(tlb, vma); 165662306a36Sopenharmony_ci} 165762306a36Sopenharmony_ci 165862306a36Sopenharmony_ci 165962306a36Sopenharmony_cistatic void unmap_single_vma(struct mmu_gather *tlb, 166062306a36Sopenharmony_ci struct vm_area_struct *vma, unsigned long start_addr, 166162306a36Sopenharmony_ci unsigned long end_addr, 166262306a36Sopenharmony_ci struct zap_details *details, bool mm_wr_locked) 166362306a36Sopenharmony_ci{ 166462306a36Sopenharmony_ci unsigned long start = max(vma->vm_start, start_addr); 166562306a36Sopenharmony_ci unsigned long end; 166662306a36Sopenharmony_ci 166762306a36Sopenharmony_ci if (start >= vma->vm_end) 166862306a36Sopenharmony_ci return; 166962306a36Sopenharmony_ci end = min(vma->vm_end, end_addr); 167062306a36Sopenharmony_ci if (end <= vma->vm_start) 167162306a36Sopenharmony_ci return; 167262306a36Sopenharmony_ci 167362306a36Sopenharmony_ci if (vma->vm_file) 167462306a36Sopenharmony_ci uprobe_munmap(vma, start, end); 167562306a36Sopenharmony_ci 167662306a36Sopenharmony_ci if (unlikely(vma->vm_flags & VM_PFNMAP)) 167762306a36Sopenharmony_ci untrack_pfn(vma, 0, 0, mm_wr_locked); 167862306a36Sopenharmony_ci 167962306a36Sopenharmony_ci if (start != end) { 168062306a36Sopenharmony_ci if (unlikely(is_vm_hugetlb_page(vma))) { 168162306a36Sopenharmony_ci /* 168262306a36Sopenharmony_ci * It is undesirable to test vma->vm_file as it 168362306a36Sopenharmony_ci * should be non-null for valid hugetlb area. 168462306a36Sopenharmony_ci * However, vm_file will be NULL in the error 168562306a36Sopenharmony_ci * cleanup path of mmap_region. When 168662306a36Sopenharmony_ci * hugetlbfs ->mmap method fails, 168762306a36Sopenharmony_ci * mmap_region() nullifies vma->vm_file 168862306a36Sopenharmony_ci * before calling this function to clean up. 168962306a36Sopenharmony_ci * Since no pte has actually been setup, it is 169062306a36Sopenharmony_ci * safe to do nothing in this case. 169162306a36Sopenharmony_ci */ 169262306a36Sopenharmony_ci if (vma->vm_file) { 169362306a36Sopenharmony_ci zap_flags_t zap_flags = details ? 169462306a36Sopenharmony_ci details->zap_flags : 0; 169562306a36Sopenharmony_ci __unmap_hugepage_range(tlb, vma, start, end, 169662306a36Sopenharmony_ci NULL, zap_flags); 169762306a36Sopenharmony_ci } 169862306a36Sopenharmony_ci } else 169962306a36Sopenharmony_ci unmap_page_range(tlb, vma, start, end, details); 170062306a36Sopenharmony_ci } 170162306a36Sopenharmony_ci} 170262306a36Sopenharmony_ci 170362306a36Sopenharmony_ci/** 170462306a36Sopenharmony_ci * unmap_vmas - unmap a range of memory covered by a list of vma's 170562306a36Sopenharmony_ci * @tlb: address of the caller's struct mmu_gather 170662306a36Sopenharmony_ci * @mas: the maple state 170762306a36Sopenharmony_ci * @vma: the starting vma 170862306a36Sopenharmony_ci * @start_addr: virtual address at which to start unmapping 170962306a36Sopenharmony_ci * @end_addr: virtual address at which to end unmapping 171062306a36Sopenharmony_ci * @tree_end: The maximum index to check 171162306a36Sopenharmony_ci * @mm_wr_locked: lock flag 171262306a36Sopenharmony_ci * 171362306a36Sopenharmony_ci * Unmap all pages in the vma list. 171462306a36Sopenharmony_ci * 171562306a36Sopenharmony_ci * Only addresses between `start' and `end' will be unmapped. 171662306a36Sopenharmony_ci * 171762306a36Sopenharmony_ci * The VMA list must be sorted in ascending virtual address order. 171862306a36Sopenharmony_ci * 171962306a36Sopenharmony_ci * unmap_vmas() assumes that the caller will flush the whole unmapped address 172062306a36Sopenharmony_ci * range after unmap_vmas() returns. So the only responsibility here is to 172162306a36Sopenharmony_ci * ensure that any thus-far unmapped pages are flushed before unmap_vmas() 172262306a36Sopenharmony_ci * drops the lock and schedules. 172362306a36Sopenharmony_ci */ 172462306a36Sopenharmony_civoid unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, 172562306a36Sopenharmony_ci struct vm_area_struct *vma, unsigned long start_addr, 172662306a36Sopenharmony_ci unsigned long end_addr, unsigned long tree_end, 172762306a36Sopenharmony_ci bool mm_wr_locked) 172862306a36Sopenharmony_ci{ 172962306a36Sopenharmony_ci struct mmu_notifier_range range; 173062306a36Sopenharmony_ci struct zap_details details = { 173162306a36Sopenharmony_ci .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP, 173262306a36Sopenharmony_ci /* Careful - we need to zap private pages too! */ 173362306a36Sopenharmony_ci .even_cows = true, 173462306a36Sopenharmony_ci }; 173562306a36Sopenharmony_ci 173662306a36Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, 173762306a36Sopenharmony_ci start_addr, end_addr); 173862306a36Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 173962306a36Sopenharmony_ci do { 174062306a36Sopenharmony_ci unsigned long start = start_addr; 174162306a36Sopenharmony_ci unsigned long end = end_addr; 174262306a36Sopenharmony_ci hugetlb_zap_begin(vma, &start, &end); 174362306a36Sopenharmony_ci unmap_single_vma(tlb, vma, start, end, &details, 174462306a36Sopenharmony_ci mm_wr_locked); 174562306a36Sopenharmony_ci hugetlb_zap_end(vma, &details); 174662306a36Sopenharmony_ci } while ((vma = mas_find(mas, tree_end - 1)) != NULL); 174762306a36Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 174862306a36Sopenharmony_ci} 174962306a36Sopenharmony_ci 175062306a36Sopenharmony_ci/** 175162306a36Sopenharmony_ci * zap_page_range_single - remove user pages in a given range 175262306a36Sopenharmony_ci * @vma: vm_area_struct holding the applicable pages 175362306a36Sopenharmony_ci * @address: starting address of pages to zap 175462306a36Sopenharmony_ci * @size: number of bytes to zap 175562306a36Sopenharmony_ci * @details: details of shared cache invalidation 175662306a36Sopenharmony_ci * 175762306a36Sopenharmony_ci * The range must fit into one VMA. 175862306a36Sopenharmony_ci */ 175962306a36Sopenharmony_civoid zap_page_range_single(struct vm_area_struct *vma, unsigned long address, 176062306a36Sopenharmony_ci unsigned long size, struct zap_details *details) 176162306a36Sopenharmony_ci{ 176262306a36Sopenharmony_ci const unsigned long end = address + size; 176362306a36Sopenharmony_ci struct mmu_notifier_range range; 176462306a36Sopenharmony_ci struct mmu_gather tlb; 176562306a36Sopenharmony_ci 176662306a36Sopenharmony_ci lru_add_drain(); 176762306a36Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, 176862306a36Sopenharmony_ci address, end); 176962306a36Sopenharmony_ci hugetlb_zap_begin(vma, &range.start, &range.end); 177062306a36Sopenharmony_ci tlb_gather_mmu(&tlb, vma->vm_mm); 177162306a36Sopenharmony_ci update_hiwater_rss(vma->vm_mm); 177262306a36Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 177362306a36Sopenharmony_ci /* 177462306a36Sopenharmony_ci * unmap 'address-end' not 'range.start-range.end' as range 177562306a36Sopenharmony_ci * could have been expanded for hugetlb pmd sharing. 177662306a36Sopenharmony_ci */ 177762306a36Sopenharmony_ci unmap_single_vma(&tlb, vma, address, end, details, false); 177862306a36Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 177962306a36Sopenharmony_ci tlb_finish_mmu(&tlb); 178062306a36Sopenharmony_ci hugetlb_zap_end(vma, details); 178162306a36Sopenharmony_ci} 178262306a36Sopenharmony_ci 178362306a36Sopenharmony_ci/** 178462306a36Sopenharmony_ci * zap_vma_ptes - remove ptes mapping the vma 178562306a36Sopenharmony_ci * @vma: vm_area_struct holding ptes to be zapped 178662306a36Sopenharmony_ci * @address: starting address of pages to zap 178762306a36Sopenharmony_ci * @size: number of bytes to zap 178862306a36Sopenharmony_ci * 178962306a36Sopenharmony_ci * This function only unmaps ptes assigned to VM_PFNMAP vmas. 179062306a36Sopenharmony_ci * 179162306a36Sopenharmony_ci * The entire address range must be fully contained within the vma. 179262306a36Sopenharmony_ci * 179362306a36Sopenharmony_ci */ 179462306a36Sopenharmony_civoid zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, 179562306a36Sopenharmony_ci unsigned long size) 179662306a36Sopenharmony_ci{ 179762306a36Sopenharmony_ci if (!range_in_vma(vma, address, address + size) || 179862306a36Sopenharmony_ci !(vma->vm_flags & VM_PFNMAP)) 179962306a36Sopenharmony_ci return; 180062306a36Sopenharmony_ci 180162306a36Sopenharmony_ci zap_page_range_single(vma, address, size, NULL); 180262306a36Sopenharmony_ci} 180362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(zap_vma_ptes); 180462306a36Sopenharmony_ci 180562306a36Sopenharmony_cistatic pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr) 180662306a36Sopenharmony_ci{ 180762306a36Sopenharmony_ci pgd_t *pgd; 180862306a36Sopenharmony_ci p4d_t *p4d; 180962306a36Sopenharmony_ci pud_t *pud; 181062306a36Sopenharmony_ci pmd_t *pmd; 181162306a36Sopenharmony_ci 181262306a36Sopenharmony_ci pgd = pgd_offset(mm, addr); 181362306a36Sopenharmony_ci p4d = p4d_alloc(mm, pgd, addr); 181462306a36Sopenharmony_ci if (!p4d) 181562306a36Sopenharmony_ci return NULL; 181662306a36Sopenharmony_ci pud = pud_alloc(mm, p4d, addr); 181762306a36Sopenharmony_ci if (!pud) 181862306a36Sopenharmony_ci return NULL; 181962306a36Sopenharmony_ci pmd = pmd_alloc(mm, pud, addr); 182062306a36Sopenharmony_ci if (!pmd) 182162306a36Sopenharmony_ci return NULL; 182262306a36Sopenharmony_ci 182362306a36Sopenharmony_ci VM_BUG_ON(pmd_trans_huge(*pmd)); 182462306a36Sopenharmony_ci return pmd; 182562306a36Sopenharmony_ci} 182662306a36Sopenharmony_ci 182762306a36Sopenharmony_cipte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, 182862306a36Sopenharmony_ci spinlock_t **ptl) 182962306a36Sopenharmony_ci{ 183062306a36Sopenharmony_ci pmd_t *pmd = walk_to_pmd(mm, addr); 183162306a36Sopenharmony_ci 183262306a36Sopenharmony_ci if (!pmd) 183362306a36Sopenharmony_ci return NULL; 183462306a36Sopenharmony_ci return pte_alloc_map_lock(mm, pmd, addr, ptl); 183562306a36Sopenharmony_ci} 183662306a36Sopenharmony_ci 183762306a36Sopenharmony_cistatic int validate_page_before_insert(struct page *page) 183862306a36Sopenharmony_ci{ 183962306a36Sopenharmony_ci if (PageAnon(page) || PageSlab(page) || page_has_type(page)) 184062306a36Sopenharmony_ci return -EINVAL; 184162306a36Sopenharmony_ci flush_dcache_page(page); 184262306a36Sopenharmony_ci return 0; 184362306a36Sopenharmony_ci} 184462306a36Sopenharmony_ci 184562306a36Sopenharmony_cistatic int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, 184662306a36Sopenharmony_ci unsigned long addr, struct page *page, pgprot_t prot) 184762306a36Sopenharmony_ci{ 184862306a36Sopenharmony_ci if (!pte_none(ptep_get(pte))) 184962306a36Sopenharmony_ci return -EBUSY; 185062306a36Sopenharmony_ci /* Ok, finally just insert the thing.. */ 185162306a36Sopenharmony_ci get_page(page); 185262306a36Sopenharmony_ci inc_mm_counter(vma->vm_mm, mm_counter_file(page)); 185362306a36Sopenharmony_ci page_add_file_rmap(page, vma, false); 185462306a36Sopenharmony_ci set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot)); 185562306a36Sopenharmony_ci return 0; 185662306a36Sopenharmony_ci} 185762306a36Sopenharmony_ci 185862306a36Sopenharmony_ci/* 185962306a36Sopenharmony_ci * This is the old fallback for page remapping. 186062306a36Sopenharmony_ci * 186162306a36Sopenharmony_ci * For historical reasons, it only allows reserved pages. Only 186262306a36Sopenharmony_ci * old drivers should use this, and they needed to mark their 186362306a36Sopenharmony_ci * pages reserved for the old functions anyway. 186462306a36Sopenharmony_ci */ 186562306a36Sopenharmony_cistatic int insert_page(struct vm_area_struct *vma, unsigned long addr, 186662306a36Sopenharmony_ci struct page *page, pgprot_t prot) 186762306a36Sopenharmony_ci{ 186862306a36Sopenharmony_ci int retval; 186962306a36Sopenharmony_ci pte_t *pte; 187062306a36Sopenharmony_ci spinlock_t *ptl; 187162306a36Sopenharmony_ci 187262306a36Sopenharmony_ci retval = validate_page_before_insert(page); 187362306a36Sopenharmony_ci if (retval) 187462306a36Sopenharmony_ci goto out; 187562306a36Sopenharmony_ci retval = -ENOMEM; 187662306a36Sopenharmony_ci pte = get_locked_pte(vma->vm_mm, addr, &ptl); 187762306a36Sopenharmony_ci if (!pte) 187862306a36Sopenharmony_ci goto out; 187962306a36Sopenharmony_ci retval = insert_page_into_pte_locked(vma, pte, addr, page, prot); 188062306a36Sopenharmony_ci pte_unmap_unlock(pte, ptl); 188162306a36Sopenharmony_ciout: 188262306a36Sopenharmony_ci return retval; 188362306a36Sopenharmony_ci} 188462306a36Sopenharmony_ci 188562306a36Sopenharmony_cistatic int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte, 188662306a36Sopenharmony_ci unsigned long addr, struct page *page, pgprot_t prot) 188762306a36Sopenharmony_ci{ 188862306a36Sopenharmony_ci int err; 188962306a36Sopenharmony_ci 189062306a36Sopenharmony_ci if (!page_count(page)) 189162306a36Sopenharmony_ci return -EINVAL; 189262306a36Sopenharmony_ci err = validate_page_before_insert(page); 189362306a36Sopenharmony_ci if (err) 189462306a36Sopenharmony_ci return err; 189562306a36Sopenharmony_ci return insert_page_into_pte_locked(vma, pte, addr, page, prot); 189662306a36Sopenharmony_ci} 189762306a36Sopenharmony_ci 189862306a36Sopenharmony_ci/* insert_pages() amortizes the cost of spinlock operations 189962306a36Sopenharmony_ci * when inserting pages in a loop. 190062306a36Sopenharmony_ci */ 190162306a36Sopenharmony_cistatic int insert_pages(struct vm_area_struct *vma, unsigned long addr, 190262306a36Sopenharmony_ci struct page **pages, unsigned long *num, pgprot_t prot) 190362306a36Sopenharmony_ci{ 190462306a36Sopenharmony_ci pmd_t *pmd = NULL; 190562306a36Sopenharmony_ci pte_t *start_pte, *pte; 190662306a36Sopenharmony_ci spinlock_t *pte_lock; 190762306a36Sopenharmony_ci struct mm_struct *const mm = vma->vm_mm; 190862306a36Sopenharmony_ci unsigned long curr_page_idx = 0; 190962306a36Sopenharmony_ci unsigned long remaining_pages_total = *num; 191062306a36Sopenharmony_ci unsigned long pages_to_write_in_pmd; 191162306a36Sopenharmony_ci int ret; 191262306a36Sopenharmony_cimore: 191362306a36Sopenharmony_ci ret = -EFAULT; 191462306a36Sopenharmony_ci pmd = walk_to_pmd(mm, addr); 191562306a36Sopenharmony_ci if (!pmd) 191662306a36Sopenharmony_ci goto out; 191762306a36Sopenharmony_ci 191862306a36Sopenharmony_ci pages_to_write_in_pmd = min_t(unsigned long, 191962306a36Sopenharmony_ci remaining_pages_total, PTRS_PER_PTE - pte_index(addr)); 192062306a36Sopenharmony_ci 192162306a36Sopenharmony_ci /* Allocate the PTE if necessary; takes PMD lock once only. */ 192262306a36Sopenharmony_ci ret = -ENOMEM; 192362306a36Sopenharmony_ci if (pte_alloc(mm, pmd)) 192462306a36Sopenharmony_ci goto out; 192562306a36Sopenharmony_ci 192662306a36Sopenharmony_ci while (pages_to_write_in_pmd) { 192762306a36Sopenharmony_ci int pte_idx = 0; 192862306a36Sopenharmony_ci const int batch_size = min_t(int, pages_to_write_in_pmd, 8); 192962306a36Sopenharmony_ci 193062306a36Sopenharmony_ci start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock); 193162306a36Sopenharmony_ci if (!start_pte) { 193262306a36Sopenharmony_ci ret = -EFAULT; 193362306a36Sopenharmony_ci goto out; 193462306a36Sopenharmony_ci } 193562306a36Sopenharmony_ci for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) { 193662306a36Sopenharmony_ci int err = insert_page_in_batch_locked(vma, pte, 193762306a36Sopenharmony_ci addr, pages[curr_page_idx], prot); 193862306a36Sopenharmony_ci if (unlikely(err)) { 193962306a36Sopenharmony_ci pte_unmap_unlock(start_pte, pte_lock); 194062306a36Sopenharmony_ci ret = err; 194162306a36Sopenharmony_ci remaining_pages_total -= pte_idx; 194262306a36Sopenharmony_ci goto out; 194362306a36Sopenharmony_ci } 194462306a36Sopenharmony_ci addr += PAGE_SIZE; 194562306a36Sopenharmony_ci ++curr_page_idx; 194662306a36Sopenharmony_ci } 194762306a36Sopenharmony_ci pte_unmap_unlock(start_pte, pte_lock); 194862306a36Sopenharmony_ci pages_to_write_in_pmd -= batch_size; 194962306a36Sopenharmony_ci remaining_pages_total -= batch_size; 195062306a36Sopenharmony_ci } 195162306a36Sopenharmony_ci if (remaining_pages_total) 195262306a36Sopenharmony_ci goto more; 195362306a36Sopenharmony_ci ret = 0; 195462306a36Sopenharmony_ciout: 195562306a36Sopenharmony_ci *num = remaining_pages_total; 195662306a36Sopenharmony_ci return ret; 195762306a36Sopenharmony_ci} 195862306a36Sopenharmony_ci 195962306a36Sopenharmony_ci/** 196062306a36Sopenharmony_ci * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock. 196162306a36Sopenharmony_ci * @vma: user vma to map to 196262306a36Sopenharmony_ci * @addr: target start user address of these pages 196362306a36Sopenharmony_ci * @pages: source kernel pages 196462306a36Sopenharmony_ci * @num: in: number of pages to map. out: number of pages that were *not* 196562306a36Sopenharmony_ci * mapped. (0 means all pages were successfully mapped). 196662306a36Sopenharmony_ci * 196762306a36Sopenharmony_ci * Preferred over vm_insert_page() when inserting multiple pages. 196862306a36Sopenharmony_ci * 196962306a36Sopenharmony_ci * In case of error, we may have mapped a subset of the provided 197062306a36Sopenharmony_ci * pages. It is the caller's responsibility to account for this case. 197162306a36Sopenharmony_ci * 197262306a36Sopenharmony_ci * The same restrictions apply as in vm_insert_page(). 197362306a36Sopenharmony_ci */ 197462306a36Sopenharmony_ciint vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, 197562306a36Sopenharmony_ci struct page **pages, unsigned long *num) 197662306a36Sopenharmony_ci{ 197762306a36Sopenharmony_ci const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1; 197862306a36Sopenharmony_ci 197962306a36Sopenharmony_ci if (addr < vma->vm_start || end_addr >= vma->vm_end) 198062306a36Sopenharmony_ci return -EFAULT; 198162306a36Sopenharmony_ci if (!(vma->vm_flags & VM_MIXEDMAP)) { 198262306a36Sopenharmony_ci BUG_ON(mmap_read_trylock(vma->vm_mm)); 198362306a36Sopenharmony_ci BUG_ON(vma->vm_flags & VM_PFNMAP); 198462306a36Sopenharmony_ci vm_flags_set(vma, VM_MIXEDMAP); 198562306a36Sopenharmony_ci } 198662306a36Sopenharmony_ci /* Defer page refcount checking till we're about to map that page. */ 198762306a36Sopenharmony_ci return insert_pages(vma, addr, pages, num, vma->vm_page_prot); 198862306a36Sopenharmony_ci} 198962306a36Sopenharmony_ciEXPORT_SYMBOL(vm_insert_pages); 199062306a36Sopenharmony_ci 199162306a36Sopenharmony_ci/** 199262306a36Sopenharmony_ci * vm_insert_page - insert single page into user vma 199362306a36Sopenharmony_ci * @vma: user vma to map to 199462306a36Sopenharmony_ci * @addr: target user address of this page 199562306a36Sopenharmony_ci * @page: source kernel page 199662306a36Sopenharmony_ci * 199762306a36Sopenharmony_ci * This allows drivers to insert individual pages they've allocated 199862306a36Sopenharmony_ci * into a user vma. 199962306a36Sopenharmony_ci * 200062306a36Sopenharmony_ci * The page has to be a nice clean _individual_ kernel allocation. 200162306a36Sopenharmony_ci * If you allocate a compound page, you need to have marked it as 200262306a36Sopenharmony_ci * such (__GFP_COMP), or manually just split the page up yourself 200362306a36Sopenharmony_ci * (see split_page()). 200462306a36Sopenharmony_ci * 200562306a36Sopenharmony_ci * NOTE! Traditionally this was done with "remap_pfn_range()" which 200662306a36Sopenharmony_ci * took an arbitrary page protection parameter. This doesn't allow 200762306a36Sopenharmony_ci * that. Your vma protection will have to be set up correctly, which 200862306a36Sopenharmony_ci * means that if you want a shared writable mapping, you'd better 200962306a36Sopenharmony_ci * ask for a shared writable mapping! 201062306a36Sopenharmony_ci * 201162306a36Sopenharmony_ci * The page does not need to be reserved. 201262306a36Sopenharmony_ci * 201362306a36Sopenharmony_ci * Usually this function is called from f_op->mmap() handler 201462306a36Sopenharmony_ci * under mm->mmap_lock write-lock, so it can change vma->vm_flags. 201562306a36Sopenharmony_ci * Caller must set VM_MIXEDMAP on vma if it wants to call this 201662306a36Sopenharmony_ci * function from other places, for example from page-fault handler. 201762306a36Sopenharmony_ci * 201862306a36Sopenharmony_ci * Return: %0 on success, negative error code otherwise. 201962306a36Sopenharmony_ci */ 202062306a36Sopenharmony_ciint vm_insert_page(struct vm_area_struct *vma, unsigned long addr, 202162306a36Sopenharmony_ci struct page *page) 202262306a36Sopenharmony_ci{ 202362306a36Sopenharmony_ci if (addr < vma->vm_start || addr >= vma->vm_end) 202462306a36Sopenharmony_ci return -EFAULT; 202562306a36Sopenharmony_ci if (!page_count(page)) 202662306a36Sopenharmony_ci return -EINVAL; 202762306a36Sopenharmony_ci if (!(vma->vm_flags & VM_MIXEDMAP)) { 202862306a36Sopenharmony_ci BUG_ON(mmap_read_trylock(vma->vm_mm)); 202962306a36Sopenharmony_ci BUG_ON(vma->vm_flags & VM_PFNMAP); 203062306a36Sopenharmony_ci vm_flags_set(vma, VM_MIXEDMAP); 203162306a36Sopenharmony_ci } 203262306a36Sopenharmony_ci return insert_page(vma, addr, page, vma->vm_page_prot); 203362306a36Sopenharmony_ci} 203462306a36Sopenharmony_ciEXPORT_SYMBOL(vm_insert_page); 203562306a36Sopenharmony_ci 203662306a36Sopenharmony_ci/* 203762306a36Sopenharmony_ci * __vm_map_pages - maps range of kernel pages into user vma 203862306a36Sopenharmony_ci * @vma: user vma to map to 203962306a36Sopenharmony_ci * @pages: pointer to array of source kernel pages 204062306a36Sopenharmony_ci * @num: number of pages in page array 204162306a36Sopenharmony_ci * @offset: user's requested vm_pgoff 204262306a36Sopenharmony_ci * 204362306a36Sopenharmony_ci * This allows drivers to map range of kernel pages into a user vma. 204462306a36Sopenharmony_ci * 204562306a36Sopenharmony_ci * Return: 0 on success and error code otherwise. 204662306a36Sopenharmony_ci */ 204762306a36Sopenharmony_cistatic int __vm_map_pages(struct vm_area_struct *vma, struct page **pages, 204862306a36Sopenharmony_ci unsigned long num, unsigned long offset) 204962306a36Sopenharmony_ci{ 205062306a36Sopenharmony_ci unsigned long count = vma_pages(vma); 205162306a36Sopenharmony_ci unsigned long uaddr = vma->vm_start; 205262306a36Sopenharmony_ci int ret, i; 205362306a36Sopenharmony_ci 205462306a36Sopenharmony_ci /* Fail if the user requested offset is beyond the end of the object */ 205562306a36Sopenharmony_ci if (offset >= num) 205662306a36Sopenharmony_ci return -ENXIO; 205762306a36Sopenharmony_ci 205862306a36Sopenharmony_ci /* Fail if the user requested size exceeds available object size */ 205962306a36Sopenharmony_ci if (count > num - offset) 206062306a36Sopenharmony_ci return -ENXIO; 206162306a36Sopenharmony_ci 206262306a36Sopenharmony_ci for (i = 0; i < count; i++) { 206362306a36Sopenharmony_ci ret = vm_insert_page(vma, uaddr, pages[offset + i]); 206462306a36Sopenharmony_ci if (ret < 0) 206562306a36Sopenharmony_ci return ret; 206662306a36Sopenharmony_ci uaddr += PAGE_SIZE; 206762306a36Sopenharmony_ci } 206862306a36Sopenharmony_ci 206962306a36Sopenharmony_ci return 0; 207062306a36Sopenharmony_ci} 207162306a36Sopenharmony_ci 207262306a36Sopenharmony_ci/** 207362306a36Sopenharmony_ci * vm_map_pages - maps range of kernel pages starts with non zero offset 207462306a36Sopenharmony_ci * @vma: user vma to map to 207562306a36Sopenharmony_ci * @pages: pointer to array of source kernel pages 207662306a36Sopenharmony_ci * @num: number of pages in page array 207762306a36Sopenharmony_ci * 207862306a36Sopenharmony_ci * Maps an object consisting of @num pages, catering for the user's 207962306a36Sopenharmony_ci * requested vm_pgoff 208062306a36Sopenharmony_ci * 208162306a36Sopenharmony_ci * If we fail to insert any page into the vma, the function will return 208262306a36Sopenharmony_ci * immediately leaving any previously inserted pages present. Callers 208362306a36Sopenharmony_ci * from the mmap handler may immediately return the error as their caller 208462306a36Sopenharmony_ci * will destroy the vma, removing any successfully inserted pages. Other 208562306a36Sopenharmony_ci * callers should make their own arrangements for calling unmap_region(). 208662306a36Sopenharmony_ci * 208762306a36Sopenharmony_ci * Context: Process context. Called by mmap handlers. 208862306a36Sopenharmony_ci * Return: 0 on success and error code otherwise. 208962306a36Sopenharmony_ci */ 209062306a36Sopenharmony_ciint vm_map_pages(struct vm_area_struct *vma, struct page **pages, 209162306a36Sopenharmony_ci unsigned long num) 209262306a36Sopenharmony_ci{ 209362306a36Sopenharmony_ci return __vm_map_pages(vma, pages, num, vma->vm_pgoff); 209462306a36Sopenharmony_ci} 209562306a36Sopenharmony_ciEXPORT_SYMBOL(vm_map_pages); 209662306a36Sopenharmony_ci 209762306a36Sopenharmony_ci/** 209862306a36Sopenharmony_ci * vm_map_pages_zero - map range of kernel pages starts with zero offset 209962306a36Sopenharmony_ci * @vma: user vma to map to 210062306a36Sopenharmony_ci * @pages: pointer to array of source kernel pages 210162306a36Sopenharmony_ci * @num: number of pages in page array 210262306a36Sopenharmony_ci * 210362306a36Sopenharmony_ci * Similar to vm_map_pages(), except that it explicitly sets the offset 210462306a36Sopenharmony_ci * to 0. This function is intended for the drivers that did not consider 210562306a36Sopenharmony_ci * vm_pgoff. 210662306a36Sopenharmony_ci * 210762306a36Sopenharmony_ci * Context: Process context. Called by mmap handlers. 210862306a36Sopenharmony_ci * Return: 0 on success and error code otherwise. 210962306a36Sopenharmony_ci */ 211062306a36Sopenharmony_ciint vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, 211162306a36Sopenharmony_ci unsigned long num) 211262306a36Sopenharmony_ci{ 211362306a36Sopenharmony_ci return __vm_map_pages(vma, pages, num, 0); 211462306a36Sopenharmony_ci} 211562306a36Sopenharmony_ciEXPORT_SYMBOL(vm_map_pages_zero); 211662306a36Sopenharmony_ci 211762306a36Sopenharmony_cistatic vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, 211862306a36Sopenharmony_ci pfn_t pfn, pgprot_t prot, bool mkwrite) 211962306a36Sopenharmony_ci{ 212062306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 212162306a36Sopenharmony_ci pte_t *pte, entry; 212262306a36Sopenharmony_ci spinlock_t *ptl; 212362306a36Sopenharmony_ci 212462306a36Sopenharmony_ci pte = get_locked_pte(mm, addr, &ptl); 212562306a36Sopenharmony_ci if (!pte) 212662306a36Sopenharmony_ci return VM_FAULT_OOM; 212762306a36Sopenharmony_ci entry = ptep_get(pte); 212862306a36Sopenharmony_ci if (!pte_none(entry)) { 212962306a36Sopenharmony_ci if (mkwrite) { 213062306a36Sopenharmony_ci /* 213162306a36Sopenharmony_ci * For read faults on private mappings the PFN passed 213262306a36Sopenharmony_ci * in may not match the PFN we have mapped if the 213362306a36Sopenharmony_ci * mapped PFN is a writeable COW page. In the mkwrite 213462306a36Sopenharmony_ci * case we are creating a writable PTE for a shared 213562306a36Sopenharmony_ci * mapping and we expect the PFNs to match. If they 213662306a36Sopenharmony_ci * don't match, we are likely racing with block 213762306a36Sopenharmony_ci * allocation and mapping invalidation so just skip the 213862306a36Sopenharmony_ci * update. 213962306a36Sopenharmony_ci */ 214062306a36Sopenharmony_ci if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) { 214162306a36Sopenharmony_ci WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry))); 214262306a36Sopenharmony_ci goto out_unlock; 214362306a36Sopenharmony_ci } 214462306a36Sopenharmony_ci entry = pte_mkyoung(entry); 214562306a36Sopenharmony_ci entry = maybe_mkwrite(pte_mkdirty(entry), vma); 214662306a36Sopenharmony_ci if (ptep_set_access_flags(vma, addr, pte, entry, 1)) 214762306a36Sopenharmony_ci update_mmu_cache(vma, addr, pte); 214862306a36Sopenharmony_ci } 214962306a36Sopenharmony_ci goto out_unlock; 215062306a36Sopenharmony_ci } 215162306a36Sopenharmony_ci 215262306a36Sopenharmony_ci /* Ok, finally just insert the thing.. */ 215362306a36Sopenharmony_ci if (pfn_t_devmap(pfn)) 215462306a36Sopenharmony_ci entry = pte_mkdevmap(pfn_t_pte(pfn, prot)); 215562306a36Sopenharmony_ci else 215662306a36Sopenharmony_ci entry = pte_mkspecial(pfn_t_pte(pfn, prot)); 215762306a36Sopenharmony_ci 215862306a36Sopenharmony_ci if (mkwrite) { 215962306a36Sopenharmony_ci entry = pte_mkyoung(entry); 216062306a36Sopenharmony_ci entry = maybe_mkwrite(pte_mkdirty(entry), vma); 216162306a36Sopenharmony_ci } 216262306a36Sopenharmony_ci 216362306a36Sopenharmony_ci set_pte_at(mm, addr, pte, entry); 216462306a36Sopenharmony_ci update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ 216562306a36Sopenharmony_ci 216662306a36Sopenharmony_ciout_unlock: 216762306a36Sopenharmony_ci pte_unmap_unlock(pte, ptl); 216862306a36Sopenharmony_ci return VM_FAULT_NOPAGE; 216962306a36Sopenharmony_ci} 217062306a36Sopenharmony_ci 217162306a36Sopenharmony_ci/** 217262306a36Sopenharmony_ci * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot 217362306a36Sopenharmony_ci * @vma: user vma to map to 217462306a36Sopenharmony_ci * @addr: target user address of this page 217562306a36Sopenharmony_ci * @pfn: source kernel pfn 217662306a36Sopenharmony_ci * @pgprot: pgprot flags for the inserted page 217762306a36Sopenharmony_ci * 217862306a36Sopenharmony_ci * This is exactly like vmf_insert_pfn(), except that it allows drivers 217962306a36Sopenharmony_ci * to override pgprot on a per-page basis. 218062306a36Sopenharmony_ci * 218162306a36Sopenharmony_ci * This only makes sense for IO mappings, and it makes no sense for 218262306a36Sopenharmony_ci * COW mappings. In general, using multiple vmas is preferable; 218362306a36Sopenharmony_ci * vmf_insert_pfn_prot should only be used if using multiple VMAs is 218462306a36Sopenharmony_ci * impractical. 218562306a36Sopenharmony_ci * 218662306a36Sopenharmony_ci * pgprot typically only differs from @vma->vm_page_prot when drivers set 218762306a36Sopenharmony_ci * caching- and encryption bits different than those of @vma->vm_page_prot, 218862306a36Sopenharmony_ci * because the caching- or encryption mode may not be known at mmap() time. 218962306a36Sopenharmony_ci * 219062306a36Sopenharmony_ci * This is ok as long as @vma->vm_page_prot is not used by the core vm 219162306a36Sopenharmony_ci * to set caching and encryption bits for those vmas (except for COW pages). 219262306a36Sopenharmony_ci * This is ensured by core vm only modifying these page table entries using 219362306a36Sopenharmony_ci * functions that don't touch caching- or encryption bits, using pte_modify() 219462306a36Sopenharmony_ci * if needed. (See for example mprotect()). 219562306a36Sopenharmony_ci * 219662306a36Sopenharmony_ci * Also when new page-table entries are created, this is only done using the 219762306a36Sopenharmony_ci * fault() callback, and never using the value of vma->vm_page_prot, 219862306a36Sopenharmony_ci * except for page-table entries that point to anonymous pages as the result 219962306a36Sopenharmony_ci * of COW. 220062306a36Sopenharmony_ci * 220162306a36Sopenharmony_ci * Context: Process context. May allocate using %GFP_KERNEL. 220262306a36Sopenharmony_ci * Return: vm_fault_t value. 220362306a36Sopenharmony_ci */ 220462306a36Sopenharmony_civm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, 220562306a36Sopenharmony_ci unsigned long pfn, pgprot_t pgprot) 220662306a36Sopenharmony_ci{ 220762306a36Sopenharmony_ci /* 220862306a36Sopenharmony_ci * Technically, architectures with pte_special can avoid all these 220962306a36Sopenharmony_ci * restrictions (same for remap_pfn_range). However we would like 221062306a36Sopenharmony_ci * consistency in testing and feature parity among all, so we should 221162306a36Sopenharmony_ci * try to keep these invariants in place for everybody. 221262306a36Sopenharmony_ci */ 221362306a36Sopenharmony_ci BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); 221462306a36Sopenharmony_ci BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 221562306a36Sopenharmony_ci (VM_PFNMAP|VM_MIXEDMAP)); 221662306a36Sopenharmony_ci BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); 221762306a36Sopenharmony_ci BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); 221862306a36Sopenharmony_ci 221962306a36Sopenharmony_ci if (addr < vma->vm_start || addr >= vma->vm_end) 222062306a36Sopenharmony_ci return VM_FAULT_SIGBUS; 222162306a36Sopenharmony_ci 222262306a36Sopenharmony_ci if (!pfn_modify_allowed(pfn, pgprot)) 222362306a36Sopenharmony_ci return VM_FAULT_SIGBUS; 222462306a36Sopenharmony_ci 222562306a36Sopenharmony_ci track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); 222662306a36Sopenharmony_ci 222762306a36Sopenharmony_ci return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, 222862306a36Sopenharmony_ci false); 222962306a36Sopenharmony_ci} 223062306a36Sopenharmony_ciEXPORT_SYMBOL(vmf_insert_pfn_prot); 223162306a36Sopenharmony_ci 223262306a36Sopenharmony_ci/** 223362306a36Sopenharmony_ci * vmf_insert_pfn - insert single pfn into user vma 223462306a36Sopenharmony_ci * @vma: user vma to map to 223562306a36Sopenharmony_ci * @addr: target user address of this page 223662306a36Sopenharmony_ci * @pfn: source kernel pfn 223762306a36Sopenharmony_ci * 223862306a36Sopenharmony_ci * Similar to vm_insert_page, this allows drivers to insert individual pages 223962306a36Sopenharmony_ci * they've allocated into a user vma. Same comments apply. 224062306a36Sopenharmony_ci * 224162306a36Sopenharmony_ci * This function should only be called from a vm_ops->fault handler, and 224262306a36Sopenharmony_ci * in that case the handler should return the result of this function. 224362306a36Sopenharmony_ci * 224462306a36Sopenharmony_ci * vma cannot be a COW mapping. 224562306a36Sopenharmony_ci * 224662306a36Sopenharmony_ci * As this is called only for pages that do not currently exist, we 224762306a36Sopenharmony_ci * do not need to flush old virtual caches or the TLB. 224862306a36Sopenharmony_ci * 224962306a36Sopenharmony_ci * Context: Process context. May allocate using %GFP_KERNEL. 225062306a36Sopenharmony_ci * Return: vm_fault_t value. 225162306a36Sopenharmony_ci */ 225262306a36Sopenharmony_civm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, 225362306a36Sopenharmony_ci unsigned long pfn) 225462306a36Sopenharmony_ci{ 225562306a36Sopenharmony_ci return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); 225662306a36Sopenharmony_ci} 225762306a36Sopenharmony_ciEXPORT_SYMBOL(vmf_insert_pfn); 225862306a36Sopenharmony_ci 225962306a36Sopenharmony_cistatic bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn) 226062306a36Sopenharmony_ci{ 226162306a36Sopenharmony_ci /* these checks mirror the abort conditions in vm_normal_page */ 226262306a36Sopenharmony_ci if (vma->vm_flags & VM_MIXEDMAP) 226362306a36Sopenharmony_ci return true; 226462306a36Sopenharmony_ci if (pfn_t_devmap(pfn)) 226562306a36Sopenharmony_ci return true; 226662306a36Sopenharmony_ci if (pfn_t_special(pfn)) 226762306a36Sopenharmony_ci return true; 226862306a36Sopenharmony_ci if (is_zero_pfn(pfn_t_to_pfn(pfn))) 226962306a36Sopenharmony_ci return true; 227062306a36Sopenharmony_ci return false; 227162306a36Sopenharmony_ci} 227262306a36Sopenharmony_ci 227362306a36Sopenharmony_cistatic vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, 227462306a36Sopenharmony_ci unsigned long addr, pfn_t pfn, bool mkwrite) 227562306a36Sopenharmony_ci{ 227662306a36Sopenharmony_ci pgprot_t pgprot = vma->vm_page_prot; 227762306a36Sopenharmony_ci int err; 227862306a36Sopenharmony_ci 227962306a36Sopenharmony_ci BUG_ON(!vm_mixed_ok(vma, pfn)); 228062306a36Sopenharmony_ci 228162306a36Sopenharmony_ci if (addr < vma->vm_start || addr >= vma->vm_end) 228262306a36Sopenharmony_ci return VM_FAULT_SIGBUS; 228362306a36Sopenharmony_ci 228462306a36Sopenharmony_ci track_pfn_insert(vma, &pgprot, pfn); 228562306a36Sopenharmony_ci 228662306a36Sopenharmony_ci if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot)) 228762306a36Sopenharmony_ci return VM_FAULT_SIGBUS; 228862306a36Sopenharmony_ci 228962306a36Sopenharmony_ci /* 229062306a36Sopenharmony_ci * If we don't have pte special, then we have to use the pfn_valid() 229162306a36Sopenharmony_ci * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* 229262306a36Sopenharmony_ci * refcount the page if pfn_valid is true (hence insert_page rather 229362306a36Sopenharmony_ci * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP 229462306a36Sopenharmony_ci * without pte special, it would there be refcounted as a normal page. 229562306a36Sopenharmony_ci */ 229662306a36Sopenharmony_ci if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && 229762306a36Sopenharmony_ci !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) { 229862306a36Sopenharmony_ci struct page *page; 229962306a36Sopenharmony_ci 230062306a36Sopenharmony_ci /* 230162306a36Sopenharmony_ci * At this point we are committed to insert_page() 230262306a36Sopenharmony_ci * regardless of whether the caller specified flags that 230362306a36Sopenharmony_ci * result in pfn_t_has_page() == false. 230462306a36Sopenharmony_ci */ 230562306a36Sopenharmony_ci page = pfn_to_page(pfn_t_to_pfn(pfn)); 230662306a36Sopenharmony_ci err = insert_page(vma, addr, page, pgprot); 230762306a36Sopenharmony_ci } else { 230862306a36Sopenharmony_ci return insert_pfn(vma, addr, pfn, pgprot, mkwrite); 230962306a36Sopenharmony_ci } 231062306a36Sopenharmony_ci 231162306a36Sopenharmony_ci if (err == -ENOMEM) 231262306a36Sopenharmony_ci return VM_FAULT_OOM; 231362306a36Sopenharmony_ci if (err < 0 && err != -EBUSY) 231462306a36Sopenharmony_ci return VM_FAULT_SIGBUS; 231562306a36Sopenharmony_ci 231662306a36Sopenharmony_ci return VM_FAULT_NOPAGE; 231762306a36Sopenharmony_ci} 231862306a36Sopenharmony_ci 231962306a36Sopenharmony_civm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, 232062306a36Sopenharmony_ci pfn_t pfn) 232162306a36Sopenharmony_ci{ 232262306a36Sopenharmony_ci return __vm_insert_mixed(vma, addr, pfn, false); 232362306a36Sopenharmony_ci} 232462306a36Sopenharmony_ciEXPORT_SYMBOL(vmf_insert_mixed); 232562306a36Sopenharmony_ci 232662306a36Sopenharmony_ci/* 232762306a36Sopenharmony_ci * If the insertion of PTE failed because someone else already added a 232862306a36Sopenharmony_ci * different entry in the mean time, we treat that as success as we assume 232962306a36Sopenharmony_ci * the same entry was actually inserted. 233062306a36Sopenharmony_ci */ 233162306a36Sopenharmony_civm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, 233262306a36Sopenharmony_ci unsigned long addr, pfn_t pfn) 233362306a36Sopenharmony_ci{ 233462306a36Sopenharmony_ci return __vm_insert_mixed(vma, addr, pfn, true); 233562306a36Sopenharmony_ci} 233662306a36Sopenharmony_ciEXPORT_SYMBOL(vmf_insert_mixed_mkwrite); 233762306a36Sopenharmony_ci 233862306a36Sopenharmony_ci/* 233962306a36Sopenharmony_ci * maps a range of physical memory into the requested pages. the old 234062306a36Sopenharmony_ci * mappings are removed. any references to nonexistent pages results 234162306a36Sopenharmony_ci * in null mappings (currently treated as "copy-on-access") 234262306a36Sopenharmony_ci */ 234362306a36Sopenharmony_cistatic int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, 234462306a36Sopenharmony_ci unsigned long addr, unsigned long end, 234562306a36Sopenharmony_ci unsigned long pfn, pgprot_t prot) 234662306a36Sopenharmony_ci{ 234762306a36Sopenharmony_ci pte_t *pte, *mapped_pte; 234862306a36Sopenharmony_ci spinlock_t *ptl; 234962306a36Sopenharmony_ci int err = 0; 235062306a36Sopenharmony_ci 235162306a36Sopenharmony_ci mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); 235262306a36Sopenharmony_ci if (!pte) 235362306a36Sopenharmony_ci return -ENOMEM; 235462306a36Sopenharmony_ci arch_enter_lazy_mmu_mode(); 235562306a36Sopenharmony_ci do { 235662306a36Sopenharmony_ci BUG_ON(!pte_none(ptep_get(pte))); 235762306a36Sopenharmony_ci if (!pfn_modify_allowed(pfn, prot)) { 235862306a36Sopenharmony_ci err = -EACCES; 235962306a36Sopenharmony_ci break; 236062306a36Sopenharmony_ci } 236162306a36Sopenharmony_ci set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); 236262306a36Sopenharmony_ci pfn++; 236362306a36Sopenharmony_ci } while (pte++, addr += PAGE_SIZE, addr != end); 236462306a36Sopenharmony_ci arch_leave_lazy_mmu_mode(); 236562306a36Sopenharmony_ci pte_unmap_unlock(mapped_pte, ptl); 236662306a36Sopenharmony_ci return err; 236762306a36Sopenharmony_ci} 236862306a36Sopenharmony_ci 236962306a36Sopenharmony_cistatic inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, 237062306a36Sopenharmony_ci unsigned long addr, unsigned long end, 237162306a36Sopenharmony_ci unsigned long pfn, pgprot_t prot) 237262306a36Sopenharmony_ci{ 237362306a36Sopenharmony_ci pmd_t *pmd; 237462306a36Sopenharmony_ci unsigned long next; 237562306a36Sopenharmony_ci int err; 237662306a36Sopenharmony_ci 237762306a36Sopenharmony_ci pfn -= addr >> PAGE_SHIFT; 237862306a36Sopenharmony_ci pmd = pmd_alloc(mm, pud, addr); 237962306a36Sopenharmony_ci if (!pmd) 238062306a36Sopenharmony_ci return -ENOMEM; 238162306a36Sopenharmony_ci VM_BUG_ON(pmd_trans_huge(*pmd)); 238262306a36Sopenharmony_ci do { 238362306a36Sopenharmony_ci next = pmd_addr_end(addr, end); 238462306a36Sopenharmony_ci err = remap_pte_range(mm, pmd, addr, next, 238562306a36Sopenharmony_ci pfn + (addr >> PAGE_SHIFT), prot); 238662306a36Sopenharmony_ci if (err) 238762306a36Sopenharmony_ci return err; 238862306a36Sopenharmony_ci } while (pmd++, addr = next, addr != end); 238962306a36Sopenharmony_ci return 0; 239062306a36Sopenharmony_ci} 239162306a36Sopenharmony_ci 239262306a36Sopenharmony_cistatic inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d, 239362306a36Sopenharmony_ci unsigned long addr, unsigned long end, 239462306a36Sopenharmony_ci unsigned long pfn, pgprot_t prot) 239562306a36Sopenharmony_ci{ 239662306a36Sopenharmony_ci pud_t *pud; 239762306a36Sopenharmony_ci unsigned long next; 239862306a36Sopenharmony_ci int err; 239962306a36Sopenharmony_ci 240062306a36Sopenharmony_ci pfn -= addr >> PAGE_SHIFT; 240162306a36Sopenharmony_ci pud = pud_alloc(mm, p4d, addr); 240262306a36Sopenharmony_ci if (!pud) 240362306a36Sopenharmony_ci return -ENOMEM; 240462306a36Sopenharmony_ci do { 240562306a36Sopenharmony_ci next = pud_addr_end(addr, end); 240662306a36Sopenharmony_ci err = remap_pmd_range(mm, pud, addr, next, 240762306a36Sopenharmony_ci pfn + (addr >> PAGE_SHIFT), prot); 240862306a36Sopenharmony_ci if (err) 240962306a36Sopenharmony_ci return err; 241062306a36Sopenharmony_ci } while (pud++, addr = next, addr != end); 241162306a36Sopenharmony_ci return 0; 241262306a36Sopenharmony_ci} 241362306a36Sopenharmony_ci 241462306a36Sopenharmony_cistatic inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, 241562306a36Sopenharmony_ci unsigned long addr, unsigned long end, 241662306a36Sopenharmony_ci unsigned long pfn, pgprot_t prot) 241762306a36Sopenharmony_ci{ 241862306a36Sopenharmony_ci p4d_t *p4d; 241962306a36Sopenharmony_ci unsigned long next; 242062306a36Sopenharmony_ci int err; 242162306a36Sopenharmony_ci 242262306a36Sopenharmony_ci pfn -= addr >> PAGE_SHIFT; 242362306a36Sopenharmony_ci p4d = p4d_alloc(mm, pgd, addr); 242462306a36Sopenharmony_ci if (!p4d) 242562306a36Sopenharmony_ci return -ENOMEM; 242662306a36Sopenharmony_ci do { 242762306a36Sopenharmony_ci next = p4d_addr_end(addr, end); 242862306a36Sopenharmony_ci err = remap_pud_range(mm, p4d, addr, next, 242962306a36Sopenharmony_ci pfn + (addr >> PAGE_SHIFT), prot); 243062306a36Sopenharmony_ci if (err) 243162306a36Sopenharmony_ci return err; 243262306a36Sopenharmony_ci } while (p4d++, addr = next, addr != end); 243362306a36Sopenharmony_ci return 0; 243462306a36Sopenharmony_ci} 243562306a36Sopenharmony_ci 243662306a36Sopenharmony_ci/* 243762306a36Sopenharmony_ci * Variant of remap_pfn_range that does not call track_pfn_remap. The caller 243862306a36Sopenharmony_ci * must have pre-validated the caching bits of the pgprot_t. 243962306a36Sopenharmony_ci */ 244062306a36Sopenharmony_ciint remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, 244162306a36Sopenharmony_ci unsigned long pfn, unsigned long size, pgprot_t prot) 244262306a36Sopenharmony_ci{ 244362306a36Sopenharmony_ci pgd_t *pgd; 244462306a36Sopenharmony_ci unsigned long next; 244562306a36Sopenharmony_ci unsigned long end = addr + PAGE_ALIGN(size); 244662306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 244762306a36Sopenharmony_ci int err; 244862306a36Sopenharmony_ci 244962306a36Sopenharmony_ci if (WARN_ON_ONCE(!PAGE_ALIGNED(addr))) 245062306a36Sopenharmony_ci return -EINVAL; 245162306a36Sopenharmony_ci 245262306a36Sopenharmony_ci /* 245362306a36Sopenharmony_ci * Physically remapped pages are special. Tell the 245462306a36Sopenharmony_ci * rest of the world about it: 245562306a36Sopenharmony_ci * VM_IO tells people not to look at these pages 245662306a36Sopenharmony_ci * (accesses can have side effects). 245762306a36Sopenharmony_ci * VM_PFNMAP tells the core MM that the base pages are just 245862306a36Sopenharmony_ci * raw PFN mappings, and do not have a "struct page" associated 245962306a36Sopenharmony_ci * with them. 246062306a36Sopenharmony_ci * VM_DONTEXPAND 246162306a36Sopenharmony_ci * Disable vma merging and expanding with mremap(). 246262306a36Sopenharmony_ci * VM_DONTDUMP 246362306a36Sopenharmony_ci * Omit vma from core dump, even when VM_IO turned off. 246462306a36Sopenharmony_ci * 246562306a36Sopenharmony_ci * There's a horrible special case to handle copy-on-write 246662306a36Sopenharmony_ci * behaviour that some programs depend on. We mark the "original" 246762306a36Sopenharmony_ci * un-COW'ed pages by matching them up with "vma->vm_pgoff". 246862306a36Sopenharmony_ci * See vm_normal_page() for details. 246962306a36Sopenharmony_ci */ 247062306a36Sopenharmony_ci if (is_cow_mapping(vma->vm_flags)) { 247162306a36Sopenharmony_ci if (addr != vma->vm_start || end != vma->vm_end) 247262306a36Sopenharmony_ci return -EINVAL; 247362306a36Sopenharmony_ci vma->vm_pgoff = pfn; 247462306a36Sopenharmony_ci } 247562306a36Sopenharmony_ci 247662306a36Sopenharmony_ci vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); 247762306a36Sopenharmony_ci 247862306a36Sopenharmony_ci BUG_ON(addr >= end); 247962306a36Sopenharmony_ci pfn -= addr >> PAGE_SHIFT; 248062306a36Sopenharmony_ci pgd = pgd_offset(mm, addr); 248162306a36Sopenharmony_ci flush_cache_range(vma, addr, end); 248262306a36Sopenharmony_ci do { 248362306a36Sopenharmony_ci next = pgd_addr_end(addr, end); 248462306a36Sopenharmony_ci err = remap_p4d_range(mm, pgd, addr, next, 248562306a36Sopenharmony_ci pfn + (addr >> PAGE_SHIFT), prot); 248662306a36Sopenharmony_ci if (err) 248762306a36Sopenharmony_ci return err; 248862306a36Sopenharmony_ci } while (pgd++, addr = next, addr != end); 248962306a36Sopenharmony_ci 249062306a36Sopenharmony_ci return 0; 249162306a36Sopenharmony_ci} 249262306a36Sopenharmony_ci 249362306a36Sopenharmony_ci/** 249462306a36Sopenharmony_ci * remap_pfn_range - remap kernel memory to userspace 249562306a36Sopenharmony_ci * @vma: user vma to map to 249662306a36Sopenharmony_ci * @addr: target page aligned user address to start at 249762306a36Sopenharmony_ci * @pfn: page frame number of kernel physical memory address 249862306a36Sopenharmony_ci * @size: size of mapping area 249962306a36Sopenharmony_ci * @prot: page protection flags for this mapping 250062306a36Sopenharmony_ci * 250162306a36Sopenharmony_ci * Note: this is only safe if the mm semaphore is held when called. 250262306a36Sopenharmony_ci * 250362306a36Sopenharmony_ci * Return: %0 on success, negative error code otherwise. 250462306a36Sopenharmony_ci */ 250562306a36Sopenharmony_ciint remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, 250662306a36Sopenharmony_ci unsigned long pfn, unsigned long size, pgprot_t prot) 250762306a36Sopenharmony_ci{ 250862306a36Sopenharmony_ci int err; 250962306a36Sopenharmony_ci 251062306a36Sopenharmony_ci err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); 251162306a36Sopenharmony_ci if (err) 251262306a36Sopenharmony_ci return -EINVAL; 251362306a36Sopenharmony_ci 251462306a36Sopenharmony_ci err = remap_pfn_range_notrack(vma, addr, pfn, size, prot); 251562306a36Sopenharmony_ci if (err) 251662306a36Sopenharmony_ci untrack_pfn(vma, pfn, PAGE_ALIGN(size), true); 251762306a36Sopenharmony_ci return err; 251862306a36Sopenharmony_ci} 251962306a36Sopenharmony_ciEXPORT_SYMBOL(remap_pfn_range); 252062306a36Sopenharmony_ci 252162306a36Sopenharmony_ci/** 252262306a36Sopenharmony_ci * vm_iomap_memory - remap memory to userspace 252362306a36Sopenharmony_ci * @vma: user vma to map to 252462306a36Sopenharmony_ci * @start: start of the physical memory to be mapped 252562306a36Sopenharmony_ci * @len: size of area 252662306a36Sopenharmony_ci * 252762306a36Sopenharmony_ci * This is a simplified io_remap_pfn_range() for common driver use. The 252862306a36Sopenharmony_ci * driver just needs to give us the physical memory range to be mapped, 252962306a36Sopenharmony_ci * we'll figure out the rest from the vma information. 253062306a36Sopenharmony_ci * 253162306a36Sopenharmony_ci * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get 253262306a36Sopenharmony_ci * whatever write-combining details or similar. 253362306a36Sopenharmony_ci * 253462306a36Sopenharmony_ci * Return: %0 on success, negative error code otherwise. 253562306a36Sopenharmony_ci */ 253662306a36Sopenharmony_ciint vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) 253762306a36Sopenharmony_ci{ 253862306a36Sopenharmony_ci unsigned long vm_len, pfn, pages; 253962306a36Sopenharmony_ci 254062306a36Sopenharmony_ci /* Check that the physical memory area passed in looks valid */ 254162306a36Sopenharmony_ci if (start + len < start) 254262306a36Sopenharmony_ci return -EINVAL; 254362306a36Sopenharmony_ci /* 254462306a36Sopenharmony_ci * You *really* shouldn't map things that aren't page-aligned, 254562306a36Sopenharmony_ci * but we've historically allowed it because IO memory might 254662306a36Sopenharmony_ci * just have smaller alignment. 254762306a36Sopenharmony_ci */ 254862306a36Sopenharmony_ci len += start & ~PAGE_MASK; 254962306a36Sopenharmony_ci pfn = start >> PAGE_SHIFT; 255062306a36Sopenharmony_ci pages = (len + ~PAGE_MASK) >> PAGE_SHIFT; 255162306a36Sopenharmony_ci if (pfn + pages < pfn) 255262306a36Sopenharmony_ci return -EINVAL; 255362306a36Sopenharmony_ci 255462306a36Sopenharmony_ci /* We start the mapping 'vm_pgoff' pages into the area */ 255562306a36Sopenharmony_ci if (vma->vm_pgoff > pages) 255662306a36Sopenharmony_ci return -EINVAL; 255762306a36Sopenharmony_ci pfn += vma->vm_pgoff; 255862306a36Sopenharmony_ci pages -= vma->vm_pgoff; 255962306a36Sopenharmony_ci 256062306a36Sopenharmony_ci /* Can we fit all of the mapping? */ 256162306a36Sopenharmony_ci vm_len = vma->vm_end - vma->vm_start; 256262306a36Sopenharmony_ci if (vm_len >> PAGE_SHIFT > pages) 256362306a36Sopenharmony_ci return -EINVAL; 256462306a36Sopenharmony_ci 256562306a36Sopenharmony_ci /* Ok, let it rip */ 256662306a36Sopenharmony_ci return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); 256762306a36Sopenharmony_ci} 256862306a36Sopenharmony_ciEXPORT_SYMBOL(vm_iomap_memory); 256962306a36Sopenharmony_ci 257062306a36Sopenharmony_cistatic int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, 257162306a36Sopenharmony_ci unsigned long addr, unsigned long end, 257262306a36Sopenharmony_ci pte_fn_t fn, void *data, bool create, 257362306a36Sopenharmony_ci pgtbl_mod_mask *mask) 257462306a36Sopenharmony_ci{ 257562306a36Sopenharmony_ci pte_t *pte, *mapped_pte; 257662306a36Sopenharmony_ci int err = 0; 257762306a36Sopenharmony_ci spinlock_t *ptl; 257862306a36Sopenharmony_ci 257962306a36Sopenharmony_ci if (create) { 258062306a36Sopenharmony_ci mapped_pte = pte = (mm == &init_mm) ? 258162306a36Sopenharmony_ci pte_alloc_kernel_track(pmd, addr, mask) : 258262306a36Sopenharmony_ci pte_alloc_map_lock(mm, pmd, addr, &ptl); 258362306a36Sopenharmony_ci if (!pte) 258462306a36Sopenharmony_ci return -ENOMEM; 258562306a36Sopenharmony_ci } else { 258662306a36Sopenharmony_ci mapped_pte = pte = (mm == &init_mm) ? 258762306a36Sopenharmony_ci pte_offset_kernel(pmd, addr) : 258862306a36Sopenharmony_ci pte_offset_map_lock(mm, pmd, addr, &ptl); 258962306a36Sopenharmony_ci if (!pte) 259062306a36Sopenharmony_ci return -EINVAL; 259162306a36Sopenharmony_ci } 259262306a36Sopenharmony_ci 259362306a36Sopenharmony_ci arch_enter_lazy_mmu_mode(); 259462306a36Sopenharmony_ci 259562306a36Sopenharmony_ci if (fn) { 259662306a36Sopenharmony_ci do { 259762306a36Sopenharmony_ci if (create || !pte_none(ptep_get(pte))) { 259862306a36Sopenharmony_ci err = fn(pte++, addr, data); 259962306a36Sopenharmony_ci if (err) 260062306a36Sopenharmony_ci break; 260162306a36Sopenharmony_ci } 260262306a36Sopenharmony_ci } while (addr += PAGE_SIZE, addr != end); 260362306a36Sopenharmony_ci } 260462306a36Sopenharmony_ci *mask |= PGTBL_PTE_MODIFIED; 260562306a36Sopenharmony_ci 260662306a36Sopenharmony_ci arch_leave_lazy_mmu_mode(); 260762306a36Sopenharmony_ci 260862306a36Sopenharmony_ci if (mm != &init_mm) 260962306a36Sopenharmony_ci pte_unmap_unlock(mapped_pte, ptl); 261062306a36Sopenharmony_ci return err; 261162306a36Sopenharmony_ci} 261262306a36Sopenharmony_ci 261362306a36Sopenharmony_cistatic int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, 261462306a36Sopenharmony_ci unsigned long addr, unsigned long end, 261562306a36Sopenharmony_ci pte_fn_t fn, void *data, bool create, 261662306a36Sopenharmony_ci pgtbl_mod_mask *mask) 261762306a36Sopenharmony_ci{ 261862306a36Sopenharmony_ci pmd_t *pmd; 261962306a36Sopenharmony_ci unsigned long next; 262062306a36Sopenharmony_ci int err = 0; 262162306a36Sopenharmony_ci 262262306a36Sopenharmony_ci BUG_ON(pud_huge(*pud)); 262362306a36Sopenharmony_ci 262462306a36Sopenharmony_ci if (create) { 262562306a36Sopenharmony_ci pmd = pmd_alloc_track(mm, pud, addr, mask); 262662306a36Sopenharmony_ci if (!pmd) 262762306a36Sopenharmony_ci return -ENOMEM; 262862306a36Sopenharmony_ci } else { 262962306a36Sopenharmony_ci pmd = pmd_offset(pud, addr); 263062306a36Sopenharmony_ci } 263162306a36Sopenharmony_ci do { 263262306a36Sopenharmony_ci next = pmd_addr_end(addr, end); 263362306a36Sopenharmony_ci if (pmd_none(*pmd) && !create) 263462306a36Sopenharmony_ci continue; 263562306a36Sopenharmony_ci if (WARN_ON_ONCE(pmd_leaf(*pmd))) 263662306a36Sopenharmony_ci return -EINVAL; 263762306a36Sopenharmony_ci if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) { 263862306a36Sopenharmony_ci if (!create) 263962306a36Sopenharmony_ci continue; 264062306a36Sopenharmony_ci pmd_clear_bad(pmd); 264162306a36Sopenharmony_ci } 264262306a36Sopenharmony_ci err = apply_to_pte_range(mm, pmd, addr, next, 264362306a36Sopenharmony_ci fn, data, create, mask); 264462306a36Sopenharmony_ci if (err) 264562306a36Sopenharmony_ci break; 264662306a36Sopenharmony_ci } while (pmd++, addr = next, addr != end); 264762306a36Sopenharmony_ci 264862306a36Sopenharmony_ci return err; 264962306a36Sopenharmony_ci} 265062306a36Sopenharmony_ci 265162306a36Sopenharmony_cistatic int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, 265262306a36Sopenharmony_ci unsigned long addr, unsigned long end, 265362306a36Sopenharmony_ci pte_fn_t fn, void *data, bool create, 265462306a36Sopenharmony_ci pgtbl_mod_mask *mask) 265562306a36Sopenharmony_ci{ 265662306a36Sopenharmony_ci pud_t *pud; 265762306a36Sopenharmony_ci unsigned long next; 265862306a36Sopenharmony_ci int err = 0; 265962306a36Sopenharmony_ci 266062306a36Sopenharmony_ci if (create) { 266162306a36Sopenharmony_ci pud = pud_alloc_track(mm, p4d, addr, mask); 266262306a36Sopenharmony_ci if (!pud) 266362306a36Sopenharmony_ci return -ENOMEM; 266462306a36Sopenharmony_ci } else { 266562306a36Sopenharmony_ci pud = pud_offset(p4d, addr); 266662306a36Sopenharmony_ci } 266762306a36Sopenharmony_ci do { 266862306a36Sopenharmony_ci next = pud_addr_end(addr, end); 266962306a36Sopenharmony_ci if (pud_none(*pud) && !create) 267062306a36Sopenharmony_ci continue; 267162306a36Sopenharmony_ci if (WARN_ON_ONCE(pud_leaf(*pud))) 267262306a36Sopenharmony_ci return -EINVAL; 267362306a36Sopenharmony_ci if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) { 267462306a36Sopenharmony_ci if (!create) 267562306a36Sopenharmony_ci continue; 267662306a36Sopenharmony_ci pud_clear_bad(pud); 267762306a36Sopenharmony_ci } 267862306a36Sopenharmony_ci err = apply_to_pmd_range(mm, pud, addr, next, 267962306a36Sopenharmony_ci fn, data, create, mask); 268062306a36Sopenharmony_ci if (err) 268162306a36Sopenharmony_ci break; 268262306a36Sopenharmony_ci } while (pud++, addr = next, addr != end); 268362306a36Sopenharmony_ci 268462306a36Sopenharmony_ci return err; 268562306a36Sopenharmony_ci} 268662306a36Sopenharmony_ci 268762306a36Sopenharmony_cistatic int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, 268862306a36Sopenharmony_ci unsigned long addr, unsigned long end, 268962306a36Sopenharmony_ci pte_fn_t fn, void *data, bool create, 269062306a36Sopenharmony_ci pgtbl_mod_mask *mask) 269162306a36Sopenharmony_ci{ 269262306a36Sopenharmony_ci p4d_t *p4d; 269362306a36Sopenharmony_ci unsigned long next; 269462306a36Sopenharmony_ci int err = 0; 269562306a36Sopenharmony_ci 269662306a36Sopenharmony_ci if (create) { 269762306a36Sopenharmony_ci p4d = p4d_alloc_track(mm, pgd, addr, mask); 269862306a36Sopenharmony_ci if (!p4d) 269962306a36Sopenharmony_ci return -ENOMEM; 270062306a36Sopenharmony_ci } else { 270162306a36Sopenharmony_ci p4d = p4d_offset(pgd, addr); 270262306a36Sopenharmony_ci } 270362306a36Sopenharmony_ci do { 270462306a36Sopenharmony_ci next = p4d_addr_end(addr, end); 270562306a36Sopenharmony_ci if (p4d_none(*p4d) && !create) 270662306a36Sopenharmony_ci continue; 270762306a36Sopenharmony_ci if (WARN_ON_ONCE(p4d_leaf(*p4d))) 270862306a36Sopenharmony_ci return -EINVAL; 270962306a36Sopenharmony_ci if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) { 271062306a36Sopenharmony_ci if (!create) 271162306a36Sopenharmony_ci continue; 271262306a36Sopenharmony_ci p4d_clear_bad(p4d); 271362306a36Sopenharmony_ci } 271462306a36Sopenharmony_ci err = apply_to_pud_range(mm, p4d, addr, next, 271562306a36Sopenharmony_ci fn, data, create, mask); 271662306a36Sopenharmony_ci if (err) 271762306a36Sopenharmony_ci break; 271862306a36Sopenharmony_ci } while (p4d++, addr = next, addr != end); 271962306a36Sopenharmony_ci 272062306a36Sopenharmony_ci return err; 272162306a36Sopenharmony_ci} 272262306a36Sopenharmony_ci 272362306a36Sopenharmony_cistatic int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, 272462306a36Sopenharmony_ci unsigned long size, pte_fn_t fn, 272562306a36Sopenharmony_ci void *data, bool create) 272662306a36Sopenharmony_ci{ 272762306a36Sopenharmony_ci pgd_t *pgd; 272862306a36Sopenharmony_ci unsigned long start = addr, next; 272962306a36Sopenharmony_ci unsigned long end = addr + size; 273062306a36Sopenharmony_ci pgtbl_mod_mask mask = 0; 273162306a36Sopenharmony_ci int err = 0; 273262306a36Sopenharmony_ci 273362306a36Sopenharmony_ci if (WARN_ON(addr >= end)) 273462306a36Sopenharmony_ci return -EINVAL; 273562306a36Sopenharmony_ci 273662306a36Sopenharmony_ci pgd = pgd_offset(mm, addr); 273762306a36Sopenharmony_ci do { 273862306a36Sopenharmony_ci next = pgd_addr_end(addr, end); 273962306a36Sopenharmony_ci if (pgd_none(*pgd) && !create) 274062306a36Sopenharmony_ci continue; 274162306a36Sopenharmony_ci if (WARN_ON_ONCE(pgd_leaf(*pgd))) 274262306a36Sopenharmony_ci return -EINVAL; 274362306a36Sopenharmony_ci if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) { 274462306a36Sopenharmony_ci if (!create) 274562306a36Sopenharmony_ci continue; 274662306a36Sopenharmony_ci pgd_clear_bad(pgd); 274762306a36Sopenharmony_ci } 274862306a36Sopenharmony_ci err = apply_to_p4d_range(mm, pgd, addr, next, 274962306a36Sopenharmony_ci fn, data, create, &mask); 275062306a36Sopenharmony_ci if (err) 275162306a36Sopenharmony_ci break; 275262306a36Sopenharmony_ci } while (pgd++, addr = next, addr != end); 275362306a36Sopenharmony_ci 275462306a36Sopenharmony_ci if (mask & ARCH_PAGE_TABLE_SYNC_MASK) 275562306a36Sopenharmony_ci arch_sync_kernel_mappings(start, start + size); 275662306a36Sopenharmony_ci 275762306a36Sopenharmony_ci return err; 275862306a36Sopenharmony_ci} 275962306a36Sopenharmony_ci 276062306a36Sopenharmony_ci/* 276162306a36Sopenharmony_ci * Scan a region of virtual memory, filling in page tables as necessary 276262306a36Sopenharmony_ci * and calling a provided function on each leaf page table. 276362306a36Sopenharmony_ci */ 276462306a36Sopenharmony_ciint apply_to_page_range(struct mm_struct *mm, unsigned long addr, 276562306a36Sopenharmony_ci unsigned long size, pte_fn_t fn, void *data) 276662306a36Sopenharmony_ci{ 276762306a36Sopenharmony_ci return __apply_to_page_range(mm, addr, size, fn, data, true); 276862306a36Sopenharmony_ci} 276962306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(apply_to_page_range); 277062306a36Sopenharmony_ci 277162306a36Sopenharmony_ci/* 277262306a36Sopenharmony_ci * Scan a region of virtual memory, calling a provided function on 277362306a36Sopenharmony_ci * each leaf page table where it exists. 277462306a36Sopenharmony_ci * 277562306a36Sopenharmony_ci * Unlike apply_to_page_range, this does _not_ fill in page tables 277662306a36Sopenharmony_ci * where they are absent. 277762306a36Sopenharmony_ci */ 277862306a36Sopenharmony_ciint apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr, 277962306a36Sopenharmony_ci unsigned long size, pte_fn_t fn, void *data) 278062306a36Sopenharmony_ci{ 278162306a36Sopenharmony_ci return __apply_to_page_range(mm, addr, size, fn, data, false); 278262306a36Sopenharmony_ci} 278362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(apply_to_existing_page_range); 278462306a36Sopenharmony_ci 278562306a36Sopenharmony_ci/* 278662306a36Sopenharmony_ci * handle_pte_fault chooses page fault handler according to an entry which was 278762306a36Sopenharmony_ci * read non-atomically. Before making any commitment, on those architectures 278862306a36Sopenharmony_ci * or configurations (e.g. i386 with PAE) which might give a mix of unmatched 278962306a36Sopenharmony_ci * parts, do_swap_page must check under lock before unmapping the pte and 279062306a36Sopenharmony_ci * proceeding (but do_wp_page is only called after already making such a check; 279162306a36Sopenharmony_ci * and do_anonymous_page can safely check later on). 279262306a36Sopenharmony_ci */ 279362306a36Sopenharmony_cistatic inline int pte_unmap_same(struct vm_fault *vmf) 279462306a36Sopenharmony_ci{ 279562306a36Sopenharmony_ci int same = 1; 279662306a36Sopenharmony_ci#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION) 279762306a36Sopenharmony_ci if (sizeof(pte_t) > sizeof(unsigned long)) { 279862306a36Sopenharmony_ci spin_lock(vmf->ptl); 279962306a36Sopenharmony_ci same = pte_same(ptep_get(vmf->pte), vmf->orig_pte); 280062306a36Sopenharmony_ci spin_unlock(vmf->ptl); 280162306a36Sopenharmony_ci } 280262306a36Sopenharmony_ci#endif 280362306a36Sopenharmony_ci pte_unmap(vmf->pte); 280462306a36Sopenharmony_ci vmf->pte = NULL; 280562306a36Sopenharmony_ci return same; 280662306a36Sopenharmony_ci} 280762306a36Sopenharmony_ci 280862306a36Sopenharmony_ci/* 280962306a36Sopenharmony_ci * Return: 281062306a36Sopenharmony_ci * 0: copied succeeded 281162306a36Sopenharmony_ci * -EHWPOISON: copy failed due to hwpoison in source page 281262306a36Sopenharmony_ci * -EAGAIN: copied failed (some other reason) 281362306a36Sopenharmony_ci */ 281462306a36Sopenharmony_cistatic inline int __wp_page_copy_user(struct page *dst, struct page *src, 281562306a36Sopenharmony_ci struct vm_fault *vmf) 281662306a36Sopenharmony_ci{ 281762306a36Sopenharmony_ci int ret; 281862306a36Sopenharmony_ci void *kaddr; 281962306a36Sopenharmony_ci void __user *uaddr; 282062306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 282162306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 282262306a36Sopenharmony_ci unsigned long addr = vmf->address; 282362306a36Sopenharmony_ci 282462306a36Sopenharmony_ci if (likely(src)) { 282562306a36Sopenharmony_ci if (copy_mc_user_highpage(dst, src, addr, vma)) { 282662306a36Sopenharmony_ci memory_failure_queue(page_to_pfn(src), 0); 282762306a36Sopenharmony_ci return -EHWPOISON; 282862306a36Sopenharmony_ci } 282962306a36Sopenharmony_ci return 0; 283062306a36Sopenharmony_ci } 283162306a36Sopenharmony_ci 283262306a36Sopenharmony_ci /* 283362306a36Sopenharmony_ci * If the source page was a PFN mapping, we don't have 283462306a36Sopenharmony_ci * a "struct page" for it. We do a best-effort copy by 283562306a36Sopenharmony_ci * just copying from the original user address. If that 283662306a36Sopenharmony_ci * fails, we just zero-fill it. Live with it. 283762306a36Sopenharmony_ci */ 283862306a36Sopenharmony_ci kaddr = kmap_atomic(dst); 283962306a36Sopenharmony_ci uaddr = (void __user *)(addr & PAGE_MASK); 284062306a36Sopenharmony_ci 284162306a36Sopenharmony_ci /* 284262306a36Sopenharmony_ci * On architectures with software "accessed" bits, we would 284362306a36Sopenharmony_ci * take a double page fault, so mark it accessed here. 284462306a36Sopenharmony_ci */ 284562306a36Sopenharmony_ci vmf->pte = NULL; 284662306a36Sopenharmony_ci if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) { 284762306a36Sopenharmony_ci pte_t entry; 284862306a36Sopenharmony_ci 284962306a36Sopenharmony_ci vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); 285062306a36Sopenharmony_ci if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { 285162306a36Sopenharmony_ci /* 285262306a36Sopenharmony_ci * Other thread has already handled the fault 285362306a36Sopenharmony_ci * and update local tlb only 285462306a36Sopenharmony_ci */ 285562306a36Sopenharmony_ci if (vmf->pte) 285662306a36Sopenharmony_ci update_mmu_tlb(vma, addr, vmf->pte); 285762306a36Sopenharmony_ci ret = -EAGAIN; 285862306a36Sopenharmony_ci goto pte_unlock; 285962306a36Sopenharmony_ci } 286062306a36Sopenharmony_ci 286162306a36Sopenharmony_ci entry = pte_mkyoung(vmf->orig_pte); 286262306a36Sopenharmony_ci if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0)) 286362306a36Sopenharmony_ci update_mmu_cache_range(vmf, vma, addr, vmf->pte, 1); 286462306a36Sopenharmony_ci } 286562306a36Sopenharmony_ci 286662306a36Sopenharmony_ci /* 286762306a36Sopenharmony_ci * This really shouldn't fail, because the page is there 286862306a36Sopenharmony_ci * in the page tables. But it might just be unreadable, 286962306a36Sopenharmony_ci * in which case we just give up and fill the result with 287062306a36Sopenharmony_ci * zeroes. 287162306a36Sopenharmony_ci */ 287262306a36Sopenharmony_ci if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { 287362306a36Sopenharmony_ci if (vmf->pte) 287462306a36Sopenharmony_ci goto warn; 287562306a36Sopenharmony_ci 287662306a36Sopenharmony_ci /* Re-validate under PTL if the page is still mapped */ 287762306a36Sopenharmony_ci vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); 287862306a36Sopenharmony_ci if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { 287962306a36Sopenharmony_ci /* The PTE changed under us, update local tlb */ 288062306a36Sopenharmony_ci if (vmf->pte) 288162306a36Sopenharmony_ci update_mmu_tlb(vma, addr, vmf->pte); 288262306a36Sopenharmony_ci ret = -EAGAIN; 288362306a36Sopenharmony_ci goto pte_unlock; 288462306a36Sopenharmony_ci } 288562306a36Sopenharmony_ci 288662306a36Sopenharmony_ci /* 288762306a36Sopenharmony_ci * The same page can be mapped back since last copy attempt. 288862306a36Sopenharmony_ci * Try to copy again under PTL. 288962306a36Sopenharmony_ci */ 289062306a36Sopenharmony_ci if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { 289162306a36Sopenharmony_ci /* 289262306a36Sopenharmony_ci * Give a warn in case there can be some obscure 289362306a36Sopenharmony_ci * use-case 289462306a36Sopenharmony_ci */ 289562306a36Sopenharmony_ciwarn: 289662306a36Sopenharmony_ci WARN_ON_ONCE(1); 289762306a36Sopenharmony_ci clear_page(kaddr); 289862306a36Sopenharmony_ci } 289962306a36Sopenharmony_ci } 290062306a36Sopenharmony_ci 290162306a36Sopenharmony_ci ret = 0; 290262306a36Sopenharmony_ci 290362306a36Sopenharmony_cipte_unlock: 290462306a36Sopenharmony_ci if (vmf->pte) 290562306a36Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 290662306a36Sopenharmony_ci kunmap_atomic(kaddr); 290762306a36Sopenharmony_ci flush_dcache_page(dst); 290862306a36Sopenharmony_ci 290962306a36Sopenharmony_ci return ret; 291062306a36Sopenharmony_ci} 291162306a36Sopenharmony_ci 291262306a36Sopenharmony_cistatic gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) 291362306a36Sopenharmony_ci{ 291462306a36Sopenharmony_ci struct file *vm_file = vma->vm_file; 291562306a36Sopenharmony_ci 291662306a36Sopenharmony_ci if (vm_file) 291762306a36Sopenharmony_ci return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO; 291862306a36Sopenharmony_ci 291962306a36Sopenharmony_ci /* 292062306a36Sopenharmony_ci * Special mappings (e.g. VDSO) do not have any file so fake 292162306a36Sopenharmony_ci * a default GFP_KERNEL for them. 292262306a36Sopenharmony_ci */ 292362306a36Sopenharmony_ci return GFP_KERNEL; 292462306a36Sopenharmony_ci} 292562306a36Sopenharmony_ci 292662306a36Sopenharmony_ci/* 292762306a36Sopenharmony_ci * Notify the address space that the page is about to become writable so that 292862306a36Sopenharmony_ci * it can prohibit this or wait for the page to get into an appropriate state. 292962306a36Sopenharmony_ci * 293062306a36Sopenharmony_ci * We do this without the lock held, so that it can sleep if it needs to. 293162306a36Sopenharmony_ci */ 293262306a36Sopenharmony_cistatic vm_fault_t do_page_mkwrite(struct vm_fault *vmf, struct folio *folio) 293362306a36Sopenharmony_ci{ 293462306a36Sopenharmony_ci vm_fault_t ret; 293562306a36Sopenharmony_ci unsigned int old_flags = vmf->flags; 293662306a36Sopenharmony_ci 293762306a36Sopenharmony_ci vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; 293862306a36Sopenharmony_ci 293962306a36Sopenharmony_ci if (vmf->vma->vm_file && 294062306a36Sopenharmony_ci IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host)) 294162306a36Sopenharmony_ci return VM_FAULT_SIGBUS; 294262306a36Sopenharmony_ci 294362306a36Sopenharmony_ci ret = vmf->vma->vm_ops->page_mkwrite(vmf); 294462306a36Sopenharmony_ci /* Restore original flags so that caller is not surprised */ 294562306a36Sopenharmony_ci vmf->flags = old_flags; 294662306a36Sopenharmony_ci if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) 294762306a36Sopenharmony_ci return ret; 294862306a36Sopenharmony_ci if (unlikely(!(ret & VM_FAULT_LOCKED))) { 294962306a36Sopenharmony_ci folio_lock(folio); 295062306a36Sopenharmony_ci if (!folio->mapping) { 295162306a36Sopenharmony_ci folio_unlock(folio); 295262306a36Sopenharmony_ci return 0; /* retry */ 295362306a36Sopenharmony_ci } 295462306a36Sopenharmony_ci ret |= VM_FAULT_LOCKED; 295562306a36Sopenharmony_ci } else 295662306a36Sopenharmony_ci VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 295762306a36Sopenharmony_ci return ret; 295862306a36Sopenharmony_ci} 295962306a36Sopenharmony_ci 296062306a36Sopenharmony_ci/* 296162306a36Sopenharmony_ci * Handle dirtying of a page in shared file mapping on a write fault. 296262306a36Sopenharmony_ci * 296362306a36Sopenharmony_ci * The function expects the page to be locked and unlocks it. 296462306a36Sopenharmony_ci */ 296562306a36Sopenharmony_cistatic vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf) 296662306a36Sopenharmony_ci{ 296762306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 296862306a36Sopenharmony_ci struct address_space *mapping; 296962306a36Sopenharmony_ci struct folio *folio = page_folio(vmf->page); 297062306a36Sopenharmony_ci bool dirtied; 297162306a36Sopenharmony_ci bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite; 297262306a36Sopenharmony_ci 297362306a36Sopenharmony_ci dirtied = folio_mark_dirty(folio); 297462306a36Sopenharmony_ci VM_BUG_ON_FOLIO(folio_test_anon(folio), folio); 297562306a36Sopenharmony_ci /* 297662306a36Sopenharmony_ci * Take a local copy of the address_space - folio.mapping may be zeroed 297762306a36Sopenharmony_ci * by truncate after folio_unlock(). The address_space itself remains 297862306a36Sopenharmony_ci * pinned by vma->vm_file's reference. We rely on folio_unlock()'s 297962306a36Sopenharmony_ci * release semantics to prevent the compiler from undoing this copying. 298062306a36Sopenharmony_ci */ 298162306a36Sopenharmony_ci mapping = folio_raw_mapping(folio); 298262306a36Sopenharmony_ci folio_unlock(folio); 298362306a36Sopenharmony_ci 298462306a36Sopenharmony_ci if (!page_mkwrite) 298562306a36Sopenharmony_ci file_update_time(vma->vm_file); 298662306a36Sopenharmony_ci 298762306a36Sopenharmony_ci /* 298862306a36Sopenharmony_ci * Throttle page dirtying rate down to writeback speed. 298962306a36Sopenharmony_ci * 299062306a36Sopenharmony_ci * mapping may be NULL here because some device drivers do not 299162306a36Sopenharmony_ci * set page.mapping but still dirty their pages 299262306a36Sopenharmony_ci * 299362306a36Sopenharmony_ci * Drop the mmap_lock before waiting on IO, if we can. The file 299462306a36Sopenharmony_ci * is pinning the mapping, as per above. 299562306a36Sopenharmony_ci */ 299662306a36Sopenharmony_ci if ((dirtied || page_mkwrite) && mapping) { 299762306a36Sopenharmony_ci struct file *fpin; 299862306a36Sopenharmony_ci 299962306a36Sopenharmony_ci fpin = maybe_unlock_mmap_for_io(vmf, NULL); 300062306a36Sopenharmony_ci balance_dirty_pages_ratelimited(mapping); 300162306a36Sopenharmony_ci if (fpin) { 300262306a36Sopenharmony_ci fput(fpin); 300362306a36Sopenharmony_ci return VM_FAULT_COMPLETED; 300462306a36Sopenharmony_ci } 300562306a36Sopenharmony_ci } 300662306a36Sopenharmony_ci 300762306a36Sopenharmony_ci return 0; 300862306a36Sopenharmony_ci} 300962306a36Sopenharmony_ci 301062306a36Sopenharmony_ci/* 301162306a36Sopenharmony_ci * Handle write page faults for pages that can be reused in the current vma 301262306a36Sopenharmony_ci * 301362306a36Sopenharmony_ci * This can happen either due to the mapping being with the VM_SHARED flag, 301462306a36Sopenharmony_ci * or due to us being the last reference standing to the page. In either 301562306a36Sopenharmony_ci * case, all we need to do here is to mark the page as writable and update 301662306a36Sopenharmony_ci * any related book-keeping. 301762306a36Sopenharmony_ci */ 301862306a36Sopenharmony_cistatic inline void wp_page_reuse(struct vm_fault *vmf) 301962306a36Sopenharmony_ci __releases(vmf->ptl) 302062306a36Sopenharmony_ci{ 302162306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 302262306a36Sopenharmony_ci struct page *page = vmf->page; 302362306a36Sopenharmony_ci pte_t entry; 302462306a36Sopenharmony_ci 302562306a36Sopenharmony_ci VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE)); 302662306a36Sopenharmony_ci VM_BUG_ON(page && PageAnon(page) && !PageAnonExclusive(page)); 302762306a36Sopenharmony_ci 302862306a36Sopenharmony_ci /* 302962306a36Sopenharmony_ci * Clear the pages cpupid information as the existing 303062306a36Sopenharmony_ci * information potentially belongs to a now completely 303162306a36Sopenharmony_ci * unrelated process. 303262306a36Sopenharmony_ci */ 303362306a36Sopenharmony_ci if (page) 303462306a36Sopenharmony_ci page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); 303562306a36Sopenharmony_ci 303662306a36Sopenharmony_ci flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); 303762306a36Sopenharmony_ci entry = pte_mkyoung(vmf->orig_pte); 303862306a36Sopenharmony_ci entry = maybe_mkwrite(pte_mkdirty(entry), vma); 303962306a36Sopenharmony_ci if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) 304062306a36Sopenharmony_ci update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); 304162306a36Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 304262306a36Sopenharmony_ci count_vm_event(PGREUSE); 304362306a36Sopenharmony_ci} 304462306a36Sopenharmony_ci 304562306a36Sopenharmony_ci/* 304662306a36Sopenharmony_ci * Handle the case of a page which we actually need to copy to a new page, 304762306a36Sopenharmony_ci * either due to COW or unsharing. 304862306a36Sopenharmony_ci * 304962306a36Sopenharmony_ci * Called with mmap_lock locked and the old page referenced, but 305062306a36Sopenharmony_ci * without the ptl held. 305162306a36Sopenharmony_ci * 305262306a36Sopenharmony_ci * High level logic flow: 305362306a36Sopenharmony_ci * 305462306a36Sopenharmony_ci * - Allocate a page, copy the content of the old page to the new one. 305562306a36Sopenharmony_ci * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc. 305662306a36Sopenharmony_ci * - Take the PTL. If the pte changed, bail out and release the allocated page 305762306a36Sopenharmony_ci * - If the pte is still the way we remember it, update the page table and all 305862306a36Sopenharmony_ci * relevant references. This includes dropping the reference the page-table 305962306a36Sopenharmony_ci * held to the old page, as well as updating the rmap. 306062306a36Sopenharmony_ci * - In any case, unlock the PTL and drop the reference we took to the old page. 306162306a36Sopenharmony_ci */ 306262306a36Sopenharmony_cistatic vm_fault_t wp_page_copy(struct vm_fault *vmf) 306362306a36Sopenharmony_ci{ 306462306a36Sopenharmony_ci const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; 306562306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 306662306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 306762306a36Sopenharmony_ci struct folio *old_folio = NULL; 306862306a36Sopenharmony_ci struct folio *new_folio = NULL; 306962306a36Sopenharmony_ci pte_t entry; 307062306a36Sopenharmony_ci int page_copied = 0; 307162306a36Sopenharmony_ci struct mmu_notifier_range range; 307262306a36Sopenharmony_ci int ret; 307362306a36Sopenharmony_ci 307462306a36Sopenharmony_ci delayacct_wpcopy_start(); 307562306a36Sopenharmony_ci 307662306a36Sopenharmony_ci if (vmf->page) 307762306a36Sopenharmony_ci old_folio = page_folio(vmf->page); 307862306a36Sopenharmony_ci if (unlikely(anon_vma_prepare(vma))) 307962306a36Sopenharmony_ci goto oom; 308062306a36Sopenharmony_ci 308162306a36Sopenharmony_ci if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { 308262306a36Sopenharmony_ci new_folio = vma_alloc_zeroed_movable_folio(vma, vmf->address); 308362306a36Sopenharmony_ci if (!new_folio) 308462306a36Sopenharmony_ci goto oom; 308562306a36Sopenharmony_ci } else { 308662306a36Sopenharmony_ci new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, 308762306a36Sopenharmony_ci vmf->address, false); 308862306a36Sopenharmony_ci if (!new_folio) 308962306a36Sopenharmony_ci goto oom; 309062306a36Sopenharmony_ci 309162306a36Sopenharmony_ci ret = __wp_page_copy_user(&new_folio->page, vmf->page, vmf); 309262306a36Sopenharmony_ci if (ret) { 309362306a36Sopenharmony_ci /* 309462306a36Sopenharmony_ci * COW failed, if the fault was solved by other, 309562306a36Sopenharmony_ci * it's fine. If not, userspace would re-fault on 309662306a36Sopenharmony_ci * the same address and we will handle the fault 309762306a36Sopenharmony_ci * from the second attempt. 309862306a36Sopenharmony_ci * The -EHWPOISON case will not be retried. 309962306a36Sopenharmony_ci */ 310062306a36Sopenharmony_ci folio_put(new_folio); 310162306a36Sopenharmony_ci if (old_folio) 310262306a36Sopenharmony_ci folio_put(old_folio); 310362306a36Sopenharmony_ci 310462306a36Sopenharmony_ci delayacct_wpcopy_end(); 310562306a36Sopenharmony_ci return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0; 310662306a36Sopenharmony_ci } 310762306a36Sopenharmony_ci kmsan_copy_page_meta(&new_folio->page, vmf->page); 310862306a36Sopenharmony_ci } 310962306a36Sopenharmony_ci 311062306a36Sopenharmony_ci if (mem_cgroup_charge(new_folio, mm, GFP_KERNEL)) 311162306a36Sopenharmony_ci goto oom_free_new; 311262306a36Sopenharmony_ci folio_throttle_swaprate(new_folio, GFP_KERNEL); 311362306a36Sopenharmony_ci 311462306a36Sopenharmony_ci __folio_mark_uptodate(new_folio); 311562306a36Sopenharmony_ci 311662306a36Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, 311762306a36Sopenharmony_ci vmf->address & PAGE_MASK, 311862306a36Sopenharmony_ci (vmf->address & PAGE_MASK) + PAGE_SIZE); 311962306a36Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 312062306a36Sopenharmony_ci 312162306a36Sopenharmony_ci /* 312262306a36Sopenharmony_ci * Re-check the pte - we dropped the lock 312362306a36Sopenharmony_ci */ 312462306a36Sopenharmony_ci vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); 312562306a36Sopenharmony_ci if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { 312662306a36Sopenharmony_ci if (old_folio) { 312762306a36Sopenharmony_ci if (!folio_test_anon(old_folio)) { 312862306a36Sopenharmony_ci dec_mm_counter(mm, mm_counter_file(&old_folio->page)); 312962306a36Sopenharmony_ci inc_mm_counter(mm, MM_ANONPAGES); 313062306a36Sopenharmony_ci } 313162306a36Sopenharmony_ci } else { 313262306a36Sopenharmony_ci ksm_might_unmap_zero_page(mm, vmf->orig_pte); 313362306a36Sopenharmony_ci inc_mm_counter(mm, MM_ANONPAGES); 313462306a36Sopenharmony_ci } 313562306a36Sopenharmony_ci flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); 313662306a36Sopenharmony_ci entry = mk_pte(&new_folio->page, vma->vm_page_prot); 313762306a36Sopenharmony_ci entry = pte_sw_mkyoung(entry); 313862306a36Sopenharmony_ci if (unlikely(unshare)) { 313962306a36Sopenharmony_ci if (pte_soft_dirty(vmf->orig_pte)) 314062306a36Sopenharmony_ci entry = pte_mksoft_dirty(entry); 314162306a36Sopenharmony_ci if (pte_uffd_wp(vmf->orig_pte)) 314262306a36Sopenharmony_ci entry = pte_mkuffd_wp(entry); 314362306a36Sopenharmony_ci } else { 314462306a36Sopenharmony_ci entry = maybe_mkwrite(pte_mkdirty(entry), vma); 314562306a36Sopenharmony_ci } 314662306a36Sopenharmony_ci 314762306a36Sopenharmony_ci /* 314862306a36Sopenharmony_ci * Clear the pte entry and flush it first, before updating the 314962306a36Sopenharmony_ci * pte with the new entry, to keep TLBs on different CPUs in 315062306a36Sopenharmony_ci * sync. This code used to set the new PTE then flush TLBs, but 315162306a36Sopenharmony_ci * that left a window where the new PTE could be loaded into 315262306a36Sopenharmony_ci * some TLBs while the old PTE remains in others. 315362306a36Sopenharmony_ci */ 315462306a36Sopenharmony_ci ptep_clear_flush(vma, vmf->address, vmf->pte); 315562306a36Sopenharmony_ci folio_add_new_anon_rmap(new_folio, vma, vmf->address); 315662306a36Sopenharmony_ci#ifdef CONFIG_MEM_PURGEABLE 315762306a36Sopenharmony_ci if (vma->vm_flags & VM_PURGEABLE) { 315862306a36Sopenharmony_ci pr_info("set wp new folio %lx purgeable\n", folio_pfn(new_folio)); 315962306a36Sopenharmony_ci folio_set_purgeable(new_folio); 316062306a36Sopenharmony_ci uxpte_set_present(vma, vmf->address); 316162306a36Sopenharmony_ci } 316262306a36Sopenharmony_ci#endif 316362306a36Sopenharmony_ci folio_add_lru_vma(new_folio, vma); 316462306a36Sopenharmony_ci /* 316562306a36Sopenharmony_ci * We call the notify macro here because, when using secondary 316662306a36Sopenharmony_ci * mmu page tables (such as kvm shadow page tables), we want the 316762306a36Sopenharmony_ci * new page to be mapped directly into the secondary page table. 316862306a36Sopenharmony_ci */ 316962306a36Sopenharmony_ci BUG_ON(unshare && pte_write(entry)); 317062306a36Sopenharmony_ci set_pte_at_notify(mm, vmf->address, vmf->pte, entry); 317162306a36Sopenharmony_ci update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); 317262306a36Sopenharmony_ci if (old_folio) { 317362306a36Sopenharmony_ci /* 317462306a36Sopenharmony_ci * Only after switching the pte to the new page may 317562306a36Sopenharmony_ci * we remove the mapcount here. Otherwise another 317662306a36Sopenharmony_ci * process may come and find the rmap count decremented 317762306a36Sopenharmony_ci * before the pte is switched to the new page, and 317862306a36Sopenharmony_ci * "reuse" the old page writing into it while our pte 317962306a36Sopenharmony_ci * here still points into it and can be read by other 318062306a36Sopenharmony_ci * threads. 318162306a36Sopenharmony_ci * 318262306a36Sopenharmony_ci * The critical issue is to order this 318362306a36Sopenharmony_ci * page_remove_rmap with the ptp_clear_flush above. 318462306a36Sopenharmony_ci * Those stores are ordered by (if nothing else,) 318562306a36Sopenharmony_ci * the barrier present in the atomic_add_negative 318662306a36Sopenharmony_ci * in page_remove_rmap. 318762306a36Sopenharmony_ci * 318862306a36Sopenharmony_ci * Then the TLB flush in ptep_clear_flush ensures that 318962306a36Sopenharmony_ci * no process can access the old page before the 319062306a36Sopenharmony_ci * decremented mapcount is visible. And the old page 319162306a36Sopenharmony_ci * cannot be reused until after the decremented 319262306a36Sopenharmony_ci * mapcount is visible. So transitively, TLBs to 319362306a36Sopenharmony_ci * old page will be flushed before it can be reused. 319462306a36Sopenharmony_ci */ 319562306a36Sopenharmony_ci page_remove_rmap(vmf->page, vma, false); 319662306a36Sopenharmony_ci } 319762306a36Sopenharmony_ci 319862306a36Sopenharmony_ci /* Free the old page.. */ 319962306a36Sopenharmony_ci new_folio = old_folio; 320062306a36Sopenharmony_ci page_copied = 1; 320162306a36Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 320262306a36Sopenharmony_ci } else if (vmf->pte) { 320362306a36Sopenharmony_ci update_mmu_tlb(vma, vmf->address, vmf->pte); 320462306a36Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 320562306a36Sopenharmony_ci } 320662306a36Sopenharmony_ci 320762306a36Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 320862306a36Sopenharmony_ci 320962306a36Sopenharmony_ci if (new_folio) 321062306a36Sopenharmony_ci folio_put(new_folio); 321162306a36Sopenharmony_ci if (old_folio) { 321262306a36Sopenharmony_ci if (page_copied) 321362306a36Sopenharmony_ci free_swap_cache(&old_folio->page); 321462306a36Sopenharmony_ci folio_put(old_folio); 321562306a36Sopenharmony_ci } 321662306a36Sopenharmony_ci 321762306a36Sopenharmony_ci delayacct_wpcopy_end(); 321862306a36Sopenharmony_ci return 0; 321962306a36Sopenharmony_cioom_free_new: 322062306a36Sopenharmony_ci folio_put(new_folio); 322162306a36Sopenharmony_cioom: 322262306a36Sopenharmony_ci if (old_folio) 322362306a36Sopenharmony_ci folio_put(old_folio); 322462306a36Sopenharmony_ci 322562306a36Sopenharmony_ci delayacct_wpcopy_end(); 322662306a36Sopenharmony_ci return VM_FAULT_OOM; 322762306a36Sopenharmony_ci} 322862306a36Sopenharmony_ci 322962306a36Sopenharmony_ci/** 323062306a36Sopenharmony_ci * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE 323162306a36Sopenharmony_ci * writeable once the page is prepared 323262306a36Sopenharmony_ci * 323362306a36Sopenharmony_ci * @vmf: structure describing the fault 323462306a36Sopenharmony_ci * 323562306a36Sopenharmony_ci * This function handles all that is needed to finish a write page fault in a 323662306a36Sopenharmony_ci * shared mapping due to PTE being read-only once the mapped page is prepared. 323762306a36Sopenharmony_ci * It handles locking of PTE and modifying it. 323862306a36Sopenharmony_ci * 323962306a36Sopenharmony_ci * The function expects the page to be locked or other protection against 324062306a36Sopenharmony_ci * concurrent faults / writeback (such as DAX radix tree locks). 324162306a36Sopenharmony_ci * 324262306a36Sopenharmony_ci * Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before 324362306a36Sopenharmony_ci * we acquired PTE lock. 324462306a36Sopenharmony_ci */ 324562306a36Sopenharmony_civm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) 324662306a36Sopenharmony_ci{ 324762306a36Sopenharmony_ci WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); 324862306a36Sopenharmony_ci vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, 324962306a36Sopenharmony_ci &vmf->ptl); 325062306a36Sopenharmony_ci if (!vmf->pte) 325162306a36Sopenharmony_ci return VM_FAULT_NOPAGE; 325262306a36Sopenharmony_ci /* 325362306a36Sopenharmony_ci * We might have raced with another page fault while we released the 325462306a36Sopenharmony_ci * pte_offset_map_lock. 325562306a36Sopenharmony_ci */ 325662306a36Sopenharmony_ci if (!pte_same(ptep_get(vmf->pte), vmf->orig_pte)) { 325762306a36Sopenharmony_ci update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); 325862306a36Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 325962306a36Sopenharmony_ci return VM_FAULT_NOPAGE; 326062306a36Sopenharmony_ci } 326162306a36Sopenharmony_ci wp_page_reuse(vmf); 326262306a36Sopenharmony_ci return 0; 326362306a36Sopenharmony_ci} 326462306a36Sopenharmony_ci 326562306a36Sopenharmony_ci/* 326662306a36Sopenharmony_ci * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED 326762306a36Sopenharmony_ci * mapping 326862306a36Sopenharmony_ci */ 326962306a36Sopenharmony_cistatic vm_fault_t wp_pfn_shared(struct vm_fault *vmf) 327062306a36Sopenharmony_ci{ 327162306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 327262306a36Sopenharmony_ci 327362306a36Sopenharmony_ci if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { 327462306a36Sopenharmony_ci vm_fault_t ret; 327562306a36Sopenharmony_ci 327662306a36Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 327762306a36Sopenharmony_ci if (vmf->flags & FAULT_FLAG_VMA_LOCK) { 327862306a36Sopenharmony_ci vma_end_read(vmf->vma); 327962306a36Sopenharmony_ci return VM_FAULT_RETRY; 328062306a36Sopenharmony_ci } 328162306a36Sopenharmony_ci 328262306a36Sopenharmony_ci vmf->flags |= FAULT_FLAG_MKWRITE; 328362306a36Sopenharmony_ci ret = vma->vm_ops->pfn_mkwrite(vmf); 328462306a36Sopenharmony_ci if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)) 328562306a36Sopenharmony_ci return ret; 328662306a36Sopenharmony_ci return finish_mkwrite_fault(vmf); 328762306a36Sopenharmony_ci } 328862306a36Sopenharmony_ci wp_page_reuse(vmf); 328962306a36Sopenharmony_ci return 0; 329062306a36Sopenharmony_ci} 329162306a36Sopenharmony_ci 329262306a36Sopenharmony_cistatic vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio) 329362306a36Sopenharmony_ci __releases(vmf->ptl) 329462306a36Sopenharmony_ci{ 329562306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 329662306a36Sopenharmony_ci vm_fault_t ret = 0; 329762306a36Sopenharmony_ci 329862306a36Sopenharmony_ci folio_get(folio); 329962306a36Sopenharmony_ci 330062306a36Sopenharmony_ci if (vma->vm_ops && vma->vm_ops->page_mkwrite) { 330162306a36Sopenharmony_ci vm_fault_t tmp; 330262306a36Sopenharmony_ci 330362306a36Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 330462306a36Sopenharmony_ci if (vmf->flags & FAULT_FLAG_VMA_LOCK) { 330562306a36Sopenharmony_ci folio_put(folio); 330662306a36Sopenharmony_ci vma_end_read(vmf->vma); 330762306a36Sopenharmony_ci return VM_FAULT_RETRY; 330862306a36Sopenharmony_ci } 330962306a36Sopenharmony_ci 331062306a36Sopenharmony_ci tmp = do_page_mkwrite(vmf, folio); 331162306a36Sopenharmony_ci if (unlikely(!tmp || (tmp & 331262306a36Sopenharmony_ci (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { 331362306a36Sopenharmony_ci folio_put(folio); 331462306a36Sopenharmony_ci return tmp; 331562306a36Sopenharmony_ci } 331662306a36Sopenharmony_ci tmp = finish_mkwrite_fault(vmf); 331762306a36Sopenharmony_ci if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { 331862306a36Sopenharmony_ci folio_unlock(folio); 331962306a36Sopenharmony_ci folio_put(folio); 332062306a36Sopenharmony_ci return tmp; 332162306a36Sopenharmony_ci } 332262306a36Sopenharmony_ci } else { 332362306a36Sopenharmony_ci wp_page_reuse(vmf); 332462306a36Sopenharmony_ci folio_lock(folio); 332562306a36Sopenharmony_ci } 332662306a36Sopenharmony_ci ret |= fault_dirty_shared_page(vmf); 332762306a36Sopenharmony_ci folio_put(folio); 332862306a36Sopenharmony_ci 332962306a36Sopenharmony_ci return ret; 333062306a36Sopenharmony_ci} 333162306a36Sopenharmony_ci 333262306a36Sopenharmony_ci/* 333362306a36Sopenharmony_ci * This routine handles present pages, when 333462306a36Sopenharmony_ci * * users try to write to a shared page (FAULT_FLAG_WRITE) 333562306a36Sopenharmony_ci * * GUP wants to take a R/O pin on a possibly shared anonymous page 333662306a36Sopenharmony_ci * (FAULT_FLAG_UNSHARE) 333762306a36Sopenharmony_ci * 333862306a36Sopenharmony_ci * It is done by copying the page to a new address and decrementing the 333962306a36Sopenharmony_ci * shared-page counter for the old page. 334062306a36Sopenharmony_ci * 334162306a36Sopenharmony_ci * Note that this routine assumes that the protection checks have been 334262306a36Sopenharmony_ci * done by the caller (the low-level page fault routine in most cases). 334362306a36Sopenharmony_ci * Thus, with FAULT_FLAG_WRITE, we can safely just mark it writable once we've 334462306a36Sopenharmony_ci * done any necessary COW. 334562306a36Sopenharmony_ci * 334662306a36Sopenharmony_ci * In case of FAULT_FLAG_WRITE, we also mark the page dirty at this point even 334762306a36Sopenharmony_ci * though the page will change only once the write actually happens. This 334862306a36Sopenharmony_ci * avoids a few races, and potentially makes it more efficient. 334962306a36Sopenharmony_ci * 335062306a36Sopenharmony_ci * We enter with non-exclusive mmap_lock (to exclude vma changes, 335162306a36Sopenharmony_ci * but allow concurrent faults), with pte both mapped and locked. 335262306a36Sopenharmony_ci * We return with mmap_lock still held, but pte unmapped and unlocked. 335362306a36Sopenharmony_ci */ 335462306a36Sopenharmony_cistatic vm_fault_t do_wp_page(struct vm_fault *vmf) 335562306a36Sopenharmony_ci __releases(vmf->ptl) 335662306a36Sopenharmony_ci{ 335762306a36Sopenharmony_ci const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; 335862306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 335962306a36Sopenharmony_ci struct folio *folio = NULL; 336062306a36Sopenharmony_ci 336162306a36Sopenharmony_ci if (likely(!unshare)) { 336262306a36Sopenharmony_ci if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) { 336362306a36Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 336462306a36Sopenharmony_ci return handle_userfault(vmf, VM_UFFD_WP); 336562306a36Sopenharmony_ci } 336662306a36Sopenharmony_ci 336762306a36Sopenharmony_ci /* 336862306a36Sopenharmony_ci * Userfaultfd write-protect can defer flushes. Ensure the TLB 336962306a36Sopenharmony_ci * is flushed in this case before copying. 337062306a36Sopenharmony_ci */ 337162306a36Sopenharmony_ci if (unlikely(userfaultfd_wp(vmf->vma) && 337262306a36Sopenharmony_ci mm_tlb_flush_pending(vmf->vma->vm_mm))) 337362306a36Sopenharmony_ci flush_tlb_page(vmf->vma, vmf->address); 337462306a36Sopenharmony_ci } 337562306a36Sopenharmony_ci 337662306a36Sopenharmony_ci vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); 337762306a36Sopenharmony_ci 337862306a36Sopenharmony_ci if (vmf->page) 337962306a36Sopenharmony_ci folio = page_folio(vmf->page); 338062306a36Sopenharmony_ci 338162306a36Sopenharmony_ci /* 338262306a36Sopenharmony_ci * Shared mapping: we are guaranteed to have VM_WRITE and 338362306a36Sopenharmony_ci * FAULT_FLAG_WRITE set at this point. 338462306a36Sopenharmony_ci */ 338562306a36Sopenharmony_ci if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { 338662306a36Sopenharmony_ci /* 338762306a36Sopenharmony_ci * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a 338862306a36Sopenharmony_ci * VM_PFNMAP VMA. 338962306a36Sopenharmony_ci * 339062306a36Sopenharmony_ci * We should not cow pages in a shared writeable mapping. 339162306a36Sopenharmony_ci * Just mark the pages writable and/or call ops->pfn_mkwrite. 339262306a36Sopenharmony_ci */ 339362306a36Sopenharmony_ci if (!vmf->page) 339462306a36Sopenharmony_ci return wp_pfn_shared(vmf); 339562306a36Sopenharmony_ci return wp_page_shared(vmf, folio); 339662306a36Sopenharmony_ci } 339762306a36Sopenharmony_ci 339862306a36Sopenharmony_ci /* 339962306a36Sopenharmony_ci * Private mapping: create an exclusive anonymous page copy if reuse 340062306a36Sopenharmony_ci * is impossible. We might miss VM_WRITE for FOLL_FORCE handling. 340162306a36Sopenharmony_ci */ 340262306a36Sopenharmony_ci if (folio && folio_test_anon(folio)) { 340362306a36Sopenharmony_ci /* 340462306a36Sopenharmony_ci * If the page is exclusive to this process we must reuse the 340562306a36Sopenharmony_ci * page without further checks. 340662306a36Sopenharmony_ci */ 340762306a36Sopenharmony_ci if (PageAnonExclusive(vmf->page)) 340862306a36Sopenharmony_ci goto reuse; 340962306a36Sopenharmony_ci 341062306a36Sopenharmony_ci /* 341162306a36Sopenharmony_ci * We have to verify under folio lock: these early checks are 341262306a36Sopenharmony_ci * just an optimization to avoid locking the folio and freeing 341362306a36Sopenharmony_ci * the swapcache if there is little hope that we can reuse. 341462306a36Sopenharmony_ci * 341562306a36Sopenharmony_ci * KSM doesn't necessarily raise the folio refcount. 341662306a36Sopenharmony_ci */ 341762306a36Sopenharmony_ci if (folio_test_ksm(folio) || folio_ref_count(folio) > 3) 341862306a36Sopenharmony_ci goto copy; 341962306a36Sopenharmony_ci if (!folio_test_lru(folio)) 342062306a36Sopenharmony_ci /* 342162306a36Sopenharmony_ci * We cannot easily detect+handle references from 342262306a36Sopenharmony_ci * remote LRU caches or references to LRU folios. 342362306a36Sopenharmony_ci */ 342462306a36Sopenharmony_ci lru_add_drain(); 342562306a36Sopenharmony_ci if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio)) 342662306a36Sopenharmony_ci goto copy; 342762306a36Sopenharmony_ci if (!folio_trylock(folio)) 342862306a36Sopenharmony_ci goto copy; 342962306a36Sopenharmony_ci if (folio_test_swapcache(folio)) 343062306a36Sopenharmony_ci folio_free_swap(folio); 343162306a36Sopenharmony_ci if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) { 343262306a36Sopenharmony_ci folio_unlock(folio); 343362306a36Sopenharmony_ci goto copy; 343462306a36Sopenharmony_ci } 343562306a36Sopenharmony_ci /* 343662306a36Sopenharmony_ci * Ok, we've got the only folio reference from our mapping 343762306a36Sopenharmony_ci * and the folio is locked, it's dark out, and we're wearing 343862306a36Sopenharmony_ci * sunglasses. Hit it. 343962306a36Sopenharmony_ci */ 344062306a36Sopenharmony_ci page_move_anon_rmap(vmf->page, vma); 344162306a36Sopenharmony_ci folio_unlock(folio); 344262306a36Sopenharmony_cireuse: 344362306a36Sopenharmony_ci if (unlikely(unshare)) { 344462306a36Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 344562306a36Sopenharmony_ci return 0; 344662306a36Sopenharmony_ci } 344762306a36Sopenharmony_ci wp_page_reuse(vmf); 344862306a36Sopenharmony_ci return 0; 344962306a36Sopenharmony_ci } 345062306a36Sopenharmony_cicopy: 345162306a36Sopenharmony_ci if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma->anon_vma) { 345262306a36Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 345362306a36Sopenharmony_ci vma_end_read(vmf->vma); 345462306a36Sopenharmony_ci return VM_FAULT_RETRY; 345562306a36Sopenharmony_ci } 345662306a36Sopenharmony_ci 345762306a36Sopenharmony_ci /* 345862306a36Sopenharmony_ci * Ok, we need to copy. Oh, well.. 345962306a36Sopenharmony_ci */ 346062306a36Sopenharmony_ci if (folio) 346162306a36Sopenharmony_ci folio_get(folio); 346262306a36Sopenharmony_ci 346362306a36Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 346462306a36Sopenharmony_ci#ifdef CONFIG_KSM 346562306a36Sopenharmony_ci if (folio && folio_test_ksm(folio)) 346662306a36Sopenharmony_ci count_vm_event(COW_KSM); 346762306a36Sopenharmony_ci#endif 346862306a36Sopenharmony_ci return wp_page_copy(vmf); 346962306a36Sopenharmony_ci} 347062306a36Sopenharmony_ci 347162306a36Sopenharmony_cistatic void unmap_mapping_range_vma(struct vm_area_struct *vma, 347262306a36Sopenharmony_ci unsigned long start_addr, unsigned long end_addr, 347362306a36Sopenharmony_ci struct zap_details *details) 347462306a36Sopenharmony_ci{ 347562306a36Sopenharmony_ci zap_page_range_single(vma, start_addr, end_addr - start_addr, details); 347662306a36Sopenharmony_ci} 347762306a36Sopenharmony_ci 347862306a36Sopenharmony_cistatic inline void unmap_mapping_range_tree(struct rb_root_cached *root, 347962306a36Sopenharmony_ci pgoff_t first_index, 348062306a36Sopenharmony_ci pgoff_t last_index, 348162306a36Sopenharmony_ci struct zap_details *details) 348262306a36Sopenharmony_ci{ 348362306a36Sopenharmony_ci struct vm_area_struct *vma; 348462306a36Sopenharmony_ci pgoff_t vba, vea, zba, zea; 348562306a36Sopenharmony_ci 348662306a36Sopenharmony_ci vma_interval_tree_foreach(vma, root, first_index, last_index) { 348762306a36Sopenharmony_ci vba = vma->vm_pgoff; 348862306a36Sopenharmony_ci vea = vba + vma_pages(vma) - 1; 348962306a36Sopenharmony_ci zba = max(first_index, vba); 349062306a36Sopenharmony_ci zea = min(last_index, vea); 349162306a36Sopenharmony_ci 349262306a36Sopenharmony_ci unmap_mapping_range_vma(vma, 349362306a36Sopenharmony_ci ((zba - vba) << PAGE_SHIFT) + vma->vm_start, 349462306a36Sopenharmony_ci ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, 349562306a36Sopenharmony_ci details); 349662306a36Sopenharmony_ci } 349762306a36Sopenharmony_ci} 349862306a36Sopenharmony_ci 349962306a36Sopenharmony_ci/** 350062306a36Sopenharmony_ci * unmap_mapping_folio() - Unmap single folio from processes. 350162306a36Sopenharmony_ci * @folio: The locked folio to be unmapped. 350262306a36Sopenharmony_ci * 350362306a36Sopenharmony_ci * Unmap this folio from any userspace process which still has it mmaped. 350462306a36Sopenharmony_ci * Typically, for efficiency, the range of nearby pages has already been 350562306a36Sopenharmony_ci * unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once 350662306a36Sopenharmony_ci * truncation or invalidation holds the lock on a folio, it may find that 350762306a36Sopenharmony_ci * the page has been remapped again: and then uses unmap_mapping_folio() 350862306a36Sopenharmony_ci * to unmap it finally. 350962306a36Sopenharmony_ci */ 351062306a36Sopenharmony_civoid unmap_mapping_folio(struct folio *folio) 351162306a36Sopenharmony_ci{ 351262306a36Sopenharmony_ci struct address_space *mapping = folio->mapping; 351362306a36Sopenharmony_ci struct zap_details details = { }; 351462306a36Sopenharmony_ci pgoff_t first_index; 351562306a36Sopenharmony_ci pgoff_t last_index; 351662306a36Sopenharmony_ci 351762306a36Sopenharmony_ci VM_BUG_ON(!folio_test_locked(folio)); 351862306a36Sopenharmony_ci 351962306a36Sopenharmony_ci first_index = folio->index; 352062306a36Sopenharmony_ci last_index = folio_next_index(folio) - 1; 352162306a36Sopenharmony_ci 352262306a36Sopenharmony_ci details.even_cows = false; 352362306a36Sopenharmony_ci details.single_folio = folio; 352462306a36Sopenharmony_ci details.zap_flags = ZAP_FLAG_DROP_MARKER; 352562306a36Sopenharmony_ci 352662306a36Sopenharmony_ci i_mmap_lock_read(mapping); 352762306a36Sopenharmony_ci if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) 352862306a36Sopenharmony_ci unmap_mapping_range_tree(&mapping->i_mmap, first_index, 352962306a36Sopenharmony_ci last_index, &details); 353062306a36Sopenharmony_ci i_mmap_unlock_read(mapping); 353162306a36Sopenharmony_ci} 353262306a36Sopenharmony_ci 353362306a36Sopenharmony_ci/** 353462306a36Sopenharmony_ci * unmap_mapping_pages() - Unmap pages from processes. 353562306a36Sopenharmony_ci * @mapping: The address space containing pages to be unmapped. 353662306a36Sopenharmony_ci * @start: Index of first page to be unmapped. 353762306a36Sopenharmony_ci * @nr: Number of pages to be unmapped. 0 to unmap to end of file. 353862306a36Sopenharmony_ci * @even_cows: Whether to unmap even private COWed pages. 353962306a36Sopenharmony_ci * 354062306a36Sopenharmony_ci * Unmap the pages in this address space from any userspace process which 354162306a36Sopenharmony_ci * has them mmaped. Generally, you want to remove COWed pages as well when 354262306a36Sopenharmony_ci * a file is being truncated, but not when invalidating pages from the page 354362306a36Sopenharmony_ci * cache. 354462306a36Sopenharmony_ci */ 354562306a36Sopenharmony_civoid unmap_mapping_pages(struct address_space *mapping, pgoff_t start, 354662306a36Sopenharmony_ci pgoff_t nr, bool even_cows) 354762306a36Sopenharmony_ci{ 354862306a36Sopenharmony_ci struct zap_details details = { }; 354962306a36Sopenharmony_ci pgoff_t first_index = start; 355062306a36Sopenharmony_ci pgoff_t last_index = start + nr - 1; 355162306a36Sopenharmony_ci 355262306a36Sopenharmony_ci details.even_cows = even_cows; 355362306a36Sopenharmony_ci if (last_index < first_index) 355462306a36Sopenharmony_ci last_index = ULONG_MAX; 355562306a36Sopenharmony_ci 355662306a36Sopenharmony_ci i_mmap_lock_read(mapping); 355762306a36Sopenharmony_ci if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) 355862306a36Sopenharmony_ci unmap_mapping_range_tree(&mapping->i_mmap, first_index, 355962306a36Sopenharmony_ci last_index, &details); 356062306a36Sopenharmony_ci i_mmap_unlock_read(mapping); 356162306a36Sopenharmony_ci} 356262306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(unmap_mapping_pages); 356362306a36Sopenharmony_ci 356462306a36Sopenharmony_ci/** 356562306a36Sopenharmony_ci * unmap_mapping_range - unmap the portion of all mmaps in the specified 356662306a36Sopenharmony_ci * address_space corresponding to the specified byte range in the underlying 356762306a36Sopenharmony_ci * file. 356862306a36Sopenharmony_ci * 356962306a36Sopenharmony_ci * @mapping: the address space containing mmaps to be unmapped. 357062306a36Sopenharmony_ci * @holebegin: byte in first page to unmap, relative to the start of 357162306a36Sopenharmony_ci * the underlying file. This will be rounded down to a PAGE_SIZE 357262306a36Sopenharmony_ci * boundary. Note that this is different from truncate_pagecache(), which 357362306a36Sopenharmony_ci * must keep the partial page. In contrast, we must get rid of 357462306a36Sopenharmony_ci * partial pages. 357562306a36Sopenharmony_ci * @holelen: size of prospective hole in bytes. This will be rounded 357662306a36Sopenharmony_ci * up to a PAGE_SIZE boundary. A holelen of zero truncates to the 357762306a36Sopenharmony_ci * end of the file. 357862306a36Sopenharmony_ci * @even_cows: 1 when truncating a file, unmap even private COWed pages; 357962306a36Sopenharmony_ci * but 0 when invalidating pagecache, don't throw away private data. 358062306a36Sopenharmony_ci */ 358162306a36Sopenharmony_civoid unmap_mapping_range(struct address_space *mapping, 358262306a36Sopenharmony_ci loff_t const holebegin, loff_t const holelen, int even_cows) 358362306a36Sopenharmony_ci{ 358462306a36Sopenharmony_ci pgoff_t hba = (pgoff_t)(holebegin) >> PAGE_SHIFT; 358562306a36Sopenharmony_ci pgoff_t hlen = ((pgoff_t)(holelen) + PAGE_SIZE - 1) >> PAGE_SHIFT; 358662306a36Sopenharmony_ci 358762306a36Sopenharmony_ci /* Check for overflow. */ 358862306a36Sopenharmony_ci if (sizeof(holelen) > sizeof(hlen)) { 358962306a36Sopenharmony_ci long long holeend = 359062306a36Sopenharmony_ci (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; 359162306a36Sopenharmony_ci if (holeend & ~(long long)ULONG_MAX) 359262306a36Sopenharmony_ci hlen = ULONG_MAX - hba + 1; 359362306a36Sopenharmony_ci } 359462306a36Sopenharmony_ci 359562306a36Sopenharmony_ci unmap_mapping_pages(mapping, hba, hlen, even_cows); 359662306a36Sopenharmony_ci} 359762306a36Sopenharmony_ciEXPORT_SYMBOL(unmap_mapping_range); 359862306a36Sopenharmony_ci 359962306a36Sopenharmony_ci/* 360062306a36Sopenharmony_ci * Restore a potential device exclusive pte to a working pte entry 360162306a36Sopenharmony_ci */ 360262306a36Sopenharmony_cistatic vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) 360362306a36Sopenharmony_ci{ 360462306a36Sopenharmony_ci struct folio *folio = page_folio(vmf->page); 360562306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 360662306a36Sopenharmony_ci struct mmu_notifier_range range; 360762306a36Sopenharmony_ci vm_fault_t ret; 360862306a36Sopenharmony_ci 360962306a36Sopenharmony_ci /* 361062306a36Sopenharmony_ci * We need a reference to lock the folio because we don't hold 361162306a36Sopenharmony_ci * the PTL so a racing thread can remove the device-exclusive 361262306a36Sopenharmony_ci * entry and unmap it. If the folio is free the entry must 361362306a36Sopenharmony_ci * have been removed already. If it happens to have already 361462306a36Sopenharmony_ci * been re-allocated after being freed all we do is lock and 361562306a36Sopenharmony_ci * unlock it. 361662306a36Sopenharmony_ci */ 361762306a36Sopenharmony_ci if (!folio_try_get(folio)) 361862306a36Sopenharmony_ci return 0; 361962306a36Sopenharmony_ci 362062306a36Sopenharmony_ci ret = folio_lock_or_retry(folio, vmf); 362162306a36Sopenharmony_ci if (ret) { 362262306a36Sopenharmony_ci folio_put(folio); 362362306a36Sopenharmony_ci return ret; 362462306a36Sopenharmony_ci } 362562306a36Sopenharmony_ci mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, 362662306a36Sopenharmony_ci vma->vm_mm, vmf->address & PAGE_MASK, 362762306a36Sopenharmony_ci (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL); 362862306a36Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 362962306a36Sopenharmony_ci 363062306a36Sopenharmony_ci vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, 363162306a36Sopenharmony_ci &vmf->ptl); 363262306a36Sopenharmony_ci if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) 363362306a36Sopenharmony_ci restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte); 363462306a36Sopenharmony_ci 363562306a36Sopenharmony_ci if (vmf->pte) 363662306a36Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 363762306a36Sopenharmony_ci folio_unlock(folio); 363862306a36Sopenharmony_ci folio_put(folio); 363962306a36Sopenharmony_ci 364062306a36Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 364162306a36Sopenharmony_ci return 0; 364262306a36Sopenharmony_ci} 364362306a36Sopenharmony_ci 364462306a36Sopenharmony_cistatic inline bool should_try_to_free_swap(struct folio *folio, 364562306a36Sopenharmony_ci struct vm_area_struct *vma, 364662306a36Sopenharmony_ci unsigned int fault_flags) 364762306a36Sopenharmony_ci{ 364862306a36Sopenharmony_ci if (!folio_test_swapcache(folio)) 364962306a36Sopenharmony_ci return false; 365062306a36Sopenharmony_ci if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) || 365162306a36Sopenharmony_ci folio_test_mlocked(folio)) 365262306a36Sopenharmony_ci return true; 365362306a36Sopenharmony_ci /* 365462306a36Sopenharmony_ci * If we want to map a page that's in the swapcache writable, we 365562306a36Sopenharmony_ci * have to detect via the refcount if we're really the exclusive 365662306a36Sopenharmony_ci * user. Try freeing the swapcache to get rid of the swapcache 365762306a36Sopenharmony_ci * reference only in case it's likely that we'll be the exlusive user. 365862306a36Sopenharmony_ci */ 365962306a36Sopenharmony_ci return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) && 366062306a36Sopenharmony_ci folio_ref_count(folio) == 2; 366162306a36Sopenharmony_ci} 366262306a36Sopenharmony_ci 366362306a36Sopenharmony_cistatic vm_fault_t pte_marker_clear(struct vm_fault *vmf) 366462306a36Sopenharmony_ci{ 366562306a36Sopenharmony_ci vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, 366662306a36Sopenharmony_ci vmf->address, &vmf->ptl); 366762306a36Sopenharmony_ci if (!vmf->pte) 366862306a36Sopenharmony_ci return 0; 366962306a36Sopenharmony_ci /* 367062306a36Sopenharmony_ci * Be careful so that we will only recover a special uffd-wp pte into a 367162306a36Sopenharmony_ci * none pte. Otherwise it means the pte could have changed, so retry. 367262306a36Sopenharmony_ci * 367362306a36Sopenharmony_ci * This should also cover the case where e.g. the pte changed 367462306a36Sopenharmony_ci * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED. 367562306a36Sopenharmony_ci * So is_pte_marker() check is not enough to safely drop the pte. 367662306a36Sopenharmony_ci */ 367762306a36Sopenharmony_ci if (pte_same(vmf->orig_pte, ptep_get(vmf->pte))) 367862306a36Sopenharmony_ci pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte); 367962306a36Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 368062306a36Sopenharmony_ci return 0; 368162306a36Sopenharmony_ci} 368262306a36Sopenharmony_ci 368362306a36Sopenharmony_cistatic vm_fault_t do_pte_missing(struct vm_fault *vmf) 368462306a36Sopenharmony_ci{ 368562306a36Sopenharmony_ci if (vma_is_anonymous(vmf->vma)) 368662306a36Sopenharmony_ci return do_anonymous_page(vmf); 368762306a36Sopenharmony_ci else 368862306a36Sopenharmony_ci return do_fault(vmf); 368962306a36Sopenharmony_ci} 369062306a36Sopenharmony_ci 369162306a36Sopenharmony_ci/* 369262306a36Sopenharmony_ci * This is actually a page-missing access, but with uffd-wp special pte 369362306a36Sopenharmony_ci * installed. It means this pte was wr-protected before being unmapped. 369462306a36Sopenharmony_ci */ 369562306a36Sopenharmony_cistatic vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf) 369662306a36Sopenharmony_ci{ 369762306a36Sopenharmony_ci /* 369862306a36Sopenharmony_ci * Just in case there're leftover special ptes even after the region 369962306a36Sopenharmony_ci * got unregistered - we can simply clear them. 370062306a36Sopenharmony_ci */ 370162306a36Sopenharmony_ci if (unlikely(!userfaultfd_wp(vmf->vma))) 370262306a36Sopenharmony_ci return pte_marker_clear(vmf); 370362306a36Sopenharmony_ci 370462306a36Sopenharmony_ci return do_pte_missing(vmf); 370562306a36Sopenharmony_ci} 370662306a36Sopenharmony_ci 370762306a36Sopenharmony_cistatic vm_fault_t handle_pte_marker(struct vm_fault *vmf) 370862306a36Sopenharmony_ci{ 370962306a36Sopenharmony_ci swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte); 371062306a36Sopenharmony_ci unsigned long marker = pte_marker_get(entry); 371162306a36Sopenharmony_ci 371262306a36Sopenharmony_ci /* 371362306a36Sopenharmony_ci * PTE markers should never be empty. If anything weird happened, 371462306a36Sopenharmony_ci * the best thing to do is to kill the process along with its mm. 371562306a36Sopenharmony_ci */ 371662306a36Sopenharmony_ci if (WARN_ON_ONCE(!marker)) 371762306a36Sopenharmony_ci return VM_FAULT_SIGBUS; 371862306a36Sopenharmony_ci 371962306a36Sopenharmony_ci /* Higher priority than uffd-wp when data corrupted */ 372062306a36Sopenharmony_ci if (marker & PTE_MARKER_POISONED) 372162306a36Sopenharmony_ci return VM_FAULT_HWPOISON; 372262306a36Sopenharmony_ci 372362306a36Sopenharmony_ci if (pte_marker_entry_uffd_wp(entry)) 372462306a36Sopenharmony_ci return pte_marker_handle_uffd_wp(vmf); 372562306a36Sopenharmony_ci 372662306a36Sopenharmony_ci /* This is an unknown pte marker */ 372762306a36Sopenharmony_ci return VM_FAULT_SIGBUS; 372862306a36Sopenharmony_ci} 372962306a36Sopenharmony_ci 373062306a36Sopenharmony_ci/* 373162306a36Sopenharmony_ci * We enter with non-exclusive mmap_lock (to exclude vma changes, 373262306a36Sopenharmony_ci * but allow concurrent faults), and pte mapped but not yet locked. 373362306a36Sopenharmony_ci * We return with pte unmapped and unlocked. 373462306a36Sopenharmony_ci * 373562306a36Sopenharmony_ci * We return with the mmap_lock locked or unlocked in the same cases 373662306a36Sopenharmony_ci * as does filemap_fault(). 373762306a36Sopenharmony_ci */ 373862306a36Sopenharmony_civm_fault_t do_swap_page(struct vm_fault *vmf) 373962306a36Sopenharmony_ci{ 374062306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 374162306a36Sopenharmony_ci struct folio *swapcache, *folio = NULL; 374262306a36Sopenharmony_ci struct page *page; 374362306a36Sopenharmony_ci struct swap_info_struct *si = NULL; 374462306a36Sopenharmony_ci rmap_t rmap_flags = RMAP_NONE; 374562306a36Sopenharmony_ci bool need_clear_cache = false; 374662306a36Sopenharmony_ci bool exclusive = false; 374762306a36Sopenharmony_ci swp_entry_t entry; 374862306a36Sopenharmony_ci pte_t pte; 374962306a36Sopenharmony_ci vm_fault_t ret = 0; 375062306a36Sopenharmony_ci void *shadow = NULL; 375162306a36Sopenharmony_ci 375262306a36Sopenharmony_ci if (!pte_unmap_same(vmf)) 375362306a36Sopenharmony_ci goto out; 375462306a36Sopenharmony_ci 375562306a36Sopenharmony_ci entry = pte_to_swp_entry(vmf->orig_pte); 375662306a36Sopenharmony_ci if (unlikely(non_swap_entry(entry))) { 375762306a36Sopenharmony_ci if (is_migration_entry(entry)) { 375862306a36Sopenharmony_ci migration_entry_wait(vma->vm_mm, vmf->pmd, 375962306a36Sopenharmony_ci vmf->address); 376062306a36Sopenharmony_ci } else if (is_device_exclusive_entry(entry)) { 376162306a36Sopenharmony_ci vmf->page = pfn_swap_entry_to_page(entry); 376262306a36Sopenharmony_ci ret = remove_device_exclusive_entry(vmf); 376362306a36Sopenharmony_ci } else if (is_device_private_entry(entry)) { 376462306a36Sopenharmony_ci if (vmf->flags & FAULT_FLAG_VMA_LOCK) { 376562306a36Sopenharmony_ci /* 376662306a36Sopenharmony_ci * migrate_to_ram is not yet ready to operate 376762306a36Sopenharmony_ci * under VMA lock. 376862306a36Sopenharmony_ci */ 376962306a36Sopenharmony_ci vma_end_read(vma); 377062306a36Sopenharmony_ci ret = VM_FAULT_RETRY; 377162306a36Sopenharmony_ci goto out; 377262306a36Sopenharmony_ci } 377362306a36Sopenharmony_ci 377462306a36Sopenharmony_ci vmf->page = pfn_swap_entry_to_page(entry); 377562306a36Sopenharmony_ci vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, 377662306a36Sopenharmony_ci vmf->address, &vmf->ptl); 377762306a36Sopenharmony_ci if (unlikely(!vmf->pte || 377862306a36Sopenharmony_ci !pte_same(ptep_get(vmf->pte), 377962306a36Sopenharmony_ci vmf->orig_pte))) 378062306a36Sopenharmony_ci goto unlock; 378162306a36Sopenharmony_ci 378262306a36Sopenharmony_ci /* 378362306a36Sopenharmony_ci * Get a page reference while we know the page can't be 378462306a36Sopenharmony_ci * freed. 378562306a36Sopenharmony_ci */ 378662306a36Sopenharmony_ci get_page(vmf->page); 378762306a36Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 378862306a36Sopenharmony_ci ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); 378962306a36Sopenharmony_ci put_page(vmf->page); 379062306a36Sopenharmony_ci } else if (is_hwpoison_entry(entry)) { 379162306a36Sopenharmony_ci ret = VM_FAULT_HWPOISON; 379262306a36Sopenharmony_ci } else if (is_pte_marker_entry(entry)) { 379362306a36Sopenharmony_ci ret = handle_pte_marker(vmf); 379462306a36Sopenharmony_ci } else { 379562306a36Sopenharmony_ci print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL); 379662306a36Sopenharmony_ci ret = VM_FAULT_SIGBUS; 379762306a36Sopenharmony_ci } 379862306a36Sopenharmony_ci goto out; 379962306a36Sopenharmony_ci } 380062306a36Sopenharmony_ci 380162306a36Sopenharmony_ci /* Prevent swapoff from happening to us. */ 380262306a36Sopenharmony_ci si = get_swap_device(entry); 380362306a36Sopenharmony_ci if (unlikely(!si)) 380462306a36Sopenharmony_ci goto out; 380562306a36Sopenharmony_ci 380662306a36Sopenharmony_ci folio = swap_cache_get_folio(entry, vma, vmf->address); 380762306a36Sopenharmony_ci if (folio) 380862306a36Sopenharmony_ci page = folio_file_page(folio, swp_offset(entry)); 380962306a36Sopenharmony_ci swapcache = folio; 381062306a36Sopenharmony_ci 381162306a36Sopenharmony_ci if (!folio) { 381262306a36Sopenharmony_ci if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && 381362306a36Sopenharmony_ci __swap_count(entry) == 1) { 381462306a36Sopenharmony_ci /* 381562306a36Sopenharmony_ci * Prevent parallel swapin from proceeding with 381662306a36Sopenharmony_ci * the cache flag. Otherwise, another thread may 381762306a36Sopenharmony_ci * finish swapin first, free the entry, and swapout 381862306a36Sopenharmony_ci * reusing the same entry. It's undetectable as 381962306a36Sopenharmony_ci * pte_same() returns true due to entry reuse. 382062306a36Sopenharmony_ci */ 382162306a36Sopenharmony_ci if (swapcache_prepare(entry)) { 382262306a36Sopenharmony_ci /* Relax a bit to prevent rapid repeated page faults */ 382362306a36Sopenharmony_ci schedule_timeout_uninterruptible(1); 382462306a36Sopenharmony_ci goto out; 382562306a36Sopenharmony_ci } 382662306a36Sopenharmony_ci need_clear_cache = true; 382762306a36Sopenharmony_ci 382862306a36Sopenharmony_ci /* skip swapcache */ 382962306a36Sopenharmony_ci folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, 383062306a36Sopenharmony_ci vma, vmf->address, false); 383162306a36Sopenharmony_ci page = &folio->page; 383262306a36Sopenharmony_ci if (folio) { 383362306a36Sopenharmony_ci __folio_set_locked(folio); 383462306a36Sopenharmony_ci __folio_set_swapbacked(folio); 383562306a36Sopenharmony_ci 383662306a36Sopenharmony_ci if (mem_cgroup_swapin_charge_folio(folio, 383762306a36Sopenharmony_ci vma->vm_mm, GFP_KERNEL, 383862306a36Sopenharmony_ci entry)) { 383962306a36Sopenharmony_ci ret = VM_FAULT_OOM; 384062306a36Sopenharmony_ci goto out_page; 384162306a36Sopenharmony_ci } 384262306a36Sopenharmony_ci mem_cgroup_swapin_uncharge_swap(entry); 384362306a36Sopenharmony_ci 384462306a36Sopenharmony_ci shadow = get_shadow_from_swap_cache(entry); 384562306a36Sopenharmony_ci if (shadow) 384662306a36Sopenharmony_ci workingset_refault(folio, shadow); 384762306a36Sopenharmony_ci 384862306a36Sopenharmony_ci folio_add_lru(folio); 384962306a36Sopenharmony_ci 385062306a36Sopenharmony_ci /* To provide entry to swap_readpage() */ 385162306a36Sopenharmony_ci folio->swap = entry; 385262306a36Sopenharmony_ci swap_readpage(page, true, NULL); 385362306a36Sopenharmony_ci folio->private = NULL; 385462306a36Sopenharmony_ci } 385562306a36Sopenharmony_ci } else { 385662306a36Sopenharmony_ci page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, 385762306a36Sopenharmony_ci vmf); 385862306a36Sopenharmony_ci if (page) 385962306a36Sopenharmony_ci folio = page_folio(page); 386062306a36Sopenharmony_ci swapcache = folio; 386162306a36Sopenharmony_ci } 386262306a36Sopenharmony_ci 386362306a36Sopenharmony_ci if (!folio) { 386462306a36Sopenharmony_ci /* 386562306a36Sopenharmony_ci * Back out if somebody else faulted in this pte 386662306a36Sopenharmony_ci * while we released the pte lock. 386762306a36Sopenharmony_ci */ 386862306a36Sopenharmony_ci vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, 386962306a36Sopenharmony_ci vmf->address, &vmf->ptl); 387062306a36Sopenharmony_ci if (likely(vmf->pte && 387162306a36Sopenharmony_ci pte_same(ptep_get(vmf->pte), vmf->orig_pte))) 387262306a36Sopenharmony_ci ret = VM_FAULT_OOM; 387362306a36Sopenharmony_ci goto unlock; 387462306a36Sopenharmony_ci } 387562306a36Sopenharmony_ci 387662306a36Sopenharmony_ci /* Had to read the page from swap area: Major fault */ 387762306a36Sopenharmony_ci ret = VM_FAULT_MAJOR; 387862306a36Sopenharmony_ci count_vm_event(PGMAJFAULT); 387962306a36Sopenharmony_ci count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); 388062306a36Sopenharmony_ci } else if (PageHWPoison(page)) { 388162306a36Sopenharmony_ci /* 388262306a36Sopenharmony_ci * hwpoisoned dirty swapcache pages are kept for killing 388362306a36Sopenharmony_ci * owner processes (which may be unknown at hwpoison time) 388462306a36Sopenharmony_ci */ 388562306a36Sopenharmony_ci ret = VM_FAULT_HWPOISON; 388662306a36Sopenharmony_ci goto out_release; 388762306a36Sopenharmony_ci } 388862306a36Sopenharmony_ci 388962306a36Sopenharmony_ci ret |= folio_lock_or_retry(folio, vmf); 389062306a36Sopenharmony_ci if (ret & VM_FAULT_RETRY) 389162306a36Sopenharmony_ci goto out_release; 389262306a36Sopenharmony_ci 389362306a36Sopenharmony_ci if (swapcache) { 389462306a36Sopenharmony_ci /* 389562306a36Sopenharmony_ci * Make sure folio_free_swap() or swapoff did not release the 389662306a36Sopenharmony_ci * swapcache from under us. The page pin, and pte_same test 389762306a36Sopenharmony_ci * below, are not enough to exclude that. Even if it is still 389862306a36Sopenharmony_ci * swapcache, we need to check that the page's swap has not 389962306a36Sopenharmony_ci * changed. 390062306a36Sopenharmony_ci */ 390162306a36Sopenharmony_ci if (unlikely(!folio_test_swapcache(folio) || 390262306a36Sopenharmony_ci page_swap_entry(page).val != entry.val)) 390362306a36Sopenharmony_ci goto out_page; 390462306a36Sopenharmony_ci 390562306a36Sopenharmony_ci /* 390662306a36Sopenharmony_ci * KSM sometimes has to copy on read faults, for example, if 390762306a36Sopenharmony_ci * page->index of !PageKSM() pages would be nonlinear inside the 390862306a36Sopenharmony_ci * anon VMA -- PageKSM() is lost on actual swapout. 390962306a36Sopenharmony_ci */ 391062306a36Sopenharmony_ci page = ksm_might_need_to_copy(page, vma, vmf->address); 391162306a36Sopenharmony_ci if (unlikely(!page)) { 391262306a36Sopenharmony_ci ret = VM_FAULT_OOM; 391362306a36Sopenharmony_ci goto out_page; 391462306a36Sopenharmony_ci } else if (unlikely(PTR_ERR(page) == -EHWPOISON)) { 391562306a36Sopenharmony_ci ret = VM_FAULT_HWPOISON; 391662306a36Sopenharmony_ci goto out_page; 391762306a36Sopenharmony_ci } 391862306a36Sopenharmony_ci folio = page_folio(page); 391962306a36Sopenharmony_ci 392062306a36Sopenharmony_ci /* 392162306a36Sopenharmony_ci * If we want to map a page that's in the swapcache writable, we 392262306a36Sopenharmony_ci * have to detect via the refcount if we're really the exclusive 392362306a36Sopenharmony_ci * owner. Try removing the extra reference from the local LRU 392462306a36Sopenharmony_ci * caches if required. 392562306a36Sopenharmony_ci */ 392662306a36Sopenharmony_ci if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache && 392762306a36Sopenharmony_ci !folio_test_ksm(folio) && !folio_test_lru(folio)) 392862306a36Sopenharmony_ci lru_add_drain(); 392962306a36Sopenharmony_ci } 393062306a36Sopenharmony_ci 393162306a36Sopenharmony_ci folio_throttle_swaprate(folio, GFP_KERNEL); 393262306a36Sopenharmony_ci 393362306a36Sopenharmony_ci /* 393462306a36Sopenharmony_ci * Back out if somebody else already faulted in this pte. 393562306a36Sopenharmony_ci */ 393662306a36Sopenharmony_ci vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, 393762306a36Sopenharmony_ci &vmf->ptl); 393862306a36Sopenharmony_ci if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) 393962306a36Sopenharmony_ci goto out_nomap; 394062306a36Sopenharmony_ci 394162306a36Sopenharmony_ci if (unlikely(!folio_test_uptodate(folio))) { 394262306a36Sopenharmony_ci ret = VM_FAULT_SIGBUS; 394362306a36Sopenharmony_ci goto out_nomap; 394462306a36Sopenharmony_ci } 394562306a36Sopenharmony_ci 394662306a36Sopenharmony_ci /* 394762306a36Sopenharmony_ci * PG_anon_exclusive reuses PG_mappedtodisk for anon pages. A swap pte 394862306a36Sopenharmony_ci * must never point at an anonymous page in the swapcache that is 394962306a36Sopenharmony_ci * PG_anon_exclusive. Sanity check that this holds and especially, that 395062306a36Sopenharmony_ci * no filesystem set PG_mappedtodisk on a page in the swapcache. Sanity 395162306a36Sopenharmony_ci * check after taking the PT lock and making sure that nobody 395262306a36Sopenharmony_ci * concurrently faulted in this page and set PG_anon_exclusive. 395362306a36Sopenharmony_ci */ 395462306a36Sopenharmony_ci BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio)); 395562306a36Sopenharmony_ci BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page)); 395662306a36Sopenharmony_ci 395762306a36Sopenharmony_ci /* 395862306a36Sopenharmony_ci * Check under PT lock (to protect against concurrent fork() sharing 395962306a36Sopenharmony_ci * the swap entry concurrently) for certainly exclusive pages. 396062306a36Sopenharmony_ci */ 396162306a36Sopenharmony_ci if (!folio_test_ksm(folio)) { 396262306a36Sopenharmony_ci exclusive = pte_swp_exclusive(vmf->orig_pte); 396362306a36Sopenharmony_ci if (folio != swapcache) { 396462306a36Sopenharmony_ci /* 396562306a36Sopenharmony_ci * We have a fresh page that is not exposed to the 396662306a36Sopenharmony_ci * swapcache -> certainly exclusive. 396762306a36Sopenharmony_ci */ 396862306a36Sopenharmony_ci exclusive = true; 396962306a36Sopenharmony_ci } else if (exclusive && folio_test_writeback(folio) && 397062306a36Sopenharmony_ci data_race(si->flags & SWP_STABLE_WRITES)) { 397162306a36Sopenharmony_ci /* 397262306a36Sopenharmony_ci * This is tricky: not all swap backends support 397362306a36Sopenharmony_ci * concurrent page modifications while under writeback. 397462306a36Sopenharmony_ci * 397562306a36Sopenharmony_ci * So if we stumble over such a page in the swapcache 397662306a36Sopenharmony_ci * we must not set the page exclusive, otherwise we can 397762306a36Sopenharmony_ci * map it writable without further checks and modify it 397862306a36Sopenharmony_ci * while still under writeback. 397962306a36Sopenharmony_ci * 398062306a36Sopenharmony_ci * For these problematic swap backends, simply drop the 398162306a36Sopenharmony_ci * exclusive marker: this is perfectly fine as we start 398262306a36Sopenharmony_ci * writeback only if we fully unmapped the page and 398362306a36Sopenharmony_ci * there are no unexpected references on the page after 398462306a36Sopenharmony_ci * unmapping succeeded. After fully unmapped, no 398562306a36Sopenharmony_ci * further GUP references (FOLL_GET and FOLL_PIN) can 398662306a36Sopenharmony_ci * appear, so dropping the exclusive marker and mapping 398762306a36Sopenharmony_ci * it only R/O is fine. 398862306a36Sopenharmony_ci */ 398962306a36Sopenharmony_ci exclusive = false; 399062306a36Sopenharmony_ci } 399162306a36Sopenharmony_ci } 399262306a36Sopenharmony_ci 399362306a36Sopenharmony_ci /* 399462306a36Sopenharmony_ci * Some architectures may have to restore extra metadata to the page 399562306a36Sopenharmony_ci * when reading from swap. This metadata may be indexed by swap entry 399662306a36Sopenharmony_ci * so this must be called before swap_free(). 399762306a36Sopenharmony_ci */ 399862306a36Sopenharmony_ci arch_swap_restore(entry, folio); 399962306a36Sopenharmony_ci 400062306a36Sopenharmony_ci /* 400162306a36Sopenharmony_ci * Remove the swap entry and conditionally try to free up the swapcache. 400262306a36Sopenharmony_ci * We're already holding a reference on the page but haven't mapped it 400362306a36Sopenharmony_ci * yet. 400462306a36Sopenharmony_ci */ 400562306a36Sopenharmony_ci swap_free(entry); 400662306a36Sopenharmony_ci if (should_try_to_free_swap(folio, vma, vmf->flags)) 400762306a36Sopenharmony_ci folio_free_swap(folio); 400862306a36Sopenharmony_ci 400962306a36Sopenharmony_ci inc_mm_counter(vma->vm_mm, MM_ANONPAGES); 401062306a36Sopenharmony_ci dec_mm_counter(vma->vm_mm, MM_SWAPENTS); 401162306a36Sopenharmony_ci pte = mk_pte(page, vma->vm_page_prot); 401262306a36Sopenharmony_ci 401362306a36Sopenharmony_ci /* 401462306a36Sopenharmony_ci * Same logic as in do_wp_page(); however, optimize for pages that are 401562306a36Sopenharmony_ci * certainly not shared either because we just allocated them without 401662306a36Sopenharmony_ci * exposing them to the swapcache or because the swap entry indicates 401762306a36Sopenharmony_ci * exclusivity. 401862306a36Sopenharmony_ci */ 401962306a36Sopenharmony_ci if (!folio_test_ksm(folio) && 402062306a36Sopenharmony_ci (exclusive || folio_ref_count(folio) == 1)) { 402162306a36Sopenharmony_ci if (vmf->flags & FAULT_FLAG_WRITE) { 402262306a36Sopenharmony_ci pte = maybe_mkwrite(pte_mkdirty(pte), vma); 402362306a36Sopenharmony_ci vmf->flags &= ~FAULT_FLAG_WRITE; 402462306a36Sopenharmony_ci } 402562306a36Sopenharmony_ci rmap_flags |= RMAP_EXCLUSIVE; 402662306a36Sopenharmony_ci } 402762306a36Sopenharmony_ci flush_icache_page(vma, page); 402862306a36Sopenharmony_ci if (pte_swp_soft_dirty(vmf->orig_pte)) 402962306a36Sopenharmony_ci pte = pte_mksoft_dirty(pte); 403062306a36Sopenharmony_ci if (pte_swp_uffd_wp(vmf->orig_pte)) 403162306a36Sopenharmony_ci pte = pte_mkuffd_wp(pte); 403262306a36Sopenharmony_ci vmf->orig_pte = pte; 403362306a36Sopenharmony_ci 403462306a36Sopenharmony_ci /* ksm created a completely new copy */ 403562306a36Sopenharmony_ci if (unlikely(folio != swapcache && swapcache)) { 403662306a36Sopenharmony_ci page_add_new_anon_rmap(page, vma, vmf->address); 403762306a36Sopenharmony_ci folio_add_lru_vma(folio, vma); 403862306a36Sopenharmony_ci } else { 403962306a36Sopenharmony_ci page_add_anon_rmap(page, vma, vmf->address, rmap_flags); 404062306a36Sopenharmony_ci } 404162306a36Sopenharmony_ci 404262306a36Sopenharmony_ci VM_BUG_ON(!folio_test_anon(folio) || 404362306a36Sopenharmony_ci (pte_write(pte) && !PageAnonExclusive(page))); 404462306a36Sopenharmony_ci set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); 404562306a36Sopenharmony_ci arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); 404662306a36Sopenharmony_ci 404762306a36Sopenharmony_ci folio_unlock(folio); 404862306a36Sopenharmony_ci if (folio != swapcache && swapcache) { 404962306a36Sopenharmony_ci /* 405062306a36Sopenharmony_ci * Hold the lock to avoid the swap entry to be reused 405162306a36Sopenharmony_ci * until we take the PT lock for the pte_same() check 405262306a36Sopenharmony_ci * (to avoid false positives from pte_same). For 405362306a36Sopenharmony_ci * further safety release the lock after the swap_free 405462306a36Sopenharmony_ci * so that the swap count won't change under a 405562306a36Sopenharmony_ci * parallel locked swapcache. 405662306a36Sopenharmony_ci */ 405762306a36Sopenharmony_ci folio_unlock(swapcache); 405862306a36Sopenharmony_ci folio_put(swapcache); 405962306a36Sopenharmony_ci } 406062306a36Sopenharmony_ci 406162306a36Sopenharmony_ci if (vmf->flags & FAULT_FLAG_WRITE) { 406262306a36Sopenharmony_ci ret |= do_wp_page(vmf); 406362306a36Sopenharmony_ci if (ret & VM_FAULT_ERROR) 406462306a36Sopenharmony_ci ret &= VM_FAULT_ERROR; 406562306a36Sopenharmony_ci goto out; 406662306a36Sopenharmony_ci } 406762306a36Sopenharmony_ci 406862306a36Sopenharmony_ci /* No need to invalidate - it was non-present before */ 406962306a36Sopenharmony_ci update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); 407062306a36Sopenharmony_ciunlock: 407162306a36Sopenharmony_ci if (vmf->pte) 407262306a36Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 407362306a36Sopenharmony_ciout: 407462306a36Sopenharmony_ci /* Clear the swap cache pin for direct swapin after PTL unlock */ 407562306a36Sopenharmony_ci if (need_clear_cache) 407662306a36Sopenharmony_ci swapcache_clear(si, entry); 407762306a36Sopenharmony_ci if (si) 407862306a36Sopenharmony_ci put_swap_device(si); 407962306a36Sopenharmony_ci return ret; 408062306a36Sopenharmony_ciout_nomap: 408162306a36Sopenharmony_ci if (vmf->pte) 408262306a36Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 408362306a36Sopenharmony_ciout_page: 408462306a36Sopenharmony_ci folio_unlock(folio); 408562306a36Sopenharmony_ciout_release: 408662306a36Sopenharmony_ci folio_put(folio); 408762306a36Sopenharmony_ci if (folio != swapcache && swapcache) { 408862306a36Sopenharmony_ci folio_unlock(swapcache); 408962306a36Sopenharmony_ci folio_put(swapcache); 409062306a36Sopenharmony_ci } 409162306a36Sopenharmony_ci if (need_clear_cache) 409262306a36Sopenharmony_ci swapcache_clear(si, entry); 409362306a36Sopenharmony_ci if (si) 409462306a36Sopenharmony_ci put_swap_device(si); 409562306a36Sopenharmony_ci return ret; 409662306a36Sopenharmony_ci} 409762306a36Sopenharmony_ci 409862306a36Sopenharmony_ci/* 409962306a36Sopenharmony_ci * We enter with non-exclusive mmap_lock (to exclude vma changes, 410062306a36Sopenharmony_ci * but allow concurrent faults), and pte mapped but not yet locked. 410162306a36Sopenharmony_ci * We return with mmap_lock still held, but pte unmapped and unlocked. 410262306a36Sopenharmony_ci */ 410362306a36Sopenharmony_cistatic vm_fault_t do_anonymous_page(struct vm_fault *vmf) 410462306a36Sopenharmony_ci{ 410562306a36Sopenharmony_ci bool uffd_wp = vmf_orig_pte_uffd_wp(vmf); 410662306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 410762306a36Sopenharmony_ci struct folio *folio; 410862306a36Sopenharmony_ci vm_fault_t ret = 0; 410962306a36Sopenharmony_ci pte_t entry; 411062306a36Sopenharmony_ci 411162306a36Sopenharmony_ci /* File mapping without ->vm_ops ? */ 411262306a36Sopenharmony_ci if (vma->vm_flags & VM_SHARED) 411362306a36Sopenharmony_ci return VM_FAULT_SIGBUS; 411462306a36Sopenharmony_ci 411562306a36Sopenharmony_ci /* 411662306a36Sopenharmony_ci * Use pte_alloc() instead of pte_alloc_map(), so that OOM can 411762306a36Sopenharmony_ci * be distinguished from a transient failure of pte_offset_map(). 411862306a36Sopenharmony_ci */ 411962306a36Sopenharmony_ci if (pte_alloc(vma->vm_mm, vmf->pmd)) 412062306a36Sopenharmony_ci return VM_FAULT_OOM; 412162306a36Sopenharmony_ci 412262306a36Sopenharmony_ci#ifdef CONFIG_MEM_PURGEABLE 412362306a36Sopenharmony_ci /* use extra page table for userexpte */ 412462306a36Sopenharmony_ci if (vma->vm_flags & VM_USEREXPTE) { 412562306a36Sopenharmony_ci if (do_uxpte_page_fault(vmf, &entry)) 412662306a36Sopenharmony_ci goto oom; 412762306a36Sopenharmony_ci else 412862306a36Sopenharmony_ci goto got_page; 412962306a36Sopenharmony_ci } 413062306a36Sopenharmony_ci#endif 413162306a36Sopenharmony_ci /* Use the zero-page for reads */ 413262306a36Sopenharmony_ci if (!(vmf->flags & FAULT_FLAG_WRITE) && 413362306a36Sopenharmony_ci !mm_forbids_zeropage(vma->vm_mm)) { 413462306a36Sopenharmony_ci entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), 413562306a36Sopenharmony_ci vma->vm_page_prot)); 413662306a36Sopenharmony_ci#ifdef CONFIG_MEM_PURGEABLE 413762306a36Sopenharmony_cigot_page: 413862306a36Sopenharmony_ci#endif 413962306a36Sopenharmony_ci vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, 414062306a36Sopenharmony_ci vmf->address, &vmf->ptl); 414162306a36Sopenharmony_ci if (!vmf->pte) 414262306a36Sopenharmony_ci goto unlock; 414362306a36Sopenharmony_ci if (vmf_pte_changed(vmf)) { 414462306a36Sopenharmony_ci update_mmu_tlb(vma, vmf->address, vmf->pte); 414562306a36Sopenharmony_ci goto unlock; 414662306a36Sopenharmony_ci } 414762306a36Sopenharmony_ci ret = check_stable_address_space(vma->vm_mm); 414862306a36Sopenharmony_ci if (ret) 414962306a36Sopenharmony_ci goto unlock; 415062306a36Sopenharmony_ci /* Deliver the page fault to userland, check inside PT lock */ 415162306a36Sopenharmony_ci if (userfaultfd_missing(vma)) { 415262306a36Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 415362306a36Sopenharmony_ci return handle_userfault(vmf, VM_UFFD_MISSING); 415462306a36Sopenharmony_ci } 415562306a36Sopenharmony_ci goto setpte; 415662306a36Sopenharmony_ci } 415762306a36Sopenharmony_ci 415862306a36Sopenharmony_ci /* Allocate our own private page. */ 415962306a36Sopenharmony_ci if (unlikely(anon_vma_prepare(vma))) 416062306a36Sopenharmony_ci goto oom; 416162306a36Sopenharmony_ci folio = vma_alloc_zeroed_movable_folio(vma, vmf->address); 416262306a36Sopenharmony_ci if (!folio) 416362306a36Sopenharmony_ci goto oom; 416462306a36Sopenharmony_ci 416562306a36Sopenharmony_ci if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) 416662306a36Sopenharmony_ci goto oom_free_page; 416762306a36Sopenharmony_ci folio_throttle_swaprate(folio, GFP_KERNEL); 416862306a36Sopenharmony_ci 416962306a36Sopenharmony_ci /* 417062306a36Sopenharmony_ci * The memory barrier inside __folio_mark_uptodate makes sure that 417162306a36Sopenharmony_ci * preceding stores to the page contents become visible before 417262306a36Sopenharmony_ci * the set_pte_at() write. 417362306a36Sopenharmony_ci */ 417462306a36Sopenharmony_ci __folio_mark_uptodate(folio); 417562306a36Sopenharmony_ci 417662306a36Sopenharmony_ci entry = mk_pte(&folio->page, vma->vm_page_prot); 417762306a36Sopenharmony_ci entry = pte_sw_mkyoung(entry); 417862306a36Sopenharmony_ci if (vma->vm_flags & VM_WRITE) 417962306a36Sopenharmony_ci entry = pte_mkwrite(pte_mkdirty(entry), vma); 418062306a36Sopenharmony_ci 418162306a36Sopenharmony_ci vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, 418262306a36Sopenharmony_ci &vmf->ptl); 418362306a36Sopenharmony_ci if (!vmf->pte) 418462306a36Sopenharmony_ci goto release; 418562306a36Sopenharmony_ci if (vmf_pte_changed(vmf)) { 418662306a36Sopenharmony_ci update_mmu_tlb(vma, vmf->address, vmf->pte); 418762306a36Sopenharmony_ci goto release; 418862306a36Sopenharmony_ci } 418962306a36Sopenharmony_ci 419062306a36Sopenharmony_ci ret = check_stable_address_space(vma->vm_mm); 419162306a36Sopenharmony_ci if (ret) 419262306a36Sopenharmony_ci goto release; 419362306a36Sopenharmony_ci 419462306a36Sopenharmony_ci /* Deliver the page fault to userland, check inside PT lock */ 419562306a36Sopenharmony_ci if (userfaultfd_missing(vma)) { 419662306a36Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 419762306a36Sopenharmony_ci folio_put(folio); 419862306a36Sopenharmony_ci return handle_userfault(vmf, VM_UFFD_MISSING); 419962306a36Sopenharmony_ci } 420062306a36Sopenharmony_ci 420162306a36Sopenharmony_ci inc_mm_counter(vma->vm_mm, MM_ANONPAGES); 420262306a36Sopenharmony_ci folio_add_new_anon_rmap(folio, vma, vmf->address); 420362306a36Sopenharmony_ci#ifdef CONFIG_MEM_PURGEABLE 420462306a36Sopenharmony_ci if (vma->vm_flags & VM_PURGEABLE) 420562306a36Sopenharmony_ci folio_set_purgeable(folio); 420662306a36Sopenharmony_ci#endif 420762306a36Sopenharmony_ci folio_add_lru_vma(folio, vma); 420862306a36Sopenharmony_cisetpte: 420962306a36Sopenharmony_ci#ifdef CONFIG_MEM_PURGEABLE 421062306a36Sopenharmony_ci if (vma->vm_flags & VM_PURGEABLE) 421162306a36Sopenharmony_ci uxpte_set_present(vma, vmf->address); 421262306a36Sopenharmony_ci#endif 421362306a36Sopenharmony_ci if (uffd_wp) 421462306a36Sopenharmony_ci entry = pte_mkuffd_wp(entry); 421562306a36Sopenharmony_ci set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); 421662306a36Sopenharmony_ci 421762306a36Sopenharmony_ci /* No need to invalidate - it was non-present before */ 421862306a36Sopenharmony_ci update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); 421962306a36Sopenharmony_ciunlock: 422062306a36Sopenharmony_ci if (vmf->pte) 422162306a36Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 422262306a36Sopenharmony_ci return ret; 422362306a36Sopenharmony_cirelease: 422462306a36Sopenharmony_ci folio_put(folio); 422562306a36Sopenharmony_ci goto unlock; 422662306a36Sopenharmony_cioom_free_page: 422762306a36Sopenharmony_ci folio_put(folio); 422862306a36Sopenharmony_cioom: 422962306a36Sopenharmony_ci return VM_FAULT_OOM; 423062306a36Sopenharmony_ci} 423162306a36Sopenharmony_ci 423262306a36Sopenharmony_ci/* 423362306a36Sopenharmony_ci * The mmap_lock must have been held on entry, and may have been 423462306a36Sopenharmony_ci * released depending on flags and vma->vm_ops->fault() return value. 423562306a36Sopenharmony_ci * See filemap_fault() and __lock_page_retry(). 423662306a36Sopenharmony_ci */ 423762306a36Sopenharmony_cistatic vm_fault_t __do_fault(struct vm_fault *vmf) 423862306a36Sopenharmony_ci{ 423962306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 424062306a36Sopenharmony_ci vm_fault_t ret; 424162306a36Sopenharmony_ci 424262306a36Sopenharmony_ci /* 424362306a36Sopenharmony_ci * Preallocate pte before we take page_lock because this might lead to 424462306a36Sopenharmony_ci * deadlocks for memcg reclaim which waits for pages under writeback: 424562306a36Sopenharmony_ci * lock_page(A) 424662306a36Sopenharmony_ci * SetPageWriteback(A) 424762306a36Sopenharmony_ci * unlock_page(A) 424862306a36Sopenharmony_ci * lock_page(B) 424962306a36Sopenharmony_ci * lock_page(B) 425062306a36Sopenharmony_ci * pte_alloc_one 425162306a36Sopenharmony_ci * shrink_page_list 425262306a36Sopenharmony_ci * wait_on_page_writeback(A) 425362306a36Sopenharmony_ci * SetPageWriteback(B) 425462306a36Sopenharmony_ci * unlock_page(B) 425562306a36Sopenharmony_ci * # flush A, B to clear the writeback 425662306a36Sopenharmony_ci */ 425762306a36Sopenharmony_ci if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) { 425862306a36Sopenharmony_ci vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); 425962306a36Sopenharmony_ci if (!vmf->prealloc_pte) 426062306a36Sopenharmony_ci return VM_FAULT_OOM; 426162306a36Sopenharmony_ci } 426262306a36Sopenharmony_ci 426362306a36Sopenharmony_ci ret = vma->vm_ops->fault(vmf); 426462306a36Sopenharmony_ci if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | 426562306a36Sopenharmony_ci VM_FAULT_DONE_COW))) 426662306a36Sopenharmony_ci return ret; 426762306a36Sopenharmony_ci 426862306a36Sopenharmony_ci if (unlikely(PageHWPoison(vmf->page))) { 426962306a36Sopenharmony_ci struct page *page = vmf->page; 427062306a36Sopenharmony_ci vm_fault_t poisonret = VM_FAULT_HWPOISON; 427162306a36Sopenharmony_ci if (ret & VM_FAULT_LOCKED) { 427262306a36Sopenharmony_ci if (page_mapped(page)) 427362306a36Sopenharmony_ci unmap_mapping_pages(page_mapping(page), 427462306a36Sopenharmony_ci page->index, 1, false); 427562306a36Sopenharmony_ci /* Retry if a clean page was removed from the cache. */ 427662306a36Sopenharmony_ci if (invalidate_inode_page(page)) 427762306a36Sopenharmony_ci poisonret = VM_FAULT_NOPAGE; 427862306a36Sopenharmony_ci unlock_page(page); 427962306a36Sopenharmony_ci } 428062306a36Sopenharmony_ci put_page(page); 428162306a36Sopenharmony_ci vmf->page = NULL; 428262306a36Sopenharmony_ci return poisonret; 428362306a36Sopenharmony_ci } 428462306a36Sopenharmony_ci 428562306a36Sopenharmony_ci if (unlikely(!(ret & VM_FAULT_LOCKED))) 428662306a36Sopenharmony_ci lock_page(vmf->page); 428762306a36Sopenharmony_ci else 428862306a36Sopenharmony_ci VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page); 428962306a36Sopenharmony_ci 429062306a36Sopenharmony_ci return ret; 429162306a36Sopenharmony_ci} 429262306a36Sopenharmony_ci 429362306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 429462306a36Sopenharmony_cistatic void deposit_prealloc_pte(struct vm_fault *vmf) 429562306a36Sopenharmony_ci{ 429662306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 429762306a36Sopenharmony_ci 429862306a36Sopenharmony_ci pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); 429962306a36Sopenharmony_ci /* 430062306a36Sopenharmony_ci * We are going to consume the prealloc table, 430162306a36Sopenharmony_ci * count that as nr_ptes. 430262306a36Sopenharmony_ci */ 430362306a36Sopenharmony_ci mm_inc_nr_ptes(vma->vm_mm); 430462306a36Sopenharmony_ci vmf->prealloc_pte = NULL; 430562306a36Sopenharmony_ci} 430662306a36Sopenharmony_ci 430762306a36Sopenharmony_civm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) 430862306a36Sopenharmony_ci{ 430962306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 431062306a36Sopenharmony_ci bool write = vmf->flags & FAULT_FLAG_WRITE; 431162306a36Sopenharmony_ci unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 431262306a36Sopenharmony_ci pmd_t entry; 431362306a36Sopenharmony_ci vm_fault_t ret = VM_FAULT_FALLBACK; 431462306a36Sopenharmony_ci 431562306a36Sopenharmony_ci if (!transhuge_vma_suitable(vma, haddr)) 431662306a36Sopenharmony_ci return ret; 431762306a36Sopenharmony_ci 431862306a36Sopenharmony_ci page = compound_head(page); 431962306a36Sopenharmony_ci if (compound_order(page) != HPAGE_PMD_ORDER) 432062306a36Sopenharmony_ci return ret; 432162306a36Sopenharmony_ci 432262306a36Sopenharmony_ci /* 432362306a36Sopenharmony_ci * Just backoff if any subpage of a THP is corrupted otherwise 432462306a36Sopenharmony_ci * the corrupted page may mapped by PMD silently to escape the 432562306a36Sopenharmony_ci * check. This kind of THP just can be PTE mapped. Access to 432662306a36Sopenharmony_ci * the corrupted subpage should trigger SIGBUS as expected. 432762306a36Sopenharmony_ci */ 432862306a36Sopenharmony_ci if (unlikely(PageHasHWPoisoned(page))) 432962306a36Sopenharmony_ci return ret; 433062306a36Sopenharmony_ci 433162306a36Sopenharmony_ci /* 433262306a36Sopenharmony_ci * Archs like ppc64 need additional space to store information 433362306a36Sopenharmony_ci * related to pte entry. Use the preallocated table for that. 433462306a36Sopenharmony_ci */ 433562306a36Sopenharmony_ci if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) { 433662306a36Sopenharmony_ci vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); 433762306a36Sopenharmony_ci if (!vmf->prealloc_pte) 433862306a36Sopenharmony_ci return VM_FAULT_OOM; 433962306a36Sopenharmony_ci } 434062306a36Sopenharmony_ci 434162306a36Sopenharmony_ci vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 434262306a36Sopenharmony_ci if (unlikely(!pmd_none(*vmf->pmd))) 434362306a36Sopenharmony_ci goto out; 434462306a36Sopenharmony_ci 434562306a36Sopenharmony_ci flush_icache_pages(vma, page, HPAGE_PMD_NR); 434662306a36Sopenharmony_ci 434762306a36Sopenharmony_ci entry = mk_huge_pmd(page, vma->vm_page_prot); 434862306a36Sopenharmony_ci if (write) 434962306a36Sopenharmony_ci entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 435062306a36Sopenharmony_ci 435162306a36Sopenharmony_ci add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR); 435262306a36Sopenharmony_ci page_add_file_rmap(page, vma, true); 435362306a36Sopenharmony_ci 435462306a36Sopenharmony_ci /* 435562306a36Sopenharmony_ci * deposit and withdraw with pmd lock held 435662306a36Sopenharmony_ci */ 435762306a36Sopenharmony_ci if (arch_needs_pgtable_deposit()) 435862306a36Sopenharmony_ci deposit_prealloc_pte(vmf); 435962306a36Sopenharmony_ci 436062306a36Sopenharmony_ci set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); 436162306a36Sopenharmony_ci 436262306a36Sopenharmony_ci update_mmu_cache_pmd(vma, haddr, vmf->pmd); 436362306a36Sopenharmony_ci 436462306a36Sopenharmony_ci /* fault is handled */ 436562306a36Sopenharmony_ci ret = 0; 436662306a36Sopenharmony_ci count_vm_event(THP_FILE_MAPPED); 436762306a36Sopenharmony_ciout: 436862306a36Sopenharmony_ci spin_unlock(vmf->ptl); 436962306a36Sopenharmony_ci return ret; 437062306a36Sopenharmony_ci} 437162306a36Sopenharmony_ci#else 437262306a36Sopenharmony_civm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) 437362306a36Sopenharmony_ci{ 437462306a36Sopenharmony_ci return VM_FAULT_FALLBACK; 437562306a36Sopenharmony_ci} 437662306a36Sopenharmony_ci#endif 437762306a36Sopenharmony_ci 437862306a36Sopenharmony_ci/** 437962306a36Sopenharmony_ci * set_pte_range - Set a range of PTEs to point to pages in a folio. 438062306a36Sopenharmony_ci * @vmf: Fault decription. 438162306a36Sopenharmony_ci * @folio: The folio that contains @page. 438262306a36Sopenharmony_ci * @page: The first page to create a PTE for. 438362306a36Sopenharmony_ci * @nr: The number of PTEs to create. 438462306a36Sopenharmony_ci * @addr: The first address to create a PTE for. 438562306a36Sopenharmony_ci */ 438662306a36Sopenharmony_civoid set_pte_range(struct vm_fault *vmf, struct folio *folio, 438762306a36Sopenharmony_ci struct page *page, unsigned int nr, unsigned long addr) 438862306a36Sopenharmony_ci{ 438962306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 439062306a36Sopenharmony_ci bool uffd_wp = vmf_orig_pte_uffd_wp(vmf); 439162306a36Sopenharmony_ci bool write = vmf->flags & FAULT_FLAG_WRITE; 439262306a36Sopenharmony_ci bool prefault = in_range(vmf->address, addr, nr * PAGE_SIZE); 439362306a36Sopenharmony_ci pte_t entry; 439462306a36Sopenharmony_ci 439562306a36Sopenharmony_ci flush_icache_pages(vma, page, nr); 439662306a36Sopenharmony_ci entry = mk_pte(page, vma->vm_page_prot); 439762306a36Sopenharmony_ci 439862306a36Sopenharmony_ci if (prefault && arch_wants_old_prefaulted_pte()) 439962306a36Sopenharmony_ci entry = pte_mkold(entry); 440062306a36Sopenharmony_ci else 440162306a36Sopenharmony_ci entry = pte_sw_mkyoung(entry); 440262306a36Sopenharmony_ci 440362306a36Sopenharmony_ci if (write) 440462306a36Sopenharmony_ci entry = maybe_mkwrite(pte_mkdirty(entry), vma); 440562306a36Sopenharmony_ci if (unlikely(uffd_wp)) 440662306a36Sopenharmony_ci entry = pte_mkuffd_wp(entry); 440762306a36Sopenharmony_ci /* copy-on-write page */ 440862306a36Sopenharmony_ci if (write && !(vma->vm_flags & VM_SHARED)) { 440962306a36Sopenharmony_ci add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr); 441062306a36Sopenharmony_ci VM_BUG_ON_FOLIO(nr != 1, folio); 441162306a36Sopenharmony_ci folio_add_new_anon_rmap(folio, vma, addr); 441262306a36Sopenharmony_ci folio_add_lru_vma(folio, vma); 441362306a36Sopenharmony_ci } else { 441462306a36Sopenharmony_ci add_mm_counter(vma->vm_mm, mm_counter_file(page), nr); 441562306a36Sopenharmony_ci folio_add_file_rmap_range(folio, page, nr, vma, false); 441662306a36Sopenharmony_ci } 441762306a36Sopenharmony_ci set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr); 441862306a36Sopenharmony_ci 441962306a36Sopenharmony_ci /* no need to invalidate: a not-present page won't be cached */ 442062306a36Sopenharmony_ci update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr); 442162306a36Sopenharmony_ci} 442262306a36Sopenharmony_ci 442362306a36Sopenharmony_cistatic bool vmf_pte_changed(struct vm_fault *vmf) 442462306a36Sopenharmony_ci{ 442562306a36Sopenharmony_ci if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID) 442662306a36Sopenharmony_ci return !pte_same(ptep_get(vmf->pte), vmf->orig_pte); 442762306a36Sopenharmony_ci 442862306a36Sopenharmony_ci return !pte_none(ptep_get(vmf->pte)); 442962306a36Sopenharmony_ci} 443062306a36Sopenharmony_ci 443162306a36Sopenharmony_ci/** 443262306a36Sopenharmony_ci * finish_fault - finish page fault once we have prepared the page to fault 443362306a36Sopenharmony_ci * 443462306a36Sopenharmony_ci * @vmf: structure describing the fault 443562306a36Sopenharmony_ci * 443662306a36Sopenharmony_ci * This function handles all that is needed to finish a page fault once the 443762306a36Sopenharmony_ci * page to fault in is prepared. It handles locking of PTEs, inserts PTE for 443862306a36Sopenharmony_ci * given page, adds reverse page mapping, handles memcg charges and LRU 443962306a36Sopenharmony_ci * addition. 444062306a36Sopenharmony_ci * 444162306a36Sopenharmony_ci * The function expects the page to be locked and on success it consumes a 444262306a36Sopenharmony_ci * reference of a page being mapped (for the PTE which maps it). 444362306a36Sopenharmony_ci * 444462306a36Sopenharmony_ci * Return: %0 on success, %VM_FAULT_ code in case of error. 444562306a36Sopenharmony_ci */ 444662306a36Sopenharmony_civm_fault_t finish_fault(struct vm_fault *vmf) 444762306a36Sopenharmony_ci{ 444862306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 444962306a36Sopenharmony_ci struct page *page; 445062306a36Sopenharmony_ci vm_fault_t ret; 445162306a36Sopenharmony_ci 445262306a36Sopenharmony_ci /* Did we COW the page? */ 445362306a36Sopenharmony_ci if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) 445462306a36Sopenharmony_ci page = vmf->cow_page; 445562306a36Sopenharmony_ci else 445662306a36Sopenharmony_ci page = vmf->page; 445762306a36Sopenharmony_ci 445862306a36Sopenharmony_ci /* 445962306a36Sopenharmony_ci * check even for read faults because we might have lost our CoWed 446062306a36Sopenharmony_ci * page 446162306a36Sopenharmony_ci */ 446262306a36Sopenharmony_ci if (!(vma->vm_flags & VM_SHARED)) { 446362306a36Sopenharmony_ci ret = check_stable_address_space(vma->vm_mm); 446462306a36Sopenharmony_ci if (ret) 446562306a36Sopenharmony_ci return ret; 446662306a36Sopenharmony_ci } 446762306a36Sopenharmony_ci 446862306a36Sopenharmony_ci if (pmd_none(*vmf->pmd)) { 446962306a36Sopenharmony_ci if (PageTransCompound(page)) { 447062306a36Sopenharmony_ci ret = do_set_pmd(vmf, page); 447162306a36Sopenharmony_ci if (ret != VM_FAULT_FALLBACK) 447262306a36Sopenharmony_ci return ret; 447362306a36Sopenharmony_ci } 447462306a36Sopenharmony_ci 447562306a36Sopenharmony_ci if (vmf->prealloc_pte) 447662306a36Sopenharmony_ci pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte); 447762306a36Sopenharmony_ci else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) 447862306a36Sopenharmony_ci return VM_FAULT_OOM; 447962306a36Sopenharmony_ci } 448062306a36Sopenharmony_ci 448162306a36Sopenharmony_ci vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, 448262306a36Sopenharmony_ci vmf->address, &vmf->ptl); 448362306a36Sopenharmony_ci if (!vmf->pte) 448462306a36Sopenharmony_ci return VM_FAULT_NOPAGE; 448562306a36Sopenharmony_ci 448662306a36Sopenharmony_ci /* Re-check under ptl */ 448762306a36Sopenharmony_ci if (likely(!vmf_pte_changed(vmf))) { 448862306a36Sopenharmony_ci struct folio *folio = page_folio(page); 448962306a36Sopenharmony_ci 449062306a36Sopenharmony_ci set_pte_range(vmf, folio, page, 1, vmf->address); 449162306a36Sopenharmony_ci ret = 0; 449262306a36Sopenharmony_ci } else { 449362306a36Sopenharmony_ci update_mmu_tlb(vma, vmf->address, vmf->pte); 449462306a36Sopenharmony_ci ret = VM_FAULT_NOPAGE; 449562306a36Sopenharmony_ci } 449662306a36Sopenharmony_ci 449762306a36Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 449862306a36Sopenharmony_ci return ret; 449962306a36Sopenharmony_ci} 450062306a36Sopenharmony_ci 450162306a36Sopenharmony_cistatic unsigned long fault_around_pages __read_mostly = 450262306a36Sopenharmony_ci 65536 >> PAGE_SHIFT; 450362306a36Sopenharmony_ci 450462306a36Sopenharmony_ci#ifdef CONFIG_DEBUG_FS 450562306a36Sopenharmony_cistatic int fault_around_bytes_get(void *data, u64 *val) 450662306a36Sopenharmony_ci{ 450762306a36Sopenharmony_ci *val = fault_around_pages << PAGE_SHIFT; 450862306a36Sopenharmony_ci return 0; 450962306a36Sopenharmony_ci} 451062306a36Sopenharmony_ci 451162306a36Sopenharmony_ci/* 451262306a36Sopenharmony_ci * fault_around_bytes must be rounded down to the nearest page order as it's 451362306a36Sopenharmony_ci * what do_fault_around() expects to see. 451462306a36Sopenharmony_ci */ 451562306a36Sopenharmony_cistatic int fault_around_bytes_set(void *data, u64 val) 451662306a36Sopenharmony_ci{ 451762306a36Sopenharmony_ci if (val / PAGE_SIZE > PTRS_PER_PTE) 451862306a36Sopenharmony_ci return -EINVAL; 451962306a36Sopenharmony_ci 452062306a36Sopenharmony_ci /* 452162306a36Sopenharmony_ci * The minimum value is 1 page, however this results in no fault-around 452262306a36Sopenharmony_ci * at all. See should_fault_around(). 452362306a36Sopenharmony_ci */ 452462306a36Sopenharmony_ci fault_around_pages = max(rounddown_pow_of_two(val) >> PAGE_SHIFT, 1UL); 452562306a36Sopenharmony_ci 452662306a36Sopenharmony_ci return 0; 452762306a36Sopenharmony_ci} 452862306a36Sopenharmony_ciDEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops, 452962306a36Sopenharmony_ci fault_around_bytes_get, fault_around_bytes_set, "%llu\n"); 453062306a36Sopenharmony_ci 453162306a36Sopenharmony_cistatic int __init fault_around_debugfs(void) 453262306a36Sopenharmony_ci{ 453362306a36Sopenharmony_ci debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL, 453462306a36Sopenharmony_ci &fault_around_bytes_fops); 453562306a36Sopenharmony_ci return 0; 453662306a36Sopenharmony_ci} 453762306a36Sopenharmony_cilate_initcall(fault_around_debugfs); 453862306a36Sopenharmony_ci#endif 453962306a36Sopenharmony_ci 454062306a36Sopenharmony_ci/* 454162306a36Sopenharmony_ci * do_fault_around() tries to map few pages around the fault address. The hope 454262306a36Sopenharmony_ci * is that the pages will be needed soon and this will lower the number of 454362306a36Sopenharmony_ci * faults to handle. 454462306a36Sopenharmony_ci * 454562306a36Sopenharmony_ci * It uses vm_ops->map_pages() to map the pages, which skips the page if it's 454662306a36Sopenharmony_ci * not ready to be mapped: not up-to-date, locked, etc. 454762306a36Sopenharmony_ci * 454862306a36Sopenharmony_ci * This function doesn't cross VMA or page table boundaries, in order to call 454962306a36Sopenharmony_ci * map_pages() and acquire a PTE lock only once. 455062306a36Sopenharmony_ci * 455162306a36Sopenharmony_ci * fault_around_pages defines how many pages we'll try to map. 455262306a36Sopenharmony_ci * do_fault_around() expects it to be set to a power of two less than or equal 455362306a36Sopenharmony_ci * to PTRS_PER_PTE. 455462306a36Sopenharmony_ci * 455562306a36Sopenharmony_ci * The virtual address of the area that we map is naturally aligned to 455662306a36Sopenharmony_ci * fault_around_pages * PAGE_SIZE rounded down to the machine page size 455762306a36Sopenharmony_ci * (and therefore to page order). This way it's easier to guarantee 455862306a36Sopenharmony_ci * that we don't cross page table boundaries. 455962306a36Sopenharmony_ci */ 456062306a36Sopenharmony_cistatic vm_fault_t do_fault_around(struct vm_fault *vmf) 456162306a36Sopenharmony_ci{ 456262306a36Sopenharmony_ci pgoff_t nr_pages = READ_ONCE(fault_around_pages); 456362306a36Sopenharmony_ci pgoff_t pte_off = pte_index(vmf->address); 456462306a36Sopenharmony_ci /* The page offset of vmf->address within the VMA. */ 456562306a36Sopenharmony_ci pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff; 456662306a36Sopenharmony_ci pgoff_t from_pte, to_pte; 456762306a36Sopenharmony_ci vm_fault_t ret; 456862306a36Sopenharmony_ci 456962306a36Sopenharmony_ci /* The PTE offset of the start address, clamped to the VMA. */ 457062306a36Sopenharmony_ci from_pte = max(ALIGN_DOWN(pte_off, nr_pages), 457162306a36Sopenharmony_ci pte_off - min(pte_off, vma_off)); 457262306a36Sopenharmony_ci 457362306a36Sopenharmony_ci /* The PTE offset of the end address, clamped to the VMA and PTE. */ 457462306a36Sopenharmony_ci to_pte = min3(from_pte + nr_pages, (pgoff_t)PTRS_PER_PTE, 457562306a36Sopenharmony_ci pte_off + vma_pages(vmf->vma) - vma_off) - 1; 457662306a36Sopenharmony_ci 457762306a36Sopenharmony_ci if (pmd_none(*vmf->pmd)) { 457862306a36Sopenharmony_ci vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); 457962306a36Sopenharmony_ci if (!vmf->prealloc_pte) 458062306a36Sopenharmony_ci return VM_FAULT_OOM; 458162306a36Sopenharmony_ci } 458262306a36Sopenharmony_ci 458362306a36Sopenharmony_ci rcu_read_lock(); 458462306a36Sopenharmony_ci ret = vmf->vma->vm_ops->map_pages(vmf, 458562306a36Sopenharmony_ci vmf->pgoff + from_pte - pte_off, 458662306a36Sopenharmony_ci vmf->pgoff + to_pte - pte_off); 458762306a36Sopenharmony_ci rcu_read_unlock(); 458862306a36Sopenharmony_ci 458962306a36Sopenharmony_ci return ret; 459062306a36Sopenharmony_ci} 459162306a36Sopenharmony_ci 459262306a36Sopenharmony_ci/* Return true if we should do read fault-around, false otherwise */ 459362306a36Sopenharmony_cistatic inline bool should_fault_around(struct vm_fault *vmf) 459462306a36Sopenharmony_ci{ 459562306a36Sopenharmony_ci /* No ->map_pages? No way to fault around... */ 459662306a36Sopenharmony_ci if (!vmf->vma->vm_ops->map_pages) 459762306a36Sopenharmony_ci return false; 459862306a36Sopenharmony_ci 459962306a36Sopenharmony_ci if (uffd_disable_fault_around(vmf->vma)) 460062306a36Sopenharmony_ci return false; 460162306a36Sopenharmony_ci 460262306a36Sopenharmony_ci /* A single page implies no faulting 'around' at all. */ 460362306a36Sopenharmony_ci return fault_around_pages > 1; 460462306a36Sopenharmony_ci} 460562306a36Sopenharmony_ci 460662306a36Sopenharmony_cistatic vm_fault_t do_read_fault(struct vm_fault *vmf) 460762306a36Sopenharmony_ci{ 460862306a36Sopenharmony_ci vm_fault_t ret = 0; 460962306a36Sopenharmony_ci struct folio *folio; 461062306a36Sopenharmony_ci 461162306a36Sopenharmony_ci /* 461262306a36Sopenharmony_ci * Let's call ->map_pages() first and use ->fault() as fallback 461362306a36Sopenharmony_ci * if page by the offset is not ready to be mapped (cold cache or 461462306a36Sopenharmony_ci * something). 461562306a36Sopenharmony_ci */ 461662306a36Sopenharmony_ci if (should_fault_around(vmf)) { 461762306a36Sopenharmony_ci ret = do_fault_around(vmf); 461862306a36Sopenharmony_ci if (ret) 461962306a36Sopenharmony_ci return ret; 462062306a36Sopenharmony_ci } 462162306a36Sopenharmony_ci 462262306a36Sopenharmony_ci if (vmf->flags & FAULT_FLAG_VMA_LOCK) { 462362306a36Sopenharmony_ci vma_end_read(vmf->vma); 462462306a36Sopenharmony_ci return VM_FAULT_RETRY; 462562306a36Sopenharmony_ci } 462662306a36Sopenharmony_ci 462762306a36Sopenharmony_ci ret = __do_fault(vmf); 462862306a36Sopenharmony_ci if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 462962306a36Sopenharmony_ci return ret; 463062306a36Sopenharmony_ci 463162306a36Sopenharmony_ci ret |= finish_fault(vmf); 463262306a36Sopenharmony_ci folio = page_folio(vmf->page); 463362306a36Sopenharmony_ci folio_unlock(folio); 463462306a36Sopenharmony_ci if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 463562306a36Sopenharmony_ci folio_put(folio); 463662306a36Sopenharmony_ci return ret; 463762306a36Sopenharmony_ci} 463862306a36Sopenharmony_ci 463962306a36Sopenharmony_cistatic vm_fault_t do_cow_fault(struct vm_fault *vmf) 464062306a36Sopenharmony_ci{ 464162306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 464262306a36Sopenharmony_ci vm_fault_t ret; 464362306a36Sopenharmony_ci 464462306a36Sopenharmony_ci if (vmf->flags & FAULT_FLAG_VMA_LOCK) { 464562306a36Sopenharmony_ci vma_end_read(vma); 464662306a36Sopenharmony_ci return VM_FAULT_RETRY; 464762306a36Sopenharmony_ci } 464862306a36Sopenharmony_ci 464962306a36Sopenharmony_ci if (unlikely(anon_vma_prepare(vma))) 465062306a36Sopenharmony_ci return VM_FAULT_OOM; 465162306a36Sopenharmony_ci 465262306a36Sopenharmony_ci vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); 465362306a36Sopenharmony_ci if (!vmf->cow_page) 465462306a36Sopenharmony_ci return VM_FAULT_OOM; 465562306a36Sopenharmony_ci 465662306a36Sopenharmony_ci if (mem_cgroup_charge(page_folio(vmf->cow_page), vma->vm_mm, 465762306a36Sopenharmony_ci GFP_KERNEL)) { 465862306a36Sopenharmony_ci put_page(vmf->cow_page); 465962306a36Sopenharmony_ci return VM_FAULT_OOM; 466062306a36Sopenharmony_ci } 466162306a36Sopenharmony_ci folio_throttle_swaprate(page_folio(vmf->cow_page), GFP_KERNEL); 466262306a36Sopenharmony_ci 466362306a36Sopenharmony_ci ret = __do_fault(vmf); 466462306a36Sopenharmony_ci if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 466562306a36Sopenharmony_ci goto uncharge_out; 466662306a36Sopenharmony_ci if (ret & VM_FAULT_DONE_COW) 466762306a36Sopenharmony_ci return ret; 466862306a36Sopenharmony_ci 466962306a36Sopenharmony_ci copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma); 467062306a36Sopenharmony_ci __SetPageUptodate(vmf->cow_page); 467162306a36Sopenharmony_ci 467262306a36Sopenharmony_ci ret |= finish_fault(vmf); 467362306a36Sopenharmony_ci unlock_page(vmf->page); 467462306a36Sopenharmony_ci put_page(vmf->page); 467562306a36Sopenharmony_ci if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 467662306a36Sopenharmony_ci goto uncharge_out; 467762306a36Sopenharmony_ci return ret; 467862306a36Sopenharmony_ciuncharge_out: 467962306a36Sopenharmony_ci put_page(vmf->cow_page); 468062306a36Sopenharmony_ci return ret; 468162306a36Sopenharmony_ci} 468262306a36Sopenharmony_ci 468362306a36Sopenharmony_cistatic vm_fault_t do_shared_fault(struct vm_fault *vmf) 468462306a36Sopenharmony_ci{ 468562306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 468662306a36Sopenharmony_ci vm_fault_t ret, tmp; 468762306a36Sopenharmony_ci struct folio *folio; 468862306a36Sopenharmony_ci 468962306a36Sopenharmony_ci if (vmf->flags & FAULT_FLAG_VMA_LOCK) { 469062306a36Sopenharmony_ci vma_end_read(vma); 469162306a36Sopenharmony_ci return VM_FAULT_RETRY; 469262306a36Sopenharmony_ci } 469362306a36Sopenharmony_ci 469462306a36Sopenharmony_ci ret = __do_fault(vmf); 469562306a36Sopenharmony_ci if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 469662306a36Sopenharmony_ci return ret; 469762306a36Sopenharmony_ci 469862306a36Sopenharmony_ci folio = page_folio(vmf->page); 469962306a36Sopenharmony_ci 470062306a36Sopenharmony_ci /* 470162306a36Sopenharmony_ci * Check if the backing address space wants to know that the page is 470262306a36Sopenharmony_ci * about to become writable 470362306a36Sopenharmony_ci */ 470462306a36Sopenharmony_ci if (vma->vm_ops->page_mkwrite) { 470562306a36Sopenharmony_ci folio_unlock(folio); 470662306a36Sopenharmony_ci tmp = do_page_mkwrite(vmf, folio); 470762306a36Sopenharmony_ci if (unlikely(!tmp || 470862306a36Sopenharmony_ci (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { 470962306a36Sopenharmony_ci folio_put(folio); 471062306a36Sopenharmony_ci return tmp; 471162306a36Sopenharmony_ci } 471262306a36Sopenharmony_ci } 471362306a36Sopenharmony_ci 471462306a36Sopenharmony_ci ret |= finish_fault(vmf); 471562306a36Sopenharmony_ci if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | 471662306a36Sopenharmony_ci VM_FAULT_RETRY))) { 471762306a36Sopenharmony_ci folio_unlock(folio); 471862306a36Sopenharmony_ci folio_put(folio); 471962306a36Sopenharmony_ci return ret; 472062306a36Sopenharmony_ci } 472162306a36Sopenharmony_ci 472262306a36Sopenharmony_ci ret |= fault_dirty_shared_page(vmf); 472362306a36Sopenharmony_ci return ret; 472462306a36Sopenharmony_ci} 472562306a36Sopenharmony_ci 472662306a36Sopenharmony_ci/* 472762306a36Sopenharmony_ci * We enter with non-exclusive mmap_lock (to exclude vma changes, 472862306a36Sopenharmony_ci * but allow concurrent faults). 472962306a36Sopenharmony_ci * The mmap_lock may have been released depending on flags and our 473062306a36Sopenharmony_ci * return value. See filemap_fault() and __folio_lock_or_retry(). 473162306a36Sopenharmony_ci * If mmap_lock is released, vma may become invalid (for example 473262306a36Sopenharmony_ci * by other thread calling munmap()). 473362306a36Sopenharmony_ci */ 473462306a36Sopenharmony_cistatic vm_fault_t do_fault(struct vm_fault *vmf) 473562306a36Sopenharmony_ci{ 473662306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 473762306a36Sopenharmony_ci struct mm_struct *vm_mm = vma->vm_mm; 473862306a36Sopenharmony_ci vm_fault_t ret; 473962306a36Sopenharmony_ci 474062306a36Sopenharmony_ci /* 474162306a36Sopenharmony_ci * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND 474262306a36Sopenharmony_ci */ 474362306a36Sopenharmony_ci if (!vma->vm_ops->fault) { 474462306a36Sopenharmony_ci vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, 474562306a36Sopenharmony_ci vmf->address, &vmf->ptl); 474662306a36Sopenharmony_ci if (unlikely(!vmf->pte)) 474762306a36Sopenharmony_ci ret = VM_FAULT_SIGBUS; 474862306a36Sopenharmony_ci else { 474962306a36Sopenharmony_ci /* 475062306a36Sopenharmony_ci * Make sure this is not a temporary clearing of pte 475162306a36Sopenharmony_ci * by holding ptl and checking again. A R/M/W update 475262306a36Sopenharmony_ci * of pte involves: take ptl, clearing the pte so that 475362306a36Sopenharmony_ci * we don't have concurrent modification by hardware 475462306a36Sopenharmony_ci * followed by an update. 475562306a36Sopenharmony_ci */ 475662306a36Sopenharmony_ci if (unlikely(pte_none(ptep_get(vmf->pte)))) 475762306a36Sopenharmony_ci ret = VM_FAULT_SIGBUS; 475862306a36Sopenharmony_ci else 475962306a36Sopenharmony_ci ret = VM_FAULT_NOPAGE; 476062306a36Sopenharmony_ci 476162306a36Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 476262306a36Sopenharmony_ci } 476362306a36Sopenharmony_ci } else if (!(vmf->flags & FAULT_FLAG_WRITE)) 476462306a36Sopenharmony_ci ret = do_read_fault(vmf); 476562306a36Sopenharmony_ci else if (!(vma->vm_flags & VM_SHARED)) 476662306a36Sopenharmony_ci ret = do_cow_fault(vmf); 476762306a36Sopenharmony_ci else 476862306a36Sopenharmony_ci ret = do_shared_fault(vmf); 476962306a36Sopenharmony_ci 477062306a36Sopenharmony_ci /* preallocated pagetable is unused: free it */ 477162306a36Sopenharmony_ci if (vmf->prealloc_pte) { 477262306a36Sopenharmony_ci pte_free(vm_mm, vmf->prealloc_pte); 477362306a36Sopenharmony_ci vmf->prealloc_pte = NULL; 477462306a36Sopenharmony_ci } 477562306a36Sopenharmony_ci return ret; 477662306a36Sopenharmony_ci} 477762306a36Sopenharmony_ci 477862306a36Sopenharmony_ciint numa_migrate_prep(struct page *page, struct vm_area_struct *vma, 477962306a36Sopenharmony_ci unsigned long addr, int page_nid, int *flags) 478062306a36Sopenharmony_ci{ 478162306a36Sopenharmony_ci get_page(page); 478262306a36Sopenharmony_ci 478362306a36Sopenharmony_ci /* Record the current PID acceesing VMA */ 478462306a36Sopenharmony_ci vma_set_access_pid_bit(vma); 478562306a36Sopenharmony_ci 478662306a36Sopenharmony_ci count_vm_numa_event(NUMA_HINT_FAULTS); 478762306a36Sopenharmony_ci if (page_nid == numa_node_id()) { 478862306a36Sopenharmony_ci count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); 478962306a36Sopenharmony_ci *flags |= TNF_FAULT_LOCAL; 479062306a36Sopenharmony_ci } 479162306a36Sopenharmony_ci 479262306a36Sopenharmony_ci return mpol_misplaced(page, vma, addr); 479362306a36Sopenharmony_ci} 479462306a36Sopenharmony_ci 479562306a36Sopenharmony_cistatic vm_fault_t do_numa_page(struct vm_fault *vmf) 479662306a36Sopenharmony_ci{ 479762306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 479862306a36Sopenharmony_ci struct page *page = NULL; 479962306a36Sopenharmony_ci int page_nid = NUMA_NO_NODE; 480062306a36Sopenharmony_ci bool writable = false; 480162306a36Sopenharmony_ci int last_cpupid; 480262306a36Sopenharmony_ci int target_nid; 480362306a36Sopenharmony_ci pte_t pte, old_pte; 480462306a36Sopenharmony_ci int flags = 0; 480562306a36Sopenharmony_ci 480662306a36Sopenharmony_ci /* 480762306a36Sopenharmony_ci * The "pte" at this point cannot be used safely without 480862306a36Sopenharmony_ci * validation through pte_unmap_same(). It's of NUMA type but 480962306a36Sopenharmony_ci * the pfn may be screwed if the read is non atomic. 481062306a36Sopenharmony_ci */ 481162306a36Sopenharmony_ci spin_lock(vmf->ptl); 481262306a36Sopenharmony_ci if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { 481362306a36Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 481462306a36Sopenharmony_ci goto out; 481562306a36Sopenharmony_ci } 481662306a36Sopenharmony_ci 481762306a36Sopenharmony_ci /* Get the normal PTE */ 481862306a36Sopenharmony_ci old_pte = ptep_get(vmf->pte); 481962306a36Sopenharmony_ci pte = pte_modify(old_pte, vma->vm_page_prot); 482062306a36Sopenharmony_ci 482162306a36Sopenharmony_ci /* 482262306a36Sopenharmony_ci * Detect now whether the PTE could be writable; this information 482362306a36Sopenharmony_ci * is only valid while holding the PT lock. 482462306a36Sopenharmony_ci */ 482562306a36Sopenharmony_ci writable = pte_write(pte); 482662306a36Sopenharmony_ci if (!writable && vma_wants_manual_pte_write_upgrade(vma) && 482762306a36Sopenharmony_ci can_change_pte_writable(vma, vmf->address, pte)) 482862306a36Sopenharmony_ci writable = true; 482962306a36Sopenharmony_ci 483062306a36Sopenharmony_ci page = vm_normal_page(vma, vmf->address, pte); 483162306a36Sopenharmony_ci if (!page || is_zone_device_page(page)) 483262306a36Sopenharmony_ci goto out_map; 483362306a36Sopenharmony_ci 483462306a36Sopenharmony_ci /* TODO: handle PTE-mapped THP */ 483562306a36Sopenharmony_ci if (PageCompound(page)) 483662306a36Sopenharmony_ci goto out_map; 483762306a36Sopenharmony_ci 483862306a36Sopenharmony_ci /* 483962306a36Sopenharmony_ci * Avoid grouping on RO pages in general. RO pages shouldn't hurt as 484062306a36Sopenharmony_ci * much anyway since they can be in shared cache state. This misses 484162306a36Sopenharmony_ci * the case where a mapping is writable but the process never writes 484262306a36Sopenharmony_ci * to it but pte_write gets cleared during protection updates and 484362306a36Sopenharmony_ci * pte_dirty has unpredictable behaviour between PTE scan updates, 484462306a36Sopenharmony_ci * background writeback, dirty balancing and application behaviour. 484562306a36Sopenharmony_ci */ 484662306a36Sopenharmony_ci if (!writable) 484762306a36Sopenharmony_ci flags |= TNF_NO_GROUP; 484862306a36Sopenharmony_ci 484962306a36Sopenharmony_ci /* 485062306a36Sopenharmony_ci * Flag if the page is shared between multiple address spaces. This 485162306a36Sopenharmony_ci * is later used when determining whether to group tasks together 485262306a36Sopenharmony_ci */ 485362306a36Sopenharmony_ci if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED)) 485462306a36Sopenharmony_ci flags |= TNF_SHARED; 485562306a36Sopenharmony_ci 485662306a36Sopenharmony_ci page_nid = page_to_nid(page); 485762306a36Sopenharmony_ci /* 485862306a36Sopenharmony_ci * For memory tiering mode, cpupid of slow memory page is used 485962306a36Sopenharmony_ci * to record page access time. So use default value. 486062306a36Sopenharmony_ci */ 486162306a36Sopenharmony_ci if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && 486262306a36Sopenharmony_ci !node_is_toptier(page_nid)) 486362306a36Sopenharmony_ci last_cpupid = (-1 & LAST_CPUPID_MASK); 486462306a36Sopenharmony_ci else 486562306a36Sopenharmony_ci last_cpupid = page_cpupid_last(page); 486662306a36Sopenharmony_ci target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, 486762306a36Sopenharmony_ci &flags); 486862306a36Sopenharmony_ci if (target_nid == NUMA_NO_NODE) { 486962306a36Sopenharmony_ci put_page(page); 487062306a36Sopenharmony_ci goto out_map; 487162306a36Sopenharmony_ci } 487262306a36Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 487362306a36Sopenharmony_ci writable = false; 487462306a36Sopenharmony_ci 487562306a36Sopenharmony_ci /* Migrate to the requested node */ 487662306a36Sopenharmony_ci if (migrate_misplaced_page(page, vma, target_nid)) { 487762306a36Sopenharmony_ci page_nid = target_nid; 487862306a36Sopenharmony_ci flags |= TNF_MIGRATED; 487962306a36Sopenharmony_ci } else { 488062306a36Sopenharmony_ci flags |= TNF_MIGRATE_FAIL; 488162306a36Sopenharmony_ci vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, 488262306a36Sopenharmony_ci vmf->address, &vmf->ptl); 488362306a36Sopenharmony_ci if (unlikely(!vmf->pte)) 488462306a36Sopenharmony_ci goto out; 488562306a36Sopenharmony_ci if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { 488662306a36Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 488762306a36Sopenharmony_ci goto out; 488862306a36Sopenharmony_ci } 488962306a36Sopenharmony_ci goto out_map; 489062306a36Sopenharmony_ci } 489162306a36Sopenharmony_ci 489262306a36Sopenharmony_ciout: 489362306a36Sopenharmony_ci if (page_nid != NUMA_NO_NODE) 489462306a36Sopenharmony_ci task_numa_fault(last_cpupid, page_nid, 1, flags); 489562306a36Sopenharmony_ci return 0; 489662306a36Sopenharmony_ciout_map: 489762306a36Sopenharmony_ci /* 489862306a36Sopenharmony_ci * Make it present again, depending on how arch implements 489962306a36Sopenharmony_ci * non-accessible ptes, some can allow access by kernel mode. 490062306a36Sopenharmony_ci */ 490162306a36Sopenharmony_ci old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); 490262306a36Sopenharmony_ci pte = pte_modify(old_pte, vma->vm_page_prot); 490362306a36Sopenharmony_ci pte = pte_mkyoung(pte); 490462306a36Sopenharmony_ci if (writable) 490562306a36Sopenharmony_ci pte = pte_mkwrite(pte, vma); 490662306a36Sopenharmony_ci ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); 490762306a36Sopenharmony_ci update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); 490862306a36Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 490962306a36Sopenharmony_ci goto out; 491062306a36Sopenharmony_ci} 491162306a36Sopenharmony_ci 491262306a36Sopenharmony_cistatic inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) 491362306a36Sopenharmony_ci{ 491462306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 491562306a36Sopenharmony_ci if (vma_is_anonymous(vma)) 491662306a36Sopenharmony_ci return do_huge_pmd_anonymous_page(vmf); 491762306a36Sopenharmony_ci if (vma->vm_ops->huge_fault) 491862306a36Sopenharmony_ci return vma->vm_ops->huge_fault(vmf, PMD_ORDER); 491962306a36Sopenharmony_ci return VM_FAULT_FALLBACK; 492062306a36Sopenharmony_ci} 492162306a36Sopenharmony_ci 492262306a36Sopenharmony_ci/* `inline' is required to avoid gcc 4.1.2 build error */ 492362306a36Sopenharmony_cistatic inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) 492462306a36Sopenharmony_ci{ 492562306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 492662306a36Sopenharmony_ci const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; 492762306a36Sopenharmony_ci vm_fault_t ret; 492862306a36Sopenharmony_ci 492962306a36Sopenharmony_ci if (vma_is_anonymous(vma)) { 493062306a36Sopenharmony_ci if (likely(!unshare) && 493162306a36Sopenharmony_ci userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) 493262306a36Sopenharmony_ci return handle_userfault(vmf, VM_UFFD_WP); 493362306a36Sopenharmony_ci return do_huge_pmd_wp_page(vmf); 493462306a36Sopenharmony_ci } 493562306a36Sopenharmony_ci 493662306a36Sopenharmony_ci if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { 493762306a36Sopenharmony_ci if (vma->vm_ops->huge_fault) { 493862306a36Sopenharmony_ci ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER); 493962306a36Sopenharmony_ci if (!(ret & VM_FAULT_FALLBACK)) 494062306a36Sopenharmony_ci return ret; 494162306a36Sopenharmony_ci } 494262306a36Sopenharmony_ci } 494362306a36Sopenharmony_ci 494462306a36Sopenharmony_ci /* COW or write-notify handled on pte level: split pmd. */ 494562306a36Sopenharmony_ci __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); 494662306a36Sopenharmony_ci 494762306a36Sopenharmony_ci return VM_FAULT_FALLBACK; 494862306a36Sopenharmony_ci} 494962306a36Sopenharmony_ci 495062306a36Sopenharmony_cistatic vm_fault_t create_huge_pud(struct vm_fault *vmf) 495162306a36Sopenharmony_ci{ 495262306a36Sopenharmony_ci#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ 495362306a36Sopenharmony_ci defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) 495462306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 495562306a36Sopenharmony_ci /* No support for anonymous transparent PUD pages yet */ 495662306a36Sopenharmony_ci if (vma_is_anonymous(vma)) 495762306a36Sopenharmony_ci return VM_FAULT_FALLBACK; 495862306a36Sopenharmony_ci if (vma->vm_ops->huge_fault) 495962306a36Sopenharmony_ci return vma->vm_ops->huge_fault(vmf, PUD_ORDER); 496062306a36Sopenharmony_ci#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 496162306a36Sopenharmony_ci return VM_FAULT_FALLBACK; 496262306a36Sopenharmony_ci} 496362306a36Sopenharmony_ci 496462306a36Sopenharmony_cistatic vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) 496562306a36Sopenharmony_ci{ 496662306a36Sopenharmony_ci#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ 496762306a36Sopenharmony_ci defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) 496862306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 496962306a36Sopenharmony_ci vm_fault_t ret; 497062306a36Sopenharmony_ci 497162306a36Sopenharmony_ci /* No support for anonymous transparent PUD pages yet */ 497262306a36Sopenharmony_ci if (vma_is_anonymous(vma)) 497362306a36Sopenharmony_ci goto split; 497462306a36Sopenharmony_ci if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { 497562306a36Sopenharmony_ci if (vma->vm_ops->huge_fault) { 497662306a36Sopenharmony_ci ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER); 497762306a36Sopenharmony_ci if (!(ret & VM_FAULT_FALLBACK)) 497862306a36Sopenharmony_ci return ret; 497962306a36Sopenharmony_ci } 498062306a36Sopenharmony_ci } 498162306a36Sopenharmony_cisplit: 498262306a36Sopenharmony_ci /* COW or write-notify not handled on PUD level: split pud.*/ 498362306a36Sopenharmony_ci __split_huge_pud(vma, vmf->pud, vmf->address); 498462306a36Sopenharmony_ci#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 498562306a36Sopenharmony_ci return VM_FAULT_FALLBACK; 498662306a36Sopenharmony_ci} 498762306a36Sopenharmony_ci 498862306a36Sopenharmony_ci/* 498962306a36Sopenharmony_ci * These routines also need to handle stuff like marking pages dirty 499062306a36Sopenharmony_ci * and/or accessed for architectures that don't do it in hardware (most 499162306a36Sopenharmony_ci * RISC architectures). The early dirtying is also good on the i386. 499262306a36Sopenharmony_ci * 499362306a36Sopenharmony_ci * There is also a hook called "update_mmu_cache()" that architectures 499462306a36Sopenharmony_ci * with external mmu caches can use to update those (ie the Sparc or 499562306a36Sopenharmony_ci * PowerPC hashed page tables that act as extended TLBs). 499662306a36Sopenharmony_ci * 499762306a36Sopenharmony_ci * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow 499862306a36Sopenharmony_ci * concurrent faults). 499962306a36Sopenharmony_ci * 500062306a36Sopenharmony_ci * The mmap_lock may have been released depending on flags and our return value. 500162306a36Sopenharmony_ci * See filemap_fault() and __folio_lock_or_retry(). 500262306a36Sopenharmony_ci */ 500362306a36Sopenharmony_cistatic vm_fault_t handle_pte_fault(struct vm_fault *vmf) 500462306a36Sopenharmony_ci{ 500562306a36Sopenharmony_ci pte_t entry; 500662306a36Sopenharmony_ci 500762306a36Sopenharmony_ci if (unlikely(pmd_none(*vmf->pmd))) { 500862306a36Sopenharmony_ci /* 500962306a36Sopenharmony_ci * Leave __pte_alloc() until later: because vm_ops->fault may 501062306a36Sopenharmony_ci * want to allocate huge page, and if we expose page table 501162306a36Sopenharmony_ci * for an instant, it will be difficult to retract from 501262306a36Sopenharmony_ci * concurrent faults and from rmap lookups. 501362306a36Sopenharmony_ci */ 501462306a36Sopenharmony_ci vmf->pte = NULL; 501562306a36Sopenharmony_ci vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID; 501662306a36Sopenharmony_ci } else { 501762306a36Sopenharmony_ci /* 501862306a36Sopenharmony_ci * A regular pmd is established and it can't morph into a huge 501962306a36Sopenharmony_ci * pmd by anon khugepaged, since that takes mmap_lock in write 502062306a36Sopenharmony_ci * mode; but shmem or file collapse to THP could still morph 502162306a36Sopenharmony_ci * it into a huge pmd: just retry later if so. 502262306a36Sopenharmony_ci */ 502362306a36Sopenharmony_ci vmf->pte = pte_offset_map_nolock(vmf->vma->vm_mm, vmf->pmd, 502462306a36Sopenharmony_ci vmf->address, &vmf->ptl); 502562306a36Sopenharmony_ci if (unlikely(!vmf->pte)) 502662306a36Sopenharmony_ci return 0; 502762306a36Sopenharmony_ci vmf->orig_pte = ptep_get_lockless(vmf->pte); 502862306a36Sopenharmony_ci vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID; 502962306a36Sopenharmony_ci 503062306a36Sopenharmony_ci if (pte_none(vmf->orig_pte)) { 503162306a36Sopenharmony_ci pte_unmap(vmf->pte); 503262306a36Sopenharmony_ci vmf->pte = NULL; 503362306a36Sopenharmony_ci } 503462306a36Sopenharmony_ci } 503562306a36Sopenharmony_ci 503662306a36Sopenharmony_ci if (!vmf->pte) 503762306a36Sopenharmony_ci return do_pte_missing(vmf); 503862306a36Sopenharmony_ci 503962306a36Sopenharmony_ci if (!pte_present(vmf->orig_pte)) 504062306a36Sopenharmony_ci return do_swap_page(vmf); 504162306a36Sopenharmony_ci 504262306a36Sopenharmony_ci if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) 504362306a36Sopenharmony_ci return do_numa_page(vmf); 504462306a36Sopenharmony_ci 504562306a36Sopenharmony_ci spin_lock(vmf->ptl); 504662306a36Sopenharmony_ci entry = vmf->orig_pte; 504762306a36Sopenharmony_ci if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) { 504862306a36Sopenharmony_ci update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); 504962306a36Sopenharmony_ci goto unlock; 505062306a36Sopenharmony_ci } 505162306a36Sopenharmony_ci if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { 505262306a36Sopenharmony_ci if (!pte_write(entry)) 505362306a36Sopenharmony_ci return do_wp_page(vmf); 505462306a36Sopenharmony_ci else if (likely(vmf->flags & FAULT_FLAG_WRITE)) 505562306a36Sopenharmony_ci entry = pte_mkdirty(entry); 505662306a36Sopenharmony_ci } 505762306a36Sopenharmony_ci entry = pte_mkyoung(entry); 505862306a36Sopenharmony_ci if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, 505962306a36Sopenharmony_ci vmf->flags & FAULT_FLAG_WRITE)) { 506062306a36Sopenharmony_ci update_mmu_cache_range(vmf, vmf->vma, vmf->address, 506162306a36Sopenharmony_ci vmf->pte, 1); 506262306a36Sopenharmony_ci } else { 506362306a36Sopenharmony_ci /* Skip spurious TLB flush for retried page fault */ 506462306a36Sopenharmony_ci if (vmf->flags & FAULT_FLAG_TRIED) 506562306a36Sopenharmony_ci goto unlock; 506662306a36Sopenharmony_ci /* 506762306a36Sopenharmony_ci * This is needed only for protection faults but the arch code 506862306a36Sopenharmony_ci * is not yet telling us if this is a protection fault or not. 506962306a36Sopenharmony_ci * This still avoids useless tlb flushes for .text page faults 507062306a36Sopenharmony_ci * with threads. 507162306a36Sopenharmony_ci */ 507262306a36Sopenharmony_ci if (vmf->flags & FAULT_FLAG_WRITE) 507362306a36Sopenharmony_ci flush_tlb_fix_spurious_fault(vmf->vma, vmf->address, 507462306a36Sopenharmony_ci vmf->pte); 507562306a36Sopenharmony_ci } 507662306a36Sopenharmony_ciunlock: 507762306a36Sopenharmony_ci pte_unmap_unlock(vmf->pte, vmf->ptl); 507862306a36Sopenharmony_ci return 0; 507962306a36Sopenharmony_ci} 508062306a36Sopenharmony_ci 508162306a36Sopenharmony_ci/* 508262306a36Sopenharmony_ci * On entry, we hold either the VMA lock or the mmap_lock 508362306a36Sopenharmony_ci * (FAULT_FLAG_VMA_LOCK tells you which). If VM_FAULT_RETRY is set in 508462306a36Sopenharmony_ci * the result, the mmap_lock is not held on exit. See filemap_fault() 508562306a36Sopenharmony_ci * and __folio_lock_or_retry(). 508662306a36Sopenharmony_ci */ 508762306a36Sopenharmony_cistatic vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, 508862306a36Sopenharmony_ci unsigned long address, unsigned int flags) 508962306a36Sopenharmony_ci{ 509062306a36Sopenharmony_ci struct vm_fault vmf = { 509162306a36Sopenharmony_ci .vma = vma, 509262306a36Sopenharmony_ci .address = address & PAGE_MASK, 509362306a36Sopenharmony_ci .real_address = address, 509462306a36Sopenharmony_ci .flags = flags, 509562306a36Sopenharmony_ci .pgoff = linear_page_index(vma, address), 509662306a36Sopenharmony_ci .gfp_mask = __get_fault_gfp_mask(vma), 509762306a36Sopenharmony_ci }; 509862306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 509962306a36Sopenharmony_ci unsigned long vm_flags = vma->vm_flags; 510062306a36Sopenharmony_ci pgd_t *pgd; 510162306a36Sopenharmony_ci p4d_t *p4d; 510262306a36Sopenharmony_ci vm_fault_t ret; 510362306a36Sopenharmony_ci 510462306a36Sopenharmony_ci pgd = pgd_offset(mm, address); 510562306a36Sopenharmony_ci p4d = p4d_alloc(mm, pgd, address); 510662306a36Sopenharmony_ci if (!p4d) 510762306a36Sopenharmony_ci return VM_FAULT_OOM; 510862306a36Sopenharmony_ci 510962306a36Sopenharmony_ci vmf.pud = pud_alloc(mm, p4d, address); 511062306a36Sopenharmony_ci if (!vmf.pud) 511162306a36Sopenharmony_ci return VM_FAULT_OOM; 511262306a36Sopenharmony_ciretry_pud: 511362306a36Sopenharmony_ci if (pud_none(*vmf.pud) && 511462306a36Sopenharmony_ci hugepage_vma_check(vma, vm_flags, false, true, true)) { 511562306a36Sopenharmony_ci ret = create_huge_pud(&vmf); 511662306a36Sopenharmony_ci if (!(ret & VM_FAULT_FALLBACK)) 511762306a36Sopenharmony_ci return ret; 511862306a36Sopenharmony_ci } else { 511962306a36Sopenharmony_ci pud_t orig_pud = *vmf.pud; 512062306a36Sopenharmony_ci 512162306a36Sopenharmony_ci barrier(); 512262306a36Sopenharmony_ci if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) { 512362306a36Sopenharmony_ci 512462306a36Sopenharmony_ci /* 512562306a36Sopenharmony_ci * TODO once we support anonymous PUDs: NUMA case and 512662306a36Sopenharmony_ci * FAULT_FLAG_UNSHARE handling. 512762306a36Sopenharmony_ci */ 512862306a36Sopenharmony_ci if ((flags & FAULT_FLAG_WRITE) && !pud_write(orig_pud)) { 512962306a36Sopenharmony_ci ret = wp_huge_pud(&vmf, orig_pud); 513062306a36Sopenharmony_ci if (!(ret & VM_FAULT_FALLBACK)) 513162306a36Sopenharmony_ci return ret; 513262306a36Sopenharmony_ci } else { 513362306a36Sopenharmony_ci huge_pud_set_accessed(&vmf, orig_pud); 513462306a36Sopenharmony_ci return 0; 513562306a36Sopenharmony_ci } 513662306a36Sopenharmony_ci } 513762306a36Sopenharmony_ci } 513862306a36Sopenharmony_ci 513962306a36Sopenharmony_ci vmf.pmd = pmd_alloc(mm, vmf.pud, address); 514062306a36Sopenharmony_ci if (!vmf.pmd) 514162306a36Sopenharmony_ci return VM_FAULT_OOM; 514262306a36Sopenharmony_ci 514362306a36Sopenharmony_ci /* Huge pud page fault raced with pmd_alloc? */ 514462306a36Sopenharmony_ci if (pud_trans_unstable(vmf.pud)) 514562306a36Sopenharmony_ci goto retry_pud; 514662306a36Sopenharmony_ci 514762306a36Sopenharmony_ci if (pmd_none(*vmf.pmd) && 514862306a36Sopenharmony_ci hugepage_vma_check(vma, vm_flags, false, true, true)) { 514962306a36Sopenharmony_ci ret = create_huge_pmd(&vmf); 515062306a36Sopenharmony_ci if (!(ret & VM_FAULT_FALLBACK)) 515162306a36Sopenharmony_ci return ret; 515262306a36Sopenharmony_ci } else { 515362306a36Sopenharmony_ci vmf.orig_pmd = pmdp_get_lockless(vmf.pmd); 515462306a36Sopenharmony_ci 515562306a36Sopenharmony_ci if (unlikely(is_swap_pmd(vmf.orig_pmd))) { 515662306a36Sopenharmony_ci VM_BUG_ON(thp_migration_supported() && 515762306a36Sopenharmony_ci !is_pmd_migration_entry(vmf.orig_pmd)); 515862306a36Sopenharmony_ci if (is_pmd_migration_entry(vmf.orig_pmd)) 515962306a36Sopenharmony_ci pmd_migration_entry_wait(mm, vmf.pmd); 516062306a36Sopenharmony_ci return 0; 516162306a36Sopenharmony_ci } 516262306a36Sopenharmony_ci if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) { 516362306a36Sopenharmony_ci if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) 516462306a36Sopenharmony_ci return do_huge_pmd_numa_page(&vmf); 516562306a36Sopenharmony_ci 516662306a36Sopenharmony_ci if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && 516762306a36Sopenharmony_ci !pmd_write(vmf.orig_pmd)) { 516862306a36Sopenharmony_ci ret = wp_huge_pmd(&vmf); 516962306a36Sopenharmony_ci if (!(ret & VM_FAULT_FALLBACK)) 517062306a36Sopenharmony_ci return ret; 517162306a36Sopenharmony_ci } else { 517262306a36Sopenharmony_ci huge_pmd_set_accessed(&vmf); 517362306a36Sopenharmony_ci return 0; 517462306a36Sopenharmony_ci } 517562306a36Sopenharmony_ci } 517662306a36Sopenharmony_ci } 517762306a36Sopenharmony_ci 517862306a36Sopenharmony_ci return handle_pte_fault(&vmf); 517962306a36Sopenharmony_ci} 518062306a36Sopenharmony_ci 518162306a36Sopenharmony_ci/** 518262306a36Sopenharmony_ci * mm_account_fault - Do page fault accounting 518362306a36Sopenharmony_ci * @mm: mm from which memcg should be extracted. It can be NULL. 518462306a36Sopenharmony_ci * @regs: the pt_regs struct pointer. When set to NULL, will skip accounting 518562306a36Sopenharmony_ci * of perf event counters, but we'll still do the per-task accounting to 518662306a36Sopenharmony_ci * the task who triggered this page fault. 518762306a36Sopenharmony_ci * @address: the faulted address. 518862306a36Sopenharmony_ci * @flags: the fault flags. 518962306a36Sopenharmony_ci * @ret: the fault retcode. 519062306a36Sopenharmony_ci * 519162306a36Sopenharmony_ci * This will take care of most of the page fault accounting. Meanwhile, it 519262306a36Sopenharmony_ci * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter 519362306a36Sopenharmony_ci * updates. However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should 519462306a36Sopenharmony_ci * still be in per-arch page fault handlers at the entry of page fault. 519562306a36Sopenharmony_ci */ 519662306a36Sopenharmony_cistatic inline void mm_account_fault(struct mm_struct *mm, struct pt_regs *regs, 519762306a36Sopenharmony_ci unsigned long address, unsigned int flags, 519862306a36Sopenharmony_ci vm_fault_t ret) 519962306a36Sopenharmony_ci{ 520062306a36Sopenharmony_ci bool major; 520162306a36Sopenharmony_ci 520262306a36Sopenharmony_ci /* Incomplete faults will be accounted upon completion. */ 520362306a36Sopenharmony_ci if (ret & VM_FAULT_RETRY) 520462306a36Sopenharmony_ci return; 520562306a36Sopenharmony_ci 520662306a36Sopenharmony_ci /* 520762306a36Sopenharmony_ci * To preserve the behavior of older kernels, PGFAULT counters record 520862306a36Sopenharmony_ci * both successful and failed faults, as opposed to perf counters, 520962306a36Sopenharmony_ci * which ignore failed cases. 521062306a36Sopenharmony_ci */ 521162306a36Sopenharmony_ci count_vm_event(PGFAULT); 521262306a36Sopenharmony_ci count_memcg_event_mm(mm, PGFAULT); 521362306a36Sopenharmony_ci 521462306a36Sopenharmony_ci /* 521562306a36Sopenharmony_ci * Do not account for unsuccessful faults (e.g. when the address wasn't 521662306a36Sopenharmony_ci * valid). That includes arch_vma_access_permitted() failing before 521762306a36Sopenharmony_ci * reaching here. So this is not a "this many hardware page faults" 521862306a36Sopenharmony_ci * counter. We should use the hw profiling for that. 521962306a36Sopenharmony_ci */ 522062306a36Sopenharmony_ci if (ret & VM_FAULT_ERROR) 522162306a36Sopenharmony_ci return; 522262306a36Sopenharmony_ci 522362306a36Sopenharmony_ci /* 522462306a36Sopenharmony_ci * We define the fault as a major fault when the final successful fault 522562306a36Sopenharmony_ci * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't 522662306a36Sopenharmony_ci * handle it immediately previously). 522762306a36Sopenharmony_ci */ 522862306a36Sopenharmony_ci major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED); 522962306a36Sopenharmony_ci 523062306a36Sopenharmony_ci if (major) 523162306a36Sopenharmony_ci current->maj_flt++; 523262306a36Sopenharmony_ci else 523362306a36Sopenharmony_ci current->min_flt++; 523462306a36Sopenharmony_ci 523562306a36Sopenharmony_ci /* 523662306a36Sopenharmony_ci * If the fault is done for GUP, regs will be NULL. We only do the 523762306a36Sopenharmony_ci * accounting for the per thread fault counters who triggered the 523862306a36Sopenharmony_ci * fault, and we skip the perf event updates. 523962306a36Sopenharmony_ci */ 524062306a36Sopenharmony_ci if (!regs) 524162306a36Sopenharmony_ci return; 524262306a36Sopenharmony_ci 524362306a36Sopenharmony_ci if (major) 524462306a36Sopenharmony_ci perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); 524562306a36Sopenharmony_ci else 524662306a36Sopenharmony_ci perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); 524762306a36Sopenharmony_ci} 524862306a36Sopenharmony_ci 524962306a36Sopenharmony_ci#ifdef CONFIG_LRU_GEN 525062306a36Sopenharmony_cistatic void lru_gen_enter_fault(struct vm_area_struct *vma) 525162306a36Sopenharmony_ci{ 525262306a36Sopenharmony_ci /* the LRU algorithm only applies to accesses with recency */ 525362306a36Sopenharmony_ci current->in_lru_fault = vma_has_recency(vma); 525462306a36Sopenharmony_ci} 525562306a36Sopenharmony_ci 525662306a36Sopenharmony_cistatic void lru_gen_exit_fault(void) 525762306a36Sopenharmony_ci{ 525862306a36Sopenharmony_ci current->in_lru_fault = false; 525962306a36Sopenharmony_ci} 526062306a36Sopenharmony_ci#else 526162306a36Sopenharmony_cistatic void lru_gen_enter_fault(struct vm_area_struct *vma) 526262306a36Sopenharmony_ci{ 526362306a36Sopenharmony_ci} 526462306a36Sopenharmony_ci 526562306a36Sopenharmony_cistatic void lru_gen_exit_fault(void) 526662306a36Sopenharmony_ci{ 526762306a36Sopenharmony_ci} 526862306a36Sopenharmony_ci#endif /* CONFIG_LRU_GEN */ 526962306a36Sopenharmony_ci 527062306a36Sopenharmony_cistatic vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma, 527162306a36Sopenharmony_ci unsigned int *flags) 527262306a36Sopenharmony_ci{ 527362306a36Sopenharmony_ci if (unlikely(*flags & FAULT_FLAG_UNSHARE)) { 527462306a36Sopenharmony_ci if (WARN_ON_ONCE(*flags & FAULT_FLAG_WRITE)) 527562306a36Sopenharmony_ci return VM_FAULT_SIGSEGV; 527662306a36Sopenharmony_ci /* 527762306a36Sopenharmony_ci * FAULT_FLAG_UNSHARE only applies to COW mappings. Let's 527862306a36Sopenharmony_ci * just treat it like an ordinary read-fault otherwise. 527962306a36Sopenharmony_ci */ 528062306a36Sopenharmony_ci if (!is_cow_mapping(vma->vm_flags)) 528162306a36Sopenharmony_ci *flags &= ~FAULT_FLAG_UNSHARE; 528262306a36Sopenharmony_ci } else if (*flags & FAULT_FLAG_WRITE) { 528362306a36Sopenharmony_ci /* Write faults on read-only mappings are impossible ... */ 528462306a36Sopenharmony_ci if (WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE))) 528562306a36Sopenharmony_ci return VM_FAULT_SIGSEGV; 528662306a36Sopenharmony_ci /* ... and FOLL_FORCE only applies to COW mappings. */ 528762306a36Sopenharmony_ci if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE) && 528862306a36Sopenharmony_ci !is_cow_mapping(vma->vm_flags))) 528962306a36Sopenharmony_ci return VM_FAULT_SIGSEGV; 529062306a36Sopenharmony_ci } 529162306a36Sopenharmony_ci#ifdef CONFIG_PER_VMA_LOCK 529262306a36Sopenharmony_ci /* 529362306a36Sopenharmony_ci * Per-VMA locks can't be used with FAULT_FLAG_RETRY_NOWAIT because of 529462306a36Sopenharmony_ci * the assumption that lock is dropped on VM_FAULT_RETRY. 529562306a36Sopenharmony_ci */ 529662306a36Sopenharmony_ci if (WARN_ON_ONCE((*flags & 529762306a36Sopenharmony_ci (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)) == 529862306a36Sopenharmony_ci (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT))) 529962306a36Sopenharmony_ci return VM_FAULT_SIGSEGV; 530062306a36Sopenharmony_ci#endif 530162306a36Sopenharmony_ci 530262306a36Sopenharmony_ci return 0; 530362306a36Sopenharmony_ci} 530462306a36Sopenharmony_ci 530562306a36Sopenharmony_ci/* 530662306a36Sopenharmony_ci * By the time we get here, we already hold the mm semaphore 530762306a36Sopenharmony_ci * 530862306a36Sopenharmony_ci * The mmap_lock may have been released depending on flags and our 530962306a36Sopenharmony_ci * return value. See filemap_fault() and __folio_lock_or_retry(). 531062306a36Sopenharmony_ci */ 531162306a36Sopenharmony_civm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, 531262306a36Sopenharmony_ci unsigned int flags, struct pt_regs *regs) 531362306a36Sopenharmony_ci{ 531462306a36Sopenharmony_ci /* If the fault handler drops the mmap_lock, vma may be freed */ 531562306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 531662306a36Sopenharmony_ci vm_fault_t ret; 531762306a36Sopenharmony_ci 531862306a36Sopenharmony_ci __set_current_state(TASK_RUNNING); 531962306a36Sopenharmony_ci 532062306a36Sopenharmony_ci ret = sanitize_fault_flags(vma, &flags); 532162306a36Sopenharmony_ci if (ret) 532262306a36Sopenharmony_ci goto out; 532362306a36Sopenharmony_ci 532462306a36Sopenharmony_ci if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, 532562306a36Sopenharmony_ci flags & FAULT_FLAG_INSTRUCTION, 532662306a36Sopenharmony_ci flags & FAULT_FLAG_REMOTE)) { 532762306a36Sopenharmony_ci ret = VM_FAULT_SIGSEGV; 532862306a36Sopenharmony_ci goto out; 532962306a36Sopenharmony_ci } 533062306a36Sopenharmony_ci 533162306a36Sopenharmony_ci /* 533262306a36Sopenharmony_ci * Enable the memcg OOM handling for faults triggered in user 533362306a36Sopenharmony_ci * space. Kernel faults are handled more gracefully. 533462306a36Sopenharmony_ci */ 533562306a36Sopenharmony_ci if (flags & FAULT_FLAG_USER) 533662306a36Sopenharmony_ci mem_cgroup_enter_user_fault(); 533762306a36Sopenharmony_ci 533862306a36Sopenharmony_ci lru_gen_enter_fault(vma); 533962306a36Sopenharmony_ci 534062306a36Sopenharmony_ci if (unlikely(is_vm_hugetlb_page(vma))) 534162306a36Sopenharmony_ci ret = hugetlb_fault(vma->vm_mm, vma, address, flags); 534262306a36Sopenharmony_ci else 534362306a36Sopenharmony_ci ret = __handle_mm_fault(vma, address, flags); 534462306a36Sopenharmony_ci 534562306a36Sopenharmony_ci lru_gen_exit_fault(); 534662306a36Sopenharmony_ci 534762306a36Sopenharmony_ci if (flags & FAULT_FLAG_USER) { 534862306a36Sopenharmony_ci mem_cgroup_exit_user_fault(); 534962306a36Sopenharmony_ci /* 535062306a36Sopenharmony_ci * The task may have entered a memcg OOM situation but 535162306a36Sopenharmony_ci * if the allocation error was handled gracefully (no 535262306a36Sopenharmony_ci * VM_FAULT_OOM), there is no need to kill anything. 535362306a36Sopenharmony_ci * Just clean up the OOM state peacefully. 535462306a36Sopenharmony_ci */ 535562306a36Sopenharmony_ci if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) 535662306a36Sopenharmony_ci mem_cgroup_oom_synchronize(false); 535762306a36Sopenharmony_ci } 535862306a36Sopenharmony_ciout: 535962306a36Sopenharmony_ci mm_account_fault(mm, regs, address, flags, ret); 536062306a36Sopenharmony_ci 536162306a36Sopenharmony_ci return ret; 536262306a36Sopenharmony_ci} 536362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(handle_mm_fault); 536462306a36Sopenharmony_ci 536562306a36Sopenharmony_ci#ifdef CONFIG_LOCK_MM_AND_FIND_VMA 536662306a36Sopenharmony_ci#include <linux/extable.h> 536762306a36Sopenharmony_ci 536862306a36Sopenharmony_cistatic inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) 536962306a36Sopenharmony_ci{ 537062306a36Sopenharmony_ci if (likely(mmap_read_trylock(mm))) 537162306a36Sopenharmony_ci return true; 537262306a36Sopenharmony_ci 537362306a36Sopenharmony_ci if (regs && !user_mode(regs)) { 537462306a36Sopenharmony_ci unsigned long ip = exception_ip(regs); 537562306a36Sopenharmony_ci if (!search_exception_tables(ip)) 537662306a36Sopenharmony_ci return false; 537762306a36Sopenharmony_ci } 537862306a36Sopenharmony_ci 537962306a36Sopenharmony_ci return !mmap_read_lock_killable(mm); 538062306a36Sopenharmony_ci} 538162306a36Sopenharmony_ci 538262306a36Sopenharmony_cistatic inline bool mmap_upgrade_trylock(struct mm_struct *mm) 538362306a36Sopenharmony_ci{ 538462306a36Sopenharmony_ci /* 538562306a36Sopenharmony_ci * We don't have this operation yet. 538662306a36Sopenharmony_ci * 538762306a36Sopenharmony_ci * It should be easy enough to do: it's basically a 538862306a36Sopenharmony_ci * atomic_long_try_cmpxchg_acquire() 538962306a36Sopenharmony_ci * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but 539062306a36Sopenharmony_ci * it also needs the proper lockdep magic etc. 539162306a36Sopenharmony_ci */ 539262306a36Sopenharmony_ci return false; 539362306a36Sopenharmony_ci} 539462306a36Sopenharmony_ci 539562306a36Sopenharmony_cistatic inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) 539662306a36Sopenharmony_ci{ 539762306a36Sopenharmony_ci mmap_read_unlock(mm); 539862306a36Sopenharmony_ci if (regs && !user_mode(regs)) { 539962306a36Sopenharmony_ci unsigned long ip = exception_ip(regs); 540062306a36Sopenharmony_ci if (!search_exception_tables(ip)) 540162306a36Sopenharmony_ci return false; 540262306a36Sopenharmony_ci } 540362306a36Sopenharmony_ci return !mmap_write_lock_killable(mm); 540462306a36Sopenharmony_ci} 540562306a36Sopenharmony_ci 540662306a36Sopenharmony_ci/* 540762306a36Sopenharmony_ci * Helper for page fault handling. 540862306a36Sopenharmony_ci * 540962306a36Sopenharmony_ci * This is kind of equivalend to "mmap_read_lock()" followed 541062306a36Sopenharmony_ci * by "find_extend_vma()", except it's a lot more careful about 541162306a36Sopenharmony_ci * the locking (and will drop the lock on failure). 541262306a36Sopenharmony_ci * 541362306a36Sopenharmony_ci * For example, if we have a kernel bug that causes a page 541462306a36Sopenharmony_ci * fault, we don't want to just use mmap_read_lock() to get 541562306a36Sopenharmony_ci * the mm lock, because that would deadlock if the bug were 541662306a36Sopenharmony_ci * to happen while we're holding the mm lock for writing. 541762306a36Sopenharmony_ci * 541862306a36Sopenharmony_ci * So this checks the exception tables on kernel faults in 541962306a36Sopenharmony_ci * order to only do this all for instructions that are actually 542062306a36Sopenharmony_ci * expected to fault. 542162306a36Sopenharmony_ci * 542262306a36Sopenharmony_ci * We can also actually take the mm lock for writing if we 542362306a36Sopenharmony_ci * need to extend the vma, which helps the VM layer a lot. 542462306a36Sopenharmony_ci */ 542562306a36Sopenharmony_cistruct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, 542662306a36Sopenharmony_ci unsigned long addr, struct pt_regs *regs) 542762306a36Sopenharmony_ci{ 542862306a36Sopenharmony_ci struct vm_area_struct *vma; 542962306a36Sopenharmony_ci 543062306a36Sopenharmony_ci if (!get_mmap_lock_carefully(mm, regs)) 543162306a36Sopenharmony_ci return NULL; 543262306a36Sopenharmony_ci 543362306a36Sopenharmony_ci vma = find_vma(mm, addr); 543462306a36Sopenharmony_ci if (likely(vma && (vma->vm_start <= addr))) 543562306a36Sopenharmony_ci return vma; 543662306a36Sopenharmony_ci 543762306a36Sopenharmony_ci /* 543862306a36Sopenharmony_ci * Well, dang. We might still be successful, but only 543962306a36Sopenharmony_ci * if we can extend a vma to do so. 544062306a36Sopenharmony_ci */ 544162306a36Sopenharmony_ci if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { 544262306a36Sopenharmony_ci mmap_read_unlock(mm); 544362306a36Sopenharmony_ci return NULL; 544462306a36Sopenharmony_ci } 544562306a36Sopenharmony_ci 544662306a36Sopenharmony_ci /* 544762306a36Sopenharmony_ci * We can try to upgrade the mmap lock atomically, 544862306a36Sopenharmony_ci * in which case we can continue to use the vma 544962306a36Sopenharmony_ci * we already looked up. 545062306a36Sopenharmony_ci * 545162306a36Sopenharmony_ci * Otherwise we'll have to drop the mmap lock and 545262306a36Sopenharmony_ci * re-take it, and also look up the vma again, 545362306a36Sopenharmony_ci * re-checking it. 545462306a36Sopenharmony_ci */ 545562306a36Sopenharmony_ci if (!mmap_upgrade_trylock(mm)) { 545662306a36Sopenharmony_ci if (!upgrade_mmap_lock_carefully(mm, regs)) 545762306a36Sopenharmony_ci return NULL; 545862306a36Sopenharmony_ci 545962306a36Sopenharmony_ci vma = find_vma(mm, addr); 546062306a36Sopenharmony_ci if (!vma) 546162306a36Sopenharmony_ci goto fail; 546262306a36Sopenharmony_ci if (vma->vm_start <= addr) 546362306a36Sopenharmony_ci goto success; 546462306a36Sopenharmony_ci if (!(vma->vm_flags & VM_GROWSDOWN)) 546562306a36Sopenharmony_ci goto fail; 546662306a36Sopenharmony_ci } 546762306a36Sopenharmony_ci 546862306a36Sopenharmony_ci if (expand_stack_locked(vma, addr)) 546962306a36Sopenharmony_ci goto fail; 547062306a36Sopenharmony_ci 547162306a36Sopenharmony_cisuccess: 547262306a36Sopenharmony_ci mmap_write_downgrade(mm); 547362306a36Sopenharmony_ci return vma; 547462306a36Sopenharmony_ci 547562306a36Sopenharmony_cifail: 547662306a36Sopenharmony_ci mmap_write_unlock(mm); 547762306a36Sopenharmony_ci return NULL; 547862306a36Sopenharmony_ci} 547962306a36Sopenharmony_ci#endif 548062306a36Sopenharmony_ci 548162306a36Sopenharmony_ci#ifdef CONFIG_PER_VMA_LOCK 548262306a36Sopenharmony_ci/* 548362306a36Sopenharmony_ci * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be 548462306a36Sopenharmony_ci * stable and not isolated. If the VMA is not found or is being modified the 548562306a36Sopenharmony_ci * function returns NULL. 548662306a36Sopenharmony_ci */ 548762306a36Sopenharmony_cistruct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, 548862306a36Sopenharmony_ci unsigned long address) 548962306a36Sopenharmony_ci{ 549062306a36Sopenharmony_ci MA_STATE(mas, &mm->mm_mt, address, address); 549162306a36Sopenharmony_ci struct vm_area_struct *vma; 549262306a36Sopenharmony_ci 549362306a36Sopenharmony_ci rcu_read_lock(); 549462306a36Sopenharmony_ciretry: 549562306a36Sopenharmony_ci vma = mas_walk(&mas); 549662306a36Sopenharmony_ci if (!vma) 549762306a36Sopenharmony_ci goto inval; 549862306a36Sopenharmony_ci 549962306a36Sopenharmony_ci if (!vma_start_read(vma)) 550062306a36Sopenharmony_ci goto inval; 550162306a36Sopenharmony_ci 550262306a36Sopenharmony_ci /* 550362306a36Sopenharmony_ci * find_mergeable_anon_vma uses adjacent vmas which are not locked. 550462306a36Sopenharmony_ci * This check must happen after vma_start_read(); otherwise, a 550562306a36Sopenharmony_ci * concurrent mremap() with MREMAP_DONTUNMAP could dissociate the VMA 550662306a36Sopenharmony_ci * from its anon_vma. 550762306a36Sopenharmony_ci */ 550862306a36Sopenharmony_ci if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) 550962306a36Sopenharmony_ci goto inval_end_read; 551062306a36Sopenharmony_ci 551162306a36Sopenharmony_ci /* Check since vm_start/vm_end might change before we lock the VMA */ 551262306a36Sopenharmony_ci if (unlikely(address < vma->vm_start || address >= vma->vm_end)) 551362306a36Sopenharmony_ci goto inval_end_read; 551462306a36Sopenharmony_ci 551562306a36Sopenharmony_ci /* Check if the VMA got isolated after we found it */ 551662306a36Sopenharmony_ci if (vma->detached) { 551762306a36Sopenharmony_ci vma_end_read(vma); 551862306a36Sopenharmony_ci count_vm_vma_lock_event(VMA_LOCK_MISS); 551962306a36Sopenharmony_ci /* The area was replaced with another one */ 552062306a36Sopenharmony_ci goto retry; 552162306a36Sopenharmony_ci } 552262306a36Sopenharmony_ci 552362306a36Sopenharmony_ci rcu_read_unlock(); 552462306a36Sopenharmony_ci return vma; 552562306a36Sopenharmony_ci 552662306a36Sopenharmony_ciinval_end_read: 552762306a36Sopenharmony_ci vma_end_read(vma); 552862306a36Sopenharmony_ciinval: 552962306a36Sopenharmony_ci rcu_read_unlock(); 553062306a36Sopenharmony_ci count_vm_vma_lock_event(VMA_LOCK_ABORT); 553162306a36Sopenharmony_ci return NULL; 553262306a36Sopenharmony_ci} 553362306a36Sopenharmony_ci#endif /* CONFIG_PER_VMA_LOCK */ 553462306a36Sopenharmony_ci 553562306a36Sopenharmony_ci#ifndef __PAGETABLE_P4D_FOLDED 553662306a36Sopenharmony_ci/* 553762306a36Sopenharmony_ci * Allocate p4d page table. 553862306a36Sopenharmony_ci * We've already handled the fast-path in-line. 553962306a36Sopenharmony_ci */ 554062306a36Sopenharmony_ciint __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) 554162306a36Sopenharmony_ci{ 554262306a36Sopenharmony_ci p4d_t *new = p4d_alloc_one(mm, address); 554362306a36Sopenharmony_ci if (!new) 554462306a36Sopenharmony_ci return -ENOMEM; 554562306a36Sopenharmony_ci 554662306a36Sopenharmony_ci spin_lock(&mm->page_table_lock); 554762306a36Sopenharmony_ci if (pgd_present(*pgd)) { /* Another has populated it */ 554862306a36Sopenharmony_ci p4d_free(mm, new); 554962306a36Sopenharmony_ci } else { 555062306a36Sopenharmony_ci smp_wmb(); /* See comment in pmd_install() */ 555162306a36Sopenharmony_ci pgd_populate(mm, pgd, new); 555262306a36Sopenharmony_ci } 555362306a36Sopenharmony_ci spin_unlock(&mm->page_table_lock); 555462306a36Sopenharmony_ci return 0; 555562306a36Sopenharmony_ci} 555662306a36Sopenharmony_ci#endif /* __PAGETABLE_P4D_FOLDED */ 555762306a36Sopenharmony_ci 555862306a36Sopenharmony_ci#ifndef __PAGETABLE_PUD_FOLDED 555962306a36Sopenharmony_ci/* 556062306a36Sopenharmony_ci * Allocate page upper directory. 556162306a36Sopenharmony_ci * We've already handled the fast-path in-line. 556262306a36Sopenharmony_ci */ 556362306a36Sopenharmony_ciint __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) 556462306a36Sopenharmony_ci{ 556562306a36Sopenharmony_ci pud_t *new = pud_alloc_one(mm, address); 556662306a36Sopenharmony_ci if (!new) 556762306a36Sopenharmony_ci return -ENOMEM; 556862306a36Sopenharmony_ci 556962306a36Sopenharmony_ci spin_lock(&mm->page_table_lock); 557062306a36Sopenharmony_ci if (!p4d_present(*p4d)) { 557162306a36Sopenharmony_ci mm_inc_nr_puds(mm); 557262306a36Sopenharmony_ci smp_wmb(); /* See comment in pmd_install() */ 557362306a36Sopenharmony_ci p4d_populate(mm, p4d, new); 557462306a36Sopenharmony_ci } else /* Another has populated it */ 557562306a36Sopenharmony_ci pud_free(mm, new); 557662306a36Sopenharmony_ci spin_unlock(&mm->page_table_lock); 557762306a36Sopenharmony_ci return 0; 557862306a36Sopenharmony_ci} 557962306a36Sopenharmony_ci#endif /* __PAGETABLE_PUD_FOLDED */ 558062306a36Sopenharmony_ci 558162306a36Sopenharmony_ci#ifndef __PAGETABLE_PMD_FOLDED 558262306a36Sopenharmony_ci/* 558362306a36Sopenharmony_ci * Allocate page middle directory. 558462306a36Sopenharmony_ci * We've already handled the fast-path in-line. 558562306a36Sopenharmony_ci */ 558662306a36Sopenharmony_ciint __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) 558762306a36Sopenharmony_ci{ 558862306a36Sopenharmony_ci spinlock_t *ptl; 558962306a36Sopenharmony_ci pmd_t *new = pmd_alloc_one(mm, address); 559062306a36Sopenharmony_ci if (!new) 559162306a36Sopenharmony_ci return -ENOMEM; 559262306a36Sopenharmony_ci 559362306a36Sopenharmony_ci ptl = pud_lock(mm, pud); 559462306a36Sopenharmony_ci if (!pud_present(*pud)) { 559562306a36Sopenharmony_ci mm_inc_nr_pmds(mm); 559662306a36Sopenharmony_ci smp_wmb(); /* See comment in pmd_install() */ 559762306a36Sopenharmony_ci pud_populate(mm, pud, new); 559862306a36Sopenharmony_ci } else { /* Another has populated it */ 559962306a36Sopenharmony_ci pmd_free(mm, new); 560062306a36Sopenharmony_ci } 560162306a36Sopenharmony_ci spin_unlock(ptl); 560262306a36Sopenharmony_ci return 0; 560362306a36Sopenharmony_ci} 560462306a36Sopenharmony_ci#endif /* __PAGETABLE_PMD_FOLDED */ 560562306a36Sopenharmony_ci 560662306a36Sopenharmony_ci/** 560762306a36Sopenharmony_ci * follow_pte - look up PTE at a user virtual address 560862306a36Sopenharmony_ci * @mm: the mm_struct of the target address space 560962306a36Sopenharmony_ci * @address: user virtual address 561062306a36Sopenharmony_ci * @ptepp: location to store found PTE 561162306a36Sopenharmony_ci * @ptlp: location to store the lock for the PTE 561262306a36Sopenharmony_ci * 561362306a36Sopenharmony_ci * On a successful return, the pointer to the PTE is stored in @ptepp; 561462306a36Sopenharmony_ci * the corresponding lock is taken and its location is stored in @ptlp. 561562306a36Sopenharmony_ci * The contents of the PTE are only stable until @ptlp is released; 561662306a36Sopenharmony_ci * any further use, if any, must be protected against invalidation 561762306a36Sopenharmony_ci * with MMU notifiers. 561862306a36Sopenharmony_ci * 561962306a36Sopenharmony_ci * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore 562062306a36Sopenharmony_ci * should be taken for read. 562162306a36Sopenharmony_ci * 562262306a36Sopenharmony_ci * KVM uses this function. While it is arguably less bad than ``follow_pfn``, 562362306a36Sopenharmony_ci * it is not a good general-purpose API. 562462306a36Sopenharmony_ci * 562562306a36Sopenharmony_ci * Return: zero on success, -ve otherwise. 562662306a36Sopenharmony_ci */ 562762306a36Sopenharmony_ciint follow_pte(struct mm_struct *mm, unsigned long address, 562862306a36Sopenharmony_ci pte_t **ptepp, spinlock_t **ptlp) 562962306a36Sopenharmony_ci{ 563062306a36Sopenharmony_ci pgd_t *pgd; 563162306a36Sopenharmony_ci p4d_t *p4d; 563262306a36Sopenharmony_ci pud_t *pud; 563362306a36Sopenharmony_ci pmd_t *pmd; 563462306a36Sopenharmony_ci pte_t *ptep; 563562306a36Sopenharmony_ci 563662306a36Sopenharmony_ci pgd = pgd_offset(mm, address); 563762306a36Sopenharmony_ci if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) 563862306a36Sopenharmony_ci goto out; 563962306a36Sopenharmony_ci 564062306a36Sopenharmony_ci p4d = p4d_offset(pgd, address); 564162306a36Sopenharmony_ci if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d))) 564262306a36Sopenharmony_ci goto out; 564362306a36Sopenharmony_ci 564462306a36Sopenharmony_ci pud = pud_offset(p4d, address); 564562306a36Sopenharmony_ci if (pud_none(*pud) || unlikely(pud_bad(*pud))) 564662306a36Sopenharmony_ci goto out; 564762306a36Sopenharmony_ci 564862306a36Sopenharmony_ci pmd = pmd_offset(pud, address); 564962306a36Sopenharmony_ci VM_BUG_ON(pmd_trans_huge(*pmd)); 565062306a36Sopenharmony_ci 565162306a36Sopenharmony_ci ptep = pte_offset_map_lock(mm, pmd, address, ptlp); 565262306a36Sopenharmony_ci if (!ptep) 565362306a36Sopenharmony_ci goto out; 565462306a36Sopenharmony_ci if (!pte_present(ptep_get(ptep))) 565562306a36Sopenharmony_ci goto unlock; 565662306a36Sopenharmony_ci *ptepp = ptep; 565762306a36Sopenharmony_ci return 0; 565862306a36Sopenharmony_ciunlock: 565962306a36Sopenharmony_ci pte_unmap_unlock(ptep, *ptlp); 566062306a36Sopenharmony_ciout: 566162306a36Sopenharmony_ci return -EINVAL; 566262306a36Sopenharmony_ci} 566362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(follow_pte); 566462306a36Sopenharmony_ci 566562306a36Sopenharmony_ci/** 566662306a36Sopenharmony_ci * follow_pfn - look up PFN at a user virtual address 566762306a36Sopenharmony_ci * @vma: memory mapping 566862306a36Sopenharmony_ci * @address: user virtual address 566962306a36Sopenharmony_ci * @pfn: location to store found PFN 567062306a36Sopenharmony_ci * 567162306a36Sopenharmony_ci * Only IO mappings and raw PFN mappings are allowed. 567262306a36Sopenharmony_ci * 567362306a36Sopenharmony_ci * This function does not allow the caller to read the permissions 567462306a36Sopenharmony_ci * of the PTE. Do not use it. 567562306a36Sopenharmony_ci * 567662306a36Sopenharmony_ci * Return: zero and the pfn at @pfn on success, -ve otherwise. 567762306a36Sopenharmony_ci */ 567862306a36Sopenharmony_ciint follow_pfn(struct vm_area_struct *vma, unsigned long address, 567962306a36Sopenharmony_ci unsigned long *pfn) 568062306a36Sopenharmony_ci{ 568162306a36Sopenharmony_ci int ret = -EINVAL; 568262306a36Sopenharmony_ci spinlock_t *ptl; 568362306a36Sopenharmony_ci pte_t *ptep; 568462306a36Sopenharmony_ci 568562306a36Sopenharmony_ci if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) 568662306a36Sopenharmony_ci return ret; 568762306a36Sopenharmony_ci 568862306a36Sopenharmony_ci ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); 568962306a36Sopenharmony_ci if (ret) 569062306a36Sopenharmony_ci return ret; 569162306a36Sopenharmony_ci *pfn = pte_pfn(ptep_get(ptep)); 569262306a36Sopenharmony_ci pte_unmap_unlock(ptep, ptl); 569362306a36Sopenharmony_ci return 0; 569462306a36Sopenharmony_ci} 569562306a36Sopenharmony_ciEXPORT_SYMBOL(follow_pfn); 569662306a36Sopenharmony_ci 569762306a36Sopenharmony_ci#ifdef CONFIG_HAVE_IOREMAP_PROT 569862306a36Sopenharmony_ciint follow_phys(struct vm_area_struct *vma, 569962306a36Sopenharmony_ci unsigned long address, unsigned int flags, 570062306a36Sopenharmony_ci unsigned long *prot, resource_size_t *phys) 570162306a36Sopenharmony_ci{ 570262306a36Sopenharmony_ci int ret = -EINVAL; 570362306a36Sopenharmony_ci pte_t *ptep, pte; 570462306a36Sopenharmony_ci spinlock_t *ptl; 570562306a36Sopenharmony_ci 570662306a36Sopenharmony_ci if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) 570762306a36Sopenharmony_ci goto out; 570862306a36Sopenharmony_ci 570962306a36Sopenharmony_ci if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) 571062306a36Sopenharmony_ci goto out; 571162306a36Sopenharmony_ci pte = ptep_get(ptep); 571262306a36Sopenharmony_ci 571362306a36Sopenharmony_ci if ((flags & FOLL_WRITE) && !pte_write(pte)) 571462306a36Sopenharmony_ci goto unlock; 571562306a36Sopenharmony_ci 571662306a36Sopenharmony_ci *prot = pgprot_val(pte_pgprot(pte)); 571762306a36Sopenharmony_ci *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; 571862306a36Sopenharmony_ci 571962306a36Sopenharmony_ci ret = 0; 572062306a36Sopenharmony_ciunlock: 572162306a36Sopenharmony_ci pte_unmap_unlock(ptep, ptl); 572262306a36Sopenharmony_ciout: 572362306a36Sopenharmony_ci return ret; 572462306a36Sopenharmony_ci} 572562306a36Sopenharmony_ci 572662306a36Sopenharmony_ci/** 572762306a36Sopenharmony_ci * generic_access_phys - generic implementation for iomem mmap access 572862306a36Sopenharmony_ci * @vma: the vma to access 572962306a36Sopenharmony_ci * @addr: userspace address, not relative offset within @vma 573062306a36Sopenharmony_ci * @buf: buffer to read/write 573162306a36Sopenharmony_ci * @len: length of transfer 573262306a36Sopenharmony_ci * @write: set to FOLL_WRITE when writing, otherwise reading 573362306a36Sopenharmony_ci * 573462306a36Sopenharmony_ci * This is a generic implementation for &vm_operations_struct.access for an 573562306a36Sopenharmony_ci * iomem mapping. This callback is used by access_process_vm() when the @vma is 573662306a36Sopenharmony_ci * not page based. 573762306a36Sopenharmony_ci */ 573862306a36Sopenharmony_ciint generic_access_phys(struct vm_area_struct *vma, unsigned long addr, 573962306a36Sopenharmony_ci void *buf, int len, int write) 574062306a36Sopenharmony_ci{ 574162306a36Sopenharmony_ci resource_size_t phys_addr; 574262306a36Sopenharmony_ci unsigned long prot = 0; 574362306a36Sopenharmony_ci void __iomem *maddr; 574462306a36Sopenharmony_ci pte_t *ptep, pte; 574562306a36Sopenharmony_ci spinlock_t *ptl; 574662306a36Sopenharmony_ci int offset = offset_in_page(addr); 574762306a36Sopenharmony_ci int ret = -EINVAL; 574862306a36Sopenharmony_ci 574962306a36Sopenharmony_ci if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) 575062306a36Sopenharmony_ci return -EINVAL; 575162306a36Sopenharmony_ci 575262306a36Sopenharmony_ciretry: 575362306a36Sopenharmony_ci if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) 575462306a36Sopenharmony_ci return -EINVAL; 575562306a36Sopenharmony_ci pte = ptep_get(ptep); 575662306a36Sopenharmony_ci pte_unmap_unlock(ptep, ptl); 575762306a36Sopenharmony_ci 575862306a36Sopenharmony_ci prot = pgprot_val(pte_pgprot(pte)); 575962306a36Sopenharmony_ci phys_addr = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; 576062306a36Sopenharmony_ci 576162306a36Sopenharmony_ci if ((write & FOLL_WRITE) && !pte_write(pte)) 576262306a36Sopenharmony_ci return -EINVAL; 576362306a36Sopenharmony_ci 576462306a36Sopenharmony_ci maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot); 576562306a36Sopenharmony_ci if (!maddr) 576662306a36Sopenharmony_ci return -ENOMEM; 576762306a36Sopenharmony_ci 576862306a36Sopenharmony_ci if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) 576962306a36Sopenharmony_ci goto out_unmap; 577062306a36Sopenharmony_ci 577162306a36Sopenharmony_ci if (!pte_same(pte, ptep_get(ptep))) { 577262306a36Sopenharmony_ci pte_unmap_unlock(ptep, ptl); 577362306a36Sopenharmony_ci iounmap(maddr); 577462306a36Sopenharmony_ci 577562306a36Sopenharmony_ci goto retry; 577662306a36Sopenharmony_ci } 577762306a36Sopenharmony_ci 577862306a36Sopenharmony_ci if (write) 577962306a36Sopenharmony_ci memcpy_toio(maddr + offset, buf, len); 578062306a36Sopenharmony_ci else 578162306a36Sopenharmony_ci memcpy_fromio(buf, maddr + offset, len); 578262306a36Sopenharmony_ci ret = len; 578362306a36Sopenharmony_ci pte_unmap_unlock(ptep, ptl); 578462306a36Sopenharmony_ciout_unmap: 578562306a36Sopenharmony_ci iounmap(maddr); 578662306a36Sopenharmony_ci 578762306a36Sopenharmony_ci return ret; 578862306a36Sopenharmony_ci} 578962306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(generic_access_phys); 579062306a36Sopenharmony_ci#endif 579162306a36Sopenharmony_ci 579262306a36Sopenharmony_ci/* 579362306a36Sopenharmony_ci * Access another process' address space as given in mm. 579462306a36Sopenharmony_ci */ 579562306a36Sopenharmony_ciint __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, 579662306a36Sopenharmony_ci int len, unsigned int gup_flags) 579762306a36Sopenharmony_ci{ 579862306a36Sopenharmony_ci void *old_buf = buf; 579962306a36Sopenharmony_ci int write = gup_flags & FOLL_WRITE; 580062306a36Sopenharmony_ci 580162306a36Sopenharmony_ci if (mmap_read_lock_killable(mm)) 580262306a36Sopenharmony_ci return 0; 580362306a36Sopenharmony_ci 580462306a36Sopenharmony_ci /* Untag the address before looking up the VMA */ 580562306a36Sopenharmony_ci addr = untagged_addr_remote(mm, addr); 580662306a36Sopenharmony_ci 580762306a36Sopenharmony_ci /* Avoid triggering the temporary warning in __get_user_pages */ 580862306a36Sopenharmony_ci if (!vma_lookup(mm, addr) && !expand_stack(mm, addr)) 580962306a36Sopenharmony_ci return 0; 581062306a36Sopenharmony_ci 581162306a36Sopenharmony_ci /* ignore errors, just check how much was successfully transferred */ 581262306a36Sopenharmony_ci while (len) { 581362306a36Sopenharmony_ci int bytes, offset; 581462306a36Sopenharmony_ci void *maddr; 581562306a36Sopenharmony_ci struct vm_area_struct *vma = NULL; 581662306a36Sopenharmony_ci struct page *page = get_user_page_vma_remote(mm, addr, 581762306a36Sopenharmony_ci gup_flags, &vma); 581862306a36Sopenharmony_ci 581962306a36Sopenharmony_ci if (IS_ERR_OR_NULL(page)) { 582062306a36Sopenharmony_ci /* We might need to expand the stack to access it */ 582162306a36Sopenharmony_ci vma = vma_lookup(mm, addr); 582262306a36Sopenharmony_ci if (!vma) { 582362306a36Sopenharmony_ci vma = expand_stack(mm, addr); 582462306a36Sopenharmony_ci 582562306a36Sopenharmony_ci /* mmap_lock was dropped on failure */ 582662306a36Sopenharmony_ci if (!vma) 582762306a36Sopenharmony_ci return buf - old_buf; 582862306a36Sopenharmony_ci 582962306a36Sopenharmony_ci /* Try again if stack expansion worked */ 583062306a36Sopenharmony_ci continue; 583162306a36Sopenharmony_ci } 583262306a36Sopenharmony_ci 583362306a36Sopenharmony_ci 583462306a36Sopenharmony_ci /* 583562306a36Sopenharmony_ci * Check if this is a VM_IO | VM_PFNMAP VMA, which 583662306a36Sopenharmony_ci * we can access using slightly different code. 583762306a36Sopenharmony_ci */ 583862306a36Sopenharmony_ci bytes = 0; 583962306a36Sopenharmony_ci#ifdef CONFIG_HAVE_IOREMAP_PROT 584062306a36Sopenharmony_ci if (vma->vm_ops && vma->vm_ops->access) 584162306a36Sopenharmony_ci bytes = vma->vm_ops->access(vma, addr, buf, 584262306a36Sopenharmony_ci len, write); 584362306a36Sopenharmony_ci#endif 584462306a36Sopenharmony_ci if (bytes <= 0) 584562306a36Sopenharmony_ci break; 584662306a36Sopenharmony_ci } else { 584762306a36Sopenharmony_ci bytes = len; 584862306a36Sopenharmony_ci offset = addr & (PAGE_SIZE-1); 584962306a36Sopenharmony_ci if (bytes > PAGE_SIZE-offset) 585062306a36Sopenharmony_ci bytes = PAGE_SIZE-offset; 585162306a36Sopenharmony_ci 585262306a36Sopenharmony_ci maddr = kmap(page); 585362306a36Sopenharmony_ci if (write) { 585462306a36Sopenharmony_ci copy_to_user_page(vma, page, addr, 585562306a36Sopenharmony_ci maddr + offset, buf, bytes); 585662306a36Sopenharmony_ci set_page_dirty_lock(page); 585762306a36Sopenharmony_ci } else { 585862306a36Sopenharmony_ci copy_from_user_page(vma, page, addr, 585962306a36Sopenharmony_ci buf, maddr + offset, bytes); 586062306a36Sopenharmony_ci } 586162306a36Sopenharmony_ci kunmap(page); 586262306a36Sopenharmony_ci put_page(page); 586362306a36Sopenharmony_ci } 586462306a36Sopenharmony_ci len -= bytes; 586562306a36Sopenharmony_ci buf += bytes; 586662306a36Sopenharmony_ci addr += bytes; 586762306a36Sopenharmony_ci } 586862306a36Sopenharmony_ci mmap_read_unlock(mm); 586962306a36Sopenharmony_ci 587062306a36Sopenharmony_ci return buf - old_buf; 587162306a36Sopenharmony_ci} 587262306a36Sopenharmony_ci 587362306a36Sopenharmony_ci/** 587462306a36Sopenharmony_ci * access_remote_vm - access another process' address space 587562306a36Sopenharmony_ci * @mm: the mm_struct of the target address space 587662306a36Sopenharmony_ci * @addr: start address to access 587762306a36Sopenharmony_ci * @buf: source or destination buffer 587862306a36Sopenharmony_ci * @len: number of bytes to transfer 587962306a36Sopenharmony_ci * @gup_flags: flags modifying lookup behaviour 588062306a36Sopenharmony_ci * 588162306a36Sopenharmony_ci * The caller must hold a reference on @mm. 588262306a36Sopenharmony_ci * 588362306a36Sopenharmony_ci * Return: number of bytes copied from source to destination. 588462306a36Sopenharmony_ci */ 588562306a36Sopenharmony_ciint access_remote_vm(struct mm_struct *mm, unsigned long addr, 588662306a36Sopenharmony_ci void *buf, int len, unsigned int gup_flags) 588762306a36Sopenharmony_ci{ 588862306a36Sopenharmony_ci return __access_remote_vm(mm, addr, buf, len, gup_flags); 588962306a36Sopenharmony_ci} 589062306a36Sopenharmony_ci 589162306a36Sopenharmony_ci/* 589262306a36Sopenharmony_ci * Access another process' address space. 589362306a36Sopenharmony_ci * Source/target buffer must be kernel space, 589462306a36Sopenharmony_ci * Do not walk the page table directly, use get_user_pages 589562306a36Sopenharmony_ci */ 589662306a36Sopenharmony_ciint access_process_vm(struct task_struct *tsk, unsigned long addr, 589762306a36Sopenharmony_ci void *buf, int len, unsigned int gup_flags) 589862306a36Sopenharmony_ci{ 589962306a36Sopenharmony_ci struct mm_struct *mm; 590062306a36Sopenharmony_ci int ret; 590162306a36Sopenharmony_ci 590262306a36Sopenharmony_ci mm = get_task_mm(tsk); 590362306a36Sopenharmony_ci if (!mm) 590462306a36Sopenharmony_ci return 0; 590562306a36Sopenharmony_ci 590662306a36Sopenharmony_ci ret = __access_remote_vm(mm, addr, buf, len, gup_flags); 590762306a36Sopenharmony_ci 590862306a36Sopenharmony_ci mmput(mm); 590962306a36Sopenharmony_ci 591062306a36Sopenharmony_ci return ret; 591162306a36Sopenharmony_ci} 591262306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(access_process_vm); 591362306a36Sopenharmony_ci 591462306a36Sopenharmony_ci/* 591562306a36Sopenharmony_ci * Print the name of a VMA. 591662306a36Sopenharmony_ci */ 591762306a36Sopenharmony_civoid print_vma_addr(char *prefix, unsigned long ip) 591862306a36Sopenharmony_ci{ 591962306a36Sopenharmony_ci struct mm_struct *mm = current->mm; 592062306a36Sopenharmony_ci struct vm_area_struct *vma; 592162306a36Sopenharmony_ci 592262306a36Sopenharmony_ci /* 592362306a36Sopenharmony_ci * we might be running from an atomic context so we cannot sleep 592462306a36Sopenharmony_ci */ 592562306a36Sopenharmony_ci if (!mmap_read_trylock(mm)) 592662306a36Sopenharmony_ci return; 592762306a36Sopenharmony_ci 592862306a36Sopenharmony_ci vma = find_vma(mm, ip); 592962306a36Sopenharmony_ci if (vma && vma->vm_file) { 593062306a36Sopenharmony_ci struct file *f = vma->vm_file; 593162306a36Sopenharmony_ci char *buf = (char *)__get_free_page(GFP_NOWAIT); 593262306a36Sopenharmony_ci if (buf) { 593362306a36Sopenharmony_ci char *p; 593462306a36Sopenharmony_ci 593562306a36Sopenharmony_ci p = file_path(f, buf, PAGE_SIZE); 593662306a36Sopenharmony_ci if (IS_ERR(p)) 593762306a36Sopenharmony_ci p = "?"; 593862306a36Sopenharmony_ci printk("%s%s[%lx+%lx]", prefix, kbasename(p), 593962306a36Sopenharmony_ci vma->vm_start, 594062306a36Sopenharmony_ci vma->vm_end - vma->vm_start); 594162306a36Sopenharmony_ci free_page((unsigned long)buf); 594262306a36Sopenharmony_ci } 594362306a36Sopenharmony_ci } 594462306a36Sopenharmony_ci mmap_read_unlock(mm); 594562306a36Sopenharmony_ci} 594662306a36Sopenharmony_ci 594762306a36Sopenharmony_ci#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP) 594862306a36Sopenharmony_civoid __might_fault(const char *file, int line) 594962306a36Sopenharmony_ci{ 595062306a36Sopenharmony_ci if (pagefault_disabled()) 595162306a36Sopenharmony_ci return; 595262306a36Sopenharmony_ci __might_sleep(file, line); 595362306a36Sopenharmony_ci#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) 595462306a36Sopenharmony_ci if (current->mm) 595562306a36Sopenharmony_ci might_lock_read(¤t->mm->mmap_lock); 595662306a36Sopenharmony_ci#endif 595762306a36Sopenharmony_ci} 595862306a36Sopenharmony_ciEXPORT_SYMBOL(__might_fault); 595962306a36Sopenharmony_ci#endif 596062306a36Sopenharmony_ci 596162306a36Sopenharmony_ci#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) 596262306a36Sopenharmony_ci/* 596362306a36Sopenharmony_ci * Process all subpages of the specified huge page with the specified 596462306a36Sopenharmony_ci * operation. The target subpage will be processed last to keep its 596562306a36Sopenharmony_ci * cache lines hot. 596662306a36Sopenharmony_ci */ 596762306a36Sopenharmony_cistatic inline int process_huge_page( 596862306a36Sopenharmony_ci unsigned long addr_hint, unsigned int pages_per_huge_page, 596962306a36Sopenharmony_ci int (*process_subpage)(unsigned long addr, int idx, void *arg), 597062306a36Sopenharmony_ci void *arg) 597162306a36Sopenharmony_ci{ 597262306a36Sopenharmony_ci int i, n, base, l, ret; 597362306a36Sopenharmony_ci unsigned long addr = addr_hint & 597462306a36Sopenharmony_ci ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); 597562306a36Sopenharmony_ci 597662306a36Sopenharmony_ci /* Process target subpage last to keep its cache lines hot */ 597762306a36Sopenharmony_ci might_sleep(); 597862306a36Sopenharmony_ci n = (addr_hint - addr) / PAGE_SIZE; 597962306a36Sopenharmony_ci if (2 * n <= pages_per_huge_page) { 598062306a36Sopenharmony_ci /* If target subpage in first half of huge page */ 598162306a36Sopenharmony_ci base = 0; 598262306a36Sopenharmony_ci l = n; 598362306a36Sopenharmony_ci /* Process subpages at the end of huge page */ 598462306a36Sopenharmony_ci for (i = pages_per_huge_page - 1; i >= 2 * n; i--) { 598562306a36Sopenharmony_ci cond_resched(); 598662306a36Sopenharmony_ci ret = process_subpage(addr + i * PAGE_SIZE, i, arg); 598762306a36Sopenharmony_ci if (ret) 598862306a36Sopenharmony_ci return ret; 598962306a36Sopenharmony_ci } 599062306a36Sopenharmony_ci } else { 599162306a36Sopenharmony_ci /* If target subpage in second half of huge page */ 599262306a36Sopenharmony_ci base = pages_per_huge_page - 2 * (pages_per_huge_page - n); 599362306a36Sopenharmony_ci l = pages_per_huge_page - n; 599462306a36Sopenharmony_ci /* Process subpages at the begin of huge page */ 599562306a36Sopenharmony_ci for (i = 0; i < base; i++) { 599662306a36Sopenharmony_ci cond_resched(); 599762306a36Sopenharmony_ci ret = process_subpage(addr + i * PAGE_SIZE, i, arg); 599862306a36Sopenharmony_ci if (ret) 599962306a36Sopenharmony_ci return ret; 600062306a36Sopenharmony_ci } 600162306a36Sopenharmony_ci } 600262306a36Sopenharmony_ci /* 600362306a36Sopenharmony_ci * Process remaining subpages in left-right-left-right pattern 600462306a36Sopenharmony_ci * towards the target subpage 600562306a36Sopenharmony_ci */ 600662306a36Sopenharmony_ci for (i = 0; i < l; i++) { 600762306a36Sopenharmony_ci int left_idx = base + i; 600862306a36Sopenharmony_ci int right_idx = base + 2 * l - 1 - i; 600962306a36Sopenharmony_ci 601062306a36Sopenharmony_ci cond_resched(); 601162306a36Sopenharmony_ci ret = process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg); 601262306a36Sopenharmony_ci if (ret) 601362306a36Sopenharmony_ci return ret; 601462306a36Sopenharmony_ci cond_resched(); 601562306a36Sopenharmony_ci ret = process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg); 601662306a36Sopenharmony_ci if (ret) 601762306a36Sopenharmony_ci return ret; 601862306a36Sopenharmony_ci } 601962306a36Sopenharmony_ci return 0; 602062306a36Sopenharmony_ci} 602162306a36Sopenharmony_ci 602262306a36Sopenharmony_cistatic void clear_gigantic_page(struct page *page, 602362306a36Sopenharmony_ci unsigned long addr, 602462306a36Sopenharmony_ci unsigned int pages_per_huge_page) 602562306a36Sopenharmony_ci{ 602662306a36Sopenharmony_ci int i; 602762306a36Sopenharmony_ci struct page *p; 602862306a36Sopenharmony_ci 602962306a36Sopenharmony_ci might_sleep(); 603062306a36Sopenharmony_ci for (i = 0; i < pages_per_huge_page; i++) { 603162306a36Sopenharmony_ci p = nth_page(page, i); 603262306a36Sopenharmony_ci cond_resched(); 603362306a36Sopenharmony_ci clear_user_highpage(p, addr + i * PAGE_SIZE); 603462306a36Sopenharmony_ci } 603562306a36Sopenharmony_ci} 603662306a36Sopenharmony_ci 603762306a36Sopenharmony_cistatic int clear_subpage(unsigned long addr, int idx, void *arg) 603862306a36Sopenharmony_ci{ 603962306a36Sopenharmony_ci struct page *page = arg; 604062306a36Sopenharmony_ci 604162306a36Sopenharmony_ci clear_user_highpage(page + idx, addr); 604262306a36Sopenharmony_ci return 0; 604362306a36Sopenharmony_ci} 604462306a36Sopenharmony_ci 604562306a36Sopenharmony_civoid clear_huge_page(struct page *page, 604662306a36Sopenharmony_ci unsigned long addr_hint, unsigned int pages_per_huge_page) 604762306a36Sopenharmony_ci{ 604862306a36Sopenharmony_ci unsigned long addr = addr_hint & 604962306a36Sopenharmony_ci ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); 605062306a36Sopenharmony_ci 605162306a36Sopenharmony_ci if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { 605262306a36Sopenharmony_ci clear_gigantic_page(page, addr, pages_per_huge_page); 605362306a36Sopenharmony_ci return; 605462306a36Sopenharmony_ci } 605562306a36Sopenharmony_ci 605662306a36Sopenharmony_ci process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page); 605762306a36Sopenharmony_ci} 605862306a36Sopenharmony_ci 605962306a36Sopenharmony_cistatic int copy_user_gigantic_page(struct folio *dst, struct folio *src, 606062306a36Sopenharmony_ci unsigned long addr, 606162306a36Sopenharmony_ci struct vm_area_struct *vma, 606262306a36Sopenharmony_ci unsigned int pages_per_huge_page) 606362306a36Sopenharmony_ci{ 606462306a36Sopenharmony_ci int i; 606562306a36Sopenharmony_ci struct page *dst_page; 606662306a36Sopenharmony_ci struct page *src_page; 606762306a36Sopenharmony_ci 606862306a36Sopenharmony_ci for (i = 0; i < pages_per_huge_page; i++) { 606962306a36Sopenharmony_ci dst_page = folio_page(dst, i); 607062306a36Sopenharmony_ci src_page = folio_page(src, i); 607162306a36Sopenharmony_ci 607262306a36Sopenharmony_ci cond_resched(); 607362306a36Sopenharmony_ci if (copy_mc_user_highpage(dst_page, src_page, 607462306a36Sopenharmony_ci addr + i*PAGE_SIZE, vma)) { 607562306a36Sopenharmony_ci memory_failure_queue(page_to_pfn(src_page), 0); 607662306a36Sopenharmony_ci return -EHWPOISON; 607762306a36Sopenharmony_ci } 607862306a36Sopenharmony_ci } 607962306a36Sopenharmony_ci return 0; 608062306a36Sopenharmony_ci} 608162306a36Sopenharmony_ci 608262306a36Sopenharmony_cistruct copy_subpage_arg { 608362306a36Sopenharmony_ci struct page *dst; 608462306a36Sopenharmony_ci struct page *src; 608562306a36Sopenharmony_ci struct vm_area_struct *vma; 608662306a36Sopenharmony_ci}; 608762306a36Sopenharmony_ci 608862306a36Sopenharmony_cistatic int copy_subpage(unsigned long addr, int idx, void *arg) 608962306a36Sopenharmony_ci{ 609062306a36Sopenharmony_ci struct copy_subpage_arg *copy_arg = arg; 609162306a36Sopenharmony_ci 609262306a36Sopenharmony_ci if (copy_mc_user_highpage(copy_arg->dst + idx, copy_arg->src + idx, 609362306a36Sopenharmony_ci addr, copy_arg->vma)) { 609462306a36Sopenharmony_ci memory_failure_queue(page_to_pfn(copy_arg->src + idx), 0); 609562306a36Sopenharmony_ci return -EHWPOISON; 609662306a36Sopenharmony_ci } 609762306a36Sopenharmony_ci return 0; 609862306a36Sopenharmony_ci} 609962306a36Sopenharmony_ci 610062306a36Sopenharmony_ciint copy_user_large_folio(struct folio *dst, struct folio *src, 610162306a36Sopenharmony_ci unsigned long addr_hint, struct vm_area_struct *vma) 610262306a36Sopenharmony_ci{ 610362306a36Sopenharmony_ci unsigned int pages_per_huge_page = folio_nr_pages(dst); 610462306a36Sopenharmony_ci unsigned long addr = addr_hint & 610562306a36Sopenharmony_ci ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); 610662306a36Sopenharmony_ci struct copy_subpage_arg arg = { 610762306a36Sopenharmony_ci .dst = &dst->page, 610862306a36Sopenharmony_ci .src = &src->page, 610962306a36Sopenharmony_ci .vma = vma, 611062306a36Sopenharmony_ci }; 611162306a36Sopenharmony_ci 611262306a36Sopenharmony_ci if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) 611362306a36Sopenharmony_ci return copy_user_gigantic_page(dst, src, addr, vma, 611462306a36Sopenharmony_ci pages_per_huge_page); 611562306a36Sopenharmony_ci 611662306a36Sopenharmony_ci return process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg); 611762306a36Sopenharmony_ci} 611862306a36Sopenharmony_ci 611962306a36Sopenharmony_cilong copy_folio_from_user(struct folio *dst_folio, 612062306a36Sopenharmony_ci const void __user *usr_src, 612162306a36Sopenharmony_ci bool allow_pagefault) 612262306a36Sopenharmony_ci{ 612362306a36Sopenharmony_ci void *kaddr; 612462306a36Sopenharmony_ci unsigned long i, rc = 0; 612562306a36Sopenharmony_ci unsigned int nr_pages = folio_nr_pages(dst_folio); 612662306a36Sopenharmony_ci unsigned long ret_val = nr_pages * PAGE_SIZE; 612762306a36Sopenharmony_ci struct page *subpage; 612862306a36Sopenharmony_ci 612962306a36Sopenharmony_ci for (i = 0; i < nr_pages; i++) { 613062306a36Sopenharmony_ci subpage = folio_page(dst_folio, i); 613162306a36Sopenharmony_ci kaddr = kmap_local_page(subpage); 613262306a36Sopenharmony_ci if (!allow_pagefault) 613362306a36Sopenharmony_ci pagefault_disable(); 613462306a36Sopenharmony_ci rc = copy_from_user(kaddr, usr_src + i * PAGE_SIZE, PAGE_SIZE); 613562306a36Sopenharmony_ci if (!allow_pagefault) 613662306a36Sopenharmony_ci pagefault_enable(); 613762306a36Sopenharmony_ci kunmap_local(kaddr); 613862306a36Sopenharmony_ci 613962306a36Sopenharmony_ci ret_val -= (PAGE_SIZE - rc); 614062306a36Sopenharmony_ci if (rc) 614162306a36Sopenharmony_ci break; 614262306a36Sopenharmony_ci 614362306a36Sopenharmony_ci flush_dcache_page(subpage); 614462306a36Sopenharmony_ci 614562306a36Sopenharmony_ci cond_resched(); 614662306a36Sopenharmony_ci } 614762306a36Sopenharmony_ci return ret_val; 614862306a36Sopenharmony_ci} 614962306a36Sopenharmony_ci#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ 615062306a36Sopenharmony_ci 615162306a36Sopenharmony_ci#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS 615262306a36Sopenharmony_ci 615362306a36Sopenharmony_cistatic struct kmem_cache *page_ptl_cachep; 615462306a36Sopenharmony_ci 615562306a36Sopenharmony_civoid __init ptlock_cache_init(void) 615662306a36Sopenharmony_ci{ 615762306a36Sopenharmony_ci page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, 615862306a36Sopenharmony_ci SLAB_PANIC, NULL); 615962306a36Sopenharmony_ci} 616062306a36Sopenharmony_ci 616162306a36Sopenharmony_cibool ptlock_alloc(struct ptdesc *ptdesc) 616262306a36Sopenharmony_ci{ 616362306a36Sopenharmony_ci spinlock_t *ptl; 616462306a36Sopenharmony_ci 616562306a36Sopenharmony_ci ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL); 616662306a36Sopenharmony_ci if (!ptl) 616762306a36Sopenharmony_ci return false; 616862306a36Sopenharmony_ci ptdesc->ptl = ptl; 616962306a36Sopenharmony_ci return true; 617062306a36Sopenharmony_ci} 617162306a36Sopenharmony_ci 617262306a36Sopenharmony_civoid ptlock_free(struct ptdesc *ptdesc) 617362306a36Sopenharmony_ci{ 617462306a36Sopenharmony_ci kmem_cache_free(page_ptl_cachep, ptdesc->ptl); 617562306a36Sopenharmony_ci} 617662306a36Sopenharmony_ci#endif 6177