162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci *  linux/mm/memory.c
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
662306a36Sopenharmony_ci */
762306a36Sopenharmony_ci
862306a36Sopenharmony_ci/*
962306a36Sopenharmony_ci * demand-loading started 01.12.91 - seems it is high on the list of
1062306a36Sopenharmony_ci * things wanted, and it should be easy to implement. - Linus
1162306a36Sopenharmony_ci */
1262306a36Sopenharmony_ci
1362306a36Sopenharmony_ci/*
1462306a36Sopenharmony_ci * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
1562306a36Sopenharmony_ci * pages started 02.12.91, seems to work. - Linus.
1662306a36Sopenharmony_ci *
1762306a36Sopenharmony_ci * Tested sharing by executing about 30 /bin/sh: under the old kernel it
1862306a36Sopenharmony_ci * would have taken more than the 6M I have free, but it worked well as
1962306a36Sopenharmony_ci * far as I could see.
2062306a36Sopenharmony_ci *
2162306a36Sopenharmony_ci * Also corrected some "invalidate()"s - I wasn't doing enough of them.
2262306a36Sopenharmony_ci */
2362306a36Sopenharmony_ci
2462306a36Sopenharmony_ci/*
2562306a36Sopenharmony_ci * Real VM (paging to/from disk) started 18.12.91. Much more work and
2662306a36Sopenharmony_ci * thought has to go into this. Oh, well..
2762306a36Sopenharmony_ci * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
2862306a36Sopenharmony_ci *		Found it. Everything seems to work now.
2962306a36Sopenharmony_ci * 20.12.91  -  Ok, making the swap-device changeable like the root.
3062306a36Sopenharmony_ci */
3162306a36Sopenharmony_ci
3262306a36Sopenharmony_ci/*
3362306a36Sopenharmony_ci * 05.04.94  -  Multi-page memory management added for v1.1.
3462306a36Sopenharmony_ci *              Idea by Alex Bligh (alex@cconcepts.co.uk)
3562306a36Sopenharmony_ci *
3662306a36Sopenharmony_ci * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
3762306a36Sopenharmony_ci *		(Gerhard.Wichert@pdb.siemens.de)
3862306a36Sopenharmony_ci *
3962306a36Sopenharmony_ci * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
4062306a36Sopenharmony_ci */
4162306a36Sopenharmony_ci
4262306a36Sopenharmony_ci#include <linux/kernel_stat.h>
4362306a36Sopenharmony_ci#include <linux/mm.h>
4462306a36Sopenharmony_ci#include <linux/mm_inline.h>
4562306a36Sopenharmony_ci#include <linux/sched/mm.h>
4662306a36Sopenharmony_ci#include <linux/sched/coredump.h>
4762306a36Sopenharmony_ci#include <linux/sched/numa_balancing.h>
4862306a36Sopenharmony_ci#include <linux/sched/task.h>
4962306a36Sopenharmony_ci#include <linux/hugetlb.h>
5062306a36Sopenharmony_ci#include <linux/mman.h>
5162306a36Sopenharmony_ci#include <linux/swap.h>
5262306a36Sopenharmony_ci#include <linux/highmem.h>
5362306a36Sopenharmony_ci#include <linux/pagemap.h>
5462306a36Sopenharmony_ci#include <linux/memremap.h>
5562306a36Sopenharmony_ci#include <linux/kmsan.h>
5662306a36Sopenharmony_ci#include <linux/ksm.h>
5762306a36Sopenharmony_ci#include <linux/rmap.h>
5862306a36Sopenharmony_ci#include <linux/export.h>
5962306a36Sopenharmony_ci#include <linux/delayacct.h>
6062306a36Sopenharmony_ci#include <linux/init.h>
6162306a36Sopenharmony_ci#include <linux/pfn_t.h>
6262306a36Sopenharmony_ci#include <linux/writeback.h>
6362306a36Sopenharmony_ci#include <linux/memcontrol.h>
6462306a36Sopenharmony_ci#include <linux/mmu_notifier.h>
6562306a36Sopenharmony_ci#include <linux/swapops.h>
6662306a36Sopenharmony_ci#include <linux/elf.h>
6762306a36Sopenharmony_ci#include <linux/gfp.h>
6862306a36Sopenharmony_ci#include <linux/migrate.h>
6962306a36Sopenharmony_ci#include <linux/string.h>
7062306a36Sopenharmony_ci#include <linux/memory-tiers.h>
7162306a36Sopenharmony_ci#include <linux/debugfs.h>
7262306a36Sopenharmony_ci#include <linux/userfaultfd_k.h>
7362306a36Sopenharmony_ci#include <linux/dax.h>
7462306a36Sopenharmony_ci#include <linux/oom.h>
7562306a36Sopenharmony_ci#include <linux/numa.h>
7662306a36Sopenharmony_ci#include <linux/perf_event.h>
7762306a36Sopenharmony_ci#include <linux/ptrace.h>
7862306a36Sopenharmony_ci#include <linux/vmalloc.h>
7962306a36Sopenharmony_ci#include <linux/sched/sysctl.h>
8062306a36Sopenharmony_ci#ifdef CONFIG_MEM_PURGEABLE
8162306a36Sopenharmony_ci#include <linux/mm_purgeable.h>
8262306a36Sopenharmony_ci#endif
8362306a36Sopenharmony_ci#include <trace/events/kmem.h>
8462306a36Sopenharmony_ci
8562306a36Sopenharmony_ci#include <asm/io.h>
8662306a36Sopenharmony_ci#include <asm/mmu_context.h>
8762306a36Sopenharmony_ci#include <asm/pgalloc.h>
8862306a36Sopenharmony_ci#include <linux/uaccess.h>
8962306a36Sopenharmony_ci#include <asm/tlb.h>
9062306a36Sopenharmony_ci#include <asm/tlbflush.h>
9162306a36Sopenharmony_ci
9262306a36Sopenharmony_ci#include "pgalloc-track.h"
9362306a36Sopenharmony_ci#include "internal.h"
9462306a36Sopenharmony_ci#include "swap.h"
9562306a36Sopenharmony_ci
9662306a36Sopenharmony_ci#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
9762306a36Sopenharmony_ci#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
9862306a36Sopenharmony_ci#endif
9962306a36Sopenharmony_ci
10062306a36Sopenharmony_ci#ifndef CONFIG_NUMA
10162306a36Sopenharmony_ciunsigned long max_mapnr;
10262306a36Sopenharmony_ciEXPORT_SYMBOL(max_mapnr);
10362306a36Sopenharmony_ci
10462306a36Sopenharmony_cistruct page *mem_map;
10562306a36Sopenharmony_ciEXPORT_SYMBOL(mem_map);
10662306a36Sopenharmony_ci#endif
10762306a36Sopenharmony_ci
10862306a36Sopenharmony_cistatic vm_fault_t do_fault(struct vm_fault *vmf);
10962306a36Sopenharmony_cistatic vm_fault_t do_anonymous_page(struct vm_fault *vmf);
11062306a36Sopenharmony_cistatic bool vmf_pte_changed(struct vm_fault *vmf);
11162306a36Sopenharmony_ci
11262306a36Sopenharmony_ci/*
11362306a36Sopenharmony_ci * Return true if the original pte was a uffd-wp pte marker (so the pte was
11462306a36Sopenharmony_ci * wr-protected).
11562306a36Sopenharmony_ci */
11662306a36Sopenharmony_cistatic bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
11762306a36Sopenharmony_ci{
11862306a36Sopenharmony_ci	if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
11962306a36Sopenharmony_ci		return false;
12062306a36Sopenharmony_ci
12162306a36Sopenharmony_ci	return pte_marker_uffd_wp(vmf->orig_pte);
12262306a36Sopenharmony_ci}
12362306a36Sopenharmony_ci
12462306a36Sopenharmony_ci/*
12562306a36Sopenharmony_ci * A number of key systems in x86 including ioremap() rely on the assumption
12662306a36Sopenharmony_ci * that high_memory defines the upper bound on direct map memory, then end
12762306a36Sopenharmony_ci * of ZONE_NORMAL.  Under CONFIG_DISCONTIG this means that max_low_pfn and
12862306a36Sopenharmony_ci * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
12962306a36Sopenharmony_ci * and ZONE_HIGHMEM.
13062306a36Sopenharmony_ci */
13162306a36Sopenharmony_civoid *high_memory;
13262306a36Sopenharmony_ciEXPORT_SYMBOL(high_memory);
13362306a36Sopenharmony_ci
13462306a36Sopenharmony_ci/*
13562306a36Sopenharmony_ci * Randomize the address space (stacks, mmaps, brk, etc.).
13662306a36Sopenharmony_ci *
13762306a36Sopenharmony_ci * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
13862306a36Sopenharmony_ci *   as ancient (libc5 based) binaries can segfault. )
13962306a36Sopenharmony_ci */
14062306a36Sopenharmony_ciint randomize_va_space __read_mostly =
14162306a36Sopenharmony_ci#ifdef CONFIG_COMPAT_BRK
14262306a36Sopenharmony_ci					1;
14362306a36Sopenharmony_ci#else
14462306a36Sopenharmony_ci					2;
14562306a36Sopenharmony_ci#endif
14662306a36Sopenharmony_ci
14762306a36Sopenharmony_ci#ifndef arch_wants_old_prefaulted_pte
14862306a36Sopenharmony_cistatic inline bool arch_wants_old_prefaulted_pte(void)
14962306a36Sopenharmony_ci{
15062306a36Sopenharmony_ci	/*
15162306a36Sopenharmony_ci	 * Transitioning a PTE from 'old' to 'young' can be expensive on
15262306a36Sopenharmony_ci	 * some architectures, even if it's performed in hardware. By
15362306a36Sopenharmony_ci	 * default, "false" means prefaulted entries will be 'young'.
15462306a36Sopenharmony_ci	 */
15562306a36Sopenharmony_ci	return false;
15662306a36Sopenharmony_ci}
15762306a36Sopenharmony_ci#endif
15862306a36Sopenharmony_ci
15962306a36Sopenharmony_cistatic int __init disable_randmaps(char *s)
16062306a36Sopenharmony_ci{
16162306a36Sopenharmony_ci	randomize_va_space = 0;
16262306a36Sopenharmony_ci	return 1;
16362306a36Sopenharmony_ci}
16462306a36Sopenharmony_ci__setup("norandmaps", disable_randmaps);
16562306a36Sopenharmony_ci
16662306a36Sopenharmony_ciunsigned long zero_pfn __read_mostly;
16762306a36Sopenharmony_ciEXPORT_SYMBOL(zero_pfn);
16862306a36Sopenharmony_ci
16962306a36Sopenharmony_ciunsigned long highest_memmap_pfn __read_mostly;
17062306a36Sopenharmony_ci
17162306a36Sopenharmony_ci/*
17262306a36Sopenharmony_ci * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
17362306a36Sopenharmony_ci */
17462306a36Sopenharmony_cistatic int __init init_zero_pfn(void)
17562306a36Sopenharmony_ci{
17662306a36Sopenharmony_ci	zero_pfn = page_to_pfn(ZERO_PAGE(0));
17762306a36Sopenharmony_ci	return 0;
17862306a36Sopenharmony_ci}
17962306a36Sopenharmony_ciearly_initcall(init_zero_pfn);
18062306a36Sopenharmony_ci
18162306a36Sopenharmony_civoid mm_trace_rss_stat(struct mm_struct *mm, int member)
18262306a36Sopenharmony_ci{
18362306a36Sopenharmony_ci	trace_rss_stat(mm, member);
18462306a36Sopenharmony_ci}
18562306a36Sopenharmony_ci
18662306a36Sopenharmony_ci/*
18762306a36Sopenharmony_ci * Note: this doesn't free the actual pages themselves. That
18862306a36Sopenharmony_ci * has been handled earlier when unmapping all the memory regions.
18962306a36Sopenharmony_ci */
19062306a36Sopenharmony_cistatic void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
19162306a36Sopenharmony_ci			   unsigned long addr)
19262306a36Sopenharmony_ci{
19362306a36Sopenharmony_ci	pgtable_t token = pmd_pgtable(*pmd);
19462306a36Sopenharmony_ci	pmd_clear(pmd);
19562306a36Sopenharmony_ci	pte_free_tlb(tlb, token, addr);
19662306a36Sopenharmony_ci	mm_dec_nr_ptes(tlb->mm);
19762306a36Sopenharmony_ci}
19862306a36Sopenharmony_ci
19962306a36Sopenharmony_cistatic inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
20062306a36Sopenharmony_ci				unsigned long addr, unsigned long end,
20162306a36Sopenharmony_ci				unsigned long floor, unsigned long ceiling)
20262306a36Sopenharmony_ci{
20362306a36Sopenharmony_ci	pmd_t *pmd;
20462306a36Sopenharmony_ci	unsigned long next;
20562306a36Sopenharmony_ci	unsigned long start;
20662306a36Sopenharmony_ci
20762306a36Sopenharmony_ci	start = addr;
20862306a36Sopenharmony_ci	pmd = pmd_offset(pud, addr);
20962306a36Sopenharmony_ci	do {
21062306a36Sopenharmony_ci		next = pmd_addr_end(addr, end);
21162306a36Sopenharmony_ci		if (pmd_none_or_clear_bad(pmd))
21262306a36Sopenharmony_ci			continue;
21362306a36Sopenharmony_ci		free_pte_range(tlb, pmd, addr);
21462306a36Sopenharmony_ci	} while (pmd++, addr = next, addr != end);
21562306a36Sopenharmony_ci
21662306a36Sopenharmony_ci	start &= PUD_MASK;
21762306a36Sopenharmony_ci	if (start < floor)
21862306a36Sopenharmony_ci		return;
21962306a36Sopenharmony_ci	if (ceiling) {
22062306a36Sopenharmony_ci		ceiling &= PUD_MASK;
22162306a36Sopenharmony_ci		if (!ceiling)
22262306a36Sopenharmony_ci			return;
22362306a36Sopenharmony_ci	}
22462306a36Sopenharmony_ci	if (end - 1 > ceiling - 1)
22562306a36Sopenharmony_ci		return;
22662306a36Sopenharmony_ci
22762306a36Sopenharmony_ci	pmd = pmd_offset(pud, start);
22862306a36Sopenharmony_ci	pud_clear(pud);
22962306a36Sopenharmony_ci	pmd_free_tlb(tlb, pmd, start);
23062306a36Sopenharmony_ci	mm_dec_nr_pmds(tlb->mm);
23162306a36Sopenharmony_ci}
23262306a36Sopenharmony_ci
23362306a36Sopenharmony_cistatic inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
23462306a36Sopenharmony_ci				unsigned long addr, unsigned long end,
23562306a36Sopenharmony_ci				unsigned long floor, unsigned long ceiling)
23662306a36Sopenharmony_ci{
23762306a36Sopenharmony_ci	pud_t *pud;
23862306a36Sopenharmony_ci	unsigned long next;
23962306a36Sopenharmony_ci	unsigned long start;
24062306a36Sopenharmony_ci
24162306a36Sopenharmony_ci	start = addr;
24262306a36Sopenharmony_ci	pud = pud_offset(p4d, addr);
24362306a36Sopenharmony_ci	do {
24462306a36Sopenharmony_ci		next = pud_addr_end(addr, end);
24562306a36Sopenharmony_ci		if (pud_none_or_clear_bad(pud))
24662306a36Sopenharmony_ci			continue;
24762306a36Sopenharmony_ci		free_pmd_range(tlb, pud, addr, next, floor, ceiling);
24862306a36Sopenharmony_ci	} while (pud++, addr = next, addr != end);
24962306a36Sopenharmony_ci
25062306a36Sopenharmony_ci	start &= P4D_MASK;
25162306a36Sopenharmony_ci	if (start < floor)
25262306a36Sopenharmony_ci		return;
25362306a36Sopenharmony_ci	if (ceiling) {
25462306a36Sopenharmony_ci		ceiling &= P4D_MASK;
25562306a36Sopenharmony_ci		if (!ceiling)
25662306a36Sopenharmony_ci			return;
25762306a36Sopenharmony_ci	}
25862306a36Sopenharmony_ci	if (end - 1 > ceiling - 1)
25962306a36Sopenharmony_ci		return;
26062306a36Sopenharmony_ci
26162306a36Sopenharmony_ci	pud = pud_offset(p4d, start);
26262306a36Sopenharmony_ci	p4d_clear(p4d);
26362306a36Sopenharmony_ci	pud_free_tlb(tlb, pud, start);
26462306a36Sopenharmony_ci	mm_dec_nr_puds(tlb->mm);
26562306a36Sopenharmony_ci}
26662306a36Sopenharmony_ci
26762306a36Sopenharmony_cistatic inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
26862306a36Sopenharmony_ci				unsigned long addr, unsigned long end,
26962306a36Sopenharmony_ci				unsigned long floor, unsigned long ceiling)
27062306a36Sopenharmony_ci{
27162306a36Sopenharmony_ci	p4d_t *p4d;
27262306a36Sopenharmony_ci	unsigned long next;
27362306a36Sopenharmony_ci	unsigned long start;
27462306a36Sopenharmony_ci
27562306a36Sopenharmony_ci	start = addr;
27662306a36Sopenharmony_ci	p4d = p4d_offset(pgd, addr);
27762306a36Sopenharmony_ci	do {
27862306a36Sopenharmony_ci		next = p4d_addr_end(addr, end);
27962306a36Sopenharmony_ci		if (p4d_none_or_clear_bad(p4d))
28062306a36Sopenharmony_ci			continue;
28162306a36Sopenharmony_ci		free_pud_range(tlb, p4d, addr, next, floor, ceiling);
28262306a36Sopenharmony_ci	} while (p4d++, addr = next, addr != end);
28362306a36Sopenharmony_ci
28462306a36Sopenharmony_ci	start &= PGDIR_MASK;
28562306a36Sopenharmony_ci	if (start < floor)
28662306a36Sopenharmony_ci		return;
28762306a36Sopenharmony_ci	if (ceiling) {
28862306a36Sopenharmony_ci		ceiling &= PGDIR_MASK;
28962306a36Sopenharmony_ci		if (!ceiling)
29062306a36Sopenharmony_ci			return;
29162306a36Sopenharmony_ci	}
29262306a36Sopenharmony_ci	if (end - 1 > ceiling - 1)
29362306a36Sopenharmony_ci		return;
29462306a36Sopenharmony_ci
29562306a36Sopenharmony_ci	p4d = p4d_offset(pgd, start);
29662306a36Sopenharmony_ci	pgd_clear(pgd);
29762306a36Sopenharmony_ci	p4d_free_tlb(tlb, p4d, start);
29862306a36Sopenharmony_ci}
29962306a36Sopenharmony_ci
30062306a36Sopenharmony_ci/*
30162306a36Sopenharmony_ci * This function frees user-level page tables of a process.
30262306a36Sopenharmony_ci */
30362306a36Sopenharmony_civoid free_pgd_range(struct mmu_gather *tlb,
30462306a36Sopenharmony_ci			unsigned long addr, unsigned long end,
30562306a36Sopenharmony_ci			unsigned long floor, unsigned long ceiling)
30662306a36Sopenharmony_ci{
30762306a36Sopenharmony_ci	pgd_t *pgd;
30862306a36Sopenharmony_ci	unsigned long next;
30962306a36Sopenharmony_ci
31062306a36Sopenharmony_ci	/*
31162306a36Sopenharmony_ci	 * The next few lines have given us lots of grief...
31262306a36Sopenharmony_ci	 *
31362306a36Sopenharmony_ci	 * Why are we testing PMD* at this top level?  Because often
31462306a36Sopenharmony_ci	 * there will be no work to do at all, and we'd prefer not to
31562306a36Sopenharmony_ci	 * go all the way down to the bottom just to discover that.
31662306a36Sopenharmony_ci	 *
31762306a36Sopenharmony_ci	 * Why all these "- 1"s?  Because 0 represents both the bottom
31862306a36Sopenharmony_ci	 * of the address space and the top of it (using -1 for the
31962306a36Sopenharmony_ci	 * top wouldn't help much: the masks would do the wrong thing).
32062306a36Sopenharmony_ci	 * The rule is that addr 0 and floor 0 refer to the bottom of
32162306a36Sopenharmony_ci	 * the address space, but end 0 and ceiling 0 refer to the top
32262306a36Sopenharmony_ci	 * Comparisons need to use "end - 1" and "ceiling - 1" (though
32362306a36Sopenharmony_ci	 * that end 0 case should be mythical).
32462306a36Sopenharmony_ci	 *
32562306a36Sopenharmony_ci	 * Wherever addr is brought up or ceiling brought down, we must
32662306a36Sopenharmony_ci	 * be careful to reject "the opposite 0" before it confuses the
32762306a36Sopenharmony_ci	 * subsequent tests.  But what about where end is brought down
32862306a36Sopenharmony_ci	 * by PMD_SIZE below? no, end can't go down to 0 there.
32962306a36Sopenharmony_ci	 *
33062306a36Sopenharmony_ci	 * Whereas we round start (addr) and ceiling down, by different
33162306a36Sopenharmony_ci	 * masks at different levels, in order to test whether a table
33262306a36Sopenharmony_ci	 * now has no other vmas using it, so can be freed, we don't
33362306a36Sopenharmony_ci	 * bother to round floor or end up - the tests don't need that.
33462306a36Sopenharmony_ci	 */
33562306a36Sopenharmony_ci
33662306a36Sopenharmony_ci	addr &= PMD_MASK;
33762306a36Sopenharmony_ci	if (addr < floor) {
33862306a36Sopenharmony_ci		addr += PMD_SIZE;
33962306a36Sopenharmony_ci		if (!addr)
34062306a36Sopenharmony_ci			return;
34162306a36Sopenharmony_ci	}
34262306a36Sopenharmony_ci	if (ceiling) {
34362306a36Sopenharmony_ci		ceiling &= PMD_MASK;
34462306a36Sopenharmony_ci		if (!ceiling)
34562306a36Sopenharmony_ci			return;
34662306a36Sopenharmony_ci	}
34762306a36Sopenharmony_ci	if (end - 1 > ceiling - 1)
34862306a36Sopenharmony_ci		end -= PMD_SIZE;
34962306a36Sopenharmony_ci	if (addr > end - 1)
35062306a36Sopenharmony_ci		return;
35162306a36Sopenharmony_ci	/*
35262306a36Sopenharmony_ci	 * We add page table cache pages with PAGE_SIZE,
35362306a36Sopenharmony_ci	 * (see pte_free_tlb()), flush the tlb if we need
35462306a36Sopenharmony_ci	 */
35562306a36Sopenharmony_ci	tlb_change_page_size(tlb, PAGE_SIZE);
35662306a36Sopenharmony_ci	pgd = pgd_offset(tlb->mm, addr);
35762306a36Sopenharmony_ci	do {
35862306a36Sopenharmony_ci		next = pgd_addr_end(addr, end);
35962306a36Sopenharmony_ci		if (pgd_none_or_clear_bad(pgd))
36062306a36Sopenharmony_ci			continue;
36162306a36Sopenharmony_ci		free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
36262306a36Sopenharmony_ci	} while (pgd++, addr = next, addr != end);
36362306a36Sopenharmony_ci}
36462306a36Sopenharmony_ci
36562306a36Sopenharmony_civoid free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
36662306a36Sopenharmony_ci		   struct vm_area_struct *vma, unsigned long floor,
36762306a36Sopenharmony_ci		   unsigned long ceiling, bool mm_wr_locked)
36862306a36Sopenharmony_ci{
36962306a36Sopenharmony_ci	do {
37062306a36Sopenharmony_ci		unsigned long addr = vma->vm_start;
37162306a36Sopenharmony_ci		struct vm_area_struct *next;
37262306a36Sopenharmony_ci
37362306a36Sopenharmony_ci		/*
37462306a36Sopenharmony_ci		 * Note: USER_PGTABLES_CEILING may be passed as ceiling and may
37562306a36Sopenharmony_ci		 * be 0.  This will underflow and is okay.
37662306a36Sopenharmony_ci		 */
37762306a36Sopenharmony_ci		next = mas_find(mas, ceiling - 1);
37862306a36Sopenharmony_ci
37962306a36Sopenharmony_ci		/*
38062306a36Sopenharmony_ci		 * Hide vma from rmap and truncate_pagecache before freeing
38162306a36Sopenharmony_ci		 * pgtables
38262306a36Sopenharmony_ci		 */
38362306a36Sopenharmony_ci		if (mm_wr_locked)
38462306a36Sopenharmony_ci			vma_start_write(vma);
38562306a36Sopenharmony_ci		unlink_anon_vmas(vma);
38662306a36Sopenharmony_ci		unlink_file_vma(vma);
38762306a36Sopenharmony_ci
38862306a36Sopenharmony_ci		if (is_vm_hugetlb_page(vma)) {
38962306a36Sopenharmony_ci			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
39062306a36Sopenharmony_ci				floor, next ? next->vm_start : ceiling);
39162306a36Sopenharmony_ci		} else {
39262306a36Sopenharmony_ci			/*
39362306a36Sopenharmony_ci			 * Optimization: gather nearby vmas into one call down
39462306a36Sopenharmony_ci			 */
39562306a36Sopenharmony_ci			while (next && next->vm_start <= vma->vm_end + PMD_SIZE
39662306a36Sopenharmony_ci			       && !is_vm_hugetlb_page(next)) {
39762306a36Sopenharmony_ci				vma = next;
39862306a36Sopenharmony_ci				next = mas_find(mas, ceiling - 1);
39962306a36Sopenharmony_ci				if (mm_wr_locked)
40062306a36Sopenharmony_ci					vma_start_write(vma);
40162306a36Sopenharmony_ci				unlink_anon_vmas(vma);
40262306a36Sopenharmony_ci				unlink_file_vma(vma);
40362306a36Sopenharmony_ci			}
40462306a36Sopenharmony_ci			free_pgd_range(tlb, addr, vma->vm_end,
40562306a36Sopenharmony_ci				floor, next ? next->vm_start : ceiling);
40662306a36Sopenharmony_ci		}
40762306a36Sopenharmony_ci		vma = next;
40862306a36Sopenharmony_ci	} while (vma);
40962306a36Sopenharmony_ci}
41062306a36Sopenharmony_ci
41162306a36Sopenharmony_civoid pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
41262306a36Sopenharmony_ci{
41362306a36Sopenharmony_ci	spinlock_t *ptl = pmd_lock(mm, pmd);
41462306a36Sopenharmony_ci
41562306a36Sopenharmony_ci	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
41662306a36Sopenharmony_ci		mm_inc_nr_ptes(mm);
41762306a36Sopenharmony_ci		/*
41862306a36Sopenharmony_ci		 * Ensure all pte setup (eg. pte page lock and page clearing) are
41962306a36Sopenharmony_ci		 * visible before the pte is made visible to other CPUs by being
42062306a36Sopenharmony_ci		 * put into page tables.
42162306a36Sopenharmony_ci		 *
42262306a36Sopenharmony_ci		 * The other side of the story is the pointer chasing in the page
42362306a36Sopenharmony_ci		 * table walking code (when walking the page table without locking;
42462306a36Sopenharmony_ci		 * ie. most of the time). Fortunately, these data accesses consist
42562306a36Sopenharmony_ci		 * of a chain of data-dependent loads, meaning most CPUs (alpha
42662306a36Sopenharmony_ci		 * being the notable exception) will already guarantee loads are
42762306a36Sopenharmony_ci		 * seen in-order. See the alpha page table accessors for the
42862306a36Sopenharmony_ci		 * smp_rmb() barriers in page table walking code.
42962306a36Sopenharmony_ci		 */
43062306a36Sopenharmony_ci		smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
43162306a36Sopenharmony_ci		pmd_populate(mm, pmd, *pte);
43262306a36Sopenharmony_ci		*pte = NULL;
43362306a36Sopenharmony_ci	}
43462306a36Sopenharmony_ci	spin_unlock(ptl);
43562306a36Sopenharmony_ci}
43662306a36Sopenharmony_ci
43762306a36Sopenharmony_ciint __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
43862306a36Sopenharmony_ci{
43962306a36Sopenharmony_ci	pgtable_t new = pte_alloc_one(mm);
44062306a36Sopenharmony_ci	if (!new)
44162306a36Sopenharmony_ci		return -ENOMEM;
44262306a36Sopenharmony_ci
44362306a36Sopenharmony_ci	pmd_install(mm, pmd, &new);
44462306a36Sopenharmony_ci	if (new)
44562306a36Sopenharmony_ci		pte_free(mm, new);
44662306a36Sopenharmony_ci	return 0;
44762306a36Sopenharmony_ci}
44862306a36Sopenharmony_ci
44962306a36Sopenharmony_ciint __pte_alloc_kernel(pmd_t *pmd)
45062306a36Sopenharmony_ci{
45162306a36Sopenharmony_ci	pte_t *new = pte_alloc_one_kernel(&init_mm);
45262306a36Sopenharmony_ci	if (!new)
45362306a36Sopenharmony_ci		return -ENOMEM;
45462306a36Sopenharmony_ci
45562306a36Sopenharmony_ci	spin_lock(&init_mm.page_table_lock);
45662306a36Sopenharmony_ci	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
45762306a36Sopenharmony_ci		smp_wmb(); /* See comment in pmd_install() */
45862306a36Sopenharmony_ci		pmd_populate_kernel(&init_mm, pmd, new);
45962306a36Sopenharmony_ci		new = NULL;
46062306a36Sopenharmony_ci	}
46162306a36Sopenharmony_ci	spin_unlock(&init_mm.page_table_lock);
46262306a36Sopenharmony_ci	if (new)
46362306a36Sopenharmony_ci		pte_free_kernel(&init_mm, new);
46462306a36Sopenharmony_ci	return 0;
46562306a36Sopenharmony_ci}
46662306a36Sopenharmony_ci
46762306a36Sopenharmony_cistatic inline void init_rss_vec(int *rss)
46862306a36Sopenharmony_ci{
46962306a36Sopenharmony_ci	memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
47062306a36Sopenharmony_ci}
47162306a36Sopenharmony_ci
47262306a36Sopenharmony_cistatic inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
47362306a36Sopenharmony_ci{
47462306a36Sopenharmony_ci	int i;
47562306a36Sopenharmony_ci
47662306a36Sopenharmony_ci	if (current->mm == mm)
47762306a36Sopenharmony_ci		sync_mm_rss(mm);
47862306a36Sopenharmony_ci	for (i = 0; i < NR_MM_COUNTERS; i++)
47962306a36Sopenharmony_ci		if (rss[i])
48062306a36Sopenharmony_ci			add_mm_counter(mm, i, rss[i]);
48162306a36Sopenharmony_ci}
48262306a36Sopenharmony_ci
48362306a36Sopenharmony_ci/*
48462306a36Sopenharmony_ci * This function is called to print an error when a bad pte
48562306a36Sopenharmony_ci * is found. For example, we might have a PFN-mapped pte in
48662306a36Sopenharmony_ci * a region that doesn't allow it.
48762306a36Sopenharmony_ci *
48862306a36Sopenharmony_ci * The calling function must still handle the error.
48962306a36Sopenharmony_ci */
49062306a36Sopenharmony_cistatic void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
49162306a36Sopenharmony_ci			  pte_t pte, struct page *page)
49262306a36Sopenharmony_ci{
49362306a36Sopenharmony_ci	pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
49462306a36Sopenharmony_ci	p4d_t *p4d = p4d_offset(pgd, addr);
49562306a36Sopenharmony_ci	pud_t *pud = pud_offset(p4d, addr);
49662306a36Sopenharmony_ci	pmd_t *pmd = pmd_offset(pud, addr);
49762306a36Sopenharmony_ci	struct address_space *mapping;
49862306a36Sopenharmony_ci	pgoff_t index;
49962306a36Sopenharmony_ci	static unsigned long resume;
50062306a36Sopenharmony_ci	static unsigned long nr_shown;
50162306a36Sopenharmony_ci	static unsigned long nr_unshown;
50262306a36Sopenharmony_ci
50362306a36Sopenharmony_ci	/*
50462306a36Sopenharmony_ci	 * Allow a burst of 60 reports, then keep quiet for that minute;
50562306a36Sopenharmony_ci	 * or allow a steady drip of one report per second.
50662306a36Sopenharmony_ci	 */
50762306a36Sopenharmony_ci	if (nr_shown == 60) {
50862306a36Sopenharmony_ci		if (time_before(jiffies, resume)) {
50962306a36Sopenharmony_ci			nr_unshown++;
51062306a36Sopenharmony_ci			return;
51162306a36Sopenharmony_ci		}
51262306a36Sopenharmony_ci		if (nr_unshown) {
51362306a36Sopenharmony_ci			pr_alert("BUG: Bad page map: %lu messages suppressed\n",
51462306a36Sopenharmony_ci				 nr_unshown);
51562306a36Sopenharmony_ci			nr_unshown = 0;
51662306a36Sopenharmony_ci		}
51762306a36Sopenharmony_ci		nr_shown = 0;
51862306a36Sopenharmony_ci	}
51962306a36Sopenharmony_ci	if (nr_shown++ == 0)
52062306a36Sopenharmony_ci		resume = jiffies + 60 * HZ;
52162306a36Sopenharmony_ci
52262306a36Sopenharmony_ci	mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
52362306a36Sopenharmony_ci	index = linear_page_index(vma, addr);
52462306a36Sopenharmony_ci
52562306a36Sopenharmony_ci	pr_alert("BUG: Bad page map in process %s  pte:%08llx pmd:%08llx\n",
52662306a36Sopenharmony_ci		 current->comm,
52762306a36Sopenharmony_ci		 (long long)pte_val(pte), (long long)pmd_val(*pmd));
52862306a36Sopenharmony_ci	if (page)
52962306a36Sopenharmony_ci		dump_page(page, "bad pte");
53062306a36Sopenharmony_ci	pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
53162306a36Sopenharmony_ci		 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
53262306a36Sopenharmony_ci	pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n",
53362306a36Sopenharmony_ci		 vma->vm_file,
53462306a36Sopenharmony_ci		 vma->vm_ops ? vma->vm_ops->fault : NULL,
53562306a36Sopenharmony_ci		 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
53662306a36Sopenharmony_ci		 mapping ? mapping->a_ops->read_folio : NULL);
53762306a36Sopenharmony_ci	dump_stack();
53862306a36Sopenharmony_ci	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
53962306a36Sopenharmony_ci}
54062306a36Sopenharmony_ci
54162306a36Sopenharmony_ci/*
54262306a36Sopenharmony_ci * vm_normal_page -- This function gets the "struct page" associated with a pte.
54362306a36Sopenharmony_ci *
54462306a36Sopenharmony_ci * "Special" mappings do not wish to be associated with a "struct page" (either
54562306a36Sopenharmony_ci * it doesn't exist, or it exists but they don't want to touch it). In this
54662306a36Sopenharmony_ci * case, NULL is returned here. "Normal" mappings do have a struct page.
54762306a36Sopenharmony_ci *
54862306a36Sopenharmony_ci * There are 2 broad cases. Firstly, an architecture may define a pte_special()
54962306a36Sopenharmony_ci * pte bit, in which case this function is trivial. Secondly, an architecture
55062306a36Sopenharmony_ci * may not have a spare pte bit, which requires a more complicated scheme,
55162306a36Sopenharmony_ci * described below.
55262306a36Sopenharmony_ci *
55362306a36Sopenharmony_ci * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
55462306a36Sopenharmony_ci * special mapping (even if there are underlying and valid "struct pages").
55562306a36Sopenharmony_ci * COWed pages of a VM_PFNMAP are always normal.
55662306a36Sopenharmony_ci *
55762306a36Sopenharmony_ci * The way we recognize COWed pages within VM_PFNMAP mappings is through the
55862306a36Sopenharmony_ci * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
55962306a36Sopenharmony_ci * set, and the vm_pgoff will point to the first PFN mapped: thus every special
56062306a36Sopenharmony_ci * mapping will always honor the rule
56162306a36Sopenharmony_ci *
56262306a36Sopenharmony_ci *	pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
56362306a36Sopenharmony_ci *
56462306a36Sopenharmony_ci * And for normal mappings this is false.
56562306a36Sopenharmony_ci *
56662306a36Sopenharmony_ci * This restricts such mappings to be a linear translation from virtual address
56762306a36Sopenharmony_ci * to pfn. To get around this restriction, we allow arbitrary mappings so long
56862306a36Sopenharmony_ci * as the vma is not a COW mapping; in that case, we know that all ptes are
56962306a36Sopenharmony_ci * special (because none can have been COWed).
57062306a36Sopenharmony_ci *
57162306a36Sopenharmony_ci *
57262306a36Sopenharmony_ci * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
57362306a36Sopenharmony_ci *
57462306a36Sopenharmony_ci * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
57562306a36Sopenharmony_ci * page" backing, however the difference is that _all_ pages with a struct
57662306a36Sopenharmony_ci * page (that is, those where pfn_valid is true) are refcounted and considered
57762306a36Sopenharmony_ci * normal pages by the VM. The disadvantage is that pages are refcounted
57862306a36Sopenharmony_ci * (which can be slower and simply not an option for some PFNMAP users). The
57962306a36Sopenharmony_ci * advantage is that we don't have to follow the strict linearity rule of
58062306a36Sopenharmony_ci * PFNMAP mappings in order to support COWable mappings.
58162306a36Sopenharmony_ci *
58262306a36Sopenharmony_ci */
58362306a36Sopenharmony_cistruct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
58462306a36Sopenharmony_ci			    pte_t pte)
58562306a36Sopenharmony_ci{
58662306a36Sopenharmony_ci	unsigned long pfn = pte_pfn(pte);
58762306a36Sopenharmony_ci
58862306a36Sopenharmony_ci	if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
58962306a36Sopenharmony_ci		if (likely(!pte_special(pte)))
59062306a36Sopenharmony_ci			goto check_pfn;
59162306a36Sopenharmony_ci		if (vma->vm_ops && vma->vm_ops->find_special_page)
59262306a36Sopenharmony_ci			return vma->vm_ops->find_special_page(vma, addr);
59362306a36Sopenharmony_ci		if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
59462306a36Sopenharmony_ci			return NULL;
59562306a36Sopenharmony_ci		if (is_zero_pfn(pfn))
59662306a36Sopenharmony_ci			return NULL;
59762306a36Sopenharmony_ci		if (pte_devmap(pte))
59862306a36Sopenharmony_ci		/*
59962306a36Sopenharmony_ci		 * NOTE: New users of ZONE_DEVICE will not set pte_devmap()
60062306a36Sopenharmony_ci		 * and will have refcounts incremented on their struct pages
60162306a36Sopenharmony_ci		 * when they are inserted into PTEs, thus they are safe to
60262306a36Sopenharmony_ci		 * return here. Legacy ZONE_DEVICE pages that set pte_devmap()
60362306a36Sopenharmony_ci		 * do not have refcounts. Example of legacy ZONE_DEVICE is
60462306a36Sopenharmony_ci		 * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers.
60562306a36Sopenharmony_ci		 */
60662306a36Sopenharmony_ci			return NULL;
60762306a36Sopenharmony_ci
60862306a36Sopenharmony_ci		print_bad_pte(vma, addr, pte, NULL);
60962306a36Sopenharmony_ci		return NULL;
61062306a36Sopenharmony_ci	}
61162306a36Sopenharmony_ci
61262306a36Sopenharmony_ci	/* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */
61362306a36Sopenharmony_ci
61462306a36Sopenharmony_ci	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
61562306a36Sopenharmony_ci		if (vma->vm_flags & VM_MIXEDMAP) {
61662306a36Sopenharmony_ci			if (!pfn_valid(pfn))
61762306a36Sopenharmony_ci				return NULL;
61862306a36Sopenharmony_ci			goto out;
61962306a36Sopenharmony_ci		} else {
62062306a36Sopenharmony_ci			unsigned long off;
62162306a36Sopenharmony_ci			off = (addr - vma->vm_start) >> PAGE_SHIFT;
62262306a36Sopenharmony_ci			if (pfn == vma->vm_pgoff + off)
62362306a36Sopenharmony_ci				return NULL;
62462306a36Sopenharmony_ci			if (!is_cow_mapping(vma->vm_flags))
62562306a36Sopenharmony_ci				return NULL;
62662306a36Sopenharmony_ci		}
62762306a36Sopenharmony_ci	}
62862306a36Sopenharmony_ci
62962306a36Sopenharmony_ci	if (is_zero_pfn(pfn))
63062306a36Sopenharmony_ci		return NULL;
63162306a36Sopenharmony_ci
63262306a36Sopenharmony_cicheck_pfn:
63362306a36Sopenharmony_ci	if (unlikely(pfn > highest_memmap_pfn)) {
63462306a36Sopenharmony_ci		print_bad_pte(vma, addr, pte, NULL);
63562306a36Sopenharmony_ci		return NULL;
63662306a36Sopenharmony_ci	}
63762306a36Sopenharmony_ci
63862306a36Sopenharmony_ci	/*
63962306a36Sopenharmony_ci	 * NOTE! We still have PageReserved() pages in the page tables.
64062306a36Sopenharmony_ci	 * eg. VDSO mappings can cause them to exist.
64162306a36Sopenharmony_ci	 */
64262306a36Sopenharmony_ciout:
64362306a36Sopenharmony_ci	return pfn_to_page(pfn);
64462306a36Sopenharmony_ci}
64562306a36Sopenharmony_ci
64662306a36Sopenharmony_cistruct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
64762306a36Sopenharmony_ci			    pte_t pte)
64862306a36Sopenharmony_ci{
64962306a36Sopenharmony_ci	struct page *page = vm_normal_page(vma, addr, pte);
65062306a36Sopenharmony_ci
65162306a36Sopenharmony_ci	if (page)
65262306a36Sopenharmony_ci		return page_folio(page);
65362306a36Sopenharmony_ci	return NULL;
65462306a36Sopenharmony_ci}
65562306a36Sopenharmony_ci
65662306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE
65762306a36Sopenharmony_cistruct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
65862306a36Sopenharmony_ci				pmd_t pmd)
65962306a36Sopenharmony_ci{
66062306a36Sopenharmony_ci	unsigned long pfn = pmd_pfn(pmd);
66162306a36Sopenharmony_ci
66262306a36Sopenharmony_ci	/*
66362306a36Sopenharmony_ci	 * There is no pmd_special() but there may be special pmds, e.g.
66462306a36Sopenharmony_ci	 * in a direct-access (dax) mapping, so let's just replicate the
66562306a36Sopenharmony_ci	 * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here.
66662306a36Sopenharmony_ci	 */
66762306a36Sopenharmony_ci	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
66862306a36Sopenharmony_ci		if (vma->vm_flags & VM_MIXEDMAP) {
66962306a36Sopenharmony_ci			if (!pfn_valid(pfn))
67062306a36Sopenharmony_ci				return NULL;
67162306a36Sopenharmony_ci			goto out;
67262306a36Sopenharmony_ci		} else {
67362306a36Sopenharmony_ci			unsigned long off;
67462306a36Sopenharmony_ci			off = (addr - vma->vm_start) >> PAGE_SHIFT;
67562306a36Sopenharmony_ci			if (pfn == vma->vm_pgoff + off)
67662306a36Sopenharmony_ci				return NULL;
67762306a36Sopenharmony_ci			if (!is_cow_mapping(vma->vm_flags))
67862306a36Sopenharmony_ci				return NULL;
67962306a36Sopenharmony_ci		}
68062306a36Sopenharmony_ci	}
68162306a36Sopenharmony_ci
68262306a36Sopenharmony_ci	if (pmd_devmap(pmd))
68362306a36Sopenharmony_ci		return NULL;
68462306a36Sopenharmony_ci	if (is_huge_zero_pmd(pmd))
68562306a36Sopenharmony_ci		return NULL;
68662306a36Sopenharmony_ci	if (unlikely(pfn > highest_memmap_pfn))
68762306a36Sopenharmony_ci		return NULL;
68862306a36Sopenharmony_ci
68962306a36Sopenharmony_ci	/*
69062306a36Sopenharmony_ci	 * NOTE! We still have PageReserved() pages in the page tables.
69162306a36Sopenharmony_ci	 * eg. VDSO mappings can cause them to exist.
69262306a36Sopenharmony_ci	 */
69362306a36Sopenharmony_ciout:
69462306a36Sopenharmony_ci	return pfn_to_page(pfn);
69562306a36Sopenharmony_ci}
69662306a36Sopenharmony_ci#endif
69762306a36Sopenharmony_ci
69862306a36Sopenharmony_cistatic void restore_exclusive_pte(struct vm_area_struct *vma,
69962306a36Sopenharmony_ci				  struct page *page, unsigned long address,
70062306a36Sopenharmony_ci				  pte_t *ptep)
70162306a36Sopenharmony_ci{
70262306a36Sopenharmony_ci	pte_t orig_pte;
70362306a36Sopenharmony_ci	pte_t pte;
70462306a36Sopenharmony_ci	swp_entry_t entry;
70562306a36Sopenharmony_ci
70662306a36Sopenharmony_ci	orig_pte = ptep_get(ptep);
70762306a36Sopenharmony_ci	pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
70862306a36Sopenharmony_ci	if (pte_swp_soft_dirty(orig_pte))
70962306a36Sopenharmony_ci		pte = pte_mksoft_dirty(pte);
71062306a36Sopenharmony_ci
71162306a36Sopenharmony_ci	entry = pte_to_swp_entry(orig_pte);
71262306a36Sopenharmony_ci	if (pte_swp_uffd_wp(orig_pte))
71362306a36Sopenharmony_ci		pte = pte_mkuffd_wp(pte);
71462306a36Sopenharmony_ci	else if (is_writable_device_exclusive_entry(entry))
71562306a36Sopenharmony_ci		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
71662306a36Sopenharmony_ci
71762306a36Sopenharmony_ci	VM_BUG_ON(pte_write(pte) && !(PageAnon(page) && PageAnonExclusive(page)));
71862306a36Sopenharmony_ci
71962306a36Sopenharmony_ci	/*
72062306a36Sopenharmony_ci	 * No need to take a page reference as one was already
72162306a36Sopenharmony_ci	 * created when the swap entry was made.
72262306a36Sopenharmony_ci	 */
72362306a36Sopenharmony_ci	if (PageAnon(page))
72462306a36Sopenharmony_ci		page_add_anon_rmap(page, vma, address, RMAP_NONE);
72562306a36Sopenharmony_ci	else
72662306a36Sopenharmony_ci		/*
72762306a36Sopenharmony_ci		 * Currently device exclusive access only supports anonymous
72862306a36Sopenharmony_ci		 * memory so the entry shouldn't point to a filebacked page.
72962306a36Sopenharmony_ci		 */
73062306a36Sopenharmony_ci		WARN_ON_ONCE(1);
73162306a36Sopenharmony_ci
73262306a36Sopenharmony_ci	set_pte_at(vma->vm_mm, address, ptep, pte);
73362306a36Sopenharmony_ci
73462306a36Sopenharmony_ci	/*
73562306a36Sopenharmony_ci	 * No need to invalidate - it was non-present before. However
73662306a36Sopenharmony_ci	 * secondary CPUs may have mappings that need invalidating.
73762306a36Sopenharmony_ci	 */
73862306a36Sopenharmony_ci	update_mmu_cache(vma, address, ptep);
73962306a36Sopenharmony_ci}
74062306a36Sopenharmony_ci
74162306a36Sopenharmony_ci/*
74262306a36Sopenharmony_ci * Tries to restore an exclusive pte if the page lock can be acquired without
74362306a36Sopenharmony_ci * sleeping.
74462306a36Sopenharmony_ci */
74562306a36Sopenharmony_cistatic int
74662306a36Sopenharmony_citry_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma,
74762306a36Sopenharmony_ci			unsigned long addr)
74862306a36Sopenharmony_ci{
74962306a36Sopenharmony_ci	swp_entry_t entry = pte_to_swp_entry(ptep_get(src_pte));
75062306a36Sopenharmony_ci	struct page *page = pfn_swap_entry_to_page(entry);
75162306a36Sopenharmony_ci
75262306a36Sopenharmony_ci	if (trylock_page(page)) {
75362306a36Sopenharmony_ci		restore_exclusive_pte(vma, page, addr, src_pte);
75462306a36Sopenharmony_ci		unlock_page(page);
75562306a36Sopenharmony_ci		return 0;
75662306a36Sopenharmony_ci	}
75762306a36Sopenharmony_ci
75862306a36Sopenharmony_ci	return -EBUSY;
75962306a36Sopenharmony_ci}
76062306a36Sopenharmony_ci
76162306a36Sopenharmony_ci/*
76262306a36Sopenharmony_ci * copy one vm_area from one task to the other. Assumes the page tables
76362306a36Sopenharmony_ci * already present in the new task to be cleared in the whole range
76462306a36Sopenharmony_ci * covered by this vma.
76562306a36Sopenharmony_ci */
76662306a36Sopenharmony_ci
76762306a36Sopenharmony_cistatic unsigned long
76862306a36Sopenharmony_cicopy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
76962306a36Sopenharmony_ci		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
77062306a36Sopenharmony_ci		struct vm_area_struct *src_vma, unsigned long addr, int *rss)
77162306a36Sopenharmony_ci{
77262306a36Sopenharmony_ci	unsigned long vm_flags = dst_vma->vm_flags;
77362306a36Sopenharmony_ci	pte_t orig_pte = ptep_get(src_pte);
77462306a36Sopenharmony_ci	pte_t pte = orig_pte;
77562306a36Sopenharmony_ci	struct page *page;
77662306a36Sopenharmony_ci	swp_entry_t entry = pte_to_swp_entry(orig_pte);
77762306a36Sopenharmony_ci
77862306a36Sopenharmony_ci	if (likely(!non_swap_entry(entry))) {
77962306a36Sopenharmony_ci		if (swap_duplicate(entry) < 0)
78062306a36Sopenharmony_ci			return -EIO;
78162306a36Sopenharmony_ci
78262306a36Sopenharmony_ci		/* make sure dst_mm is on swapoff's mmlist. */
78362306a36Sopenharmony_ci		if (unlikely(list_empty(&dst_mm->mmlist))) {
78462306a36Sopenharmony_ci			spin_lock(&mmlist_lock);
78562306a36Sopenharmony_ci			if (list_empty(&dst_mm->mmlist))
78662306a36Sopenharmony_ci				list_add(&dst_mm->mmlist,
78762306a36Sopenharmony_ci						&src_mm->mmlist);
78862306a36Sopenharmony_ci			spin_unlock(&mmlist_lock);
78962306a36Sopenharmony_ci		}
79062306a36Sopenharmony_ci		/* Mark the swap entry as shared. */
79162306a36Sopenharmony_ci		if (pte_swp_exclusive(orig_pte)) {
79262306a36Sopenharmony_ci			pte = pte_swp_clear_exclusive(orig_pte);
79362306a36Sopenharmony_ci			set_pte_at(src_mm, addr, src_pte, pte);
79462306a36Sopenharmony_ci		}
79562306a36Sopenharmony_ci		rss[MM_SWAPENTS]++;
79662306a36Sopenharmony_ci	} else if (is_migration_entry(entry)) {
79762306a36Sopenharmony_ci		page = pfn_swap_entry_to_page(entry);
79862306a36Sopenharmony_ci
79962306a36Sopenharmony_ci		rss[mm_counter(page)]++;
80062306a36Sopenharmony_ci
80162306a36Sopenharmony_ci		if (!is_readable_migration_entry(entry) &&
80262306a36Sopenharmony_ci				is_cow_mapping(vm_flags)) {
80362306a36Sopenharmony_ci			/*
80462306a36Sopenharmony_ci			 * COW mappings require pages in both parent and child
80562306a36Sopenharmony_ci			 * to be set to read. A previously exclusive entry is
80662306a36Sopenharmony_ci			 * now shared.
80762306a36Sopenharmony_ci			 */
80862306a36Sopenharmony_ci			entry = make_readable_migration_entry(
80962306a36Sopenharmony_ci							swp_offset(entry));
81062306a36Sopenharmony_ci			pte = swp_entry_to_pte(entry);
81162306a36Sopenharmony_ci			if (pte_swp_soft_dirty(orig_pte))
81262306a36Sopenharmony_ci				pte = pte_swp_mksoft_dirty(pte);
81362306a36Sopenharmony_ci			if (pte_swp_uffd_wp(orig_pte))
81462306a36Sopenharmony_ci				pte = pte_swp_mkuffd_wp(pte);
81562306a36Sopenharmony_ci			set_pte_at(src_mm, addr, src_pte, pte);
81662306a36Sopenharmony_ci		}
81762306a36Sopenharmony_ci	} else if (is_device_private_entry(entry)) {
81862306a36Sopenharmony_ci		page = pfn_swap_entry_to_page(entry);
81962306a36Sopenharmony_ci
82062306a36Sopenharmony_ci		/*
82162306a36Sopenharmony_ci		 * Update rss count even for unaddressable pages, as
82262306a36Sopenharmony_ci		 * they should treated just like normal pages in this
82362306a36Sopenharmony_ci		 * respect.
82462306a36Sopenharmony_ci		 *
82562306a36Sopenharmony_ci		 * We will likely want to have some new rss counters
82662306a36Sopenharmony_ci		 * for unaddressable pages, at some point. But for now
82762306a36Sopenharmony_ci		 * keep things as they are.
82862306a36Sopenharmony_ci		 */
82962306a36Sopenharmony_ci		get_page(page);
83062306a36Sopenharmony_ci		rss[mm_counter(page)]++;
83162306a36Sopenharmony_ci		/* Cannot fail as these pages cannot get pinned. */
83262306a36Sopenharmony_ci		BUG_ON(page_try_dup_anon_rmap(page, false, src_vma));
83362306a36Sopenharmony_ci
83462306a36Sopenharmony_ci		/*
83562306a36Sopenharmony_ci		 * We do not preserve soft-dirty information, because so
83662306a36Sopenharmony_ci		 * far, checkpoint/restore is the only feature that
83762306a36Sopenharmony_ci		 * requires that. And checkpoint/restore does not work
83862306a36Sopenharmony_ci		 * when a device driver is involved (you cannot easily
83962306a36Sopenharmony_ci		 * save and restore device driver state).
84062306a36Sopenharmony_ci		 */
84162306a36Sopenharmony_ci		if (is_writable_device_private_entry(entry) &&
84262306a36Sopenharmony_ci		    is_cow_mapping(vm_flags)) {
84362306a36Sopenharmony_ci			entry = make_readable_device_private_entry(
84462306a36Sopenharmony_ci							swp_offset(entry));
84562306a36Sopenharmony_ci			pte = swp_entry_to_pte(entry);
84662306a36Sopenharmony_ci			if (pte_swp_uffd_wp(orig_pte))
84762306a36Sopenharmony_ci				pte = pte_swp_mkuffd_wp(pte);
84862306a36Sopenharmony_ci			set_pte_at(src_mm, addr, src_pte, pte);
84962306a36Sopenharmony_ci		}
85062306a36Sopenharmony_ci	} else if (is_device_exclusive_entry(entry)) {
85162306a36Sopenharmony_ci		/*
85262306a36Sopenharmony_ci		 * Make device exclusive entries present by restoring the
85362306a36Sopenharmony_ci		 * original entry then copying as for a present pte. Device
85462306a36Sopenharmony_ci		 * exclusive entries currently only support private writable
85562306a36Sopenharmony_ci		 * (ie. COW) mappings.
85662306a36Sopenharmony_ci		 */
85762306a36Sopenharmony_ci		VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags));
85862306a36Sopenharmony_ci		if (try_restore_exclusive_pte(src_pte, src_vma, addr))
85962306a36Sopenharmony_ci			return -EBUSY;
86062306a36Sopenharmony_ci		return -ENOENT;
86162306a36Sopenharmony_ci	} else if (is_pte_marker_entry(entry)) {
86262306a36Sopenharmony_ci		pte_marker marker = copy_pte_marker(entry, dst_vma);
86362306a36Sopenharmony_ci
86462306a36Sopenharmony_ci		if (marker)
86562306a36Sopenharmony_ci			set_pte_at(dst_mm, addr, dst_pte,
86662306a36Sopenharmony_ci				   make_pte_marker(marker));
86762306a36Sopenharmony_ci		return 0;
86862306a36Sopenharmony_ci	}
86962306a36Sopenharmony_ci	if (!userfaultfd_wp(dst_vma))
87062306a36Sopenharmony_ci		pte = pte_swp_clear_uffd_wp(pte);
87162306a36Sopenharmony_ci	set_pte_at(dst_mm, addr, dst_pte, pte);
87262306a36Sopenharmony_ci	return 0;
87362306a36Sopenharmony_ci}
87462306a36Sopenharmony_ci
87562306a36Sopenharmony_ci/*
87662306a36Sopenharmony_ci * Copy a present and normal page.
87762306a36Sopenharmony_ci *
87862306a36Sopenharmony_ci * NOTE! The usual case is that this isn't required;
87962306a36Sopenharmony_ci * instead, the caller can just increase the page refcount
88062306a36Sopenharmony_ci * and re-use the pte the traditional way.
88162306a36Sopenharmony_ci *
88262306a36Sopenharmony_ci * And if we need a pre-allocated page but don't yet have
88362306a36Sopenharmony_ci * one, return a negative error to let the preallocation
88462306a36Sopenharmony_ci * code know so that it can do so outside the page table
88562306a36Sopenharmony_ci * lock.
88662306a36Sopenharmony_ci */
88762306a36Sopenharmony_cistatic inline int
88862306a36Sopenharmony_cicopy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
88962306a36Sopenharmony_ci		  pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
89062306a36Sopenharmony_ci		  struct folio **prealloc, struct page *page)
89162306a36Sopenharmony_ci{
89262306a36Sopenharmony_ci	struct folio *new_folio;
89362306a36Sopenharmony_ci	pte_t pte;
89462306a36Sopenharmony_ci
89562306a36Sopenharmony_ci	new_folio = *prealloc;
89662306a36Sopenharmony_ci	if (!new_folio)
89762306a36Sopenharmony_ci		return -EAGAIN;
89862306a36Sopenharmony_ci
89962306a36Sopenharmony_ci	/*
90062306a36Sopenharmony_ci	 * We have a prealloc page, all good!  Take it
90162306a36Sopenharmony_ci	 * over and copy the page & arm it.
90262306a36Sopenharmony_ci	 */
90362306a36Sopenharmony_ci	*prealloc = NULL;
90462306a36Sopenharmony_ci	copy_user_highpage(&new_folio->page, page, addr, src_vma);
90562306a36Sopenharmony_ci	__folio_mark_uptodate(new_folio);
90662306a36Sopenharmony_ci	folio_add_new_anon_rmap(new_folio, dst_vma, addr);
90762306a36Sopenharmony_ci	folio_add_lru_vma(new_folio, dst_vma);
90862306a36Sopenharmony_ci	rss[MM_ANONPAGES]++;
90962306a36Sopenharmony_ci
91062306a36Sopenharmony_ci	/* All done, just insert the new page copy in the child */
91162306a36Sopenharmony_ci	pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot);
91262306a36Sopenharmony_ci	pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
91362306a36Sopenharmony_ci	if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte)))
91462306a36Sopenharmony_ci		/* Uffd-wp needs to be delivered to dest pte as well */
91562306a36Sopenharmony_ci		pte = pte_mkuffd_wp(pte);
91662306a36Sopenharmony_ci	set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
91762306a36Sopenharmony_ci	return 0;
91862306a36Sopenharmony_ci}
91962306a36Sopenharmony_ci
92062306a36Sopenharmony_ci/*
92162306a36Sopenharmony_ci * Copy one pte.  Returns 0 if succeeded, or -EAGAIN if one preallocated page
92262306a36Sopenharmony_ci * is required to copy this pte.
92362306a36Sopenharmony_ci */
92462306a36Sopenharmony_cistatic inline int
92562306a36Sopenharmony_cicopy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
92662306a36Sopenharmony_ci		 pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
92762306a36Sopenharmony_ci		 struct folio **prealloc)
92862306a36Sopenharmony_ci{
92962306a36Sopenharmony_ci	struct mm_struct *src_mm = src_vma->vm_mm;
93062306a36Sopenharmony_ci	unsigned long vm_flags = src_vma->vm_flags;
93162306a36Sopenharmony_ci	pte_t pte = ptep_get(src_pte);
93262306a36Sopenharmony_ci	struct page *page;
93362306a36Sopenharmony_ci	struct folio *folio;
93462306a36Sopenharmony_ci
93562306a36Sopenharmony_ci	page = vm_normal_page(src_vma, addr, pte);
93662306a36Sopenharmony_ci	if (page)
93762306a36Sopenharmony_ci		folio = page_folio(page);
93862306a36Sopenharmony_ci	if (page && folio_test_anon(folio)) {
93962306a36Sopenharmony_ci		/*
94062306a36Sopenharmony_ci		 * If this page may have been pinned by the parent process,
94162306a36Sopenharmony_ci		 * copy the page immediately for the child so that we'll always
94262306a36Sopenharmony_ci		 * guarantee the pinned page won't be randomly replaced in the
94362306a36Sopenharmony_ci		 * future.
94462306a36Sopenharmony_ci		 */
94562306a36Sopenharmony_ci		folio_get(folio);
94662306a36Sopenharmony_ci		if (unlikely(page_try_dup_anon_rmap(page, false, src_vma))) {
94762306a36Sopenharmony_ci			/* Page may be pinned, we have to copy. */
94862306a36Sopenharmony_ci			folio_put(folio);
94962306a36Sopenharmony_ci			return copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
95062306a36Sopenharmony_ci						 addr, rss, prealloc, page);
95162306a36Sopenharmony_ci		}
95262306a36Sopenharmony_ci		rss[MM_ANONPAGES]++;
95362306a36Sopenharmony_ci	} else if (page) {
95462306a36Sopenharmony_ci		folio_get(folio);
95562306a36Sopenharmony_ci		page_dup_file_rmap(page, false);
95662306a36Sopenharmony_ci		rss[mm_counter_file(page)]++;
95762306a36Sopenharmony_ci	}
95862306a36Sopenharmony_ci
95962306a36Sopenharmony_ci	/*
96062306a36Sopenharmony_ci	 * If it's a COW mapping, write protect it both
96162306a36Sopenharmony_ci	 * in the parent and the child
96262306a36Sopenharmony_ci	 */
96362306a36Sopenharmony_ci	if (is_cow_mapping(vm_flags) && pte_write(pte)) {
96462306a36Sopenharmony_ci		ptep_set_wrprotect(src_mm, addr, src_pte);
96562306a36Sopenharmony_ci		pte = pte_wrprotect(pte);
96662306a36Sopenharmony_ci	}
96762306a36Sopenharmony_ci	VM_BUG_ON(page && folio_test_anon(folio) && PageAnonExclusive(page));
96862306a36Sopenharmony_ci
96962306a36Sopenharmony_ci	/*
97062306a36Sopenharmony_ci	 * If it's a shared mapping, mark it clean in
97162306a36Sopenharmony_ci	 * the child
97262306a36Sopenharmony_ci	 */
97362306a36Sopenharmony_ci	if (vm_flags & VM_SHARED)
97462306a36Sopenharmony_ci		pte = pte_mkclean(pte);
97562306a36Sopenharmony_ci	pte = pte_mkold(pte);
97662306a36Sopenharmony_ci
97762306a36Sopenharmony_ci	if (!userfaultfd_wp(dst_vma))
97862306a36Sopenharmony_ci		pte = pte_clear_uffd_wp(pte);
97962306a36Sopenharmony_ci
98062306a36Sopenharmony_ci	set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
98162306a36Sopenharmony_ci	return 0;
98262306a36Sopenharmony_ci}
98362306a36Sopenharmony_ci
98462306a36Sopenharmony_cistatic inline struct folio *page_copy_prealloc(struct mm_struct *src_mm,
98562306a36Sopenharmony_ci		struct vm_area_struct *vma, unsigned long addr)
98662306a36Sopenharmony_ci{
98762306a36Sopenharmony_ci	struct folio *new_folio;
98862306a36Sopenharmony_ci
98962306a36Sopenharmony_ci	new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false);
99062306a36Sopenharmony_ci	if (!new_folio)
99162306a36Sopenharmony_ci		return NULL;
99262306a36Sopenharmony_ci
99362306a36Sopenharmony_ci	if (mem_cgroup_charge(new_folio, src_mm, GFP_KERNEL)) {
99462306a36Sopenharmony_ci		folio_put(new_folio);
99562306a36Sopenharmony_ci		return NULL;
99662306a36Sopenharmony_ci	}
99762306a36Sopenharmony_ci	folio_throttle_swaprate(new_folio, GFP_KERNEL);
99862306a36Sopenharmony_ci
99962306a36Sopenharmony_ci	return new_folio;
100062306a36Sopenharmony_ci}
100162306a36Sopenharmony_ci
100262306a36Sopenharmony_cistatic int
100362306a36Sopenharmony_cicopy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
100462306a36Sopenharmony_ci	       pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
100562306a36Sopenharmony_ci	       unsigned long end)
100662306a36Sopenharmony_ci{
100762306a36Sopenharmony_ci	struct mm_struct *dst_mm = dst_vma->vm_mm;
100862306a36Sopenharmony_ci	struct mm_struct *src_mm = src_vma->vm_mm;
100962306a36Sopenharmony_ci	pte_t *orig_src_pte, *orig_dst_pte;
101062306a36Sopenharmony_ci	pte_t *src_pte, *dst_pte;
101162306a36Sopenharmony_ci	pte_t ptent;
101262306a36Sopenharmony_ci	spinlock_t *src_ptl, *dst_ptl;
101362306a36Sopenharmony_ci	int progress, ret = 0;
101462306a36Sopenharmony_ci	int rss[NR_MM_COUNTERS];
101562306a36Sopenharmony_ci	swp_entry_t entry = (swp_entry_t){0};
101662306a36Sopenharmony_ci	struct folio *prealloc = NULL;
101762306a36Sopenharmony_ci
101862306a36Sopenharmony_ciagain:
101962306a36Sopenharmony_ci	progress = 0;
102062306a36Sopenharmony_ci	init_rss_vec(rss);
102162306a36Sopenharmony_ci
102262306a36Sopenharmony_ci	/*
102362306a36Sopenharmony_ci	 * copy_pmd_range()'s prior pmd_none_or_clear_bad(src_pmd), and the
102462306a36Sopenharmony_ci	 * error handling here, assume that exclusive mmap_lock on dst and src
102562306a36Sopenharmony_ci	 * protects anon from unexpected THP transitions; with shmem and file
102662306a36Sopenharmony_ci	 * protected by mmap_lock-less collapse skipping areas with anon_vma
102762306a36Sopenharmony_ci	 * (whereas vma_needs_copy() skips areas without anon_vma).  A rework
102862306a36Sopenharmony_ci	 * can remove such assumptions later, but this is good enough for now.
102962306a36Sopenharmony_ci	 */
103062306a36Sopenharmony_ci	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
103162306a36Sopenharmony_ci	if (!dst_pte) {
103262306a36Sopenharmony_ci		ret = -ENOMEM;
103362306a36Sopenharmony_ci		goto out;
103462306a36Sopenharmony_ci	}
103562306a36Sopenharmony_ci	src_pte = pte_offset_map_nolock(src_mm, src_pmd, addr, &src_ptl);
103662306a36Sopenharmony_ci	if (!src_pte) {
103762306a36Sopenharmony_ci		pte_unmap_unlock(dst_pte, dst_ptl);
103862306a36Sopenharmony_ci		/* ret == 0 */
103962306a36Sopenharmony_ci		goto out;
104062306a36Sopenharmony_ci	}
104162306a36Sopenharmony_ci	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
104262306a36Sopenharmony_ci	orig_src_pte = src_pte;
104362306a36Sopenharmony_ci	orig_dst_pte = dst_pte;
104462306a36Sopenharmony_ci	arch_enter_lazy_mmu_mode();
104562306a36Sopenharmony_ci
104662306a36Sopenharmony_ci	do {
104762306a36Sopenharmony_ci		/*
104862306a36Sopenharmony_ci		 * We are holding two locks at this point - either of them
104962306a36Sopenharmony_ci		 * could generate latencies in another task on another CPU.
105062306a36Sopenharmony_ci		 */
105162306a36Sopenharmony_ci		if (progress >= 32) {
105262306a36Sopenharmony_ci			progress = 0;
105362306a36Sopenharmony_ci			if (need_resched() ||
105462306a36Sopenharmony_ci			    spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
105562306a36Sopenharmony_ci				break;
105662306a36Sopenharmony_ci		}
105762306a36Sopenharmony_ci		ptent = ptep_get(src_pte);
105862306a36Sopenharmony_ci		if (pte_none(ptent)) {
105962306a36Sopenharmony_ci			progress++;
106062306a36Sopenharmony_ci			continue;
106162306a36Sopenharmony_ci		}
106262306a36Sopenharmony_ci		if (unlikely(!pte_present(ptent))) {
106362306a36Sopenharmony_ci			ret = copy_nonpresent_pte(dst_mm, src_mm,
106462306a36Sopenharmony_ci						  dst_pte, src_pte,
106562306a36Sopenharmony_ci						  dst_vma, src_vma,
106662306a36Sopenharmony_ci						  addr, rss);
106762306a36Sopenharmony_ci			if (ret == -EIO) {
106862306a36Sopenharmony_ci				entry = pte_to_swp_entry(ptep_get(src_pte));
106962306a36Sopenharmony_ci				break;
107062306a36Sopenharmony_ci			} else if (ret == -EBUSY) {
107162306a36Sopenharmony_ci				break;
107262306a36Sopenharmony_ci			} else if (!ret) {
107362306a36Sopenharmony_ci				progress += 8;
107462306a36Sopenharmony_ci				continue;
107562306a36Sopenharmony_ci			}
107662306a36Sopenharmony_ci
107762306a36Sopenharmony_ci			/*
107862306a36Sopenharmony_ci			 * Device exclusive entry restored, continue by copying
107962306a36Sopenharmony_ci			 * the now present pte.
108062306a36Sopenharmony_ci			 */
108162306a36Sopenharmony_ci			WARN_ON_ONCE(ret != -ENOENT);
108262306a36Sopenharmony_ci		}
108362306a36Sopenharmony_ci		/* copy_present_pte() will clear `*prealloc' if consumed */
108462306a36Sopenharmony_ci		ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
108562306a36Sopenharmony_ci				       addr, rss, &prealloc);
108662306a36Sopenharmony_ci		/*
108762306a36Sopenharmony_ci		 * If we need a pre-allocated page for this pte, drop the
108862306a36Sopenharmony_ci		 * locks, allocate, and try again.
108962306a36Sopenharmony_ci		 */
109062306a36Sopenharmony_ci		if (unlikely(ret == -EAGAIN))
109162306a36Sopenharmony_ci			break;
109262306a36Sopenharmony_ci		if (unlikely(prealloc)) {
109362306a36Sopenharmony_ci			/*
109462306a36Sopenharmony_ci			 * pre-alloc page cannot be reused by next time so as
109562306a36Sopenharmony_ci			 * to strictly follow mempolicy (e.g., alloc_page_vma()
109662306a36Sopenharmony_ci			 * will allocate page according to address).  This
109762306a36Sopenharmony_ci			 * could only happen if one pinned pte changed.
109862306a36Sopenharmony_ci			 */
109962306a36Sopenharmony_ci			folio_put(prealloc);
110062306a36Sopenharmony_ci			prealloc = NULL;
110162306a36Sopenharmony_ci		}
110262306a36Sopenharmony_ci		progress += 8;
110362306a36Sopenharmony_ci	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
110462306a36Sopenharmony_ci
110562306a36Sopenharmony_ci	arch_leave_lazy_mmu_mode();
110662306a36Sopenharmony_ci	pte_unmap_unlock(orig_src_pte, src_ptl);
110762306a36Sopenharmony_ci	add_mm_rss_vec(dst_mm, rss);
110862306a36Sopenharmony_ci	pte_unmap_unlock(orig_dst_pte, dst_ptl);
110962306a36Sopenharmony_ci	cond_resched();
111062306a36Sopenharmony_ci
111162306a36Sopenharmony_ci	if (ret == -EIO) {
111262306a36Sopenharmony_ci		VM_WARN_ON_ONCE(!entry.val);
111362306a36Sopenharmony_ci		if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
111462306a36Sopenharmony_ci			ret = -ENOMEM;
111562306a36Sopenharmony_ci			goto out;
111662306a36Sopenharmony_ci		}
111762306a36Sopenharmony_ci		entry.val = 0;
111862306a36Sopenharmony_ci	} else if (ret == -EBUSY) {
111962306a36Sopenharmony_ci		goto out;
112062306a36Sopenharmony_ci	} else if (ret ==  -EAGAIN) {
112162306a36Sopenharmony_ci		prealloc = page_copy_prealloc(src_mm, src_vma, addr);
112262306a36Sopenharmony_ci		if (!prealloc)
112362306a36Sopenharmony_ci			return -ENOMEM;
112462306a36Sopenharmony_ci	} else if (ret) {
112562306a36Sopenharmony_ci		VM_WARN_ON_ONCE(1);
112662306a36Sopenharmony_ci	}
112762306a36Sopenharmony_ci
112862306a36Sopenharmony_ci	/* We've captured and resolved the error. Reset, try again. */
112962306a36Sopenharmony_ci	ret = 0;
113062306a36Sopenharmony_ci
113162306a36Sopenharmony_ci	if (addr != end)
113262306a36Sopenharmony_ci		goto again;
113362306a36Sopenharmony_ciout:
113462306a36Sopenharmony_ci	if (unlikely(prealloc))
113562306a36Sopenharmony_ci		folio_put(prealloc);
113662306a36Sopenharmony_ci	return ret;
113762306a36Sopenharmony_ci}
113862306a36Sopenharmony_ci
113962306a36Sopenharmony_cistatic inline int
114062306a36Sopenharmony_cicopy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
114162306a36Sopenharmony_ci	       pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
114262306a36Sopenharmony_ci	       unsigned long end)
114362306a36Sopenharmony_ci{
114462306a36Sopenharmony_ci	struct mm_struct *dst_mm = dst_vma->vm_mm;
114562306a36Sopenharmony_ci	struct mm_struct *src_mm = src_vma->vm_mm;
114662306a36Sopenharmony_ci	pmd_t *src_pmd, *dst_pmd;
114762306a36Sopenharmony_ci	unsigned long next;
114862306a36Sopenharmony_ci
114962306a36Sopenharmony_ci	dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
115062306a36Sopenharmony_ci	if (!dst_pmd)
115162306a36Sopenharmony_ci		return -ENOMEM;
115262306a36Sopenharmony_ci	src_pmd = pmd_offset(src_pud, addr);
115362306a36Sopenharmony_ci	do {
115462306a36Sopenharmony_ci		next = pmd_addr_end(addr, end);
115562306a36Sopenharmony_ci		if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
115662306a36Sopenharmony_ci			|| pmd_devmap(*src_pmd)) {
115762306a36Sopenharmony_ci			int err;
115862306a36Sopenharmony_ci			VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
115962306a36Sopenharmony_ci			err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
116062306a36Sopenharmony_ci					    addr, dst_vma, src_vma);
116162306a36Sopenharmony_ci			if (err == -ENOMEM)
116262306a36Sopenharmony_ci				return -ENOMEM;
116362306a36Sopenharmony_ci			if (!err)
116462306a36Sopenharmony_ci				continue;
116562306a36Sopenharmony_ci			/* fall through */
116662306a36Sopenharmony_ci		}
116762306a36Sopenharmony_ci		if (pmd_none_or_clear_bad(src_pmd))
116862306a36Sopenharmony_ci			continue;
116962306a36Sopenharmony_ci		if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
117062306a36Sopenharmony_ci				   addr, next))
117162306a36Sopenharmony_ci			return -ENOMEM;
117262306a36Sopenharmony_ci	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
117362306a36Sopenharmony_ci	return 0;
117462306a36Sopenharmony_ci}
117562306a36Sopenharmony_ci
117662306a36Sopenharmony_cistatic inline int
117762306a36Sopenharmony_cicopy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
117862306a36Sopenharmony_ci	       p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr,
117962306a36Sopenharmony_ci	       unsigned long end)
118062306a36Sopenharmony_ci{
118162306a36Sopenharmony_ci	struct mm_struct *dst_mm = dst_vma->vm_mm;
118262306a36Sopenharmony_ci	struct mm_struct *src_mm = src_vma->vm_mm;
118362306a36Sopenharmony_ci	pud_t *src_pud, *dst_pud;
118462306a36Sopenharmony_ci	unsigned long next;
118562306a36Sopenharmony_ci
118662306a36Sopenharmony_ci	dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
118762306a36Sopenharmony_ci	if (!dst_pud)
118862306a36Sopenharmony_ci		return -ENOMEM;
118962306a36Sopenharmony_ci	src_pud = pud_offset(src_p4d, addr);
119062306a36Sopenharmony_ci	do {
119162306a36Sopenharmony_ci		next = pud_addr_end(addr, end);
119262306a36Sopenharmony_ci		if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
119362306a36Sopenharmony_ci			int err;
119462306a36Sopenharmony_ci
119562306a36Sopenharmony_ci			VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
119662306a36Sopenharmony_ci			err = copy_huge_pud(dst_mm, src_mm,
119762306a36Sopenharmony_ci					    dst_pud, src_pud, addr, src_vma);
119862306a36Sopenharmony_ci			if (err == -ENOMEM)
119962306a36Sopenharmony_ci				return -ENOMEM;
120062306a36Sopenharmony_ci			if (!err)
120162306a36Sopenharmony_ci				continue;
120262306a36Sopenharmony_ci			/* fall through */
120362306a36Sopenharmony_ci		}
120462306a36Sopenharmony_ci		if (pud_none_or_clear_bad(src_pud))
120562306a36Sopenharmony_ci			continue;
120662306a36Sopenharmony_ci		if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud,
120762306a36Sopenharmony_ci				   addr, next))
120862306a36Sopenharmony_ci			return -ENOMEM;
120962306a36Sopenharmony_ci	} while (dst_pud++, src_pud++, addr = next, addr != end);
121062306a36Sopenharmony_ci	return 0;
121162306a36Sopenharmony_ci}
121262306a36Sopenharmony_ci
121362306a36Sopenharmony_cistatic inline int
121462306a36Sopenharmony_cicopy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
121562306a36Sopenharmony_ci	       pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr,
121662306a36Sopenharmony_ci	       unsigned long end)
121762306a36Sopenharmony_ci{
121862306a36Sopenharmony_ci	struct mm_struct *dst_mm = dst_vma->vm_mm;
121962306a36Sopenharmony_ci	p4d_t *src_p4d, *dst_p4d;
122062306a36Sopenharmony_ci	unsigned long next;
122162306a36Sopenharmony_ci
122262306a36Sopenharmony_ci	dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
122362306a36Sopenharmony_ci	if (!dst_p4d)
122462306a36Sopenharmony_ci		return -ENOMEM;
122562306a36Sopenharmony_ci	src_p4d = p4d_offset(src_pgd, addr);
122662306a36Sopenharmony_ci	do {
122762306a36Sopenharmony_ci		next = p4d_addr_end(addr, end);
122862306a36Sopenharmony_ci		if (p4d_none_or_clear_bad(src_p4d))
122962306a36Sopenharmony_ci			continue;
123062306a36Sopenharmony_ci		if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d,
123162306a36Sopenharmony_ci				   addr, next))
123262306a36Sopenharmony_ci			return -ENOMEM;
123362306a36Sopenharmony_ci	} while (dst_p4d++, src_p4d++, addr = next, addr != end);
123462306a36Sopenharmony_ci	return 0;
123562306a36Sopenharmony_ci}
123662306a36Sopenharmony_ci
123762306a36Sopenharmony_ci/*
123862306a36Sopenharmony_ci * Return true if the vma needs to copy the pgtable during this fork().  Return
123962306a36Sopenharmony_ci * false when we can speed up fork() by allowing lazy page faults later until
124062306a36Sopenharmony_ci * when the child accesses the memory range.
124162306a36Sopenharmony_ci */
124262306a36Sopenharmony_cistatic bool
124362306a36Sopenharmony_civma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
124462306a36Sopenharmony_ci{
124562306a36Sopenharmony_ci	/*
124662306a36Sopenharmony_ci	 * Always copy pgtables when dst_vma has uffd-wp enabled even if it's
124762306a36Sopenharmony_ci	 * file-backed (e.g. shmem). Because when uffd-wp is enabled, pgtable
124862306a36Sopenharmony_ci	 * contains uffd-wp protection information, that's something we can't
124962306a36Sopenharmony_ci	 * retrieve from page cache, and skip copying will lose those info.
125062306a36Sopenharmony_ci	 */
125162306a36Sopenharmony_ci	if (userfaultfd_wp(dst_vma))
125262306a36Sopenharmony_ci		return true;
125362306a36Sopenharmony_ci
125462306a36Sopenharmony_ci	if (src_vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
125562306a36Sopenharmony_ci		return true;
125662306a36Sopenharmony_ci
125762306a36Sopenharmony_ci	if (src_vma->anon_vma)
125862306a36Sopenharmony_ci		return true;
125962306a36Sopenharmony_ci
126062306a36Sopenharmony_ci	/*
126162306a36Sopenharmony_ci	 * Don't copy ptes where a page fault will fill them correctly.  Fork
126262306a36Sopenharmony_ci	 * becomes much lighter when there are big shared or private readonly
126362306a36Sopenharmony_ci	 * mappings. The tradeoff is that copy_page_range is more efficient
126462306a36Sopenharmony_ci	 * than faulting.
126562306a36Sopenharmony_ci	 */
126662306a36Sopenharmony_ci	return false;
126762306a36Sopenharmony_ci}
126862306a36Sopenharmony_ci
126962306a36Sopenharmony_ciint
127062306a36Sopenharmony_cicopy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
127162306a36Sopenharmony_ci{
127262306a36Sopenharmony_ci	pgd_t *src_pgd, *dst_pgd;
127362306a36Sopenharmony_ci	unsigned long next;
127462306a36Sopenharmony_ci	unsigned long addr = src_vma->vm_start;
127562306a36Sopenharmony_ci	unsigned long end = src_vma->vm_end;
127662306a36Sopenharmony_ci	struct mm_struct *dst_mm = dst_vma->vm_mm;
127762306a36Sopenharmony_ci	struct mm_struct *src_mm = src_vma->vm_mm;
127862306a36Sopenharmony_ci	struct mmu_notifier_range range;
127962306a36Sopenharmony_ci	bool is_cow;
128062306a36Sopenharmony_ci	int ret;
128162306a36Sopenharmony_ci
128262306a36Sopenharmony_ci	if (!vma_needs_copy(dst_vma, src_vma))
128362306a36Sopenharmony_ci		return 0;
128462306a36Sopenharmony_ci
128562306a36Sopenharmony_ci	if (is_vm_hugetlb_page(src_vma))
128662306a36Sopenharmony_ci		return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);
128762306a36Sopenharmony_ci
128862306a36Sopenharmony_ci	if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
128962306a36Sopenharmony_ci		/*
129062306a36Sopenharmony_ci		 * We do not free on error cases below as remove_vma
129162306a36Sopenharmony_ci		 * gets called on error from higher level routine
129262306a36Sopenharmony_ci		 */
129362306a36Sopenharmony_ci		ret = track_pfn_copy(src_vma);
129462306a36Sopenharmony_ci		if (ret)
129562306a36Sopenharmony_ci			return ret;
129662306a36Sopenharmony_ci	}
129762306a36Sopenharmony_ci
129862306a36Sopenharmony_ci	/*
129962306a36Sopenharmony_ci	 * We need to invalidate the secondary MMU mappings only when
130062306a36Sopenharmony_ci	 * there could be a permission downgrade on the ptes of the
130162306a36Sopenharmony_ci	 * parent mm. And a permission downgrade will only happen if
130262306a36Sopenharmony_ci	 * is_cow_mapping() returns true.
130362306a36Sopenharmony_ci	 */
130462306a36Sopenharmony_ci	is_cow = is_cow_mapping(src_vma->vm_flags);
130562306a36Sopenharmony_ci
130662306a36Sopenharmony_ci	if (is_cow) {
130762306a36Sopenharmony_ci		mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
130862306a36Sopenharmony_ci					0, src_mm, addr, end);
130962306a36Sopenharmony_ci		mmu_notifier_invalidate_range_start(&range);
131062306a36Sopenharmony_ci		/*
131162306a36Sopenharmony_ci		 * Disabling preemption is not needed for the write side, as
131262306a36Sopenharmony_ci		 * the read side doesn't spin, but goes to the mmap_lock.
131362306a36Sopenharmony_ci		 *
131462306a36Sopenharmony_ci		 * Use the raw variant of the seqcount_t write API to avoid
131562306a36Sopenharmony_ci		 * lockdep complaining about preemptibility.
131662306a36Sopenharmony_ci		 */
131762306a36Sopenharmony_ci		vma_assert_write_locked(src_vma);
131862306a36Sopenharmony_ci		raw_write_seqcount_begin(&src_mm->write_protect_seq);
131962306a36Sopenharmony_ci	}
132062306a36Sopenharmony_ci
132162306a36Sopenharmony_ci	ret = 0;
132262306a36Sopenharmony_ci	dst_pgd = pgd_offset(dst_mm, addr);
132362306a36Sopenharmony_ci	src_pgd = pgd_offset(src_mm, addr);
132462306a36Sopenharmony_ci	do {
132562306a36Sopenharmony_ci		next = pgd_addr_end(addr, end);
132662306a36Sopenharmony_ci		if (pgd_none_or_clear_bad(src_pgd))
132762306a36Sopenharmony_ci			continue;
132862306a36Sopenharmony_ci		if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
132962306a36Sopenharmony_ci					    addr, next))) {
133062306a36Sopenharmony_ci			untrack_pfn_clear(dst_vma);
133162306a36Sopenharmony_ci			ret = -ENOMEM;
133262306a36Sopenharmony_ci			break;
133362306a36Sopenharmony_ci		}
133462306a36Sopenharmony_ci	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
133562306a36Sopenharmony_ci
133662306a36Sopenharmony_ci	if (is_cow) {
133762306a36Sopenharmony_ci		raw_write_seqcount_end(&src_mm->write_protect_seq);
133862306a36Sopenharmony_ci		mmu_notifier_invalidate_range_end(&range);
133962306a36Sopenharmony_ci	}
134062306a36Sopenharmony_ci	return ret;
134162306a36Sopenharmony_ci}
134262306a36Sopenharmony_ci
134362306a36Sopenharmony_ci/* Whether we should zap all COWed (private) pages too */
134462306a36Sopenharmony_cistatic inline bool should_zap_cows(struct zap_details *details)
134562306a36Sopenharmony_ci{
134662306a36Sopenharmony_ci	/* By default, zap all pages */
134762306a36Sopenharmony_ci	if (!details)
134862306a36Sopenharmony_ci		return true;
134962306a36Sopenharmony_ci
135062306a36Sopenharmony_ci	/* Or, we zap COWed pages only if the caller wants to */
135162306a36Sopenharmony_ci	return details->even_cows;
135262306a36Sopenharmony_ci}
135362306a36Sopenharmony_ci
135462306a36Sopenharmony_ci/* Decides whether we should zap this page with the page pointer specified */
135562306a36Sopenharmony_cistatic inline bool should_zap_page(struct zap_details *details, struct page *page)
135662306a36Sopenharmony_ci{
135762306a36Sopenharmony_ci	/* If we can make a decision without *page.. */
135862306a36Sopenharmony_ci	if (should_zap_cows(details))
135962306a36Sopenharmony_ci		return true;
136062306a36Sopenharmony_ci
136162306a36Sopenharmony_ci	/* E.g. the caller passes NULL for the case of a zero page */
136262306a36Sopenharmony_ci	if (!page)
136362306a36Sopenharmony_ci		return true;
136462306a36Sopenharmony_ci
136562306a36Sopenharmony_ci	/* Otherwise we should only zap non-anon pages */
136662306a36Sopenharmony_ci	return !PageAnon(page);
136762306a36Sopenharmony_ci}
136862306a36Sopenharmony_ci
136962306a36Sopenharmony_cistatic inline bool zap_drop_file_uffd_wp(struct zap_details *details)
137062306a36Sopenharmony_ci{
137162306a36Sopenharmony_ci	if (!details)
137262306a36Sopenharmony_ci		return false;
137362306a36Sopenharmony_ci
137462306a36Sopenharmony_ci	return details->zap_flags & ZAP_FLAG_DROP_MARKER;
137562306a36Sopenharmony_ci}
137662306a36Sopenharmony_ci
137762306a36Sopenharmony_ci/*
137862306a36Sopenharmony_ci * This function makes sure that we'll replace the none pte with an uffd-wp
137962306a36Sopenharmony_ci * swap special pte marker when necessary. Must be with the pgtable lock held.
138062306a36Sopenharmony_ci */
138162306a36Sopenharmony_cistatic inline void
138262306a36Sopenharmony_cizap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
138362306a36Sopenharmony_ci			      unsigned long addr, pte_t *pte,
138462306a36Sopenharmony_ci			      struct zap_details *details, pte_t pteval)
138562306a36Sopenharmony_ci{
138662306a36Sopenharmony_ci	/* Zap on anonymous always means dropping everything */
138762306a36Sopenharmony_ci	if (vma_is_anonymous(vma))
138862306a36Sopenharmony_ci		return;
138962306a36Sopenharmony_ci
139062306a36Sopenharmony_ci	if (zap_drop_file_uffd_wp(details))
139162306a36Sopenharmony_ci		return;
139262306a36Sopenharmony_ci
139362306a36Sopenharmony_ci	pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
139462306a36Sopenharmony_ci}
139562306a36Sopenharmony_ci
139662306a36Sopenharmony_cistatic unsigned long zap_pte_range(struct mmu_gather *tlb,
139762306a36Sopenharmony_ci				struct vm_area_struct *vma, pmd_t *pmd,
139862306a36Sopenharmony_ci				unsigned long addr, unsigned long end,
139962306a36Sopenharmony_ci				struct zap_details *details)
140062306a36Sopenharmony_ci{
140162306a36Sopenharmony_ci	struct mm_struct *mm = tlb->mm;
140262306a36Sopenharmony_ci	int force_flush = 0;
140362306a36Sopenharmony_ci	int rss[NR_MM_COUNTERS];
140462306a36Sopenharmony_ci	spinlock_t *ptl;
140562306a36Sopenharmony_ci	pte_t *start_pte;
140662306a36Sopenharmony_ci	pte_t *pte;
140762306a36Sopenharmony_ci	swp_entry_t entry;
140862306a36Sopenharmony_ci
140962306a36Sopenharmony_ci	tlb_change_page_size(tlb, PAGE_SIZE);
141062306a36Sopenharmony_ci	init_rss_vec(rss);
141162306a36Sopenharmony_ci	start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
141262306a36Sopenharmony_ci	if (!pte)
141362306a36Sopenharmony_ci		return addr;
141462306a36Sopenharmony_ci
141562306a36Sopenharmony_ci	flush_tlb_batched_pending(mm);
141662306a36Sopenharmony_ci	arch_enter_lazy_mmu_mode();
141762306a36Sopenharmony_ci	do {
141862306a36Sopenharmony_ci		pte_t ptent = ptep_get(pte);
141962306a36Sopenharmony_ci		struct page *page;
142062306a36Sopenharmony_ci
142162306a36Sopenharmony_ci		if (pte_none(ptent))
142262306a36Sopenharmony_ci			continue;
142362306a36Sopenharmony_ci
142462306a36Sopenharmony_ci		if (need_resched())
142562306a36Sopenharmony_ci			break;
142662306a36Sopenharmony_ci
142762306a36Sopenharmony_ci		if (pte_present(ptent)) {
142862306a36Sopenharmony_ci			unsigned int delay_rmap;
142962306a36Sopenharmony_ci
143062306a36Sopenharmony_ci			page = vm_normal_page(vma, addr, ptent);
143162306a36Sopenharmony_ci#ifdef CONFIG_MEM_PURGEABLE
143262306a36Sopenharmony_ci			if (vma->vm_flags & VM_USEREXPTE)
143362306a36Sopenharmony_ci				page =  NULL;
143462306a36Sopenharmony_ci#endif
143562306a36Sopenharmony_ci			if (unlikely(!should_zap_page(details, page)))
143662306a36Sopenharmony_ci				continue;
143762306a36Sopenharmony_ci			ptent = ptep_get_and_clear_full(mm, addr, pte,
143862306a36Sopenharmony_ci							tlb->fullmm);
143962306a36Sopenharmony_ci			arch_check_zapped_pte(vma, ptent);
144062306a36Sopenharmony_ci			tlb_remove_tlb_entry(tlb, pte, addr);
144162306a36Sopenharmony_ci			zap_install_uffd_wp_if_needed(vma, addr, pte, details,
144262306a36Sopenharmony_ci						      ptent);
144362306a36Sopenharmony_ci			if (unlikely(!page)) {
144462306a36Sopenharmony_ci				ksm_might_unmap_zero_page(mm, ptent);
144562306a36Sopenharmony_ci				continue;
144662306a36Sopenharmony_ci			}
144762306a36Sopenharmony_ci#ifdef CONFIG_MEM_PURGEABLE
144862306a36Sopenharmony_ci			if (vma->vm_flags & VM_PURGEABLE)
144962306a36Sopenharmony_ci				uxpte_clear_present(vma, addr);
145062306a36Sopenharmony_ci#endif
145162306a36Sopenharmony_ci			delay_rmap = 0;
145262306a36Sopenharmony_ci			if (!PageAnon(page)) {
145362306a36Sopenharmony_ci				if (pte_dirty(ptent)) {
145462306a36Sopenharmony_ci					set_page_dirty(page);
145562306a36Sopenharmony_ci					if (tlb_delay_rmap(tlb)) {
145662306a36Sopenharmony_ci						delay_rmap = 1;
145762306a36Sopenharmony_ci						force_flush = 1;
145862306a36Sopenharmony_ci					}
145962306a36Sopenharmony_ci				}
146062306a36Sopenharmony_ci				if (pte_young(ptent) && likely(vma_has_recency(vma)))
146162306a36Sopenharmony_ci					mark_page_accessed(page);
146262306a36Sopenharmony_ci			}
146362306a36Sopenharmony_ci			rss[mm_counter(page)]--;
146462306a36Sopenharmony_ci			if (!delay_rmap) {
146562306a36Sopenharmony_ci				page_remove_rmap(page, vma, false);
146662306a36Sopenharmony_ci				if (unlikely(page_mapcount(page) < 0))
146762306a36Sopenharmony_ci					print_bad_pte(vma, addr, ptent, page);
146862306a36Sopenharmony_ci			}
146962306a36Sopenharmony_ci			if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) {
147062306a36Sopenharmony_ci				force_flush = 1;
147162306a36Sopenharmony_ci				addr += PAGE_SIZE;
147262306a36Sopenharmony_ci				break;
147362306a36Sopenharmony_ci			}
147462306a36Sopenharmony_ci			continue;
147562306a36Sopenharmony_ci		}
147662306a36Sopenharmony_ci
147762306a36Sopenharmony_ci		entry = pte_to_swp_entry(ptent);
147862306a36Sopenharmony_ci		if (is_device_private_entry(entry) ||
147962306a36Sopenharmony_ci		    is_device_exclusive_entry(entry)) {
148062306a36Sopenharmony_ci			page = pfn_swap_entry_to_page(entry);
148162306a36Sopenharmony_ci			if (unlikely(!should_zap_page(details, page)))
148262306a36Sopenharmony_ci				continue;
148362306a36Sopenharmony_ci			/*
148462306a36Sopenharmony_ci			 * Both device private/exclusive mappings should only
148562306a36Sopenharmony_ci			 * work with anonymous page so far, so we don't need to
148662306a36Sopenharmony_ci			 * consider uffd-wp bit when zap. For more information,
148762306a36Sopenharmony_ci			 * see zap_install_uffd_wp_if_needed().
148862306a36Sopenharmony_ci			 */
148962306a36Sopenharmony_ci			WARN_ON_ONCE(!vma_is_anonymous(vma));
149062306a36Sopenharmony_ci			rss[mm_counter(page)]--;
149162306a36Sopenharmony_ci			if (is_device_private_entry(entry))
149262306a36Sopenharmony_ci				page_remove_rmap(page, vma, false);
149362306a36Sopenharmony_ci			put_page(page);
149462306a36Sopenharmony_ci		} else if (!non_swap_entry(entry)) {
149562306a36Sopenharmony_ci			/* Genuine swap entry, hence a private anon page */
149662306a36Sopenharmony_ci			if (!should_zap_cows(details))
149762306a36Sopenharmony_ci				continue;
149862306a36Sopenharmony_ci			rss[MM_SWAPENTS]--;
149962306a36Sopenharmony_ci			if (unlikely(!free_swap_and_cache(entry)))
150062306a36Sopenharmony_ci				print_bad_pte(vma, addr, ptent, NULL);
150162306a36Sopenharmony_ci		} else if (is_migration_entry(entry)) {
150262306a36Sopenharmony_ci			page = pfn_swap_entry_to_page(entry);
150362306a36Sopenharmony_ci			if (!should_zap_page(details, page))
150462306a36Sopenharmony_ci				continue;
150562306a36Sopenharmony_ci			rss[mm_counter(page)]--;
150662306a36Sopenharmony_ci		} else if (pte_marker_entry_uffd_wp(entry)) {
150762306a36Sopenharmony_ci			/*
150862306a36Sopenharmony_ci			 * For anon: always drop the marker; for file: only
150962306a36Sopenharmony_ci			 * drop the marker if explicitly requested.
151062306a36Sopenharmony_ci			 */
151162306a36Sopenharmony_ci			if (!vma_is_anonymous(vma) &&
151262306a36Sopenharmony_ci			    !zap_drop_file_uffd_wp(details))
151362306a36Sopenharmony_ci				continue;
151462306a36Sopenharmony_ci		} else if (is_hwpoison_entry(entry) ||
151562306a36Sopenharmony_ci			   is_poisoned_swp_entry(entry)) {
151662306a36Sopenharmony_ci			if (!should_zap_cows(details))
151762306a36Sopenharmony_ci				continue;
151862306a36Sopenharmony_ci		} else {
151962306a36Sopenharmony_ci			/* We should have covered all the swap entry types */
152062306a36Sopenharmony_ci			WARN_ON_ONCE(1);
152162306a36Sopenharmony_ci		}
152262306a36Sopenharmony_ci		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
152362306a36Sopenharmony_ci		zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
152462306a36Sopenharmony_ci	} while (pte++, addr += PAGE_SIZE, addr != end);
152562306a36Sopenharmony_ci
152662306a36Sopenharmony_ci	add_mm_rss_vec(mm, rss);
152762306a36Sopenharmony_ci	arch_leave_lazy_mmu_mode();
152862306a36Sopenharmony_ci
152962306a36Sopenharmony_ci	/* Do the actual TLB flush before dropping ptl */
153062306a36Sopenharmony_ci	if (force_flush) {
153162306a36Sopenharmony_ci		tlb_flush_mmu_tlbonly(tlb);
153262306a36Sopenharmony_ci		tlb_flush_rmaps(tlb, vma);
153362306a36Sopenharmony_ci	}
153462306a36Sopenharmony_ci	pte_unmap_unlock(start_pte, ptl);
153562306a36Sopenharmony_ci
153662306a36Sopenharmony_ci	/*
153762306a36Sopenharmony_ci	 * If we forced a TLB flush (either due to running out of
153862306a36Sopenharmony_ci	 * batch buffers or because we needed to flush dirty TLB
153962306a36Sopenharmony_ci	 * entries before releasing the ptl), free the batched
154062306a36Sopenharmony_ci	 * memory too. Come back again if we didn't do everything.
154162306a36Sopenharmony_ci	 */
154262306a36Sopenharmony_ci	if (force_flush)
154362306a36Sopenharmony_ci		tlb_flush_mmu(tlb);
154462306a36Sopenharmony_ci
154562306a36Sopenharmony_ci	return addr;
154662306a36Sopenharmony_ci}
154762306a36Sopenharmony_ci
154862306a36Sopenharmony_cistatic inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
154962306a36Sopenharmony_ci				struct vm_area_struct *vma, pud_t *pud,
155062306a36Sopenharmony_ci				unsigned long addr, unsigned long end,
155162306a36Sopenharmony_ci				struct zap_details *details)
155262306a36Sopenharmony_ci{
155362306a36Sopenharmony_ci	pmd_t *pmd;
155462306a36Sopenharmony_ci	unsigned long next;
155562306a36Sopenharmony_ci
155662306a36Sopenharmony_ci	pmd = pmd_offset(pud, addr);
155762306a36Sopenharmony_ci	do {
155862306a36Sopenharmony_ci		next = pmd_addr_end(addr, end);
155962306a36Sopenharmony_ci		if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
156062306a36Sopenharmony_ci			if (next - addr != HPAGE_PMD_SIZE)
156162306a36Sopenharmony_ci				__split_huge_pmd(vma, pmd, addr, false, NULL);
156262306a36Sopenharmony_ci			else if (zap_huge_pmd(tlb, vma, pmd, addr)) {
156362306a36Sopenharmony_ci				addr = next;
156462306a36Sopenharmony_ci				continue;
156562306a36Sopenharmony_ci			}
156662306a36Sopenharmony_ci			/* fall through */
156762306a36Sopenharmony_ci		} else if (details && details->single_folio &&
156862306a36Sopenharmony_ci			   folio_test_pmd_mappable(details->single_folio) &&
156962306a36Sopenharmony_ci			   next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
157062306a36Sopenharmony_ci			spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
157162306a36Sopenharmony_ci			/*
157262306a36Sopenharmony_ci			 * Take and drop THP pmd lock so that we cannot return
157362306a36Sopenharmony_ci			 * prematurely, while zap_huge_pmd() has cleared *pmd,
157462306a36Sopenharmony_ci			 * but not yet decremented compound_mapcount().
157562306a36Sopenharmony_ci			 */
157662306a36Sopenharmony_ci			spin_unlock(ptl);
157762306a36Sopenharmony_ci		}
157862306a36Sopenharmony_ci		if (pmd_none(*pmd)) {
157962306a36Sopenharmony_ci			addr = next;
158062306a36Sopenharmony_ci			continue;
158162306a36Sopenharmony_ci		}
158262306a36Sopenharmony_ci		addr = zap_pte_range(tlb, vma, pmd, addr, next, details);
158362306a36Sopenharmony_ci		if (addr != next)
158462306a36Sopenharmony_ci			pmd--;
158562306a36Sopenharmony_ci	} while (pmd++, cond_resched(), addr != end);
158662306a36Sopenharmony_ci
158762306a36Sopenharmony_ci	return addr;
158862306a36Sopenharmony_ci}
158962306a36Sopenharmony_ci
159062306a36Sopenharmony_cistatic inline unsigned long zap_pud_range(struct mmu_gather *tlb,
159162306a36Sopenharmony_ci				struct vm_area_struct *vma, p4d_t *p4d,
159262306a36Sopenharmony_ci				unsigned long addr, unsigned long end,
159362306a36Sopenharmony_ci				struct zap_details *details)
159462306a36Sopenharmony_ci{
159562306a36Sopenharmony_ci	pud_t *pud;
159662306a36Sopenharmony_ci	unsigned long next;
159762306a36Sopenharmony_ci
159862306a36Sopenharmony_ci	pud = pud_offset(p4d, addr);
159962306a36Sopenharmony_ci	do {
160062306a36Sopenharmony_ci		next = pud_addr_end(addr, end);
160162306a36Sopenharmony_ci		if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
160262306a36Sopenharmony_ci			if (next - addr != HPAGE_PUD_SIZE) {
160362306a36Sopenharmony_ci				mmap_assert_locked(tlb->mm);
160462306a36Sopenharmony_ci				split_huge_pud(vma, pud, addr);
160562306a36Sopenharmony_ci			} else if (zap_huge_pud(tlb, vma, pud, addr))
160662306a36Sopenharmony_ci				goto next;
160762306a36Sopenharmony_ci			/* fall through */
160862306a36Sopenharmony_ci		}
160962306a36Sopenharmony_ci		if (pud_none_or_clear_bad(pud))
161062306a36Sopenharmony_ci			continue;
161162306a36Sopenharmony_ci		next = zap_pmd_range(tlb, vma, pud, addr, next, details);
161262306a36Sopenharmony_cinext:
161362306a36Sopenharmony_ci		cond_resched();
161462306a36Sopenharmony_ci	} while (pud++, addr = next, addr != end);
161562306a36Sopenharmony_ci
161662306a36Sopenharmony_ci	return addr;
161762306a36Sopenharmony_ci}
161862306a36Sopenharmony_ci
161962306a36Sopenharmony_cistatic inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
162062306a36Sopenharmony_ci				struct vm_area_struct *vma, pgd_t *pgd,
162162306a36Sopenharmony_ci				unsigned long addr, unsigned long end,
162262306a36Sopenharmony_ci				struct zap_details *details)
162362306a36Sopenharmony_ci{
162462306a36Sopenharmony_ci	p4d_t *p4d;
162562306a36Sopenharmony_ci	unsigned long next;
162662306a36Sopenharmony_ci
162762306a36Sopenharmony_ci	p4d = p4d_offset(pgd, addr);
162862306a36Sopenharmony_ci	do {
162962306a36Sopenharmony_ci		next = p4d_addr_end(addr, end);
163062306a36Sopenharmony_ci		if (p4d_none_or_clear_bad(p4d))
163162306a36Sopenharmony_ci			continue;
163262306a36Sopenharmony_ci		next = zap_pud_range(tlb, vma, p4d, addr, next, details);
163362306a36Sopenharmony_ci	} while (p4d++, addr = next, addr != end);
163462306a36Sopenharmony_ci
163562306a36Sopenharmony_ci	return addr;
163662306a36Sopenharmony_ci}
163762306a36Sopenharmony_ci
163862306a36Sopenharmony_civoid unmap_page_range(struct mmu_gather *tlb,
163962306a36Sopenharmony_ci			     struct vm_area_struct *vma,
164062306a36Sopenharmony_ci			     unsigned long addr, unsigned long end,
164162306a36Sopenharmony_ci			     struct zap_details *details)
164262306a36Sopenharmony_ci{
164362306a36Sopenharmony_ci	pgd_t *pgd;
164462306a36Sopenharmony_ci	unsigned long next;
164562306a36Sopenharmony_ci
164662306a36Sopenharmony_ci	BUG_ON(addr >= end);
164762306a36Sopenharmony_ci	tlb_start_vma(tlb, vma);
164862306a36Sopenharmony_ci	pgd = pgd_offset(vma->vm_mm, addr);
164962306a36Sopenharmony_ci	do {
165062306a36Sopenharmony_ci		next = pgd_addr_end(addr, end);
165162306a36Sopenharmony_ci		if (pgd_none_or_clear_bad(pgd))
165262306a36Sopenharmony_ci			continue;
165362306a36Sopenharmony_ci		next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
165462306a36Sopenharmony_ci	} while (pgd++, addr = next, addr != end);
165562306a36Sopenharmony_ci	tlb_end_vma(tlb, vma);
165662306a36Sopenharmony_ci}
165762306a36Sopenharmony_ci
165862306a36Sopenharmony_ci
165962306a36Sopenharmony_cistatic void unmap_single_vma(struct mmu_gather *tlb,
166062306a36Sopenharmony_ci		struct vm_area_struct *vma, unsigned long start_addr,
166162306a36Sopenharmony_ci		unsigned long end_addr,
166262306a36Sopenharmony_ci		struct zap_details *details, bool mm_wr_locked)
166362306a36Sopenharmony_ci{
166462306a36Sopenharmony_ci	unsigned long start = max(vma->vm_start, start_addr);
166562306a36Sopenharmony_ci	unsigned long end;
166662306a36Sopenharmony_ci
166762306a36Sopenharmony_ci	if (start >= vma->vm_end)
166862306a36Sopenharmony_ci		return;
166962306a36Sopenharmony_ci	end = min(vma->vm_end, end_addr);
167062306a36Sopenharmony_ci	if (end <= vma->vm_start)
167162306a36Sopenharmony_ci		return;
167262306a36Sopenharmony_ci
167362306a36Sopenharmony_ci	if (vma->vm_file)
167462306a36Sopenharmony_ci		uprobe_munmap(vma, start, end);
167562306a36Sopenharmony_ci
167662306a36Sopenharmony_ci	if (unlikely(vma->vm_flags & VM_PFNMAP))
167762306a36Sopenharmony_ci		untrack_pfn(vma, 0, 0, mm_wr_locked);
167862306a36Sopenharmony_ci
167962306a36Sopenharmony_ci	if (start != end) {
168062306a36Sopenharmony_ci		if (unlikely(is_vm_hugetlb_page(vma))) {
168162306a36Sopenharmony_ci			/*
168262306a36Sopenharmony_ci			 * It is undesirable to test vma->vm_file as it
168362306a36Sopenharmony_ci			 * should be non-null for valid hugetlb area.
168462306a36Sopenharmony_ci			 * However, vm_file will be NULL in the error
168562306a36Sopenharmony_ci			 * cleanup path of mmap_region. When
168662306a36Sopenharmony_ci			 * hugetlbfs ->mmap method fails,
168762306a36Sopenharmony_ci			 * mmap_region() nullifies vma->vm_file
168862306a36Sopenharmony_ci			 * before calling this function to clean up.
168962306a36Sopenharmony_ci			 * Since no pte has actually been setup, it is
169062306a36Sopenharmony_ci			 * safe to do nothing in this case.
169162306a36Sopenharmony_ci			 */
169262306a36Sopenharmony_ci			if (vma->vm_file) {
169362306a36Sopenharmony_ci				zap_flags_t zap_flags = details ?
169462306a36Sopenharmony_ci				    details->zap_flags : 0;
169562306a36Sopenharmony_ci				__unmap_hugepage_range(tlb, vma, start, end,
169662306a36Sopenharmony_ci							     NULL, zap_flags);
169762306a36Sopenharmony_ci			}
169862306a36Sopenharmony_ci		} else
169962306a36Sopenharmony_ci			unmap_page_range(tlb, vma, start, end, details);
170062306a36Sopenharmony_ci	}
170162306a36Sopenharmony_ci}
170262306a36Sopenharmony_ci
170362306a36Sopenharmony_ci/**
170462306a36Sopenharmony_ci * unmap_vmas - unmap a range of memory covered by a list of vma's
170562306a36Sopenharmony_ci * @tlb: address of the caller's struct mmu_gather
170662306a36Sopenharmony_ci * @mas: the maple state
170762306a36Sopenharmony_ci * @vma: the starting vma
170862306a36Sopenharmony_ci * @start_addr: virtual address at which to start unmapping
170962306a36Sopenharmony_ci * @end_addr: virtual address at which to end unmapping
171062306a36Sopenharmony_ci * @tree_end: The maximum index to check
171162306a36Sopenharmony_ci * @mm_wr_locked: lock flag
171262306a36Sopenharmony_ci *
171362306a36Sopenharmony_ci * Unmap all pages in the vma list.
171462306a36Sopenharmony_ci *
171562306a36Sopenharmony_ci * Only addresses between `start' and `end' will be unmapped.
171662306a36Sopenharmony_ci *
171762306a36Sopenharmony_ci * The VMA list must be sorted in ascending virtual address order.
171862306a36Sopenharmony_ci *
171962306a36Sopenharmony_ci * unmap_vmas() assumes that the caller will flush the whole unmapped address
172062306a36Sopenharmony_ci * range after unmap_vmas() returns.  So the only responsibility here is to
172162306a36Sopenharmony_ci * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
172262306a36Sopenharmony_ci * drops the lock and schedules.
172362306a36Sopenharmony_ci */
172462306a36Sopenharmony_civoid unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
172562306a36Sopenharmony_ci		struct vm_area_struct *vma, unsigned long start_addr,
172662306a36Sopenharmony_ci		unsigned long end_addr, unsigned long tree_end,
172762306a36Sopenharmony_ci		bool mm_wr_locked)
172862306a36Sopenharmony_ci{
172962306a36Sopenharmony_ci	struct mmu_notifier_range range;
173062306a36Sopenharmony_ci	struct zap_details details = {
173162306a36Sopenharmony_ci		.zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP,
173262306a36Sopenharmony_ci		/* Careful - we need to zap private pages too! */
173362306a36Sopenharmony_ci		.even_cows = true,
173462306a36Sopenharmony_ci	};
173562306a36Sopenharmony_ci
173662306a36Sopenharmony_ci	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
173762306a36Sopenharmony_ci				start_addr, end_addr);
173862306a36Sopenharmony_ci	mmu_notifier_invalidate_range_start(&range);
173962306a36Sopenharmony_ci	do {
174062306a36Sopenharmony_ci		unsigned long start = start_addr;
174162306a36Sopenharmony_ci		unsigned long end = end_addr;
174262306a36Sopenharmony_ci		hugetlb_zap_begin(vma, &start, &end);
174362306a36Sopenharmony_ci		unmap_single_vma(tlb, vma, start, end, &details,
174462306a36Sopenharmony_ci				 mm_wr_locked);
174562306a36Sopenharmony_ci		hugetlb_zap_end(vma, &details);
174662306a36Sopenharmony_ci	} while ((vma = mas_find(mas, tree_end - 1)) != NULL);
174762306a36Sopenharmony_ci	mmu_notifier_invalidate_range_end(&range);
174862306a36Sopenharmony_ci}
174962306a36Sopenharmony_ci
175062306a36Sopenharmony_ci/**
175162306a36Sopenharmony_ci * zap_page_range_single - remove user pages in a given range
175262306a36Sopenharmony_ci * @vma: vm_area_struct holding the applicable pages
175362306a36Sopenharmony_ci * @address: starting address of pages to zap
175462306a36Sopenharmony_ci * @size: number of bytes to zap
175562306a36Sopenharmony_ci * @details: details of shared cache invalidation
175662306a36Sopenharmony_ci *
175762306a36Sopenharmony_ci * The range must fit into one VMA.
175862306a36Sopenharmony_ci */
175962306a36Sopenharmony_civoid zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
176062306a36Sopenharmony_ci		unsigned long size, struct zap_details *details)
176162306a36Sopenharmony_ci{
176262306a36Sopenharmony_ci	const unsigned long end = address + size;
176362306a36Sopenharmony_ci	struct mmu_notifier_range range;
176462306a36Sopenharmony_ci	struct mmu_gather tlb;
176562306a36Sopenharmony_ci
176662306a36Sopenharmony_ci	lru_add_drain();
176762306a36Sopenharmony_ci	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
176862306a36Sopenharmony_ci				address, end);
176962306a36Sopenharmony_ci	hugetlb_zap_begin(vma, &range.start, &range.end);
177062306a36Sopenharmony_ci	tlb_gather_mmu(&tlb, vma->vm_mm);
177162306a36Sopenharmony_ci	update_hiwater_rss(vma->vm_mm);
177262306a36Sopenharmony_ci	mmu_notifier_invalidate_range_start(&range);
177362306a36Sopenharmony_ci	/*
177462306a36Sopenharmony_ci	 * unmap 'address-end' not 'range.start-range.end' as range
177562306a36Sopenharmony_ci	 * could have been expanded for hugetlb pmd sharing.
177662306a36Sopenharmony_ci	 */
177762306a36Sopenharmony_ci	unmap_single_vma(&tlb, vma, address, end, details, false);
177862306a36Sopenharmony_ci	mmu_notifier_invalidate_range_end(&range);
177962306a36Sopenharmony_ci	tlb_finish_mmu(&tlb);
178062306a36Sopenharmony_ci	hugetlb_zap_end(vma, details);
178162306a36Sopenharmony_ci}
178262306a36Sopenharmony_ci
178362306a36Sopenharmony_ci/**
178462306a36Sopenharmony_ci * zap_vma_ptes - remove ptes mapping the vma
178562306a36Sopenharmony_ci * @vma: vm_area_struct holding ptes to be zapped
178662306a36Sopenharmony_ci * @address: starting address of pages to zap
178762306a36Sopenharmony_ci * @size: number of bytes to zap
178862306a36Sopenharmony_ci *
178962306a36Sopenharmony_ci * This function only unmaps ptes assigned to VM_PFNMAP vmas.
179062306a36Sopenharmony_ci *
179162306a36Sopenharmony_ci * The entire address range must be fully contained within the vma.
179262306a36Sopenharmony_ci *
179362306a36Sopenharmony_ci */
179462306a36Sopenharmony_civoid zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
179562306a36Sopenharmony_ci		unsigned long size)
179662306a36Sopenharmony_ci{
179762306a36Sopenharmony_ci	if (!range_in_vma(vma, address, address + size) ||
179862306a36Sopenharmony_ci	    		!(vma->vm_flags & VM_PFNMAP))
179962306a36Sopenharmony_ci		return;
180062306a36Sopenharmony_ci
180162306a36Sopenharmony_ci	zap_page_range_single(vma, address, size, NULL);
180262306a36Sopenharmony_ci}
180362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(zap_vma_ptes);
180462306a36Sopenharmony_ci
180562306a36Sopenharmony_cistatic pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr)
180662306a36Sopenharmony_ci{
180762306a36Sopenharmony_ci	pgd_t *pgd;
180862306a36Sopenharmony_ci	p4d_t *p4d;
180962306a36Sopenharmony_ci	pud_t *pud;
181062306a36Sopenharmony_ci	pmd_t *pmd;
181162306a36Sopenharmony_ci
181262306a36Sopenharmony_ci	pgd = pgd_offset(mm, addr);
181362306a36Sopenharmony_ci	p4d = p4d_alloc(mm, pgd, addr);
181462306a36Sopenharmony_ci	if (!p4d)
181562306a36Sopenharmony_ci		return NULL;
181662306a36Sopenharmony_ci	pud = pud_alloc(mm, p4d, addr);
181762306a36Sopenharmony_ci	if (!pud)
181862306a36Sopenharmony_ci		return NULL;
181962306a36Sopenharmony_ci	pmd = pmd_alloc(mm, pud, addr);
182062306a36Sopenharmony_ci	if (!pmd)
182162306a36Sopenharmony_ci		return NULL;
182262306a36Sopenharmony_ci
182362306a36Sopenharmony_ci	VM_BUG_ON(pmd_trans_huge(*pmd));
182462306a36Sopenharmony_ci	return pmd;
182562306a36Sopenharmony_ci}
182662306a36Sopenharmony_ci
182762306a36Sopenharmony_cipte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
182862306a36Sopenharmony_ci			spinlock_t **ptl)
182962306a36Sopenharmony_ci{
183062306a36Sopenharmony_ci	pmd_t *pmd = walk_to_pmd(mm, addr);
183162306a36Sopenharmony_ci
183262306a36Sopenharmony_ci	if (!pmd)
183362306a36Sopenharmony_ci		return NULL;
183462306a36Sopenharmony_ci	return pte_alloc_map_lock(mm, pmd, addr, ptl);
183562306a36Sopenharmony_ci}
183662306a36Sopenharmony_ci
183762306a36Sopenharmony_cistatic int validate_page_before_insert(struct page *page)
183862306a36Sopenharmony_ci{
183962306a36Sopenharmony_ci	if (PageAnon(page) || PageSlab(page) || page_has_type(page))
184062306a36Sopenharmony_ci		return -EINVAL;
184162306a36Sopenharmony_ci	flush_dcache_page(page);
184262306a36Sopenharmony_ci	return 0;
184362306a36Sopenharmony_ci}
184462306a36Sopenharmony_ci
184562306a36Sopenharmony_cistatic int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
184662306a36Sopenharmony_ci			unsigned long addr, struct page *page, pgprot_t prot)
184762306a36Sopenharmony_ci{
184862306a36Sopenharmony_ci	if (!pte_none(ptep_get(pte)))
184962306a36Sopenharmony_ci		return -EBUSY;
185062306a36Sopenharmony_ci	/* Ok, finally just insert the thing.. */
185162306a36Sopenharmony_ci	get_page(page);
185262306a36Sopenharmony_ci	inc_mm_counter(vma->vm_mm, mm_counter_file(page));
185362306a36Sopenharmony_ci	page_add_file_rmap(page, vma, false);
185462306a36Sopenharmony_ci	set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot));
185562306a36Sopenharmony_ci	return 0;
185662306a36Sopenharmony_ci}
185762306a36Sopenharmony_ci
185862306a36Sopenharmony_ci/*
185962306a36Sopenharmony_ci * This is the old fallback for page remapping.
186062306a36Sopenharmony_ci *
186162306a36Sopenharmony_ci * For historical reasons, it only allows reserved pages. Only
186262306a36Sopenharmony_ci * old drivers should use this, and they needed to mark their
186362306a36Sopenharmony_ci * pages reserved for the old functions anyway.
186462306a36Sopenharmony_ci */
186562306a36Sopenharmony_cistatic int insert_page(struct vm_area_struct *vma, unsigned long addr,
186662306a36Sopenharmony_ci			struct page *page, pgprot_t prot)
186762306a36Sopenharmony_ci{
186862306a36Sopenharmony_ci	int retval;
186962306a36Sopenharmony_ci	pte_t *pte;
187062306a36Sopenharmony_ci	spinlock_t *ptl;
187162306a36Sopenharmony_ci
187262306a36Sopenharmony_ci	retval = validate_page_before_insert(page);
187362306a36Sopenharmony_ci	if (retval)
187462306a36Sopenharmony_ci		goto out;
187562306a36Sopenharmony_ci	retval = -ENOMEM;
187662306a36Sopenharmony_ci	pte = get_locked_pte(vma->vm_mm, addr, &ptl);
187762306a36Sopenharmony_ci	if (!pte)
187862306a36Sopenharmony_ci		goto out;
187962306a36Sopenharmony_ci	retval = insert_page_into_pte_locked(vma, pte, addr, page, prot);
188062306a36Sopenharmony_ci	pte_unmap_unlock(pte, ptl);
188162306a36Sopenharmony_ciout:
188262306a36Sopenharmony_ci	return retval;
188362306a36Sopenharmony_ci}
188462306a36Sopenharmony_ci
188562306a36Sopenharmony_cistatic int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
188662306a36Sopenharmony_ci			unsigned long addr, struct page *page, pgprot_t prot)
188762306a36Sopenharmony_ci{
188862306a36Sopenharmony_ci	int err;
188962306a36Sopenharmony_ci
189062306a36Sopenharmony_ci	if (!page_count(page))
189162306a36Sopenharmony_ci		return -EINVAL;
189262306a36Sopenharmony_ci	err = validate_page_before_insert(page);
189362306a36Sopenharmony_ci	if (err)
189462306a36Sopenharmony_ci		return err;
189562306a36Sopenharmony_ci	return insert_page_into_pte_locked(vma, pte, addr, page, prot);
189662306a36Sopenharmony_ci}
189762306a36Sopenharmony_ci
189862306a36Sopenharmony_ci/* insert_pages() amortizes the cost of spinlock operations
189962306a36Sopenharmony_ci * when inserting pages in a loop.
190062306a36Sopenharmony_ci */
190162306a36Sopenharmony_cistatic int insert_pages(struct vm_area_struct *vma, unsigned long addr,
190262306a36Sopenharmony_ci			struct page **pages, unsigned long *num, pgprot_t prot)
190362306a36Sopenharmony_ci{
190462306a36Sopenharmony_ci	pmd_t *pmd = NULL;
190562306a36Sopenharmony_ci	pte_t *start_pte, *pte;
190662306a36Sopenharmony_ci	spinlock_t *pte_lock;
190762306a36Sopenharmony_ci	struct mm_struct *const mm = vma->vm_mm;
190862306a36Sopenharmony_ci	unsigned long curr_page_idx = 0;
190962306a36Sopenharmony_ci	unsigned long remaining_pages_total = *num;
191062306a36Sopenharmony_ci	unsigned long pages_to_write_in_pmd;
191162306a36Sopenharmony_ci	int ret;
191262306a36Sopenharmony_cimore:
191362306a36Sopenharmony_ci	ret = -EFAULT;
191462306a36Sopenharmony_ci	pmd = walk_to_pmd(mm, addr);
191562306a36Sopenharmony_ci	if (!pmd)
191662306a36Sopenharmony_ci		goto out;
191762306a36Sopenharmony_ci
191862306a36Sopenharmony_ci	pages_to_write_in_pmd = min_t(unsigned long,
191962306a36Sopenharmony_ci		remaining_pages_total, PTRS_PER_PTE - pte_index(addr));
192062306a36Sopenharmony_ci
192162306a36Sopenharmony_ci	/* Allocate the PTE if necessary; takes PMD lock once only. */
192262306a36Sopenharmony_ci	ret = -ENOMEM;
192362306a36Sopenharmony_ci	if (pte_alloc(mm, pmd))
192462306a36Sopenharmony_ci		goto out;
192562306a36Sopenharmony_ci
192662306a36Sopenharmony_ci	while (pages_to_write_in_pmd) {
192762306a36Sopenharmony_ci		int pte_idx = 0;
192862306a36Sopenharmony_ci		const int batch_size = min_t(int, pages_to_write_in_pmd, 8);
192962306a36Sopenharmony_ci
193062306a36Sopenharmony_ci		start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
193162306a36Sopenharmony_ci		if (!start_pte) {
193262306a36Sopenharmony_ci			ret = -EFAULT;
193362306a36Sopenharmony_ci			goto out;
193462306a36Sopenharmony_ci		}
193562306a36Sopenharmony_ci		for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
193662306a36Sopenharmony_ci			int err = insert_page_in_batch_locked(vma, pte,
193762306a36Sopenharmony_ci				addr, pages[curr_page_idx], prot);
193862306a36Sopenharmony_ci			if (unlikely(err)) {
193962306a36Sopenharmony_ci				pte_unmap_unlock(start_pte, pte_lock);
194062306a36Sopenharmony_ci				ret = err;
194162306a36Sopenharmony_ci				remaining_pages_total -= pte_idx;
194262306a36Sopenharmony_ci				goto out;
194362306a36Sopenharmony_ci			}
194462306a36Sopenharmony_ci			addr += PAGE_SIZE;
194562306a36Sopenharmony_ci			++curr_page_idx;
194662306a36Sopenharmony_ci		}
194762306a36Sopenharmony_ci		pte_unmap_unlock(start_pte, pte_lock);
194862306a36Sopenharmony_ci		pages_to_write_in_pmd -= batch_size;
194962306a36Sopenharmony_ci		remaining_pages_total -= batch_size;
195062306a36Sopenharmony_ci	}
195162306a36Sopenharmony_ci	if (remaining_pages_total)
195262306a36Sopenharmony_ci		goto more;
195362306a36Sopenharmony_ci	ret = 0;
195462306a36Sopenharmony_ciout:
195562306a36Sopenharmony_ci	*num = remaining_pages_total;
195662306a36Sopenharmony_ci	return ret;
195762306a36Sopenharmony_ci}
195862306a36Sopenharmony_ci
195962306a36Sopenharmony_ci/**
196062306a36Sopenharmony_ci * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
196162306a36Sopenharmony_ci * @vma: user vma to map to
196262306a36Sopenharmony_ci * @addr: target start user address of these pages
196362306a36Sopenharmony_ci * @pages: source kernel pages
196462306a36Sopenharmony_ci * @num: in: number of pages to map. out: number of pages that were *not*
196562306a36Sopenharmony_ci * mapped. (0 means all pages were successfully mapped).
196662306a36Sopenharmony_ci *
196762306a36Sopenharmony_ci * Preferred over vm_insert_page() when inserting multiple pages.
196862306a36Sopenharmony_ci *
196962306a36Sopenharmony_ci * In case of error, we may have mapped a subset of the provided
197062306a36Sopenharmony_ci * pages. It is the caller's responsibility to account for this case.
197162306a36Sopenharmony_ci *
197262306a36Sopenharmony_ci * The same restrictions apply as in vm_insert_page().
197362306a36Sopenharmony_ci */
197462306a36Sopenharmony_ciint vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
197562306a36Sopenharmony_ci			struct page **pages, unsigned long *num)
197662306a36Sopenharmony_ci{
197762306a36Sopenharmony_ci	const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1;
197862306a36Sopenharmony_ci
197962306a36Sopenharmony_ci	if (addr < vma->vm_start || end_addr >= vma->vm_end)
198062306a36Sopenharmony_ci		return -EFAULT;
198162306a36Sopenharmony_ci	if (!(vma->vm_flags & VM_MIXEDMAP)) {
198262306a36Sopenharmony_ci		BUG_ON(mmap_read_trylock(vma->vm_mm));
198362306a36Sopenharmony_ci		BUG_ON(vma->vm_flags & VM_PFNMAP);
198462306a36Sopenharmony_ci		vm_flags_set(vma, VM_MIXEDMAP);
198562306a36Sopenharmony_ci	}
198662306a36Sopenharmony_ci	/* Defer page refcount checking till we're about to map that page. */
198762306a36Sopenharmony_ci	return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
198862306a36Sopenharmony_ci}
198962306a36Sopenharmony_ciEXPORT_SYMBOL(vm_insert_pages);
199062306a36Sopenharmony_ci
199162306a36Sopenharmony_ci/**
199262306a36Sopenharmony_ci * vm_insert_page - insert single page into user vma
199362306a36Sopenharmony_ci * @vma: user vma to map to
199462306a36Sopenharmony_ci * @addr: target user address of this page
199562306a36Sopenharmony_ci * @page: source kernel page
199662306a36Sopenharmony_ci *
199762306a36Sopenharmony_ci * This allows drivers to insert individual pages they've allocated
199862306a36Sopenharmony_ci * into a user vma.
199962306a36Sopenharmony_ci *
200062306a36Sopenharmony_ci * The page has to be a nice clean _individual_ kernel allocation.
200162306a36Sopenharmony_ci * If you allocate a compound page, you need to have marked it as
200262306a36Sopenharmony_ci * such (__GFP_COMP), or manually just split the page up yourself
200362306a36Sopenharmony_ci * (see split_page()).
200462306a36Sopenharmony_ci *
200562306a36Sopenharmony_ci * NOTE! Traditionally this was done with "remap_pfn_range()" which
200662306a36Sopenharmony_ci * took an arbitrary page protection parameter. This doesn't allow
200762306a36Sopenharmony_ci * that. Your vma protection will have to be set up correctly, which
200862306a36Sopenharmony_ci * means that if you want a shared writable mapping, you'd better
200962306a36Sopenharmony_ci * ask for a shared writable mapping!
201062306a36Sopenharmony_ci *
201162306a36Sopenharmony_ci * The page does not need to be reserved.
201262306a36Sopenharmony_ci *
201362306a36Sopenharmony_ci * Usually this function is called from f_op->mmap() handler
201462306a36Sopenharmony_ci * under mm->mmap_lock write-lock, so it can change vma->vm_flags.
201562306a36Sopenharmony_ci * Caller must set VM_MIXEDMAP on vma if it wants to call this
201662306a36Sopenharmony_ci * function from other places, for example from page-fault handler.
201762306a36Sopenharmony_ci *
201862306a36Sopenharmony_ci * Return: %0 on success, negative error code otherwise.
201962306a36Sopenharmony_ci */
202062306a36Sopenharmony_ciint vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
202162306a36Sopenharmony_ci			struct page *page)
202262306a36Sopenharmony_ci{
202362306a36Sopenharmony_ci	if (addr < vma->vm_start || addr >= vma->vm_end)
202462306a36Sopenharmony_ci		return -EFAULT;
202562306a36Sopenharmony_ci	if (!page_count(page))
202662306a36Sopenharmony_ci		return -EINVAL;
202762306a36Sopenharmony_ci	if (!(vma->vm_flags & VM_MIXEDMAP)) {
202862306a36Sopenharmony_ci		BUG_ON(mmap_read_trylock(vma->vm_mm));
202962306a36Sopenharmony_ci		BUG_ON(vma->vm_flags & VM_PFNMAP);
203062306a36Sopenharmony_ci		vm_flags_set(vma, VM_MIXEDMAP);
203162306a36Sopenharmony_ci	}
203262306a36Sopenharmony_ci	return insert_page(vma, addr, page, vma->vm_page_prot);
203362306a36Sopenharmony_ci}
203462306a36Sopenharmony_ciEXPORT_SYMBOL(vm_insert_page);
203562306a36Sopenharmony_ci
203662306a36Sopenharmony_ci/*
203762306a36Sopenharmony_ci * __vm_map_pages - maps range of kernel pages into user vma
203862306a36Sopenharmony_ci * @vma: user vma to map to
203962306a36Sopenharmony_ci * @pages: pointer to array of source kernel pages
204062306a36Sopenharmony_ci * @num: number of pages in page array
204162306a36Sopenharmony_ci * @offset: user's requested vm_pgoff
204262306a36Sopenharmony_ci *
204362306a36Sopenharmony_ci * This allows drivers to map range of kernel pages into a user vma.
204462306a36Sopenharmony_ci *
204562306a36Sopenharmony_ci * Return: 0 on success and error code otherwise.
204662306a36Sopenharmony_ci */
204762306a36Sopenharmony_cistatic int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
204862306a36Sopenharmony_ci				unsigned long num, unsigned long offset)
204962306a36Sopenharmony_ci{
205062306a36Sopenharmony_ci	unsigned long count = vma_pages(vma);
205162306a36Sopenharmony_ci	unsigned long uaddr = vma->vm_start;
205262306a36Sopenharmony_ci	int ret, i;
205362306a36Sopenharmony_ci
205462306a36Sopenharmony_ci	/* Fail if the user requested offset is beyond the end of the object */
205562306a36Sopenharmony_ci	if (offset >= num)
205662306a36Sopenharmony_ci		return -ENXIO;
205762306a36Sopenharmony_ci
205862306a36Sopenharmony_ci	/* Fail if the user requested size exceeds available object size */
205962306a36Sopenharmony_ci	if (count > num - offset)
206062306a36Sopenharmony_ci		return -ENXIO;
206162306a36Sopenharmony_ci
206262306a36Sopenharmony_ci	for (i = 0; i < count; i++) {
206362306a36Sopenharmony_ci		ret = vm_insert_page(vma, uaddr, pages[offset + i]);
206462306a36Sopenharmony_ci		if (ret < 0)
206562306a36Sopenharmony_ci			return ret;
206662306a36Sopenharmony_ci		uaddr += PAGE_SIZE;
206762306a36Sopenharmony_ci	}
206862306a36Sopenharmony_ci
206962306a36Sopenharmony_ci	return 0;
207062306a36Sopenharmony_ci}
207162306a36Sopenharmony_ci
207262306a36Sopenharmony_ci/**
207362306a36Sopenharmony_ci * vm_map_pages - maps range of kernel pages starts with non zero offset
207462306a36Sopenharmony_ci * @vma: user vma to map to
207562306a36Sopenharmony_ci * @pages: pointer to array of source kernel pages
207662306a36Sopenharmony_ci * @num: number of pages in page array
207762306a36Sopenharmony_ci *
207862306a36Sopenharmony_ci * Maps an object consisting of @num pages, catering for the user's
207962306a36Sopenharmony_ci * requested vm_pgoff
208062306a36Sopenharmony_ci *
208162306a36Sopenharmony_ci * If we fail to insert any page into the vma, the function will return
208262306a36Sopenharmony_ci * immediately leaving any previously inserted pages present.  Callers
208362306a36Sopenharmony_ci * from the mmap handler may immediately return the error as their caller
208462306a36Sopenharmony_ci * will destroy the vma, removing any successfully inserted pages. Other
208562306a36Sopenharmony_ci * callers should make their own arrangements for calling unmap_region().
208662306a36Sopenharmony_ci *
208762306a36Sopenharmony_ci * Context: Process context. Called by mmap handlers.
208862306a36Sopenharmony_ci * Return: 0 on success and error code otherwise.
208962306a36Sopenharmony_ci */
209062306a36Sopenharmony_ciint vm_map_pages(struct vm_area_struct *vma, struct page **pages,
209162306a36Sopenharmony_ci				unsigned long num)
209262306a36Sopenharmony_ci{
209362306a36Sopenharmony_ci	return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
209462306a36Sopenharmony_ci}
209562306a36Sopenharmony_ciEXPORT_SYMBOL(vm_map_pages);
209662306a36Sopenharmony_ci
209762306a36Sopenharmony_ci/**
209862306a36Sopenharmony_ci * vm_map_pages_zero - map range of kernel pages starts with zero offset
209962306a36Sopenharmony_ci * @vma: user vma to map to
210062306a36Sopenharmony_ci * @pages: pointer to array of source kernel pages
210162306a36Sopenharmony_ci * @num: number of pages in page array
210262306a36Sopenharmony_ci *
210362306a36Sopenharmony_ci * Similar to vm_map_pages(), except that it explicitly sets the offset
210462306a36Sopenharmony_ci * to 0. This function is intended for the drivers that did not consider
210562306a36Sopenharmony_ci * vm_pgoff.
210662306a36Sopenharmony_ci *
210762306a36Sopenharmony_ci * Context: Process context. Called by mmap handlers.
210862306a36Sopenharmony_ci * Return: 0 on success and error code otherwise.
210962306a36Sopenharmony_ci */
211062306a36Sopenharmony_ciint vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
211162306a36Sopenharmony_ci				unsigned long num)
211262306a36Sopenharmony_ci{
211362306a36Sopenharmony_ci	return __vm_map_pages(vma, pages, num, 0);
211462306a36Sopenharmony_ci}
211562306a36Sopenharmony_ciEXPORT_SYMBOL(vm_map_pages_zero);
211662306a36Sopenharmony_ci
211762306a36Sopenharmony_cistatic vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
211862306a36Sopenharmony_ci			pfn_t pfn, pgprot_t prot, bool mkwrite)
211962306a36Sopenharmony_ci{
212062306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
212162306a36Sopenharmony_ci	pte_t *pte, entry;
212262306a36Sopenharmony_ci	spinlock_t *ptl;
212362306a36Sopenharmony_ci
212462306a36Sopenharmony_ci	pte = get_locked_pte(mm, addr, &ptl);
212562306a36Sopenharmony_ci	if (!pte)
212662306a36Sopenharmony_ci		return VM_FAULT_OOM;
212762306a36Sopenharmony_ci	entry = ptep_get(pte);
212862306a36Sopenharmony_ci	if (!pte_none(entry)) {
212962306a36Sopenharmony_ci		if (mkwrite) {
213062306a36Sopenharmony_ci			/*
213162306a36Sopenharmony_ci			 * For read faults on private mappings the PFN passed
213262306a36Sopenharmony_ci			 * in may not match the PFN we have mapped if the
213362306a36Sopenharmony_ci			 * mapped PFN is a writeable COW page.  In the mkwrite
213462306a36Sopenharmony_ci			 * case we are creating a writable PTE for a shared
213562306a36Sopenharmony_ci			 * mapping and we expect the PFNs to match. If they
213662306a36Sopenharmony_ci			 * don't match, we are likely racing with block
213762306a36Sopenharmony_ci			 * allocation and mapping invalidation so just skip the
213862306a36Sopenharmony_ci			 * update.
213962306a36Sopenharmony_ci			 */
214062306a36Sopenharmony_ci			if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) {
214162306a36Sopenharmony_ci				WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry)));
214262306a36Sopenharmony_ci				goto out_unlock;
214362306a36Sopenharmony_ci			}
214462306a36Sopenharmony_ci			entry = pte_mkyoung(entry);
214562306a36Sopenharmony_ci			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
214662306a36Sopenharmony_ci			if (ptep_set_access_flags(vma, addr, pte, entry, 1))
214762306a36Sopenharmony_ci				update_mmu_cache(vma, addr, pte);
214862306a36Sopenharmony_ci		}
214962306a36Sopenharmony_ci		goto out_unlock;
215062306a36Sopenharmony_ci	}
215162306a36Sopenharmony_ci
215262306a36Sopenharmony_ci	/* Ok, finally just insert the thing.. */
215362306a36Sopenharmony_ci	if (pfn_t_devmap(pfn))
215462306a36Sopenharmony_ci		entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
215562306a36Sopenharmony_ci	else
215662306a36Sopenharmony_ci		entry = pte_mkspecial(pfn_t_pte(pfn, prot));
215762306a36Sopenharmony_ci
215862306a36Sopenharmony_ci	if (mkwrite) {
215962306a36Sopenharmony_ci		entry = pte_mkyoung(entry);
216062306a36Sopenharmony_ci		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
216162306a36Sopenharmony_ci	}
216262306a36Sopenharmony_ci
216362306a36Sopenharmony_ci	set_pte_at(mm, addr, pte, entry);
216462306a36Sopenharmony_ci	update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
216562306a36Sopenharmony_ci
216662306a36Sopenharmony_ciout_unlock:
216762306a36Sopenharmony_ci	pte_unmap_unlock(pte, ptl);
216862306a36Sopenharmony_ci	return VM_FAULT_NOPAGE;
216962306a36Sopenharmony_ci}
217062306a36Sopenharmony_ci
217162306a36Sopenharmony_ci/**
217262306a36Sopenharmony_ci * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
217362306a36Sopenharmony_ci * @vma: user vma to map to
217462306a36Sopenharmony_ci * @addr: target user address of this page
217562306a36Sopenharmony_ci * @pfn: source kernel pfn
217662306a36Sopenharmony_ci * @pgprot: pgprot flags for the inserted page
217762306a36Sopenharmony_ci *
217862306a36Sopenharmony_ci * This is exactly like vmf_insert_pfn(), except that it allows drivers
217962306a36Sopenharmony_ci * to override pgprot on a per-page basis.
218062306a36Sopenharmony_ci *
218162306a36Sopenharmony_ci * This only makes sense for IO mappings, and it makes no sense for
218262306a36Sopenharmony_ci * COW mappings.  In general, using multiple vmas is preferable;
218362306a36Sopenharmony_ci * vmf_insert_pfn_prot should only be used if using multiple VMAs is
218462306a36Sopenharmony_ci * impractical.
218562306a36Sopenharmony_ci *
218662306a36Sopenharmony_ci * pgprot typically only differs from @vma->vm_page_prot when drivers set
218762306a36Sopenharmony_ci * caching- and encryption bits different than those of @vma->vm_page_prot,
218862306a36Sopenharmony_ci * because the caching- or encryption mode may not be known at mmap() time.
218962306a36Sopenharmony_ci *
219062306a36Sopenharmony_ci * This is ok as long as @vma->vm_page_prot is not used by the core vm
219162306a36Sopenharmony_ci * to set caching and encryption bits for those vmas (except for COW pages).
219262306a36Sopenharmony_ci * This is ensured by core vm only modifying these page table entries using
219362306a36Sopenharmony_ci * functions that don't touch caching- or encryption bits, using pte_modify()
219462306a36Sopenharmony_ci * if needed. (See for example mprotect()).
219562306a36Sopenharmony_ci *
219662306a36Sopenharmony_ci * Also when new page-table entries are created, this is only done using the
219762306a36Sopenharmony_ci * fault() callback, and never using the value of vma->vm_page_prot,
219862306a36Sopenharmony_ci * except for page-table entries that point to anonymous pages as the result
219962306a36Sopenharmony_ci * of COW.
220062306a36Sopenharmony_ci *
220162306a36Sopenharmony_ci * Context: Process context.  May allocate using %GFP_KERNEL.
220262306a36Sopenharmony_ci * Return: vm_fault_t value.
220362306a36Sopenharmony_ci */
220462306a36Sopenharmony_civm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
220562306a36Sopenharmony_ci			unsigned long pfn, pgprot_t pgprot)
220662306a36Sopenharmony_ci{
220762306a36Sopenharmony_ci	/*
220862306a36Sopenharmony_ci	 * Technically, architectures with pte_special can avoid all these
220962306a36Sopenharmony_ci	 * restrictions (same for remap_pfn_range).  However we would like
221062306a36Sopenharmony_ci	 * consistency in testing and feature parity among all, so we should
221162306a36Sopenharmony_ci	 * try to keep these invariants in place for everybody.
221262306a36Sopenharmony_ci	 */
221362306a36Sopenharmony_ci	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
221462306a36Sopenharmony_ci	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
221562306a36Sopenharmony_ci						(VM_PFNMAP|VM_MIXEDMAP));
221662306a36Sopenharmony_ci	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
221762306a36Sopenharmony_ci	BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
221862306a36Sopenharmony_ci
221962306a36Sopenharmony_ci	if (addr < vma->vm_start || addr >= vma->vm_end)
222062306a36Sopenharmony_ci		return VM_FAULT_SIGBUS;
222162306a36Sopenharmony_ci
222262306a36Sopenharmony_ci	if (!pfn_modify_allowed(pfn, pgprot))
222362306a36Sopenharmony_ci		return VM_FAULT_SIGBUS;
222462306a36Sopenharmony_ci
222562306a36Sopenharmony_ci	track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
222662306a36Sopenharmony_ci
222762306a36Sopenharmony_ci	return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
222862306a36Sopenharmony_ci			false);
222962306a36Sopenharmony_ci}
223062306a36Sopenharmony_ciEXPORT_SYMBOL(vmf_insert_pfn_prot);
223162306a36Sopenharmony_ci
223262306a36Sopenharmony_ci/**
223362306a36Sopenharmony_ci * vmf_insert_pfn - insert single pfn into user vma
223462306a36Sopenharmony_ci * @vma: user vma to map to
223562306a36Sopenharmony_ci * @addr: target user address of this page
223662306a36Sopenharmony_ci * @pfn: source kernel pfn
223762306a36Sopenharmony_ci *
223862306a36Sopenharmony_ci * Similar to vm_insert_page, this allows drivers to insert individual pages
223962306a36Sopenharmony_ci * they've allocated into a user vma. Same comments apply.
224062306a36Sopenharmony_ci *
224162306a36Sopenharmony_ci * This function should only be called from a vm_ops->fault handler, and
224262306a36Sopenharmony_ci * in that case the handler should return the result of this function.
224362306a36Sopenharmony_ci *
224462306a36Sopenharmony_ci * vma cannot be a COW mapping.
224562306a36Sopenharmony_ci *
224662306a36Sopenharmony_ci * As this is called only for pages that do not currently exist, we
224762306a36Sopenharmony_ci * do not need to flush old virtual caches or the TLB.
224862306a36Sopenharmony_ci *
224962306a36Sopenharmony_ci * Context: Process context.  May allocate using %GFP_KERNEL.
225062306a36Sopenharmony_ci * Return: vm_fault_t value.
225162306a36Sopenharmony_ci */
225262306a36Sopenharmony_civm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
225362306a36Sopenharmony_ci			unsigned long pfn)
225462306a36Sopenharmony_ci{
225562306a36Sopenharmony_ci	return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
225662306a36Sopenharmony_ci}
225762306a36Sopenharmony_ciEXPORT_SYMBOL(vmf_insert_pfn);
225862306a36Sopenharmony_ci
225962306a36Sopenharmony_cistatic bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
226062306a36Sopenharmony_ci{
226162306a36Sopenharmony_ci	/* these checks mirror the abort conditions in vm_normal_page */
226262306a36Sopenharmony_ci	if (vma->vm_flags & VM_MIXEDMAP)
226362306a36Sopenharmony_ci		return true;
226462306a36Sopenharmony_ci	if (pfn_t_devmap(pfn))
226562306a36Sopenharmony_ci		return true;
226662306a36Sopenharmony_ci	if (pfn_t_special(pfn))
226762306a36Sopenharmony_ci		return true;
226862306a36Sopenharmony_ci	if (is_zero_pfn(pfn_t_to_pfn(pfn)))
226962306a36Sopenharmony_ci		return true;
227062306a36Sopenharmony_ci	return false;
227162306a36Sopenharmony_ci}
227262306a36Sopenharmony_ci
227362306a36Sopenharmony_cistatic vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
227462306a36Sopenharmony_ci		unsigned long addr, pfn_t pfn, bool mkwrite)
227562306a36Sopenharmony_ci{
227662306a36Sopenharmony_ci	pgprot_t pgprot = vma->vm_page_prot;
227762306a36Sopenharmony_ci	int err;
227862306a36Sopenharmony_ci
227962306a36Sopenharmony_ci	BUG_ON(!vm_mixed_ok(vma, pfn));
228062306a36Sopenharmony_ci
228162306a36Sopenharmony_ci	if (addr < vma->vm_start || addr >= vma->vm_end)
228262306a36Sopenharmony_ci		return VM_FAULT_SIGBUS;
228362306a36Sopenharmony_ci
228462306a36Sopenharmony_ci	track_pfn_insert(vma, &pgprot, pfn);
228562306a36Sopenharmony_ci
228662306a36Sopenharmony_ci	if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
228762306a36Sopenharmony_ci		return VM_FAULT_SIGBUS;
228862306a36Sopenharmony_ci
228962306a36Sopenharmony_ci	/*
229062306a36Sopenharmony_ci	 * If we don't have pte special, then we have to use the pfn_valid()
229162306a36Sopenharmony_ci	 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
229262306a36Sopenharmony_ci	 * refcount the page if pfn_valid is true (hence insert_page rather
229362306a36Sopenharmony_ci	 * than insert_pfn).  If a zero_pfn were inserted into a VM_MIXEDMAP
229462306a36Sopenharmony_ci	 * without pte special, it would there be refcounted as a normal page.
229562306a36Sopenharmony_ci	 */
229662306a36Sopenharmony_ci	if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
229762306a36Sopenharmony_ci	    !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
229862306a36Sopenharmony_ci		struct page *page;
229962306a36Sopenharmony_ci
230062306a36Sopenharmony_ci		/*
230162306a36Sopenharmony_ci		 * At this point we are committed to insert_page()
230262306a36Sopenharmony_ci		 * regardless of whether the caller specified flags that
230362306a36Sopenharmony_ci		 * result in pfn_t_has_page() == false.
230462306a36Sopenharmony_ci		 */
230562306a36Sopenharmony_ci		page = pfn_to_page(pfn_t_to_pfn(pfn));
230662306a36Sopenharmony_ci		err = insert_page(vma, addr, page, pgprot);
230762306a36Sopenharmony_ci	} else {
230862306a36Sopenharmony_ci		return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
230962306a36Sopenharmony_ci	}
231062306a36Sopenharmony_ci
231162306a36Sopenharmony_ci	if (err == -ENOMEM)
231262306a36Sopenharmony_ci		return VM_FAULT_OOM;
231362306a36Sopenharmony_ci	if (err < 0 && err != -EBUSY)
231462306a36Sopenharmony_ci		return VM_FAULT_SIGBUS;
231562306a36Sopenharmony_ci
231662306a36Sopenharmony_ci	return VM_FAULT_NOPAGE;
231762306a36Sopenharmony_ci}
231862306a36Sopenharmony_ci
231962306a36Sopenharmony_civm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
232062306a36Sopenharmony_ci		pfn_t pfn)
232162306a36Sopenharmony_ci{
232262306a36Sopenharmony_ci	return __vm_insert_mixed(vma, addr, pfn, false);
232362306a36Sopenharmony_ci}
232462306a36Sopenharmony_ciEXPORT_SYMBOL(vmf_insert_mixed);
232562306a36Sopenharmony_ci
232662306a36Sopenharmony_ci/*
232762306a36Sopenharmony_ci *  If the insertion of PTE failed because someone else already added a
232862306a36Sopenharmony_ci *  different entry in the mean time, we treat that as success as we assume
232962306a36Sopenharmony_ci *  the same entry was actually inserted.
233062306a36Sopenharmony_ci */
233162306a36Sopenharmony_civm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
233262306a36Sopenharmony_ci		unsigned long addr, pfn_t pfn)
233362306a36Sopenharmony_ci{
233462306a36Sopenharmony_ci	return __vm_insert_mixed(vma, addr, pfn, true);
233562306a36Sopenharmony_ci}
233662306a36Sopenharmony_ciEXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
233762306a36Sopenharmony_ci
233862306a36Sopenharmony_ci/*
233962306a36Sopenharmony_ci * maps a range of physical memory into the requested pages. the old
234062306a36Sopenharmony_ci * mappings are removed. any references to nonexistent pages results
234162306a36Sopenharmony_ci * in null mappings (currently treated as "copy-on-access")
234262306a36Sopenharmony_ci */
234362306a36Sopenharmony_cistatic int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
234462306a36Sopenharmony_ci			unsigned long addr, unsigned long end,
234562306a36Sopenharmony_ci			unsigned long pfn, pgprot_t prot)
234662306a36Sopenharmony_ci{
234762306a36Sopenharmony_ci	pte_t *pte, *mapped_pte;
234862306a36Sopenharmony_ci	spinlock_t *ptl;
234962306a36Sopenharmony_ci	int err = 0;
235062306a36Sopenharmony_ci
235162306a36Sopenharmony_ci	mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
235262306a36Sopenharmony_ci	if (!pte)
235362306a36Sopenharmony_ci		return -ENOMEM;
235462306a36Sopenharmony_ci	arch_enter_lazy_mmu_mode();
235562306a36Sopenharmony_ci	do {
235662306a36Sopenharmony_ci		BUG_ON(!pte_none(ptep_get(pte)));
235762306a36Sopenharmony_ci		if (!pfn_modify_allowed(pfn, prot)) {
235862306a36Sopenharmony_ci			err = -EACCES;
235962306a36Sopenharmony_ci			break;
236062306a36Sopenharmony_ci		}
236162306a36Sopenharmony_ci		set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
236262306a36Sopenharmony_ci		pfn++;
236362306a36Sopenharmony_ci	} while (pte++, addr += PAGE_SIZE, addr != end);
236462306a36Sopenharmony_ci	arch_leave_lazy_mmu_mode();
236562306a36Sopenharmony_ci	pte_unmap_unlock(mapped_pte, ptl);
236662306a36Sopenharmony_ci	return err;
236762306a36Sopenharmony_ci}
236862306a36Sopenharmony_ci
236962306a36Sopenharmony_cistatic inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
237062306a36Sopenharmony_ci			unsigned long addr, unsigned long end,
237162306a36Sopenharmony_ci			unsigned long pfn, pgprot_t prot)
237262306a36Sopenharmony_ci{
237362306a36Sopenharmony_ci	pmd_t *pmd;
237462306a36Sopenharmony_ci	unsigned long next;
237562306a36Sopenharmony_ci	int err;
237662306a36Sopenharmony_ci
237762306a36Sopenharmony_ci	pfn -= addr >> PAGE_SHIFT;
237862306a36Sopenharmony_ci	pmd = pmd_alloc(mm, pud, addr);
237962306a36Sopenharmony_ci	if (!pmd)
238062306a36Sopenharmony_ci		return -ENOMEM;
238162306a36Sopenharmony_ci	VM_BUG_ON(pmd_trans_huge(*pmd));
238262306a36Sopenharmony_ci	do {
238362306a36Sopenharmony_ci		next = pmd_addr_end(addr, end);
238462306a36Sopenharmony_ci		err = remap_pte_range(mm, pmd, addr, next,
238562306a36Sopenharmony_ci				pfn + (addr >> PAGE_SHIFT), prot);
238662306a36Sopenharmony_ci		if (err)
238762306a36Sopenharmony_ci			return err;
238862306a36Sopenharmony_ci	} while (pmd++, addr = next, addr != end);
238962306a36Sopenharmony_ci	return 0;
239062306a36Sopenharmony_ci}
239162306a36Sopenharmony_ci
239262306a36Sopenharmony_cistatic inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
239362306a36Sopenharmony_ci			unsigned long addr, unsigned long end,
239462306a36Sopenharmony_ci			unsigned long pfn, pgprot_t prot)
239562306a36Sopenharmony_ci{
239662306a36Sopenharmony_ci	pud_t *pud;
239762306a36Sopenharmony_ci	unsigned long next;
239862306a36Sopenharmony_ci	int err;
239962306a36Sopenharmony_ci
240062306a36Sopenharmony_ci	pfn -= addr >> PAGE_SHIFT;
240162306a36Sopenharmony_ci	pud = pud_alloc(mm, p4d, addr);
240262306a36Sopenharmony_ci	if (!pud)
240362306a36Sopenharmony_ci		return -ENOMEM;
240462306a36Sopenharmony_ci	do {
240562306a36Sopenharmony_ci		next = pud_addr_end(addr, end);
240662306a36Sopenharmony_ci		err = remap_pmd_range(mm, pud, addr, next,
240762306a36Sopenharmony_ci				pfn + (addr >> PAGE_SHIFT), prot);
240862306a36Sopenharmony_ci		if (err)
240962306a36Sopenharmony_ci			return err;
241062306a36Sopenharmony_ci	} while (pud++, addr = next, addr != end);
241162306a36Sopenharmony_ci	return 0;
241262306a36Sopenharmony_ci}
241362306a36Sopenharmony_ci
241462306a36Sopenharmony_cistatic inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
241562306a36Sopenharmony_ci			unsigned long addr, unsigned long end,
241662306a36Sopenharmony_ci			unsigned long pfn, pgprot_t prot)
241762306a36Sopenharmony_ci{
241862306a36Sopenharmony_ci	p4d_t *p4d;
241962306a36Sopenharmony_ci	unsigned long next;
242062306a36Sopenharmony_ci	int err;
242162306a36Sopenharmony_ci
242262306a36Sopenharmony_ci	pfn -= addr >> PAGE_SHIFT;
242362306a36Sopenharmony_ci	p4d = p4d_alloc(mm, pgd, addr);
242462306a36Sopenharmony_ci	if (!p4d)
242562306a36Sopenharmony_ci		return -ENOMEM;
242662306a36Sopenharmony_ci	do {
242762306a36Sopenharmony_ci		next = p4d_addr_end(addr, end);
242862306a36Sopenharmony_ci		err = remap_pud_range(mm, p4d, addr, next,
242962306a36Sopenharmony_ci				pfn + (addr >> PAGE_SHIFT), prot);
243062306a36Sopenharmony_ci		if (err)
243162306a36Sopenharmony_ci			return err;
243262306a36Sopenharmony_ci	} while (p4d++, addr = next, addr != end);
243362306a36Sopenharmony_ci	return 0;
243462306a36Sopenharmony_ci}
243562306a36Sopenharmony_ci
243662306a36Sopenharmony_ci/*
243762306a36Sopenharmony_ci * Variant of remap_pfn_range that does not call track_pfn_remap.  The caller
243862306a36Sopenharmony_ci * must have pre-validated the caching bits of the pgprot_t.
243962306a36Sopenharmony_ci */
244062306a36Sopenharmony_ciint remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
244162306a36Sopenharmony_ci		unsigned long pfn, unsigned long size, pgprot_t prot)
244262306a36Sopenharmony_ci{
244362306a36Sopenharmony_ci	pgd_t *pgd;
244462306a36Sopenharmony_ci	unsigned long next;
244562306a36Sopenharmony_ci	unsigned long end = addr + PAGE_ALIGN(size);
244662306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
244762306a36Sopenharmony_ci	int err;
244862306a36Sopenharmony_ci
244962306a36Sopenharmony_ci	if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
245062306a36Sopenharmony_ci		return -EINVAL;
245162306a36Sopenharmony_ci
245262306a36Sopenharmony_ci	/*
245362306a36Sopenharmony_ci	 * Physically remapped pages are special. Tell the
245462306a36Sopenharmony_ci	 * rest of the world about it:
245562306a36Sopenharmony_ci	 *   VM_IO tells people not to look at these pages
245662306a36Sopenharmony_ci	 *	(accesses can have side effects).
245762306a36Sopenharmony_ci	 *   VM_PFNMAP tells the core MM that the base pages are just
245862306a36Sopenharmony_ci	 *	raw PFN mappings, and do not have a "struct page" associated
245962306a36Sopenharmony_ci	 *	with them.
246062306a36Sopenharmony_ci	 *   VM_DONTEXPAND
246162306a36Sopenharmony_ci	 *      Disable vma merging and expanding with mremap().
246262306a36Sopenharmony_ci	 *   VM_DONTDUMP
246362306a36Sopenharmony_ci	 *      Omit vma from core dump, even when VM_IO turned off.
246462306a36Sopenharmony_ci	 *
246562306a36Sopenharmony_ci	 * There's a horrible special case to handle copy-on-write
246662306a36Sopenharmony_ci	 * behaviour that some programs depend on. We mark the "original"
246762306a36Sopenharmony_ci	 * un-COW'ed pages by matching them up with "vma->vm_pgoff".
246862306a36Sopenharmony_ci	 * See vm_normal_page() for details.
246962306a36Sopenharmony_ci	 */
247062306a36Sopenharmony_ci	if (is_cow_mapping(vma->vm_flags)) {
247162306a36Sopenharmony_ci		if (addr != vma->vm_start || end != vma->vm_end)
247262306a36Sopenharmony_ci			return -EINVAL;
247362306a36Sopenharmony_ci		vma->vm_pgoff = pfn;
247462306a36Sopenharmony_ci	}
247562306a36Sopenharmony_ci
247662306a36Sopenharmony_ci	vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
247762306a36Sopenharmony_ci
247862306a36Sopenharmony_ci	BUG_ON(addr >= end);
247962306a36Sopenharmony_ci	pfn -= addr >> PAGE_SHIFT;
248062306a36Sopenharmony_ci	pgd = pgd_offset(mm, addr);
248162306a36Sopenharmony_ci	flush_cache_range(vma, addr, end);
248262306a36Sopenharmony_ci	do {
248362306a36Sopenharmony_ci		next = pgd_addr_end(addr, end);
248462306a36Sopenharmony_ci		err = remap_p4d_range(mm, pgd, addr, next,
248562306a36Sopenharmony_ci				pfn + (addr >> PAGE_SHIFT), prot);
248662306a36Sopenharmony_ci		if (err)
248762306a36Sopenharmony_ci			return err;
248862306a36Sopenharmony_ci	} while (pgd++, addr = next, addr != end);
248962306a36Sopenharmony_ci
249062306a36Sopenharmony_ci	return 0;
249162306a36Sopenharmony_ci}
249262306a36Sopenharmony_ci
249362306a36Sopenharmony_ci/**
249462306a36Sopenharmony_ci * remap_pfn_range - remap kernel memory to userspace
249562306a36Sopenharmony_ci * @vma: user vma to map to
249662306a36Sopenharmony_ci * @addr: target page aligned user address to start at
249762306a36Sopenharmony_ci * @pfn: page frame number of kernel physical memory address
249862306a36Sopenharmony_ci * @size: size of mapping area
249962306a36Sopenharmony_ci * @prot: page protection flags for this mapping
250062306a36Sopenharmony_ci *
250162306a36Sopenharmony_ci * Note: this is only safe if the mm semaphore is held when called.
250262306a36Sopenharmony_ci *
250362306a36Sopenharmony_ci * Return: %0 on success, negative error code otherwise.
250462306a36Sopenharmony_ci */
250562306a36Sopenharmony_ciint remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
250662306a36Sopenharmony_ci		    unsigned long pfn, unsigned long size, pgprot_t prot)
250762306a36Sopenharmony_ci{
250862306a36Sopenharmony_ci	int err;
250962306a36Sopenharmony_ci
251062306a36Sopenharmony_ci	err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
251162306a36Sopenharmony_ci	if (err)
251262306a36Sopenharmony_ci		return -EINVAL;
251362306a36Sopenharmony_ci
251462306a36Sopenharmony_ci	err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
251562306a36Sopenharmony_ci	if (err)
251662306a36Sopenharmony_ci		untrack_pfn(vma, pfn, PAGE_ALIGN(size), true);
251762306a36Sopenharmony_ci	return err;
251862306a36Sopenharmony_ci}
251962306a36Sopenharmony_ciEXPORT_SYMBOL(remap_pfn_range);
252062306a36Sopenharmony_ci
252162306a36Sopenharmony_ci/**
252262306a36Sopenharmony_ci * vm_iomap_memory - remap memory to userspace
252362306a36Sopenharmony_ci * @vma: user vma to map to
252462306a36Sopenharmony_ci * @start: start of the physical memory to be mapped
252562306a36Sopenharmony_ci * @len: size of area
252662306a36Sopenharmony_ci *
252762306a36Sopenharmony_ci * This is a simplified io_remap_pfn_range() for common driver use. The
252862306a36Sopenharmony_ci * driver just needs to give us the physical memory range to be mapped,
252962306a36Sopenharmony_ci * we'll figure out the rest from the vma information.
253062306a36Sopenharmony_ci *
253162306a36Sopenharmony_ci * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
253262306a36Sopenharmony_ci * whatever write-combining details or similar.
253362306a36Sopenharmony_ci *
253462306a36Sopenharmony_ci * Return: %0 on success, negative error code otherwise.
253562306a36Sopenharmony_ci */
253662306a36Sopenharmony_ciint vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
253762306a36Sopenharmony_ci{
253862306a36Sopenharmony_ci	unsigned long vm_len, pfn, pages;
253962306a36Sopenharmony_ci
254062306a36Sopenharmony_ci	/* Check that the physical memory area passed in looks valid */
254162306a36Sopenharmony_ci	if (start + len < start)
254262306a36Sopenharmony_ci		return -EINVAL;
254362306a36Sopenharmony_ci	/*
254462306a36Sopenharmony_ci	 * You *really* shouldn't map things that aren't page-aligned,
254562306a36Sopenharmony_ci	 * but we've historically allowed it because IO memory might
254662306a36Sopenharmony_ci	 * just have smaller alignment.
254762306a36Sopenharmony_ci	 */
254862306a36Sopenharmony_ci	len += start & ~PAGE_MASK;
254962306a36Sopenharmony_ci	pfn = start >> PAGE_SHIFT;
255062306a36Sopenharmony_ci	pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
255162306a36Sopenharmony_ci	if (pfn + pages < pfn)
255262306a36Sopenharmony_ci		return -EINVAL;
255362306a36Sopenharmony_ci
255462306a36Sopenharmony_ci	/* We start the mapping 'vm_pgoff' pages into the area */
255562306a36Sopenharmony_ci	if (vma->vm_pgoff > pages)
255662306a36Sopenharmony_ci		return -EINVAL;
255762306a36Sopenharmony_ci	pfn += vma->vm_pgoff;
255862306a36Sopenharmony_ci	pages -= vma->vm_pgoff;
255962306a36Sopenharmony_ci
256062306a36Sopenharmony_ci	/* Can we fit all of the mapping? */
256162306a36Sopenharmony_ci	vm_len = vma->vm_end - vma->vm_start;
256262306a36Sopenharmony_ci	if (vm_len >> PAGE_SHIFT > pages)
256362306a36Sopenharmony_ci		return -EINVAL;
256462306a36Sopenharmony_ci
256562306a36Sopenharmony_ci	/* Ok, let it rip */
256662306a36Sopenharmony_ci	return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
256762306a36Sopenharmony_ci}
256862306a36Sopenharmony_ciEXPORT_SYMBOL(vm_iomap_memory);
256962306a36Sopenharmony_ci
257062306a36Sopenharmony_cistatic int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
257162306a36Sopenharmony_ci				     unsigned long addr, unsigned long end,
257262306a36Sopenharmony_ci				     pte_fn_t fn, void *data, bool create,
257362306a36Sopenharmony_ci				     pgtbl_mod_mask *mask)
257462306a36Sopenharmony_ci{
257562306a36Sopenharmony_ci	pte_t *pte, *mapped_pte;
257662306a36Sopenharmony_ci	int err = 0;
257762306a36Sopenharmony_ci	spinlock_t *ptl;
257862306a36Sopenharmony_ci
257962306a36Sopenharmony_ci	if (create) {
258062306a36Sopenharmony_ci		mapped_pte = pte = (mm == &init_mm) ?
258162306a36Sopenharmony_ci			pte_alloc_kernel_track(pmd, addr, mask) :
258262306a36Sopenharmony_ci			pte_alloc_map_lock(mm, pmd, addr, &ptl);
258362306a36Sopenharmony_ci		if (!pte)
258462306a36Sopenharmony_ci			return -ENOMEM;
258562306a36Sopenharmony_ci	} else {
258662306a36Sopenharmony_ci		mapped_pte = pte = (mm == &init_mm) ?
258762306a36Sopenharmony_ci			pte_offset_kernel(pmd, addr) :
258862306a36Sopenharmony_ci			pte_offset_map_lock(mm, pmd, addr, &ptl);
258962306a36Sopenharmony_ci		if (!pte)
259062306a36Sopenharmony_ci			return -EINVAL;
259162306a36Sopenharmony_ci	}
259262306a36Sopenharmony_ci
259362306a36Sopenharmony_ci	arch_enter_lazy_mmu_mode();
259462306a36Sopenharmony_ci
259562306a36Sopenharmony_ci	if (fn) {
259662306a36Sopenharmony_ci		do {
259762306a36Sopenharmony_ci			if (create || !pte_none(ptep_get(pte))) {
259862306a36Sopenharmony_ci				err = fn(pte++, addr, data);
259962306a36Sopenharmony_ci				if (err)
260062306a36Sopenharmony_ci					break;
260162306a36Sopenharmony_ci			}
260262306a36Sopenharmony_ci		} while (addr += PAGE_SIZE, addr != end);
260362306a36Sopenharmony_ci	}
260462306a36Sopenharmony_ci	*mask |= PGTBL_PTE_MODIFIED;
260562306a36Sopenharmony_ci
260662306a36Sopenharmony_ci	arch_leave_lazy_mmu_mode();
260762306a36Sopenharmony_ci
260862306a36Sopenharmony_ci	if (mm != &init_mm)
260962306a36Sopenharmony_ci		pte_unmap_unlock(mapped_pte, ptl);
261062306a36Sopenharmony_ci	return err;
261162306a36Sopenharmony_ci}
261262306a36Sopenharmony_ci
261362306a36Sopenharmony_cistatic int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
261462306a36Sopenharmony_ci				     unsigned long addr, unsigned long end,
261562306a36Sopenharmony_ci				     pte_fn_t fn, void *data, bool create,
261662306a36Sopenharmony_ci				     pgtbl_mod_mask *mask)
261762306a36Sopenharmony_ci{
261862306a36Sopenharmony_ci	pmd_t *pmd;
261962306a36Sopenharmony_ci	unsigned long next;
262062306a36Sopenharmony_ci	int err = 0;
262162306a36Sopenharmony_ci
262262306a36Sopenharmony_ci	BUG_ON(pud_huge(*pud));
262362306a36Sopenharmony_ci
262462306a36Sopenharmony_ci	if (create) {
262562306a36Sopenharmony_ci		pmd = pmd_alloc_track(mm, pud, addr, mask);
262662306a36Sopenharmony_ci		if (!pmd)
262762306a36Sopenharmony_ci			return -ENOMEM;
262862306a36Sopenharmony_ci	} else {
262962306a36Sopenharmony_ci		pmd = pmd_offset(pud, addr);
263062306a36Sopenharmony_ci	}
263162306a36Sopenharmony_ci	do {
263262306a36Sopenharmony_ci		next = pmd_addr_end(addr, end);
263362306a36Sopenharmony_ci		if (pmd_none(*pmd) && !create)
263462306a36Sopenharmony_ci			continue;
263562306a36Sopenharmony_ci		if (WARN_ON_ONCE(pmd_leaf(*pmd)))
263662306a36Sopenharmony_ci			return -EINVAL;
263762306a36Sopenharmony_ci		if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) {
263862306a36Sopenharmony_ci			if (!create)
263962306a36Sopenharmony_ci				continue;
264062306a36Sopenharmony_ci			pmd_clear_bad(pmd);
264162306a36Sopenharmony_ci		}
264262306a36Sopenharmony_ci		err = apply_to_pte_range(mm, pmd, addr, next,
264362306a36Sopenharmony_ci					 fn, data, create, mask);
264462306a36Sopenharmony_ci		if (err)
264562306a36Sopenharmony_ci			break;
264662306a36Sopenharmony_ci	} while (pmd++, addr = next, addr != end);
264762306a36Sopenharmony_ci
264862306a36Sopenharmony_ci	return err;
264962306a36Sopenharmony_ci}
265062306a36Sopenharmony_ci
265162306a36Sopenharmony_cistatic int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
265262306a36Sopenharmony_ci				     unsigned long addr, unsigned long end,
265362306a36Sopenharmony_ci				     pte_fn_t fn, void *data, bool create,
265462306a36Sopenharmony_ci				     pgtbl_mod_mask *mask)
265562306a36Sopenharmony_ci{
265662306a36Sopenharmony_ci	pud_t *pud;
265762306a36Sopenharmony_ci	unsigned long next;
265862306a36Sopenharmony_ci	int err = 0;
265962306a36Sopenharmony_ci
266062306a36Sopenharmony_ci	if (create) {
266162306a36Sopenharmony_ci		pud = pud_alloc_track(mm, p4d, addr, mask);
266262306a36Sopenharmony_ci		if (!pud)
266362306a36Sopenharmony_ci			return -ENOMEM;
266462306a36Sopenharmony_ci	} else {
266562306a36Sopenharmony_ci		pud = pud_offset(p4d, addr);
266662306a36Sopenharmony_ci	}
266762306a36Sopenharmony_ci	do {
266862306a36Sopenharmony_ci		next = pud_addr_end(addr, end);
266962306a36Sopenharmony_ci		if (pud_none(*pud) && !create)
267062306a36Sopenharmony_ci			continue;
267162306a36Sopenharmony_ci		if (WARN_ON_ONCE(pud_leaf(*pud)))
267262306a36Sopenharmony_ci			return -EINVAL;
267362306a36Sopenharmony_ci		if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) {
267462306a36Sopenharmony_ci			if (!create)
267562306a36Sopenharmony_ci				continue;
267662306a36Sopenharmony_ci			pud_clear_bad(pud);
267762306a36Sopenharmony_ci		}
267862306a36Sopenharmony_ci		err = apply_to_pmd_range(mm, pud, addr, next,
267962306a36Sopenharmony_ci					 fn, data, create, mask);
268062306a36Sopenharmony_ci		if (err)
268162306a36Sopenharmony_ci			break;
268262306a36Sopenharmony_ci	} while (pud++, addr = next, addr != end);
268362306a36Sopenharmony_ci
268462306a36Sopenharmony_ci	return err;
268562306a36Sopenharmony_ci}
268662306a36Sopenharmony_ci
268762306a36Sopenharmony_cistatic int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
268862306a36Sopenharmony_ci				     unsigned long addr, unsigned long end,
268962306a36Sopenharmony_ci				     pte_fn_t fn, void *data, bool create,
269062306a36Sopenharmony_ci				     pgtbl_mod_mask *mask)
269162306a36Sopenharmony_ci{
269262306a36Sopenharmony_ci	p4d_t *p4d;
269362306a36Sopenharmony_ci	unsigned long next;
269462306a36Sopenharmony_ci	int err = 0;
269562306a36Sopenharmony_ci
269662306a36Sopenharmony_ci	if (create) {
269762306a36Sopenharmony_ci		p4d = p4d_alloc_track(mm, pgd, addr, mask);
269862306a36Sopenharmony_ci		if (!p4d)
269962306a36Sopenharmony_ci			return -ENOMEM;
270062306a36Sopenharmony_ci	} else {
270162306a36Sopenharmony_ci		p4d = p4d_offset(pgd, addr);
270262306a36Sopenharmony_ci	}
270362306a36Sopenharmony_ci	do {
270462306a36Sopenharmony_ci		next = p4d_addr_end(addr, end);
270562306a36Sopenharmony_ci		if (p4d_none(*p4d) && !create)
270662306a36Sopenharmony_ci			continue;
270762306a36Sopenharmony_ci		if (WARN_ON_ONCE(p4d_leaf(*p4d)))
270862306a36Sopenharmony_ci			return -EINVAL;
270962306a36Sopenharmony_ci		if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) {
271062306a36Sopenharmony_ci			if (!create)
271162306a36Sopenharmony_ci				continue;
271262306a36Sopenharmony_ci			p4d_clear_bad(p4d);
271362306a36Sopenharmony_ci		}
271462306a36Sopenharmony_ci		err = apply_to_pud_range(mm, p4d, addr, next,
271562306a36Sopenharmony_ci					 fn, data, create, mask);
271662306a36Sopenharmony_ci		if (err)
271762306a36Sopenharmony_ci			break;
271862306a36Sopenharmony_ci	} while (p4d++, addr = next, addr != end);
271962306a36Sopenharmony_ci
272062306a36Sopenharmony_ci	return err;
272162306a36Sopenharmony_ci}
272262306a36Sopenharmony_ci
272362306a36Sopenharmony_cistatic int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
272462306a36Sopenharmony_ci				 unsigned long size, pte_fn_t fn,
272562306a36Sopenharmony_ci				 void *data, bool create)
272662306a36Sopenharmony_ci{
272762306a36Sopenharmony_ci	pgd_t *pgd;
272862306a36Sopenharmony_ci	unsigned long start = addr, next;
272962306a36Sopenharmony_ci	unsigned long end = addr + size;
273062306a36Sopenharmony_ci	pgtbl_mod_mask mask = 0;
273162306a36Sopenharmony_ci	int err = 0;
273262306a36Sopenharmony_ci
273362306a36Sopenharmony_ci	if (WARN_ON(addr >= end))
273462306a36Sopenharmony_ci		return -EINVAL;
273562306a36Sopenharmony_ci
273662306a36Sopenharmony_ci	pgd = pgd_offset(mm, addr);
273762306a36Sopenharmony_ci	do {
273862306a36Sopenharmony_ci		next = pgd_addr_end(addr, end);
273962306a36Sopenharmony_ci		if (pgd_none(*pgd) && !create)
274062306a36Sopenharmony_ci			continue;
274162306a36Sopenharmony_ci		if (WARN_ON_ONCE(pgd_leaf(*pgd)))
274262306a36Sopenharmony_ci			return -EINVAL;
274362306a36Sopenharmony_ci		if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
274462306a36Sopenharmony_ci			if (!create)
274562306a36Sopenharmony_ci				continue;
274662306a36Sopenharmony_ci			pgd_clear_bad(pgd);
274762306a36Sopenharmony_ci		}
274862306a36Sopenharmony_ci		err = apply_to_p4d_range(mm, pgd, addr, next,
274962306a36Sopenharmony_ci					 fn, data, create, &mask);
275062306a36Sopenharmony_ci		if (err)
275162306a36Sopenharmony_ci			break;
275262306a36Sopenharmony_ci	} while (pgd++, addr = next, addr != end);
275362306a36Sopenharmony_ci
275462306a36Sopenharmony_ci	if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
275562306a36Sopenharmony_ci		arch_sync_kernel_mappings(start, start + size);
275662306a36Sopenharmony_ci
275762306a36Sopenharmony_ci	return err;
275862306a36Sopenharmony_ci}
275962306a36Sopenharmony_ci
276062306a36Sopenharmony_ci/*
276162306a36Sopenharmony_ci * Scan a region of virtual memory, filling in page tables as necessary
276262306a36Sopenharmony_ci * and calling a provided function on each leaf page table.
276362306a36Sopenharmony_ci */
276462306a36Sopenharmony_ciint apply_to_page_range(struct mm_struct *mm, unsigned long addr,
276562306a36Sopenharmony_ci			unsigned long size, pte_fn_t fn, void *data)
276662306a36Sopenharmony_ci{
276762306a36Sopenharmony_ci	return __apply_to_page_range(mm, addr, size, fn, data, true);
276862306a36Sopenharmony_ci}
276962306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(apply_to_page_range);
277062306a36Sopenharmony_ci
277162306a36Sopenharmony_ci/*
277262306a36Sopenharmony_ci * Scan a region of virtual memory, calling a provided function on
277362306a36Sopenharmony_ci * each leaf page table where it exists.
277462306a36Sopenharmony_ci *
277562306a36Sopenharmony_ci * Unlike apply_to_page_range, this does _not_ fill in page tables
277662306a36Sopenharmony_ci * where they are absent.
277762306a36Sopenharmony_ci */
277862306a36Sopenharmony_ciint apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
277962306a36Sopenharmony_ci				 unsigned long size, pte_fn_t fn, void *data)
278062306a36Sopenharmony_ci{
278162306a36Sopenharmony_ci	return __apply_to_page_range(mm, addr, size, fn, data, false);
278262306a36Sopenharmony_ci}
278362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(apply_to_existing_page_range);
278462306a36Sopenharmony_ci
278562306a36Sopenharmony_ci/*
278662306a36Sopenharmony_ci * handle_pte_fault chooses page fault handler according to an entry which was
278762306a36Sopenharmony_ci * read non-atomically.  Before making any commitment, on those architectures
278862306a36Sopenharmony_ci * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
278962306a36Sopenharmony_ci * parts, do_swap_page must check under lock before unmapping the pte and
279062306a36Sopenharmony_ci * proceeding (but do_wp_page is only called after already making such a check;
279162306a36Sopenharmony_ci * and do_anonymous_page can safely check later on).
279262306a36Sopenharmony_ci */
279362306a36Sopenharmony_cistatic inline int pte_unmap_same(struct vm_fault *vmf)
279462306a36Sopenharmony_ci{
279562306a36Sopenharmony_ci	int same = 1;
279662306a36Sopenharmony_ci#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
279762306a36Sopenharmony_ci	if (sizeof(pte_t) > sizeof(unsigned long)) {
279862306a36Sopenharmony_ci		spin_lock(vmf->ptl);
279962306a36Sopenharmony_ci		same = pte_same(ptep_get(vmf->pte), vmf->orig_pte);
280062306a36Sopenharmony_ci		spin_unlock(vmf->ptl);
280162306a36Sopenharmony_ci	}
280262306a36Sopenharmony_ci#endif
280362306a36Sopenharmony_ci	pte_unmap(vmf->pte);
280462306a36Sopenharmony_ci	vmf->pte = NULL;
280562306a36Sopenharmony_ci	return same;
280662306a36Sopenharmony_ci}
280762306a36Sopenharmony_ci
280862306a36Sopenharmony_ci/*
280962306a36Sopenharmony_ci * Return:
281062306a36Sopenharmony_ci *	0:		copied succeeded
281162306a36Sopenharmony_ci *	-EHWPOISON:	copy failed due to hwpoison in source page
281262306a36Sopenharmony_ci *	-EAGAIN:	copied failed (some other reason)
281362306a36Sopenharmony_ci */
281462306a36Sopenharmony_cistatic inline int __wp_page_copy_user(struct page *dst, struct page *src,
281562306a36Sopenharmony_ci				      struct vm_fault *vmf)
281662306a36Sopenharmony_ci{
281762306a36Sopenharmony_ci	int ret;
281862306a36Sopenharmony_ci	void *kaddr;
281962306a36Sopenharmony_ci	void __user *uaddr;
282062306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
282162306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
282262306a36Sopenharmony_ci	unsigned long addr = vmf->address;
282362306a36Sopenharmony_ci
282462306a36Sopenharmony_ci	if (likely(src)) {
282562306a36Sopenharmony_ci		if (copy_mc_user_highpage(dst, src, addr, vma)) {
282662306a36Sopenharmony_ci			memory_failure_queue(page_to_pfn(src), 0);
282762306a36Sopenharmony_ci			return -EHWPOISON;
282862306a36Sopenharmony_ci		}
282962306a36Sopenharmony_ci		return 0;
283062306a36Sopenharmony_ci	}
283162306a36Sopenharmony_ci
283262306a36Sopenharmony_ci	/*
283362306a36Sopenharmony_ci	 * If the source page was a PFN mapping, we don't have
283462306a36Sopenharmony_ci	 * a "struct page" for it. We do a best-effort copy by
283562306a36Sopenharmony_ci	 * just copying from the original user address. If that
283662306a36Sopenharmony_ci	 * fails, we just zero-fill it. Live with it.
283762306a36Sopenharmony_ci	 */
283862306a36Sopenharmony_ci	kaddr = kmap_atomic(dst);
283962306a36Sopenharmony_ci	uaddr = (void __user *)(addr & PAGE_MASK);
284062306a36Sopenharmony_ci
284162306a36Sopenharmony_ci	/*
284262306a36Sopenharmony_ci	 * On architectures with software "accessed" bits, we would
284362306a36Sopenharmony_ci	 * take a double page fault, so mark it accessed here.
284462306a36Sopenharmony_ci	 */
284562306a36Sopenharmony_ci	vmf->pte = NULL;
284662306a36Sopenharmony_ci	if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
284762306a36Sopenharmony_ci		pte_t entry;
284862306a36Sopenharmony_ci
284962306a36Sopenharmony_ci		vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
285062306a36Sopenharmony_ci		if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
285162306a36Sopenharmony_ci			/*
285262306a36Sopenharmony_ci			 * Other thread has already handled the fault
285362306a36Sopenharmony_ci			 * and update local tlb only
285462306a36Sopenharmony_ci			 */
285562306a36Sopenharmony_ci			if (vmf->pte)
285662306a36Sopenharmony_ci				update_mmu_tlb(vma, addr, vmf->pte);
285762306a36Sopenharmony_ci			ret = -EAGAIN;
285862306a36Sopenharmony_ci			goto pte_unlock;
285962306a36Sopenharmony_ci		}
286062306a36Sopenharmony_ci
286162306a36Sopenharmony_ci		entry = pte_mkyoung(vmf->orig_pte);
286262306a36Sopenharmony_ci		if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
286362306a36Sopenharmony_ci			update_mmu_cache_range(vmf, vma, addr, vmf->pte, 1);
286462306a36Sopenharmony_ci	}
286562306a36Sopenharmony_ci
286662306a36Sopenharmony_ci	/*
286762306a36Sopenharmony_ci	 * This really shouldn't fail, because the page is there
286862306a36Sopenharmony_ci	 * in the page tables. But it might just be unreadable,
286962306a36Sopenharmony_ci	 * in which case we just give up and fill the result with
287062306a36Sopenharmony_ci	 * zeroes.
287162306a36Sopenharmony_ci	 */
287262306a36Sopenharmony_ci	if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
287362306a36Sopenharmony_ci		if (vmf->pte)
287462306a36Sopenharmony_ci			goto warn;
287562306a36Sopenharmony_ci
287662306a36Sopenharmony_ci		/* Re-validate under PTL if the page is still mapped */
287762306a36Sopenharmony_ci		vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
287862306a36Sopenharmony_ci		if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
287962306a36Sopenharmony_ci			/* The PTE changed under us, update local tlb */
288062306a36Sopenharmony_ci			if (vmf->pte)
288162306a36Sopenharmony_ci				update_mmu_tlb(vma, addr, vmf->pte);
288262306a36Sopenharmony_ci			ret = -EAGAIN;
288362306a36Sopenharmony_ci			goto pte_unlock;
288462306a36Sopenharmony_ci		}
288562306a36Sopenharmony_ci
288662306a36Sopenharmony_ci		/*
288762306a36Sopenharmony_ci		 * The same page can be mapped back since last copy attempt.
288862306a36Sopenharmony_ci		 * Try to copy again under PTL.
288962306a36Sopenharmony_ci		 */
289062306a36Sopenharmony_ci		if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
289162306a36Sopenharmony_ci			/*
289262306a36Sopenharmony_ci			 * Give a warn in case there can be some obscure
289362306a36Sopenharmony_ci			 * use-case
289462306a36Sopenharmony_ci			 */
289562306a36Sopenharmony_ciwarn:
289662306a36Sopenharmony_ci			WARN_ON_ONCE(1);
289762306a36Sopenharmony_ci			clear_page(kaddr);
289862306a36Sopenharmony_ci		}
289962306a36Sopenharmony_ci	}
290062306a36Sopenharmony_ci
290162306a36Sopenharmony_ci	ret = 0;
290262306a36Sopenharmony_ci
290362306a36Sopenharmony_cipte_unlock:
290462306a36Sopenharmony_ci	if (vmf->pte)
290562306a36Sopenharmony_ci		pte_unmap_unlock(vmf->pte, vmf->ptl);
290662306a36Sopenharmony_ci	kunmap_atomic(kaddr);
290762306a36Sopenharmony_ci	flush_dcache_page(dst);
290862306a36Sopenharmony_ci
290962306a36Sopenharmony_ci	return ret;
291062306a36Sopenharmony_ci}
291162306a36Sopenharmony_ci
291262306a36Sopenharmony_cistatic gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
291362306a36Sopenharmony_ci{
291462306a36Sopenharmony_ci	struct file *vm_file = vma->vm_file;
291562306a36Sopenharmony_ci
291662306a36Sopenharmony_ci	if (vm_file)
291762306a36Sopenharmony_ci		return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
291862306a36Sopenharmony_ci
291962306a36Sopenharmony_ci	/*
292062306a36Sopenharmony_ci	 * Special mappings (e.g. VDSO) do not have any file so fake
292162306a36Sopenharmony_ci	 * a default GFP_KERNEL for them.
292262306a36Sopenharmony_ci	 */
292362306a36Sopenharmony_ci	return GFP_KERNEL;
292462306a36Sopenharmony_ci}
292562306a36Sopenharmony_ci
292662306a36Sopenharmony_ci/*
292762306a36Sopenharmony_ci * Notify the address space that the page is about to become writable so that
292862306a36Sopenharmony_ci * it can prohibit this or wait for the page to get into an appropriate state.
292962306a36Sopenharmony_ci *
293062306a36Sopenharmony_ci * We do this without the lock held, so that it can sleep if it needs to.
293162306a36Sopenharmony_ci */
293262306a36Sopenharmony_cistatic vm_fault_t do_page_mkwrite(struct vm_fault *vmf, struct folio *folio)
293362306a36Sopenharmony_ci{
293462306a36Sopenharmony_ci	vm_fault_t ret;
293562306a36Sopenharmony_ci	unsigned int old_flags = vmf->flags;
293662306a36Sopenharmony_ci
293762306a36Sopenharmony_ci	vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
293862306a36Sopenharmony_ci
293962306a36Sopenharmony_ci	if (vmf->vma->vm_file &&
294062306a36Sopenharmony_ci	    IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host))
294162306a36Sopenharmony_ci		return VM_FAULT_SIGBUS;
294262306a36Sopenharmony_ci
294362306a36Sopenharmony_ci	ret = vmf->vma->vm_ops->page_mkwrite(vmf);
294462306a36Sopenharmony_ci	/* Restore original flags so that caller is not surprised */
294562306a36Sopenharmony_ci	vmf->flags = old_flags;
294662306a36Sopenharmony_ci	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
294762306a36Sopenharmony_ci		return ret;
294862306a36Sopenharmony_ci	if (unlikely(!(ret & VM_FAULT_LOCKED))) {
294962306a36Sopenharmony_ci		folio_lock(folio);
295062306a36Sopenharmony_ci		if (!folio->mapping) {
295162306a36Sopenharmony_ci			folio_unlock(folio);
295262306a36Sopenharmony_ci			return 0; /* retry */
295362306a36Sopenharmony_ci		}
295462306a36Sopenharmony_ci		ret |= VM_FAULT_LOCKED;
295562306a36Sopenharmony_ci	} else
295662306a36Sopenharmony_ci		VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
295762306a36Sopenharmony_ci	return ret;
295862306a36Sopenharmony_ci}
295962306a36Sopenharmony_ci
296062306a36Sopenharmony_ci/*
296162306a36Sopenharmony_ci * Handle dirtying of a page in shared file mapping on a write fault.
296262306a36Sopenharmony_ci *
296362306a36Sopenharmony_ci * The function expects the page to be locked and unlocks it.
296462306a36Sopenharmony_ci */
296562306a36Sopenharmony_cistatic vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
296662306a36Sopenharmony_ci{
296762306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
296862306a36Sopenharmony_ci	struct address_space *mapping;
296962306a36Sopenharmony_ci	struct folio *folio = page_folio(vmf->page);
297062306a36Sopenharmony_ci	bool dirtied;
297162306a36Sopenharmony_ci	bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
297262306a36Sopenharmony_ci
297362306a36Sopenharmony_ci	dirtied = folio_mark_dirty(folio);
297462306a36Sopenharmony_ci	VM_BUG_ON_FOLIO(folio_test_anon(folio), folio);
297562306a36Sopenharmony_ci	/*
297662306a36Sopenharmony_ci	 * Take a local copy of the address_space - folio.mapping may be zeroed
297762306a36Sopenharmony_ci	 * by truncate after folio_unlock().   The address_space itself remains
297862306a36Sopenharmony_ci	 * pinned by vma->vm_file's reference.  We rely on folio_unlock()'s
297962306a36Sopenharmony_ci	 * release semantics to prevent the compiler from undoing this copying.
298062306a36Sopenharmony_ci	 */
298162306a36Sopenharmony_ci	mapping = folio_raw_mapping(folio);
298262306a36Sopenharmony_ci	folio_unlock(folio);
298362306a36Sopenharmony_ci
298462306a36Sopenharmony_ci	if (!page_mkwrite)
298562306a36Sopenharmony_ci		file_update_time(vma->vm_file);
298662306a36Sopenharmony_ci
298762306a36Sopenharmony_ci	/*
298862306a36Sopenharmony_ci	 * Throttle page dirtying rate down to writeback speed.
298962306a36Sopenharmony_ci	 *
299062306a36Sopenharmony_ci	 * mapping may be NULL here because some device drivers do not
299162306a36Sopenharmony_ci	 * set page.mapping but still dirty their pages
299262306a36Sopenharmony_ci	 *
299362306a36Sopenharmony_ci	 * Drop the mmap_lock before waiting on IO, if we can. The file
299462306a36Sopenharmony_ci	 * is pinning the mapping, as per above.
299562306a36Sopenharmony_ci	 */
299662306a36Sopenharmony_ci	if ((dirtied || page_mkwrite) && mapping) {
299762306a36Sopenharmony_ci		struct file *fpin;
299862306a36Sopenharmony_ci
299962306a36Sopenharmony_ci		fpin = maybe_unlock_mmap_for_io(vmf, NULL);
300062306a36Sopenharmony_ci		balance_dirty_pages_ratelimited(mapping);
300162306a36Sopenharmony_ci		if (fpin) {
300262306a36Sopenharmony_ci			fput(fpin);
300362306a36Sopenharmony_ci			return VM_FAULT_COMPLETED;
300462306a36Sopenharmony_ci		}
300562306a36Sopenharmony_ci	}
300662306a36Sopenharmony_ci
300762306a36Sopenharmony_ci	return 0;
300862306a36Sopenharmony_ci}
300962306a36Sopenharmony_ci
301062306a36Sopenharmony_ci/*
301162306a36Sopenharmony_ci * Handle write page faults for pages that can be reused in the current vma
301262306a36Sopenharmony_ci *
301362306a36Sopenharmony_ci * This can happen either due to the mapping being with the VM_SHARED flag,
301462306a36Sopenharmony_ci * or due to us being the last reference standing to the page. In either
301562306a36Sopenharmony_ci * case, all we need to do here is to mark the page as writable and update
301662306a36Sopenharmony_ci * any related book-keeping.
301762306a36Sopenharmony_ci */
301862306a36Sopenharmony_cistatic inline void wp_page_reuse(struct vm_fault *vmf)
301962306a36Sopenharmony_ci	__releases(vmf->ptl)
302062306a36Sopenharmony_ci{
302162306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
302262306a36Sopenharmony_ci	struct page *page = vmf->page;
302362306a36Sopenharmony_ci	pte_t entry;
302462306a36Sopenharmony_ci
302562306a36Sopenharmony_ci	VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE));
302662306a36Sopenharmony_ci	VM_BUG_ON(page && PageAnon(page) && !PageAnonExclusive(page));
302762306a36Sopenharmony_ci
302862306a36Sopenharmony_ci	/*
302962306a36Sopenharmony_ci	 * Clear the pages cpupid information as the existing
303062306a36Sopenharmony_ci	 * information potentially belongs to a now completely
303162306a36Sopenharmony_ci	 * unrelated process.
303262306a36Sopenharmony_ci	 */
303362306a36Sopenharmony_ci	if (page)
303462306a36Sopenharmony_ci		page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
303562306a36Sopenharmony_ci
303662306a36Sopenharmony_ci	flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
303762306a36Sopenharmony_ci	entry = pte_mkyoung(vmf->orig_pte);
303862306a36Sopenharmony_ci	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
303962306a36Sopenharmony_ci	if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
304062306a36Sopenharmony_ci		update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
304162306a36Sopenharmony_ci	pte_unmap_unlock(vmf->pte, vmf->ptl);
304262306a36Sopenharmony_ci	count_vm_event(PGREUSE);
304362306a36Sopenharmony_ci}
304462306a36Sopenharmony_ci
304562306a36Sopenharmony_ci/*
304662306a36Sopenharmony_ci * Handle the case of a page which we actually need to copy to a new page,
304762306a36Sopenharmony_ci * either due to COW or unsharing.
304862306a36Sopenharmony_ci *
304962306a36Sopenharmony_ci * Called with mmap_lock locked and the old page referenced, but
305062306a36Sopenharmony_ci * without the ptl held.
305162306a36Sopenharmony_ci *
305262306a36Sopenharmony_ci * High level logic flow:
305362306a36Sopenharmony_ci *
305462306a36Sopenharmony_ci * - Allocate a page, copy the content of the old page to the new one.
305562306a36Sopenharmony_ci * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
305662306a36Sopenharmony_ci * - Take the PTL. If the pte changed, bail out and release the allocated page
305762306a36Sopenharmony_ci * - If the pte is still the way we remember it, update the page table and all
305862306a36Sopenharmony_ci *   relevant references. This includes dropping the reference the page-table
305962306a36Sopenharmony_ci *   held to the old page, as well as updating the rmap.
306062306a36Sopenharmony_ci * - In any case, unlock the PTL and drop the reference we took to the old page.
306162306a36Sopenharmony_ci */
306262306a36Sopenharmony_cistatic vm_fault_t wp_page_copy(struct vm_fault *vmf)
306362306a36Sopenharmony_ci{
306462306a36Sopenharmony_ci	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
306562306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
306662306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
306762306a36Sopenharmony_ci	struct folio *old_folio = NULL;
306862306a36Sopenharmony_ci	struct folio *new_folio = NULL;
306962306a36Sopenharmony_ci	pte_t entry;
307062306a36Sopenharmony_ci	int page_copied = 0;
307162306a36Sopenharmony_ci	struct mmu_notifier_range range;
307262306a36Sopenharmony_ci	int ret;
307362306a36Sopenharmony_ci
307462306a36Sopenharmony_ci	delayacct_wpcopy_start();
307562306a36Sopenharmony_ci
307662306a36Sopenharmony_ci	if (vmf->page)
307762306a36Sopenharmony_ci		old_folio = page_folio(vmf->page);
307862306a36Sopenharmony_ci	if (unlikely(anon_vma_prepare(vma)))
307962306a36Sopenharmony_ci		goto oom;
308062306a36Sopenharmony_ci
308162306a36Sopenharmony_ci	if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
308262306a36Sopenharmony_ci		new_folio = vma_alloc_zeroed_movable_folio(vma, vmf->address);
308362306a36Sopenharmony_ci		if (!new_folio)
308462306a36Sopenharmony_ci			goto oom;
308562306a36Sopenharmony_ci	} else {
308662306a36Sopenharmony_ci		new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma,
308762306a36Sopenharmony_ci				vmf->address, false);
308862306a36Sopenharmony_ci		if (!new_folio)
308962306a36Sopenharmony_ci			goto oom;
309062306a36Sopenharmony_ci
309162306a36Sopenharmony_ci		ret = __wp_page_copy_user(&new_folio->page, vmf->page, vmf);
309262306a36Sopenharmony_ci		if (ret) {
309362306a36Sopenharmony_ci			/*
309462306a36Sopenharmony_ci			 * COW failed, if the fault was solved by other,
309562306a36Sopenharmony_ci			 * it's fine. If not, userspace would re-fault on
309662306a36Sopenharmony_ci			 * the same address and we will handle the fault
309762306a36Sopenharmony_ci			 * from the second attempt.
309862306a36Sopenharmony_ci			 * The -EHWPOISON case will not be retried.
309962306a36Sopenharmony_ci			 */
310062306a36Sopenharmony_ci			folio_put(new_folio);
310162306a36Sopenharmony_ci			if (old_folio)
310262306a36Sopenharmony_ci				folio_put(old_folio);
310362306a36Sopenharmony_ci
310462306a36Sopenharmony_ci			delayacct_wpcopy_end();
310562306a36Sopenharmony_ci			return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
310662306a36Sopenharmony_ci		}
310762306a36Sopenharmony_ci		kmsan_copy_page_meta(&new_folio->page, vmf->page);
310862306a36Sopenharmony_ci	}
310962306a36Sopenharmony_ci
311062306a36Sopenharmony_ci	if (mem_cgroup_charge(new_folio, mm, GFP_KERNEL))
311162306a36Sopenharmony_ci		goto oom_free_new;
311262306a36Sopenharmony_ci	folio_throttle_swaprate(new_folio, GFP_KERNEL);
311362306a36Sopenharmony_ci
311462306a36Sopenharmony_ci	__folio_mark_uptodate(new_folio);
311562306a36Sopenharmony_ci
311662306a36Sopenharmony_ci	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
311762306a36Sopenharmony_ci				vmf->address & PAGE_MASK,
311862306a36Sopenharmony_ci				(vmf->address & PAGE_MASK) + PAGE_SIZE);
311962306a36Sopenharmony_ci	mmu_notifier_invalidate_range_start(&range);
312062306a36Sopenharmony_ci
312162306a36Sopenharmony_ci	/*
312262306a36Sopenharmony_ci	 * Re-check the pte - we dropped the lock
312362306a36Sopenharmony_ci	 */
312462306a36Sopenharmony_ci	vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
312562306a36Sopenharmony_ci	if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
312662306a36Sopenharmony_ci		if (old_folio) {
312762306a36Sopenharmony_ci			if (!folio_test_anon(old_folio)) {
312862306a36Sopenharmony_ci				dec_mm_counter(mm, mm_counter_file(&old_folio->page));
312962306a36Sopenharmony_ci				inc_mm_counter(mm, MM_ANONPAGES);
313062306a36Sopenharmony_ci			}
313162306a36Sopenharmony_ci		} else {
313262306a36Sopenharmony_ci			ksm_might_unmap_zero_page(mm, vmf->orig_pte);
313362306a36Sopenharmony_ci			inc_mm_counter(mm, MM_ANONPAGES);
313462306a36Sopenharmony_ci		}
313562306a36Sopenharmony_ci		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
313662306a36Sopenharmony_ci		entry = mk_pte(&new_folio->page, vma->vm_page_prot);
313762306a36Sopenharmony_ci		entry = pte_sw_mkyoung(entry);
313862306a36Sopenharmony_ci		if (unlikely(unshare)) {
313962306a36Sopenharmony_ci			if (pte_soft_dirty(vmf->orig_pte))
314062306a36Sopenharmony_ci				entry = pte_mksoft_dirty(entry);
314162306a36Sopenharmony_ci			if (pte_uffd_wp(vmf->orig_pte))
314262306a36Sopenharmony_ci				entry = pte_mkuffd_wp(entry);
314362306a36Sopenharmony_ci		} else {
314462306a36Sopenharmony_ci			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
314562306a36Sopenharmony_ci		}
314662306a36Sopenharmony_ci
314762306a36Sopenharmony_ci		/*
314862306a36Sopenharmony_ci		 * Clear the pte entry and flush it first, before updating the
314962306a36Sopenharmony_ci		 * pte with the new entry, to keep TLBs on different CPUs in
315062306a36Sopenharmony_ci		 * sync. This code used to set the new PTE then flush TLBs, but
315162306a36Sopenharmony_ci		 * that left a window where the new PTE could be loaded into
315262306a36Sopenharmony_ci		 * some TLBs while the old PTE remains in others.
315362306a36Sopenharmony_ci		 */
315462306a36Sopenharmony_ci		ptep_clear_flush(vma, vmf->address, vmf->pte);
315562306a36Sopenharmony_ci		folio_add_new_anon_rmap(new_folio, vma, vmf->address);
315662306a36Sopenharmony_ci#ifdef CONFIG_MEM_PURGEABLE
315762306a36Sopenharmony_ci		if (vma->vm_flags & VM_PURGEABLE) {
315862306a36Sopenharmony_ci			pr_info("set wp new folio %lx purgeable\n", folio_pfn(new_folio));
315962306a36Sopenharmony_ci			folio_set_purgeable(new_folio);
316062306a36Sopenharmony_ci			uxpte_set_present(vma, vmf->address);
316162306a36Sopenharmony_ci		}
316262306a36Sopenharmony_ci#endif
316362306a36Sopenharmony_ci		folio_add_lru_vma(new_folio, vma);
316462306a36Sopenharmony_ci		/*
316562306a36Sopenharmony_ci		 * We call the notify macro here because, when using secondary
316662306a36Sopenharmony_ci		 * mmu page tables (such as kvm shadow page tables), we want the
316762306a36Sopenharmony_ci		 * new page to be mapped directly into the secondary page table.
316862306a36Sopenharmony_ci		 */
316962306a36Sopenharmony_ci		BUG_ON(unshare && pte_write(entry));
317062306a36Sopenharmony_ci		set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
317162306a36Sopenharmony_ci		update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
317262306a36Sopenharmony_ci		if (old_folio) {
317362306a36Sopenharmony_ci			/*
317462306a36Sopenharmony_ci			 * Only after switching the pte to the new page may
317562306a36Sopenharmony_ci			 * we remove the mapcount here. Otherwise another
317662306a36Sopenharmony_ci			 * process may come and find the rmap count decremented
317762306a36Sopenharmony_ci			 * before the pte is switched to the new page, and
317862306a36Sopenharmony_ci			 * "reuse" the old page writing into it while our pte
317962306a36Sopenharmony_ci			 * here still points into it and can be read by other
318062306a36Sopenharmony_ci			 * threads.
318162306a36Sopenharmony_ci			 *
318262306a36Sopenharmony_ci			 * The critical issue is to order this
318362306a36Sopenharmony_ci			 * page_remove_rmap with the ptp_clear_flush above.
318462306a36Sopenharmony_ci			 * Those stores are ordered by (if nothing else,)
318562306a36Sopenharmony_ci			 * the barrier present in the atomic_add_negative
318662306a36Sopenharmony_ci			 * in page_remove_rmap.
318762306a36Sopenharmony_ci			 *
318862306a36Sopenharmony_ci			 * Then the TLB flush in ptep_clear_flush ensures that
318962306a36Sopenharmony_ci			 * no process can access the old page before the
319062306a36Sopenharmony_ci			 * decremented mapcount is visible. And the old page
319162306a36Sopenharmony_ci			 * cannot be reused until after the decremented
319262306a36Sopenharmony_ci			 * mapcount is visible. So transitively, TLBs to
319362306a36Sopenharmony_ci			 * old page will be flushed before it can be reused.
319462306a36Sopenharmony_ci			 */
319562306a36Sopenharmony_ci			page_remove_rmap(vmf->page, vma, false);
319662306a36Sopenharmony_ci		}
319762306a36Sopenharmony_ci
319862306a36Sopenharmony_ci		/* Free the old page.. */
319962306a36Sopenharmony_ci		new_folio = old_folio;
320062306a36Sopenharmony_ci		page_copied = 1;
320162306a36Sopenharmony_ci		pte_unmap_unlock(vmf->pte, vmf->ptl);
320262306a36Sopenharmony_ci	} else if (vmf->pte) {
320362306a36Sopenharmony_ci		update_mmu_tlb(vma, vmf->address, vmf->pte);
320462306a36Sopenharmony_ci		pte_unmap_unlock(vmf->pte, vmf->ptl);
320562306a36Sopenharmony_ci	}
320662306a36Sopenharmony_ci
320762306a36Sopenharmony_ci	mmu_notifier_invalidate_range_end(&range);
320862306a36Sopenharmony_ci
320962306a36Sopenharmony_ci	if (new_folio)
321062306a36Sopenharmony_ci		folio_put(new_folio);
321162306a36Sopenharmony_ci	if (old_folio) {
321262306a36Sopenharmony_ci		if (page_copied)
321362306a36Sopenharmony_ci			free_swap_cache(&old_folio->page);
321462306a36Sopenharmony_ci		folio_put(old_folio);
321562306a36Sopenharmony_ci	}
321662306a36Sopenharmony_ci
321762306a36Sopenharmony_ci	delayacct_wpcopy_end();
321862306a36Sopenharmony_ci	return 0;
321962306a36Sopenharmony_cioom_free_new:
322062306a36Sopenharmony_ci	folio_put(new_folio);
322162306a36Sopenharmony_cioom:
322262306a36Sopenharmony_ci	if (old_folio)
322362306a36Sopenharmony_ci		folio_put(old_folio);
322462306a36Sopenharmony_ci
322562306a36Sopenharmony_ci	delayacct_wpcopy_end();
322662306a36Sopenharmony_ci	return VM_FAULT_OOM;
322762306a36Sopenharmony_ci}
322862306a36Sopenharmony_ci
322962306a36Sopenharmony_ci/**
323062306a36Sopenharmony_ci * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
323162306a36Sopenharmony_ci *			  writeable once the page is prepared
323262306a36Sopenharmony_ci *
323362306a36Sopenharmony_ci * @vmf: structure describing the fault
323462306a36Sopenharmony_ci *
323562306a36Sopenharmony_ci * This function handles all that is needed to finish a write page fault in a
323662306a36Sopenharmony_ci * shared mapping due to PTE being read-only once the mapped page is prepared.
323762306a36Sopenharmony_ci * It handles locking of PTE and modifying it.
323862306a36Sopenharmony_ci *
323962306a36Sopenharmony_ci * The function expects the page to be locked or other protection against
324062306a36Sopenharmony_ci * concurrent faults / writeback (such as DAX radix tree locks).
324162306a36Sopenharmony_ci *
324262306a36Sopenharmony_ci * Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before
324362306a36Sopenharmony_ci * we acquired PTE lock.
324462306a36Sopenharmony_ci */
324562306a36Sopenharmony_civm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
324662306a36Sopenharmony_ci{
324762306a36Sopenharmony_ci	WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
324862306a36Sopenharmony_ci	vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
324962306a36Sopenharmony_ci				       &vmf->ptl);
325062306a36Sopenharmony_ci	if (!vmf->pte)
325162306a36Sopenharmony_ci		return VM_FAULT_NOPAGE;
325262306a36Sopenharmony_ci	/*
325362306a36Sopenharmony_ci	 * We might have raced with another page fault while we released the
325462306a36Sopenharmony_ci	 * pte_offset_map_lock.
325562306a36Sopenharmony_ci	 */
325662306a36Sopenharmony_ci	if (!pte_same(ptep_get(vmf->pte), vmf->orig_pte)) {
325762306a36Sopenharmony_ci		update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
325862306a36Sopenharmony_ci		pte_unmap_unlock(vmf->pte, vmf->ptl);
325962306a36Sopenharmony_ci		return VM_FAULT_NOPAGE;
326062306a36Sopenharmony_ci	}
326162306a36Sopenharmony_ci	wp_page_reuse(vmf);
326262306a36Sopenharmony_ci	return 0;
326362306a36Sopenharmony_ci}
326462306a36Sopenharmony_ci
326562306a36Sopenharmony_ci/*
326662306a36Sopenharmony_ci * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
326762306a36Sopenharmony_ci * mapping
326862306a36Sopenharmony_ci */
326962306a36Sopenharmony_cistatic vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
327062306a36Sopenharmony_ci{
327162306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
327262306a36Sopenharmony_ci
327362306a36Sopenharmony_ci	if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
327462306a36Sopenharmony_ci		vm_fault_t ret;
327562306a36Sopenharmony_ci
327662306a36Sopenharmony_ci		pte_unmap_unlock(vmf->pte, vmf->ptl);
327762306a36Sopenharmony_ci		if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
327862306a36Sopenharmony_ci			vma_end_read(vmf->vma);
327962306a36Sopenharmony_ci			return VM_FAULT_RETRY;
328062306a36Sopenharmony_ci		}
328162306a36Sopenharmony_ci
328262306a36Sopenharmony_ci		vmf->flags |= FAULT_FLAG_MKWRITE;
328362306a36Sopenharmony_ci		ret = vma->vm_ops->pfn_mkwrite(vmf);
328462306a36Sopenharmony_ci		if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
328562306a36Sopenharmony_ci			return ret;
328662306a36Sopenharmony_ci		return finish_mkwrite_fault(vmf);
328762306a36Sopenharmony_ci	}
328862306a36Sopenharmony_ci	wp_page_reuse(vmf);
328962306a36Sopenharmony_ci	return 0;
329062306a36Sopenharmony_ci}
329162306a36Sopenharmony_ci
329262306a36Sopenharmony_cistatic vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio)
329362306a36Sopenharmony_ci	__releases(vmf->ptl)
329462306a36Sopenharmony_ci{
329562306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
329662306a36Sopenharmony_ci	vm_fault_t ret = 0;
329762306a36Sopenharmony_ci
329862306a36Sopenharmony_ci	folio_get(folio);
329962306a36Sopenharmony_ci
330062306a36Sopenharmony_ci	if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
330162306a36Sopenharmony_ci		vm_fault_t tmp;
330262306a36Sopenharmony_ci
330362306a36Sopenharmony_ci		pte_unmap_unlock(vmf->pte, vmf->ptl);
330462306a36Sopenharmony_ci		if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
330562306a36Sopenharmony_ci			folio_put(folio);
330662306a36Sopenharmony_ci			vma_end_read(vmf->vma);
330762306a36Sopenharmony_ci			return VM_FAULT_RETRY;
330862306a36Sopenharmony_ci		}
330962306a36Sopenharmony_ci
331062306a36Sopenharmony_ci		tmp = do_page_mkwrite(vmf, folio);
331162306a36Sopenharmony_ci		if (unlikely(!tmp || (tmp &
331262306a36Sopenharmony_ci				      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
331362306a36Sopenharmony_ci			folio_put(folio);
331462306a36Sopenharmony_ci			return tmp;
331562306a36Sopenharmony_ci		}
331662306a36Sopenharmony_ci		tmp = finish_mkwrite_fault(vmf);
331762306a36Sopenharmony_ci		if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
331862306a36Sopenharmony_ci			folio_unlock(folio);
331962306a36Sopenharmony_ci			folio_put(folio);
332062306a36Sopenharmony_ci			return tmp;
332162306a36Sopenharmony_ci		}
332262306a36Sopenharmony_ci	} else {
332362306a36Sopenharmony_ci		wp_page_reuse(vmf);
332462306a36Sopenharmony_ci		folio_lock(folio);
332562306a36Sopenharmony_ci	}
332662306a36Sopenharmony_ci	ret |= fault_dirty_shared_page(vmf);
332762306a36Sopenharmony_ci	folio_put(folio);
332862306a36Sopenharmony_ci
332962306a36Sopenharmony_ci	return ret;
333062306a36Sopenharmony_ci}
333162306a36Sopenharmony_ci
333262306a36Sopenharmony_ci/*
333362306a36Sopenharmony_ci * This routine handles present pages, when
333462306a36Sopenharmony_ci * * users try to write to a shared page (FAULT_FLAG_WRITE)
333562306a36Sopenharmony_ci * * GUP wants to take a R/O pin on a possibly shared anonymous page
333662306a36Sopenharmony_ci *   (FAULT_FLAG_UNSHARE)
333762306a36Sopenharmony_ci *
333862306a36Sopenharmony_ci * It is done by copying the page to a new address and decrementing the
333962306a36Sopenharmony_ci * shared-page counter for the old page.
334062306a36Sopenharmony_ci *
334162306a36Sopenharmony_ci * Note that this routine assumes that the protection checks have been
334262306a36Sopenharmony_ci * done by the caller (the low-level page fault routine in most cases).
334362306a36Sopenharmony_ci * Thus, with FAULT_FLAG_WRITE, we can safely just mark it writable once we've
334462306a36Sopenharmony_ci * done any necessary COW.
334562306a36Sopenharmony_ci *
334662306a36Sopenharmony_ci * In case of FAULT_FLAG_WRITE, we also mark the page dirty at this point even
334762306a36Sopenharmony_ci * though the page will change only once the write actually happens. This
334862306a36Sopenharmony_ci * avoids a few races, and potentially makes it more efficient.
334962306a36Sopenharmony_ci *
335062306a36Sopenharmony_ci * We enter with non-exclusive mmap_lock (to exclude vma changes,
335162306a36Sopenharmony_ci * but allow concurrent faults), with pte both mapped and locked.
335262306a36Sopenharmony_ci * We return with mmap_lock still held, but pte unmapped and unlocked.
335362306a36Sopenharmony_ci */
335462306a36Sopenharmony_cistatic vm_fault_t do_wp_page(struct vm_fault *vmf)
335562306a36Sopenharmony_ci	__releases(vmf->ptl)
335662306a36Sopenharmony_ci{
335762306a36Sopenharmony_ci	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
335862306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
335962306a36Sopenharmony_ci	struct folio *folio = NULL;
336062306a36Sopenharmony_ci
336162306a36Sopenharmony_ci	if (likely(!unshare)) {
336262306a36Sopenharmony_ci		if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) {
336362306a36Sopenharmony_ci			pte_unmap_unlock(vmf->pte, vmf->ptl);
336462306a36Sopenharmony_ci			return handle_userfault(vmf, VM_UFFD_WP);
336562306a36Sopenharmony_ci		}
336662306a36Sopenharmony_ci
336762306a36Sopenharmony_ci		/*
336862306a36Sopenharmony_ci		 * Userfaultfd write-protect can defer flushes. Ensure the TLB
336962306a36Sopenharmony_ci		 * is flushed in this case before copying.
337062306a36Sopenharmony_ci		 */
337162306a36Sopenharmony_ci		if (unlikely(userfaultfd_wp(vmf->vma) &&
337262306a36Sopenharmony_ci			     mm_tlb_flush_pending(vmf->vma->vm_mm)))
337362306a36Sopenharmony_ci			flush_tlb_page(vmf->vma, vmf->address);
337462306a36Sopenharmony_ci	}
337562306a36Sopenharmony_ci
337662306a36Sopenharmony_ci	vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
337762306a36Sopenharmony_ci
337862306a36Sopenharmony_ci	if (vmf->page)
337962306a36Sopenharmony_ci		folio = page_folio(vmf->page);
338062306a36Sopenharmony_ci
338162306a36Sopenharmony_ci	/*
338262306a36Sopenharmony_ci	 * Shared mapping: we are guaranteed to have VM_WRITE and
338362306a36Sopenharmony_ci	 * FAULT_FLAG_WRITE set at this point.
338462306a36Sopenharmony_ci	 */
338562306a36Sopenharmony_ci	if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
338662306a36Sopenharmony_ci		/*
338762306a36Sopenharmony_ci		 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
338862306a36Sopenharmony_ci		 * VM_PFNMAP VMA.
338962306a36Sopenharmony_ci		 *
339062306a36Sopenharmony_ci		 * We should not cow pages in a shared writeable mapping.
339162306a36Sopenharmony_ci		 * Just mark the pages writable and/or call ops->pfn_mkwrite.
339262306a36Sopenharmony_ci		 */
339362306a36Sopenharmony_ci		if (!vmf->page)
339462306a36Sopenharmony_ci			return wp_pfn_shared(vmf);
339562306a36Sopenharmony_ci		return wp_page_shared(vmf, folio);
339662306a36Sopenharmony_ci	}
339762306a36Sopenharmony_ci
339862306a36Sopenharmony_ci	/*
339962306a36Sopenharmony_ci	 * Private mapping: create an exclusive anonymous page copy if reuse
340062306a36Sopenharmony_ci	 * is impossible. We might miss VM_WRITE for FOLL_FORCE handling.
340162306a36Sopenharmony_ci	 */
340262306a36Sopenharmony_ci	if (folio && folio_test_anon(folio)) {
340362306a36Sopenharmony_ci		/*
340462306a36Sopenharmony_ci		 * If the page is exclusive to this process we must reuse the
340562306a36Sopenharmony_ci		 * page without further checks.
340662306a36Sopenharmony_ci		 */
340762306a36Sopenharmony_ci		if (PageAnonExclusive(vmf->page))
340862306a36Sopenharmony_ci			goto reuse;
340962306a36Sopenharmony_ci
341062306a36Sopenharmony_ci		/*
341162306a36Sopenharmony_ci		 * We have to verify under folio lock: these early checks are
341262306a36Sopenharmony_ci		 * just an optimization to avoid locking the folio and freeing
341362306a36Sopenharmony_ci		 * the swapcache if there is little hope that we can reuse.
341462306a36Sopenharmony_ci		 *
341562306a36Sopenharmony_ci		 * KSM doesn't necessarily raise the folio refcount.
341662306a36Sopenharmony_ci		 */
341762306a36Sopenharmony_ci		if (folio_test_ksm(folio) || folio_ref_count(folio) > 3)
341862306a36Sopenharmony_ci			goto copy;
341962306a36Sopenharmony_ci		if (!folio_test_lru(folio))
342062306a36Sopenharmony_ci			/*
342162306a36Sopenharmony_ci			 * We cannot easily detect+handle references from
342262306a36Sopenharmony_ci			 * remote LRU caches or references to LRU folios.
342362306a36Sopenharmony_ci			 */
342462306a36Sopenharmony_ci			lru_add_drain();
342562306a36Sopenharmony_ci		if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio))
342662306a36Sopenharmony_ci			goto copy;
342762306a36Sopenharmony_ci		if (!folio_trylock(folio))
342862306a36Sopenharmony_ci			goto copy;
342962306a36Sopenharmony_ci		if (folio_test_swapcache(folio))
343062306a36Sopenharmony_ci			folio_free_swap(folio);
343162306a36Sopenharmony_ci		if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) {
343262306a36Sopenharmony_ci			folio_unlock(folio);
343362306a36Sopenharmony_ci			goto copy;
343462306a36Sopenharmony_ci		}
343562306a36Sopenharmony_ci		/*
343662306a36Sopenharmony_ci		 * Ok, we've got the only folio reference from our mapping
343762306a36Sopenharmony_ci		 * and the folio is locked, it's dark out, and we're wearing
343862306a36Sopenharmony_ci		 * sunglasses. Hit it.
343962306a36Sopenharmony_ci		 */
344062306a36Sopenharmony_ci		page_move_anon_rmap(vmf->page, vma);
344162306a36Sopenharmony_ci		folio_unlock(folio);
344262306a36Sopenharmony_cireuse:
344362306a36Sopenharmony_ci		if (unlikely(unshare)) {
344462306a36Sopenharmony_ci			pte_unmap_unlock(vmf->pte, vmf->ptl);
344562306a36Sopenharmony_ci			return 0;
344662306a36Sopenharmony_ci		}
344762306a36Sopenharmony_ci		wp_page_reuse(vmf);
344862306a36Sopenharmony_ci		return 0;
344962306a36Sopenharmony_ci	}
345062306a36Sopenharmony_cicopy:
345162306a36Sopenharmony_ci	if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma->anon_vma) {
345262306a36Sopenharmony_ci		pte_unmap_unlock(vmf->pte, vmf->ptl);
345362306a36Sopenharmony_ci		vma_end_read(vmf->vma);
345462306a36Sopenharmony_ci		return VM_FAULT_RETRY;
345562306a36Sopenharmony_ci	}
345662306a36Sopenharmony_ci
345762306a36Sopenharmony_ci	/*
345862306a36Sopenharmony_ci	 * Ok, we need to copy. Oh, well..
345962306a36Sopenharmony_ci	 */
346062306a36Sopenharmony_ci	if (folio)
346162306a36Sopenharmony_ci		folio_get(folio);
346262306a36Sopenharmony_ci
346362306a36Sopenharmony_ci	pte_unmap_unlock(vmf->pte, vmf->ptl);
346462306a36Sopenharmony_ci#ifdef CONFIG_KSM
346562306a36Sopenharmony_ci	if (folio && folio_test_ksm(folio))
346662306a36Sopenharmony_ci		count_vm_event(COW_KSM);
346762306a36Sopenharmony_ci#endif
346862306a36Sopenharmony_ci	return wp_page_copy(vmf);
346962306a36Sopenharmony_ci}
347062306a36Sopenharmony_ci
347162306a36Sopenharmony_cistatic void unmap_mapping_range_vma(struct vm_area_struct *vma,
347262306a36Sopenharmony_ci		unsigned long start_addr, unsigned long end_addr,
347362306a36Sopenharmony_ci		struct zap_details *details)
347462306a36Sopenharmony_ci{
347562306a36Sopenharmony_ci	zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
347662306a36Sopenharmony_ci}
347762306a36Sopenharmony_ci
347862306a36Sopenharmony_cistatic inline void unmap_mapping_range_tree(struct rb_root_cached *root,
347962306a36Sopenharmony_ci					    pgoff_t first_index,
348062306a36Sopenharmony_ci					    pgoff_t last_index,
348162306a36Sopenharmony_ci					    struct zap_details *details)
348262306a36Sopenharmony_ci{
348362306a36Sopenharmony_ci	struct vm_area_struct *vma;
348462306a36Sopenharmony_ci	pgoff_t vba, vea, zba, zea;
348562306a36Sopenharmony_ci
348662306a36Sopenharmony_ci	vma_interval_tree_foreach(vma, root, first_index, last_index) {
348762306a36Sopenharmony_ci		vba = vma->vm_pgoff;
348862306a36Sopenharmony_ci		vea = vba + vma_pages(vma) - 1;
348962306a36Sopenharmony_ci		zba = max(first_index, vba);
349062306a36Sopenharmony_ci		zea = min(last_index, vea);
349162306a36Sopenharmony_ci
349262306a36Sopenharmony_ci		unmap_mapping_range_vma(vma,
349362306a36Sopenharmony_ci			((zba - vba) << PAGE_SHIFT) + vma->vm_start,
349462306a36Sopenharmony_ci			((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
349562306a36Sopenharmony_ci				details);
349662306a36Sopenharmony_ci	}
349762306a36Sopenharmony_ci}
349862306a36Sopenharmony_ci
349962306a36Sopenharmony_ci/**
350062306a36Sopenharmony_ci * unmap_mapping_folio() - Unmap single folio from processes.
350162306a36Sopenharmony_ci * @folio: The locked folio to be unmapped.
350262306a36Sopenharmony_ci *
350362306a36Sopenharmony_ci * Unmap this folio from any userspace process which still has it mmaped.
350462306a36Sopenharmony_ci * Typically, for efficiency, the range of nearby pages has already been
350562306a36Sopenharmony_ci * unmapped by unmap_mapping_pages() or unmap_mapping_range().  But once
350662306a36Sopenharmony_ci * truncation or invalidation holds the lock on a folio, it may find that
350762306a36Sopenharmony_ci * the page has been remapped again: and then uses unmap_mapping_folio()
350862306a36Sopenharmony_ci * to unmap it finally.
350962306a36Sopenharmony_ci */
351062306a36Sopenharmony_civoid unmap_mapping_folio(struct folio *folio)
351162306a36Sopenharmony_ci{
351262306a36Sopenharmony_ci	struct address_space *mapping = folio->mapping;
351362306a36Sopenharmony_ci	struct zap_details details = { };
351462306a36Sopenharmony_ci	pgoff_t	first_index;
351562306a36Sopenharmony_ci	pgoff_t	last_index;
351662306a36Sopenharmony_ci
351762306a36Sopenharmony_ci	VM_BUG_ON(!folio_test_locked(folio));
351862306a36Sopenharmony_ci
351962306a36Sopenharmony_ci	first_index = folio->index;
352062306a36Sopenharmony_ci	last_index = folio_next_index(folio) - 1;
352162306a36Sopenharmony_ci
352262306a36Sopenharmony_ci	details.even_cows = false;
352362306a36Sopenharmony_ci	details.single_folio = folio;
352462306a36Sopenharmony_ci	details.zap_flags = ZAP_FLAG_DROP_MARKER;
352562306a36Sopenharmony_ci
352662306a36Sopenharmony_ci	i_mmap_lock_read(mapping);
352762306a36Sopenharmony_ci	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
352862306a36Sopenharmony_ci		unmap_mapping_range_tree(&mapping->i_mmap, first_index,
352962306a36Sopenharmony_ci					 last_index, &details);
353062306a36Sopenharmony_ci	i_mmap_unlock_read(mapping);
353162306a36Sopenharmony_ci}
353262306a36Sopenharmony_ci
353362306a36Sopenharmony_ci/**
353462306a36Sopenharmony_ci * unmap_mapping_pages() - Unmap pages from processes.
353562306a36Sopenharmony_ci * @mapping: The address space containing pages to be unmapped.
353662306a36Sopenharmony_ci * @start: Index of first page to be unmapped.
353762306a36Sopenharmony_ci * @nr: Number of pages to be unmapped.  0 to unmap to end of file.
353862306a36Sopenharmony_ci * @even_cows: Whether to unmap even private COWed pages.
353962306a36Sopenharmony_ci *
354062306a36Sopenharmony_ci * Unmap the pages in this address space from any userspace process which
354162306a36Sopenharmony_ci * has them mmaped.  Generally, you want to remove COWed pages as well when
354262306a36Sopenharmony_ci * a file is being truncated, but not when invalidating pages from the page
354362306a36Sopenharmony_ci * cache.
354462306a36Sopenharmony_ci */
354562306a36Sopenharmony_civoid unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
354662306a36Sopenharmony_ci		pgoff_t nr, bool even_cows)
354762306a36Sopenharmony_ci{
354862306a36Sopenharmony_ci	struct zap_details details = { };
354962306a36Sopenharmony_ci	pgoff_t	first_index = start;
355062306a36Sopenharmony_ci	pgoff_t	last_index = start + nr - 1;
355162306a36Sopenharmony_ci
355262306a36Sopenharmony_ci	details.even_cows = even_cows;
355362306a36Sopenharmony_ci	if (last_index < first_index)
355462306a36Sopenharmony_ci		last_index = ULONG_MAX;
355562306a36Sopenharmony_ci
355662306a36Sopenharmony_ci	i_mmap_lock_read(mapping);
355762306a36Sopenharmony_ci	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
355862306a36Sopenharmony_ci		unmap_mapping_range_tree(&mapping->i_mmap, first_index,
355962306a36Sopenharmony_ci					 last_index, &details);
356062306a36Sopenharmony_ci	i_mmap_unlock_read(mapping);
356162306a36Sopenharmony_ci}
356262306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(unmap_mapping_pages);
356362306a36Sopenharmony_ci
356462306a36Sopenharmony_ci/**
356562306a36Sopenharmony_ci * unmap_mapping_range - unmap the portion of all mmaps in the specified
356662306a36Sopenharmony_ci * address_space corresponding to the specified byte range in the underlying
356762306a36Sopenharmony_ci * file.
356862306a36Sopenharmony_ci *
356962306a36Sopenharmony_ci * @mapping: the address space containing mmaps to be unmapped.
357062306a36Sopenharmony_ci * @holebegin: byte in first page to unmap, relative to the start of
357162306a36Sopenharmony_ci * the underlying file.  This will be rounded down to a PAGE_SIZE
357262306a36Sopenharmony_ci * boundary.  Note that this is different from truncate_pagecache(), which
357362306a36Sopenharmony_ci * must keep the partial page.  In contrast, we must get rid of
357462306a36Sopenharmony_ci * partial pages.
357562306a36Sopenharmony_ci * @holelen: size of prospective hole in bytes.  This will be rounded
357662306a36Sopenharmony_ci * up to a PAGE_SIZE boundary.  A holelen of zero truncates to the
357762306a36Sopenharmony_ci * end of the file.
357862306a36Sopenharmony_ci * @even_cows: 1 when truncating a file, unmap even private COWed pages;
357962306a36Sopenharmony_ci * but 0 when invalidating pagecache, don't throw away private data.
358062306a36Sopenharmony_ci */
358162306a36Sopenharmony_civoid unmap_mapping_range(struct address_space *mapping,
358262306a36Sopenharmony_ci		loff_t const holebegin, loff_t const holelen, int even_cows)
358362306a36Sopenharmony_ci{
358462306a36Sopenharmony_ci	pgoff_t hba = (pgoff_t)(holebegin) >> PAGE_SHIFT;
358562306a36Sopenharmony_ci	pgoff_t hlen = ((pgoff_t)(holelen) + PAGE_SIZE - 1) >> PAGE_SHIFT;
358662306a36Sopenharmony_ci
358762306a36Sopenharmony_ci	/* Check for overflow. */
358862306a36Sopenharmony_ci	if (sizeof(holelen) > sizeof(hlen)) {
358962306a36Sopenharmony_ci		long long holeend =
359062306a36Sopenharmony_ci			(holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
359162306a36Sopenharmony_ci		if (holeend & ~(long long)ULONG_MAX)
359262306a36Sopenharmony_ci			hlen = ULONG_MAX - hba + 1;
359362306a36Sopenharmony_ci	}
359462306a36Sopenharmony_ci
359562306a36Sopenharmony_ci	unmap_mapping_pages(mapping, hba, hlen, even_cows);
359662306a36Sopenharmony_ci}
359762306a36Sopenharmony_ciEXPORT_SYMBOL(unmap_mapping_range);
359862306a36Sopenharmony_ci
359962306a36Sopenharmony_ci/*
360062306a36Sopenharmony_ci * Restore a potential device exclusive pte to a working pte entry
360162306a36Sopenharmony_ci */
360262306a36Sopenharmony_cistatic vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
360362306a36Sopenharmony_ci{
360462306a36Sopenharmony_ci	struct folio *folio = page_folio(vmf->page);
360562306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
360662306a36Sopenharmony_ci	struct mmu_notifier_range range;
360762306a36Sopenharmony_ci	vm_fault_t ret;
360862306a36Sopenharmony_ci
360962306a36Sopenharmony_ci	/*
361062306a36Sopenharmony_ci	 * We need a reference to lock the folio because we don't hold
361162306a36Sopenharmony_ci	 * the PTL so a racing thread can remove the device-exclusive
361262306a36Sopenharmony_ci	 * entry and unmap it. If the folio is free the entry must
361362306a36Sopenharmony_ci	 * have been removed already. If it happens to have already
361462306a36Sopenharmony_ci	 * been re-allocated after being freed all we do is lock and
361562306a36Sopenharmony_ci	 * unlock it.
361662306a36Sopenharmony_ci	 */
361762306a36Sopenharmony_ci	if (!folio_try_get(folio))
361862306a36Sopenharmony_ci		return 0;
361962306a36Sopenharmony_ci
362062306a36Sopenharmony_ci	ret = folio_lock_or_retry(folio, vmf);
362162306a36Sopenharmony_ci	if (ret) {
362262306a36Sopenharmony_ci		folio_put(folio);
362362306a36Sopenharmony_ci		return ret;
362462306a36Sopenharmony_ci	}
362562306a36Sopenharmony_ci	mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
362662306a36Sopenharmony_ci				vma->vm_mm, vmf->address & PAGE_MASK,
362762306a36Sopenharmony_ci				(vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
362862306a36Sopenharmony_ci	mmu_notifier_invalidate_range_start(&range);
362962306a36Sopenharmony_ci
363062306a36Sopenharmony_ci	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
363162306a36Sopenharmony_ci				&vmf->ptl);
363262306a36Sopenharmony_ci	if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
363362306a36Sopenharmony_ci		restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte);
363462306a36Sopenharmony_ci
363562306a36Sopenharmony_ci	if (vmf->pte)
363662306a36Sopenharmony_ci		pte_unmap_unlock(vmf->pte, vmf->ptl);
363762306a36Sopenharmony_ci	folio_unlock(folio);
363862306a36Sopenharmony_ci	folio_put(folio);
363962306a36Sopenharmony_ci
364062306a36Sopenharmony_ci	mmu_notifier_invalidate_range_end(&range);
364162306a36Sopenharmony_ci	return 0;
364262306a36Sopenharmony_ci}
364362306a36Sopenharmony_ci
364462306a36Sopenharmony_cistatic inline bool should_try_to_free_swap(struct folio *folio,
364562306a36Sopenharmony_ci					   struct vm_area_struct *vma,
364662306a36Sopenharmony_ci					   unsigned int fault_flags)
364762306a36Sopenharmony_ci{
364862306a36Sopenharmony_ci	if (!folio_test_swapcache(folio))
364962306a36Sopenharmony_ci		return false;
365062306a36Sopenharmony_ci	if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) ||
365162306a36Sopenharmony_ci	    folio_test_mlocked(folio))
365262306a36Sopenharmony_ci		return true;
365362306a36Sopenharmony_ci	/*
365462306a36Sopenharmony_ci	 * If we want to map a page that's in the swapcache writable, we
365562306a36Sopenharmony_ci	 * have to detect via the refcount if we're really the exclusive
365662306a36Sopenharmony_ci	 * user. Try freeing the swapcache to get rid of the swapcache
365762306a36Sopenharmony_ci	 * reference only in case it's likely that we'll be the exlusive user.
365862306a36Sopenharmony_ci	 */
365962306a36Sopenharmony_ci	return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) &&
366062306a36Sopenharmony_ci		folio_ref_count(folio) == 2;
366162306a36Sopenharmony_ci}
366262306a36Sopenharmony_ci
366362306a36Sopenharmony_cistatic vm_fault_t pte_marker_clear(struct vm_fault *vmf)
366462306a36Sopenharmony_ci{
366562306a36Sopenharmony_ci	vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
366662306a36Sopenharmony_ci				       vmf->address, &vmf->ptl);
366762306a36Sopenharmony_ci	if (!vmf->pte)
366862306a36Sopenharmony_ci		return 0;
366962306a36Sopenharmony_ci	/*
367062306a36Sopenharmony_ci	 * Be careful so that we will only recover a special uffd-wp pte into a
367162306a36Sopenharmony_ci	 * none pte.  Otherwise it means the pte could have changed, so retry.
367262306a36Sopenharmony_ci	 *
367362306a36Sopenharmony_ci	 * This should also cover the case where e.g. the pte changed
367462306a36Sopenharmony_ci	 * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED.
367562306a36Sopenharmony_ci	 * So is_pte_marker() check is not enough to safely drop the pte.
367662306a36Sopenharmony_ci	 */
367762306a36Sopenharmony_ci	if (pte_same(vmf->orig_pte, ptep_get(vmf->pte)))
367862306a36Sopenharmony_ci		pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
367962306a36Sopenharmony_ci	pte_unmap_unlock(vmf->pte, vmf->ptl);
368062306a36Sopenharmony_ci	return 0;
368162306a36Sopenharmony_ci}
368262306a36Sopenharmony_ci
368362306a36Sopenharmony_cistatic vm_fault_t do_pte_missing(struct vm_fault *vmf)
368462306a36Sopenharmony_ci{
368562306a36Sopenharmony_ci	if (vma_is_anonymous(vmf->vma))
368662306a36Sopenharmony_ci		return do_anonymous_page(vmf);
368762306a36Sopenharmony_ci	else
368862306a36Sopenharmony_ci		return do_fault(vmf);
368962306a36Sopenharmony_ci}
369062306a36Sopenharmony_ci
369162306a36Sopenharmony_ci/*
369262306a36Sopenharmony_ci * This is actually a page-missing access, but with uffd-wp special pte
369362306a36Sopenharmony_ci * installed.  It means this pte was wr-protected before being unmapped.
369462306a36Sopenharmony_ci */
369562306a36Sopenharmony_cistatic vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
369662306a36Sopenharmony_ci{
369762306a36Sopenharmony_ci	/*
369862306a36Sopenharmony_ci	 * Just in case there're leftover special ptes even after the region
369962306a36Sopenharmony_ci	 * got unregistered - we can simply clear them.
370062306a36Sopenharmony_ci	 */
370162306a36Sopenharmony_ci	if (unlikely(!userfaultfd_wp(vmf->vma)))
370262306a36Sopenharmony_ci		return pte_marker_clear(vmf);
370362306a36Sopenharmony_ci
370462306a36Sopenharmony_ci	return do_pte_missing(vmf);
370562306a36Sopenharmony_ci}
370662306a36Sopenharmony_ci
370762306a36Sopenharmony_cistatic vm_fault_t handle_pte_marker(struct vm_fault *vmf)
370862306a36Sopenharmony_ci{
370962306a36Sopenharmony_ci	swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte);
371062306a36Sopenharmony_ci	unsigned long marker = pte_marker_get(entry);
371162306a36Sopenharmony_ci
371262306a36Sopenharmony_ci	/*
371362306a36Sopenharmony_ci	 * PTE markers should never be empty.  If anything weird happened,
371462306a36Sopenharmony_ci	 * the best thing to do is to kill the process along with its mm.
371562306a36Sopenharmony_ci	 */
371662306a36Sopenharmony_ci	if (WARN_ON_ONCE(!marker))
371762306a36Sopenharmony_ci		return VM_FAULT_SIGBUS;
371862306a36Sopenharmony_ci
371962306a36Sopenharmony_ci	/* Higher priority than uffd-wp when data corrupted */
372062306a36Sopenharmony_ci	if (marker & PTE_MARKER_POISONED)
372162306a36Sopenharmony_ci		return VM_FAULT_HWPOISON;
372262306a36Sopenharmony_ci
372362306a36Sopenharmony_ci	if (pte_marker_entry_uffd_wp(entry))
372462306a36Sopenharmony_ci		return pte_marker_handle_uffd_wp(vmf);
372562306a36Sopenharmony_ci
372662306a36Sopenharmony_ci	/* This is an unknown pte marker */
372762306a36Sopenharmony_ci	return VM_FAULT_SIGBUS;
372862306a36Sopenharmony_ci}
372962306a36Sopenharmony_ci
373062306a36Sopenharmony_ci/*
373162306a36Sopenharmony_ci * We enter with non-exclusive mmap_lock (to exclude vma changes,
373262306a36Sopenharmony_ci * but allow concurrent faults), and pte mapped but not yet locked.
373362306a36Sopenharmony_ci * We return with pte unmapped and unlocked.
373462306a36Sopenharmony_ci *
373562306a36Sopenharmony_ci * We return with the mmap_lock locked or unlocked in the same cases
373662306a36Sopenharmony_ci * as does filemap_fault().
373762306a36Sopenharmony_ci */
373862306a36Sopenharmony_civm_fault_t do_swap_page(struct vm_fault *vmf)
373962306a36Sopenharmony_ci{
374062306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
374162306a36Sopenharmony_ci	struct folio *swapcache, *folio = NULL;
374262306a36Sopenharmony_ci	struct page *page;
374362306a36Sopenharmony_ci	struct swap_info_struct *si = NULL;
374462306a36Sopenharmony_ci	rmap_t rmap_flags = RMAP_NONE;
374562306a36Sopenharmony_ci	bool need_clear_cache = false;
374662306a36Sopenharmony_ci	bool exclusive = false;
374762306a36Sopenharmony_ci	swp_entry_t entry;
374862306a36Sopenharmony_ci	pte_t pte;
374962306a36Sopenharmony_ci	vm_fault_t ret = 0;
375062306a36Sopenharmony_ci	void *shadow = NULL;
375162306a36Sopenharmony_ci
375262306a36Sopenharmony_ci	if (!pte_unmap_same(vmf))
375362306a36Sopenharmony_ci		goto out;
375462306a36Sopenharmony_ci
375562306a36Sopenharmony_ci	entry = pte_to_swp_entry(vmf->orig_pte);
375662306a36Sopenharmony_ci	if (unlikely(non_swap_entry(entry))) {
375762306a36Sopenharmony_ci		if (is_migration_entry(entry)) {
375862306a36Sopenharmony_ci			migration_entry_wait(vma->vm_mm, vmf->pmd,
375962306a36Sopenharmony_ci					     vmf->address);
376062306a36Sopenharmony_ci		} else if (is_device_exclusive_entry(entry)) {
376162306a36Sopenharmony_ci			vmf->page = pfn_swap_entry_to_page(entry);
376262306a36Sopenharmony_ci			ret = remove_device_exclusive_entry(vmf);
376362306a36Sopenharmony_ci		} else if (is_device_private_entry(entry)) {
376462306a36Sopenharmony_ci			if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
376562306a36Sopenharmony_ci				/*
376662306a36Sopenharmony_ci				 * migrate_to_ram is not yet ready to operate
376762306a36Sopenharmony_ci				 * under VMA lock.
376862306a36Sopenharmony_ci				 */
376962306a36Sopenharmony_ci				vma_end_read(vma);
377062306a36Sopenharmony_ci				ret = VM_FAULT_RETRY;
377162306a36Sopenharmony_ci				goto out;
377262306a36Sopenharmony_ci			}
377362306a36Sopenharmony_ci
377462306a36Sopenharmony_ci			vmf->page = pfn_swap_entry_to_page(entry);
377562306a36Sopenharmony_ci			vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
377662306a36Sopenharmony_ci					vmf->address, &vmf->ptl);
377762306a36Sopenharmony_ci			if (unlikely(!vmf->pte ||
377862306a36Sopenharmony_ci				     !pte_same(ptep_get(vmf->pte),
377962306a36Sopenharmony_ci							vmf->orig_pte)))
378062306a36Sopenharmony_ci				goto unlock;
378162306a36Sopenharmony_ci
378262306a36Sopenharmony_ci			/*
378362306a36Sopenharmony_ci			 * Get a page reference while we know the page can't be
378462306a36Sopenharmony_ci			 * freed.
378562306a36Sopenharmony_ci			 */
378662306a36Sopenharmony_ci			get_page(vmf->page);
378762306a36Sopenharmony_ci			pte_unmap_unlock(vmf->pte, vmf->ptl);
378862306a36Sopenharmony_ci			ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
378962306a36Sopenharmony_ci			put_page(vmf->page);
379062306a36Sopenharmony_ci		} else if (is_hwpoison_entry(entry)) {
379162306a36Sopenharmony_ci			ret = VM_FAULT_HWPOISON;
379262306a36Sopenharmony_ci		} else if (is_pte_marker_entry(entry)) {
379362306a36Sopenharmony_ci			ret = handle_pte_marker(vmf);
379462306a36Sopenharmony_ci		} else {
379562306a36Sopenharmony_ci			print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
379662306a36Sopenharmony_ci			ret = VM_FAULT_SIGBUS;
379762306a36Sopenharmony_ci		}
379862306a36Sopenharmony_ci		goto out;
379962306a36Sopenharmony_ci	}
380062306a36Sopenharmony_ci
380162306a36Sopenharmony_ci	/* Prevent swapoff from happening to us. */
380262306a36Sopenharmony_ci	si = get_swap_device(entry);
380362306a36Sopenharmony_ci	if (unlikely(!si))
380462306a36Sopenharmony_ci		goto out;
380562306a36Sopenharmony_ci
380662306a36Sopenharmony_ci	folio = swap_cache_get_folio(entry, vma, vmf->address);
380762306a36Sopenharmony_ci	if (folio)
380862306a36Sopenharmony_ci		page = folio_file_page(folio, swp_offset(entry));
380962306a36Sopenharmony_ci	swapcache = folio;
381062306a36Sopenharmony_ci
381162306a36Sopenharmony_ci	if (!folio) {
381262306a36Sopenharmony_ci		if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
381362306a36Sopenharmony_ci		    __swap_count(entry) == 1) {
381462306a36Sopenharmony_ci			/*
381562306a36Sopenharmony_ci			 * Prevent parallel swapin from proceeding with
381662306a36Sopenharmony_ci			 * the cache flag. Otherwise, another thread may
381762306a36Sopenharmony_ci			 * finish swapin first, free the entry, and swapout
381862306a36Sopenharmony_ci			 * reusing the same entry. It's undetectable as
381962306a36Sopenharmony_ci			 * pte_same() returns true due to entry reuse.
382062306a36Sopenharmony_ci			 */
382162306a36Sopenharmony_ci			if (swapcache_prepare(entry)) {
382262306a36Sopenharmony_ci				/* Relax a bit to prevent rapid repeated page faults */
382362306a36Sopenharmony_ci				schedule_timeout_uninterruptible(1);
382462306a36Sopenharmony_ci				goto out;
382562306a36Sopenharmony_ci			}
382662306a36Sopenharmony_ci			need_clear_cache = true;
382762306a36Sopenharmony_ci
382862306a36Sopenharmony_ci			/* skip swapcache */
382962306a36Sopenharmony_ci			folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0,
383062306a36Sopenharmony_ci						vma, vmf->address, false);
383162306a36Sopenharmony_ci			page = &folio->page;
383262306a36Sopenharmony_ci			if (folio) {
383362306a36Sopenharmony_ci				__folio_set_locked(folio);
383462306a36Sopenharmony_ci				__folio_set_swapbacked(folio);
383562306a36Sopenharmony_ci
383662306a36Sopenharmony_ci				if (mem_cgroup_swapin_charge_folio(folio,
383762306a36Sopenharmony_ci							vma->vm_mm, GFP_KERNEL,
383862306a36Sopenharmony_ci							entry)) {
383962306a36Sopenharmony_ci					ret = VM_FAULT_OOM;
384062306a36Sopenharmony_ci					goto out_page;
384162306a36Sopenharmony_ci				}
384262306a36Sopenharmony_ci				mem_cgroup_swapin_uncharge_swap(entry);
384362306a36Sopenharmony_ci
384462306a36Sopenharmony_ci				shadow = get_shadow_from_swap_cache(entry);
384562306a36Sopenharmony_ci				if (shadow)
384662306a36Sopenharmony_ci					workingset_refault(folio, shadow);
384762306a36Sopenharmony_ci
384862306a36Sopenharmony_ci				folio_add_lru(folio);
384962306a36Sopenharmony_ci
385062306a36Sopenharmony_ci				/* To provide entry to swap_readpage() */
385162306a36Sopenharmony_ci				folio->swap = entry;
385262306a36Sopenharmony_ci				swap_readpage(page, true, NULL);
385362306a36Sopenharmony_ci				folio->private = NULL;
385462306a36Sopenharmony_ci			}
385562306a36Sopenharmony_ci		} else {
385662306a36Sopenharmony_ci			page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
385762306a36Sopenharmony_ci						vmf);
385862306a36Sopenharmony_ci			if (page)
385962306a36Sopenharmony_ci				folio = page_folio(page);
386062306a36Sopenharmony_ci			swapcache = folio;
386162306a36Sopenharmony_ci		}
386262306a36Sopenharmony_ci
386362306a36Sopenharmony_ci		if (!folio) {
386462306a36Sopenharmony_ci			/*
386562306a36Sopenharmony_ci			 * Back out if somebody else faulted in this pte
386662306a36Sopenharmony_ci			 * while we released the pte lock.
386762306a36Sopenharmony_ci			 */
386862306a36Sopenharmony_ci			vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
386962306a36Sopenharmony_ci					vmf->address, &vmf->ptl);
387062306a36Sopenharmony_ci			if (likely(vmf->pte &&
387162306a36Sopenharmony_ci				   pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
387262306a36Sopenharmony_ci				ret = VM_FAULT_OOM;
387362306a36Sopenharmony_ci			goto unlock;
387462306a36Sopenharmony_ci		}
387562306a36Sopenharmony_ci
387662306a36Sopenharmony_ci		/* Had to read the page from swap area: Major fault */
387762306a36Sopenharmony_ci		ret = VM_FAULT_MAJOR;
387862306a36Sopenharmony_ci		count_vm_event(PGMAJFAULT);
387962306a36Sopenharmony_ci		count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
388062306a36Sopenharmony_ci	} else if (PageHWPoison(page)) {
388162306a36Sopenharmony_ci		/*
388262306a36Sopenharmony_ci		 * hwpoisoned dirty swapcache pages are kept for killing
388362306a36Sopenharmony_ci		 * owner processes (which may be unknown at hwpoison time)
388462306a36Sopenharmony_ci		 */
388562306a36Sopenharmony_ci		ret = VM_FAULT_HWPOISON;
388662306a36Sopenharmony_ci		goto out_release;
388762306a36Sopenharmony_ci	}
388862306a36Sopenharmony_ci
388962306a36Sopenharmony_ci	ret |= folio_lock_or_retry(folio, vmf);
389062306a36Sopenharmony_ci	if (ret & VM_FAULT_RETRY)
389162306a36Sopenharmony_ci		goto out_release;
389262306a36Sopenharmony_ci
389362306a36Sopenharmony_ci	if (swapcache) {
389462306a36Sopenharmony_ci		/*
389562306a36Sopenharmony_ci		 * Make sure folio_free_swap() or swapoff did not release the
389662306a36Sopenharmony_ci		 * swapcache from under us.  The page pin, and pte_same test
389762306a36Sopenharmony_ci		 * below, are not enough to exclude that.  Even if it is still
389862306a36Sopenharmony_ci		 * swapcache, we need to check that the page's swap has not
389962306a36Sopenharmony_ci		 * changed.
390062306a36Sopenharmony_ci		 */
390162306a36Sopenharmony_ci		if (unlikely(!folio_test_swapcache(folio) ||
390262306a36Sopenharmony_ci			     page_swap_entry(page).val != entry.val))
390362306a36Sopenharmony_ci			goto out_page;
390462306a36Sopenharmony_ci
390562306a36Sopenharmony_ci		/*
390662306a36Sopenharmony_ci		 * KSM sometimes has to copy on read faults, for example, if
390762306a36Sopenharmony_ci		 * page->index of !PageKSM() pages would be nonlinear inside the
390862306a36Sopenharmony_ci		 * anon VMA -- PageKSM() is lost on actual swapout.
390962306a36Sopenharmony_ci		 */
391062306a36Sopenharmony_ci		page = ksm_might_need_to_copy(page, vma, vmf->address);
391162306a36Sopenharmony_ci		if (unlikely(!page)) {
391262306a36Sopenharmony_ci			ret = VM_FAULT_OOM;
391362306a36Sopenharmony_ci			goto out_page;
391462306a36Sopenharmony_ci		} else if (unlikely(PTR_ERR(page) == -EHWPOISON)) {
391562306a36Sopenharmony_ci			ret = VM_FAULT_HWPOISON;
391662306a36Sopenharmony_ci			goto out_page;
391762306a36Sopenharmony_ci		}
391862306a36Sopenharmony_ci		folio = page_folio(page);
391962306a36Sopenharmony_ci
392062306a36Sopenharmony_ci		/*
392162306a36Sopenharmony_ci		 * If we want to map a page that's in the swapcache writable, we
392262306a36Sopenharmony_ci		 * have to detect via the refcount if we're really the exclusive
392362306a36Sopenharmony_ci		 * owner. Try removing the extra reference from the local LRU
392462306a36Sopenharmony_ci		 * caches if required.
392562306a36Sopenharmony_ci		 */
392662306a36Sopenharmony_ci		if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache &&
392762306a36Sopenharmony_ci		    !folio_test_ksm(folio) && !folio_test_lru(folio))
392862306a36Sopenharmony_ci			lru_add_drain();
392962306a36Sopenharmony_ci	}
393062306a36Sopenharmony_ci
393162306a36Sopenharmony_ci	folio_throttle_swaprate(folio, GFP_KERNEL);
393262306a36Sopenharmony_ci
393362306a36Sopenharmony_ci	/*
393462306a36Sopenharmony_ci	 * Back out if somebody else already faulted in this pte.
393562306a36Sopenharmony_ci	 */
393662306a36Sopenharmony_ci	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
393762306a36Sopenharmony_ci			&vmf->ptl);
393862306a36Sopenharmony_ci	if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
393962306a36Sopenharmony_ci		goto out_nomap;
394062306a36Sopenharmony_ci
394162306a36Sopenharmony_ci	if (unlikely(!folio_test_uptodate(folio))) {
394262306a36Sopenharmony_ci		ret = VM_FAULT_SIGBUS;
394362306a36Sopenharmony_ci		goto out_nomap;
394462306a36Sopenharmony_ci	}
394562306a36Sopenharmony_ci
394662306a36Sopenharmony_ci	/*
394762306a36Sopenharmony_ci	 * PG_anon_exclusive reuses PG_mappedtodisk for anon pages. A swap pte
394862306a36Sopenharmony_ci	 * must never point at an anonymous page in the swapcache that is
394962306a36Sopenharmony_ci	 * PG_anon_exclusive. Sanity check that this holds and especially, that
395062306a36Sopenharmony_ci	 * no filesystem set PG_mappedtodisk on a page in the swapcache. Sanity
395162306a36Sopenharmony_ci	 * check after taking the PT lock and making sure that nobody
395262306a36Sopenharmony_ci	 * concurrently faulted in this page and set PG_anon_exclusive.
395362306a36Sopenharmony_ci	 */
395462306a36Sopenharmony_ci	BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio));
395562306a36Sopenharmony_ci	BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page));
395662306a36Sopenharmony_ci
395762306a36Sopenharmony_ci	/*
395862306a36Sopenharmony_ci	 * Check under PT lock (to protect against concurrent fork() sharing
395962306a36Sopenharmony_ci	 * the swap entry concurrently) for certainly exclusive pages.
396062306a36Sopenharmony_ci	 */
396162306a36Sopenharmony_ci	if (!folio_test_ksm(folio)) {
396262306a36Sopenharmony_ci		exclusive = pte_swp_exclusive(vmf->orig_pte);
396362306a36Sopenharmony_ci		if (folio != swapcache) {
396462306a36Sopenharmony_ci			/*
396562306a36Sopenharmony_ci			 * We have a fresh page that is not exposed to the
396662306a36Sopenharmony_ci			 * swapcache -> certainly exclusive.
396762306a36Sopenharmony_ci			 */
396862306a36Sopenharmony_ci			exclusive = true;
396962306a36Sopenharmony_ci		} else if (exclusive && folio_test_writeback(folio) &&
397062306a36Sopenharmony_ci			  data_race(si->flags & SWP_STABLE_WRITES)) {
397162306a36Sopenharmony_ci			/*
397262306a36Sopenharmony_ci			 * This is tricky: not all swap backends support
397362306a36Sopenharmony_ci			 * concurrent page modifications while under writeback.
397462306a36Sopenharmony_ci			 *
397562306a36Sopenharmony_ci			 * So if we stumble over such a page in the swapcache
397662306a36Sopenharmony_ci			 * we must not set the page exclusive, otherwise we can
397762306a36Sopenharmony_ci			 * map it writable without further checks and modify it
397862306a36Sopenharmony_ci			 * while still under writeback.
397962306a36Sopenharmony_ci			 *
398062306a36Sopenharmony_ci			 * For these problematic swap backends, simply drop the
398162306a36Sopenharmony_ci			 * exclusive marker: this is perfectly fine as we start
398262306a36Sopenharmony_ci			 * writeback only if we fully unmapped the page and
398362306a36Sopenharmony_ci			 * there are no unexpected references on the page after
398462306a36Sopenharmony_ci			 * unmapping succeeded. After fully unmapped, no
398562306a36Sopenharmony_ci			 * further GUP references (FOLL_GET and FOLL_PIN) can
398662306a36Sopenharmony_ci			 * appear, so dropping the exclusive marker and mapping
398762306a36Sopenharmony_ci			 * it only R/O is fine.
398862306a36Sopenharmony_ci			 */
398962306a36Sopenharmony_ci			exclusive = false;
399062306a36Sopenharmony_ci		}
399162306a36Sopenharmony_ci	}
399262306a36Sopenharmony_ci
399362306a36Sopenharmony_ci	/*
399462306a36Sopenharmony_ci	 * Some architectures may have to restore extra metadata to the page
399562306a36Sopenharmony_ci	 * when reading from swap. This metadata may be indexed by swap entry
399662306a36Sopenharmony_ci	 * so this must be called before swap_free().
399762306a36Sopenharmony_ci	 */
399862306a36Sopenharmony_ci	arch_swap_restore(entry, folio);
399962306a36Sopenharmony_ci
400062306a36Sopenharmony_ci	/*
400162306a36Sopenharmony_ci	 * Remove the swap entry and conditionally try to free up the swapcache.
400262306a36Sopenharmony_ci	 * We're already holding a reference on the page but haven't mapped it
400362306a36Sopenharmony_ci	 * yet.
400462306a36Sopenharmony_ci	 */
400562306a36Sopenharmony_ci	swap_free(entry);
400662306a36Sopenharmony_ci	if (should_try_to_free_swap(folio, vma, vmf->flags))
400762306a36Sopenharmony_ci		folio_free_swap(folio);
400862306a36Sopenharmony_ci
400962306a36Sopenharmony_ci	inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
401062306a36Sopenharmony_ci	dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
401162306a36Sopenharmony_ci	pte = mk_pte(page, vma->vm_page_prot);
401262306a36Sopenharmony_ci
401362306a36Sopenharmony_ci	/*
401462306a36Sopenharmony_ci	 * Same logic as in do_wp_page(); however, optimize for pages that are
401562306a36Sopenharmony_ci	 * certainly not shared either because we just allocated them without
401662306a36Sopenharmony_ci	 * exposing them to the swapcache or because the swap entry indicates
401762306a36Sopenharmony_ci	 * exclusivity.
401862306a36Sopenharmony_ci	 */
401962306a36Sopenharmony_ci	if (!folio_test_ksm(folio) &&
402062306a36Sopenharmony_ci	    (exclusive || folio_ref_count(folio) == 1)) {
402162306a36Sopenharmony_ci		if (vmf->flags & FAULT_FLAG_WRITE) {
402262306a36Sopenharmony_ci			pte = maybe_mkwrite(pte_mkdirty(pte), vma);
402362306a36Sopenharmony_ci			vmf->flags &= ~FAULT_FLAG_WRITE;
402462306a36Sopenharmony_ci		}
402562306a36Sopenharmony_ci		rmap_flags |= RMAP_EXCLUSIVE;
402662306a36Sopenharmony_ci	}
402762306a36Sopenharmony_ci	flush_icache_page(vma, page);
402862306a36Sopenharmony_ci	if (pte_swp_soft_dirty(vmf->orig_pte))
402962306a36Sopenharmony_ci		pte = pte_mksoft_dirty(pte);
403062306a36Sopenharmony_ci	if (pte_swp_uffd_wp(vmf->orig_pte))
403162306a36Sopenharmony_ci		pte = pte_mkuffd_wp(pte);
403262306a36Sopenharmony_ci	vmf->orig_pte = pte;
403362306a36Sopenharmony_ci
403462306a36Sopenharmony_ci	/* ksm created a completely new copy */
403562306a36Sopenharmony_ci	if (unlikely(folio != swapcache && swapcache)) {
403662306a36Sopenharmony_ci		page_add_new_anon_rmap(page, vma, vmf->address);
403762306a36Sopenharmony_ci		folio_add_lru_vma(folio, vma);
403862306a36Sopenharmony_ci	} else {
403962306a36Sopenharmony_ci		page_add_anon_rmap(page, vma, vmf->address, rmap_flags);
404062306a36Sopenharmony_ci	}
404162306a36Sopenharmony_ci
404262306a36Sopenharmony_ci	VM_BUG_ON(!folio_test_anon(folio) ||
404362306a36Sopenharmony_ci			(pte_write(pte) && !PageAnonExclusive(page)));
404462306a36Sopenharmony_ci	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
404562306a36Sopenharmony_ci	arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
404662306a36Sopenharmony_ci
404762306a36Sopenharmony_ci	folio_unlock(folio);
404862306a36Sopenharmony_ci	if (folio != swapcache && swapcache) {
404962306a36Sopenharmony_ci		/*
405062306a36Sopenharmony_ci		 * Hold the lock to avoid the swap entry to be reused
405162306a36Sopenharmony_ci		 * until we take the PT lock for the pte_same() check
405262306a36Sopenharmony_ci		 * (to avoid false positives from pte_same). For
405362306a36Sopenharmony_ci		 * further safety release the lock after the swap_free
405462306a36Sopenharmony_ci		 * so that the swap count won't change under a
405562306a36Sopenharmony_ci		 * parallel locked swapcache.
405662306a36Sopenharmony_ci		 */
405762306a36Sopenharmony_ci		folio_unlock(swapcache);
405862306a36Sopenharmony_ci		folio_put(swapcache);
405962306a36Sopenharmony_ci	}
406062306a36Sopenharmony_ci
406162306a36Sopenharmony_ci	if (vmf->flags & FAULT_FLAG_WRITE) {
406262306a36Sopenharmony_ci		ret |= do_wp_page(vmf);
406362306a36Sopenharmony_ci		if (ret & VM_FAULT_ERROR)
406462306a36Sopenharmony_ci			ret &= VM_FAULT_ERROR;
406562306a36Sopenharmony_ci		goto out;
406662306a36Sopenharmony_ci	}
406762306a36Sopenharmony_ci
406862306a36Sopenharmony_ci	/* No need to invalidate - it was non-present before */
406962306a36Sopenharmony_ci	update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
407062306a36Sopenharmony_ciunlock:
407162306a36Sopenharmony_ci	if (vmf->pte)
407262306a36Sopenharmony_ci		pte_unmap_unlock(vmf->pte, vmf->ptl);
407362306a36Sopenharmony_ciout:
407462306a36Sopenharmony_ci	/* Clear the swap cache pin for direct swapin after PTL unlock */
407562306a36Sopenharmony_ci	if (need_clear_cache)
407662306a36Sopenharmony_ci		swapcache_clear(si, entry);
407762306a36Sopenharmony_ci	if (si)
407862306a36Sopenharmony_ci		put_swap_device(si);
407962306a36Sopenharmony_ci	return ret;
408062306a36Sopenharmony_ciout_nomap:
408162306a36Sopenharmony_ci	if (vmf->pte)
408262306a36Sopenharmony_ci		pte_unmap_unlock(vmf->pte, vmf->ptl);
408362306a36Sopenharmony_ciout_page:
408462306a36Sopenharmony_ci	folio_unlock(folio);
408562306a36Sopenharmony_ciout_release:
408662306a36Sopenharmony_ci	folio_put(folio);
408762306a36Sopenharmony_ci	if (folio != swapcache && swapcache) {
408862306a36Sopenharmony_ci		folio_unlock(swapcache);
408962306a36Sopenharmony_ci		folio_put(swapcache);
409062306a36Sopenharmony_ci	}
409162306a36Sopenharmony_ci	if (need_clear_cache)
409262306a36Sopenharmony_ci		swapcache_clear(si, entry);
409362306a36Sopenharmony_ci	if (si)
409462306a36Sopenharmony_ci		put_swap_device(si);
409562306a36Sopenharmony_ci	return ret;
409662306a36Sopenharmony_ci}
409762306a36Sopenharmony_ci
409862306a36Sopenharmony_ci/*
409962306a36Sopenharmony_ci * We enter with non-exclusive mmap_lock (to exclude vma changes,
410062306a36Sopenharmony_ci * but allow concurrent faults), and pte mapped but not yet locked.
410162306a36Sopenharmony_ci * We return with mmap_lock still held, but pte unmapped and unlocked.
410262306a36Sopenharmony_ci */
410362306a36Sopenharmony_cistatic vm_fault_t do_anonymous_page(struct vm_fault *vmf)
410462306a36Sopenharmony_ci{
410562306a36Sopenharmony_ci	bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
410662306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
410762306a36Sopenharmony_ci	struct folio *folio;
410862306a36Sopenharmony_ci	vm_fault_t ret = 0;
410962306a36Sopenharmony_ci	pte_t entry;
411062306a36Sopenharmony_ci
411162306a36Sopenharmony_ci	/* File mapping without ->vm_ops ? */
411262306a36Sopenharmony_ci	if (vma->vm_flags & VM_SHARED)
411362306a36Sopenharmony_ci		return VM_FAULT_SIGBUS;
411462306a36Sopenharmony_ci
411562306a36Sopenharmony_ci	/*
411662306a36Sopenharmony_ci	 * Use pte_alloc() instead of pte_alloc_map(), so that OOM can
411762306a36Sopenharmony_ci	 * be distinguished from a transient failure of pte_offset_map().
411862306a36Sopenharmony_ci	 */
411962306a36Sopenharmony_ci	if (pte_alloc(vma->vm_mm, vmf->pmd))
412062306a36Sopenharmony_ci		return VM_FAULT_OOM;
412162306a36Sopenharmony_ci
412262306a36Sopenharmony_ci#ifdef CONFIG_MEM_PURGEABLE
412362306a36Sopenharmony_ci	/* use extra page table for userexpte */
412462306a36Sopenharmony_ci	if (vma->vm_flags & VM_USEREXPTE) {
412562306a36Sopenharmony_ci		if (do_uxpte_page_fault(vmf, &entry))
412662306a36Sopenharmony_ci			goto oom;
412762306a36Sopenharmony_ci		else
412862306a36Sopenharmony_ci			goto got_page;
412962306a36Sopenharmony_ci	}
413062306a36Sopenharmony_ci#endif
413162306a36Sopenharmony_ci	/* Use the zero-page for reads */
413262306a36Sopenharmony_ci	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
413362306a36Sopenharmony_ci			!mm_forbids_zeropage(vma->vm_mm)) {
413462306a36Sopenharmony_ci		entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
413562306a36Sopenharmony_ci						vma->vm_page_prot));
413662306a36Sopenharmony_ci#ifdef CONFIG_MEM_PURGEABLE
413762306a36Sopenharmony_cigot_page:
413862306a36Sopenharmony_ci#endif
413962306a36Sopenharmony_ci		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
414062306a36Sopenharmony_ci				vmf->address, &vmf->ptl);
414162306a36Sopenharmony_ci		if (!vmf->pte)
414262306a36Sopenharmony_ci			goto unlock;
414362306a36Sopenharmony_ci		if (vmf_pte_changed(vmf)) {
414462306a36Sopenharmony_ci			update_mmu_tlb(vma, vmf->address, vmf->pte);
414562306a36Sopenharmony_ci			goto unlock;
414662306a36Sopenharmony_ci		}
414762306a36Sopenharmony_ci		ret = check_stable_address_space(vma->vm_mm);
414862306a36Sopenharmony_ci		if (ret)
414962306a36Sopenharmony_ci			goto unlock;
415062306a36Sopenharmony_ci		/* Deliver the page fault to userland, check inside PT lock */
415162306a36Sopenharmony_ci		if (userfaultfd_missing(vma)) {
415262306a36Sopenharmony_ci			pte_unmap_unlock(vmf->pte, vmf->ptl);
415362306a36Sopenharmony_ci			return handle_userfault(vmf, VM_UFFD_MISSING);
415462306a36Sopenharmony_ci		}
415562306a36Sopenharmony_ci		goto setpte;
415662306a36Sopenharmony_ci	}
415762306a36Sopenharmony_ci
415862306a36Sopenharmony_ci	/* Allocate our own private page. */
415962306a36Sopenharmony_ci	if (unlikely(anon_vma_prepare(vma)))
416062306a36Sopenharmony_ci		goto oom;
416162306a36Sopenharmony_ci	folio = vma_alloc_zeroed_movable_folio(vma, vmf->address);
416262306a36Sopenharmony_ci	if (!folio)
416362306a36Sopenharmony_ci		goto oom;
416462306a36Sopenharmony_ci
416562306a36Sopenharmony_ci	if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL))
416662306a36Sopenharmony_ci		goto oom_free_page;
416762306a36Sopenharmony_ci	folio_throttle_swaprate(folio, GFP_KERNEL);
416862306a36Sopenharmony_ci
416962306a36Sopenharmony_ci	/*
417062306a36Sopenharmony_ci	 * The memory barrier inside __folio_mark_uptodate makes sure that
417162306a36Sopenharmony_ci	 * preceding stores to the page contents become visible before
417262306a36Sopenharmony_ci	 * the set_pte_at() write.
417362306a36Sopenharmony_ci	 */
417462306a36Sopenharmony_ci	__folio_mark_uptodate(folio);
417562306a36Sopenharmony_ci
417662306a36Sopenharmony_ci	entry = mk_pte(&folio->page, vma->vm_page_prot);
417762306a36Sopenharmony_ci	entry = pte_sw_mkyoung(entry);
417862306a36Sopenharmony_ci	if (vma->vm_flags & VM_WRITE)
417962306a36Sopenharmony_ci		entry = pte_mkwrite(pte_mkdirty(entry), vma);
418062306a36Sopenharmony_ci
418162306a36Sopenharmony_ci	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
418262306a36Sopenharmony_ci			&vmf->ptl);
418362306a36Sopenharmony_ci	if (!vmf->pte)
418462306a36Sopenharmony_ci		goto release;
418562306a36Sopenharmony_ci	if (vmf_pte_changed(vmf)) {
418662306a36Sopenharmony_ci		update_mmu_tlb(vma, vmf->address, vmf->pte);
418762306a36Sopenharmony_ci		goto release;
418862306a36Sopenharmony_ci	}
418962306a36Sopenharmony_ci
419062306a36Sopenharmony_ci	ret = check_stable_address_space(vma->vm_mm);
419162306a36Sopenharmony_ci	if (ret)
419262306a36Sopenharmony_ci		goto release;
419362306a36Sopenharmony_ci
419462306a36Sopenharmony_ci	/* Deliver the page fault to userland, check inside PT lock */
419562306a36Sopenharmony_ci	if (userfaultfd_missing(vma)) {
419662306a36Sopenharmony_ci		pte_unmap_unlock(vmf->pte, vmf->ptl);
419762306a36Sopenharmony_ci		folio_put(folio);
419862306a36Sopenharmony_ci		return handle_userfault(vmf, VM_UFFD_MISSING);
419962306a36Sopenharmony_ci	}
420062306a36Sopenharmony_ci
420162306a36Sopenharmony_ci	inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
420262306a36Sopenharmony_ci	folio_add_new_anon_rmap(folio, vma, vmf->address);
420362306a36Sopenharmony_ci#ifdef CONFIG_MEM_PURGEABLE
420462306a36Sopenharmony_ci	if (vma->vm_flags & VM_PURGEABLE)
420562306a36Sopenharmony_ci		folio_set_purgeable(folio);
420662306a36Sopenharmony_ci#endif
420762306a36Sopenharmony_ci	folio_add_lru_vma(folio, vma);
420862306a36Sopenharmony_cisetpte:
420962306a36Sopenharmony_ci#ifdef CONFIG_MEM_PURGEABLE
421062306a36Sopenharmony_ci	if (vma->vm_flags & VM_PURGEABLE)
421162306a36Sopenharmony_ci		uxpte_set_present(vma, vmf->address);
421262306a36Sopenharmony_ci#endif
421362306a36Sopenharmony_ci	if (uffd_wp)
421462306a36Sopenharmony_ci		entry = pte_mkuffd_wp(entry);
421562306a36Sopenharmony_ci	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
421662306a36Sopenharmony_ci
421762306a36Sopenharmony_ci	/* No need to invalidate - it was non-present before */
421862306a36Sopenharmony_ci	update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
421962306a36Sopenharmony_ciunlock:
422062306a36Sopenharmony_ci	if (vmf->pte)
422162306a36Sopenharmony_ci		pte_unmap_unlock(vmf->pte, vmf->ptl);
422262306a36Sopenharmony_ci	return ret;
422362306a36Sopenharmony_cirelease:
422462306a36Sopenharmony_ci	folio_put(folio);
422562306a36Sopenharmony_ci	goto unlock;
422662306a36Sopenharmony_cioom_free_page:
422762306a36Sopenharmony_ci	folio_put(folio);
422862306a36Sopenharmony_cioom:
422962306a36Sopenharmony_ci	return VM_FAULT_OOM;
423062306a36Sopenharmony_ci}
423162306a36Sopenharmony_ci
423262306a36Sopenharmony_ci/*
423362306a36Sopenharmony_ci * The mmap_lock must have been held on entry, and may have been
423462306a36Sopenharmony_ci * released depending on flags and vma->vm_ops->fault() return value.
423562306a36Sopenharmony_ci * See filemap_fault() and __lock_page_retry().
423662306a36Sopenharmony_ci */
423762306a36Sopenharmony_cistatic vm_fault_t __do_fault(struct vm_fault *vmf)
423862306a36Sopenharmony_ci{
423962306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
424062306a36Sopenharmony_ci	vm_fault_t ret;
424162306a36Sopenharmony_ci
424262306a36Sopenharmony_ci	/*
424362306a36Sopenharmony_ci	 * Preallocate pte before we take page_lock because this might lead to
424462306a36Sopenharmony_ci	 * deadlocks for memcg reclaim which waits for pages under writeback:
424562306a36Sopenharmony_ci	 *				lock_page(A)
424662306a36Sopenharmony_ci	 *				SetPageWriteback(A)
424762306a36Sopenharmony_ci	 *				unlock_page(A)
424862306a36Sopenharmony_ci	 * lock_page(B)
424962306a36Sopenharmony_ci	 *				lock_page(B)
425062306a36Sopenharmony_ci	 * pte_alloc_one
425162306a36Sopenharmony_ci	 *   shrink_page_list
425262306a36Sopenharmony_ci	 *     wait_on_page_writeback(A)
425362306a36Sopenharmony_ci	 *				SetPageWriteback(B)
425462306a36Sopenharmony_ci	 *				unlock_page(B)
425562306a36Sopenharmony_ci	 *				# flush A, B to clear the writeback
425662306a36Sopenharmony_ci	 */
425762306a36Sopenharmony_ci	if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
425862306a36Sopenharmony_ci		vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
425962306a36Sopenharmony_ci		if (!vmf->prealloc_pte)
426062306a36Sopenharmony_ci			return VM_FAULT_OOM;
426162306a36Sopenharmony_ci	}
426262306a36Sopenharmony_ci
426362306a36Sopenharmony_ci	ret = vma->vm_ops->fault(vmf);
426462306a36Sopenharmony_ci	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
426562306a36Sopenharmony_ci			    VM_FAULT_DONE_COW)))
426662306a36Sopenharmony_ci		return ret;
426762306a36Sopenharmony_ci
426862306a36Sopenharmony_ci	if (unlikely(PageHWPoison(vmf->page))) {
426962306a36Sopenharmony_ci		struct page *page = vmf->page;
427062306a36Sopenharmony_ci		vm_fault_t poisonret = VM_FAULT_HWPOISON;
427162306a36Sopenharmony_ci		if (ret & VM_FAULT_LOCKED) {
427262306a36Sopenharmony_ci			if (page_mapped(page))
427362306a36Sopenharmony_ci				unmap_mapping_pages(page_mapping(page),
427462306a36Sopenharmony_ci						    page->index, 1, false);
427562306a36Sopenharmony_ci			/* Retry if a clean page was removed from the cache. */
427662306a36Sopenharmony_ci			if (invalidate_inode_page(page))
427762306a36Sopenharmony_ci				poisonret = VM_FAULT_NOPAGE;
427862306a36Sopenharmony_ci			unlock_page(page);
427962306a36Sopenharmony_ci		}
428062306a36Sopenharmony_ci		put_page(page);
428162306a36Sopenharmony_ci		vmf->page = NULL;
428262306a36Sopenharmony_ci		return poisonret;
428362306a36Sopenharmony_ci	}
428462306a36Sopenharmony_ci
428562306a36Sopenharmony_ci	if (unlikely(!(ret & VM_FAULT_LOCKED)))
428662306a36Sopenharmony_ci		lock_page(vmf->page);
428762306a36Sopenharmony_ci	else
428862306a36Sopenharmony_ci		VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
428962306a36Sopenharmony_ci
429062306a36Sopenharmony_ci	return ret;
429162306a36Sopenharmony_ci}
429262306a36Sopenharmony_ci
429362306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE
429462306a36Sopenharmony_cistatic void deposit_prealloc_pte(struct vm_fault *vmf)
429562306a36Sopenharmony_ci{
429662306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
429762306a36Sopenharmony_ci
429862306a36Sopenharmony_ci	pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
429962306a36Sopenharmony_ci	/*
430062306a36Sopenharmony_ci	 * We are going to consume the prealloc table,
430162306a36Sopenharmony_ci	 * count that as nr_ptes.
430262306a36Sopenharmony_ci	 */
430362306a36Sopenharmony_ci	mm_inc_nr_ptes(vma->vm_mm);
430462306a36Sopenharmony_ci	vmf->prealloc_pte = NULL;
430562306a36Sopenharmony_ci}
430662306a36Sopenharmony_ci
430762306a36Sopenharmony_civm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
430862306a36Sopenharmony_ci{
430962306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
431062306a36Sopenharmony_ci	bool write = vmf->flags & FAULT_FLAG_WRITE;
431162306a36Sopenharmony_ci	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
431262306a36Sopenharmony_ci	pmd_t entry;
431362306a36Sopenharmony_ci	vm_fault_t ret = VM_FAULT_FALLBACK;
431462306a36Sopenharmony_ci
431562306a36Sopenharmony_ci	if (!transhuge_vma_suitable(vma, haddr))
431662306a36Sopenharmony_ci		return ret;
431762306a36Sopenharmony_ci
431862306a36Sopenharmony_ci	page = compound_head(page);
431962306a36Sopenharmony_ci	if (compound_order(page) != HPAGE_PMD_ORDER)
432062306a36Sopenharmony_ci		return ret;
432162306a36Sopenharmony_ci
432262306a36Sopenharmony_ci	/*
432362306a36Sopenharmony_ci	 * Just backoff if any subpage of a THP is corrupted otherwise
432462306a36Sopenharmony_ci	 * the corrupted page may mapped by PMD silently to escape the
432562306a36Sopenharmony_ci	 * check.  This kind of THP just can be PTE mapped.  Access to
432662306a36Sopenharmony_ci	 * the corrupted subpage should trigger SIGBUS as expected.
432762306a36Sopenharmony_ci	 */
432862306a36Sopenharmony_ci	if (unlikely(PageHasHWPoisoned(page)))
432962306a36Sopenharmony_ci		return ret;
433062306a36Sopenharmony_ci
433162306a36Sopenharmony_ci	/*
433262306a36Sopenharmony_ci	 * Archs like ppc64 need additional space to store information
433362306a36Sopenharmony_ci	 * related to pte entry. Use the preallocated table for that.
433462306a36Sopenharmony_ci	 */
433562306a36Sopenharmony_ci	if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
433662306a36Sopenharmony_ci		vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
433762306a36Sopenharmony_ci		if (!vmf->prealloc_pte)
433862306a36Sopenharmony_ci			return VM_FAULT_OOM;
433962306a36Sopenharmony_ci	}
434062306a36Sopenharmony_ci
434162306a36Sopenharmony_ci	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
434262306a36Sopenharmony_ci	if (unlikely(!pmd_none(*vmf->pmd)))
434362306a36Sopenharmony_ci		goto out;
434462306a36Sopenharmony_ci
434562306a36Sopenharmony_ci	flush_icache_pages(vma, page, HPAGE_PMD_NR);
434662306a36Sopenharmony_ci
434762306a36Sopenharmony_ci	entry = mk_huge_pmd(page, vma->vm_page_prot);
434862306a36Sopenharmony_ci	if (write)
434962306a36Sopenharmony_ci		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
435062306a36Sopenharmony_ci
435162306a36Sopenharmony_ci	add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
435262306a36Sopenharmony_ci	page_add_file_rmap(page, vma, true);
435362306a36Sopenharmony_ci
435462306a36Sopenharmony_ci	/*
435562306a36Sopenharmony_ci	 * deposit and withdraw with pmd lock held
435662306a36Sopenharmony_ci	 */
435762306a36Sopenharmony_ci	if (arch_needs_pgtable_deposit())
435862306a36Sopenharmony_ci		deposit_prealloc_pte(vmf);
435962306a36Sopenharmony_ci
436062306a36Sopenharmony_ci	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
436162306a36Sopenharmony_ci
436262306a36Sopenharmony_ci	update_mmu_cache_pmd(vma, haddr, vmf->pmd);
436362306a36Sopenharmony_ci
436462306a36Sopenharmony_ci	/* fault is handled */
436562306a36Sopenharmony_ci	ret = 0;
436662306a36Sopenharmony_ci	count_vm_event(THP_FILE_MAPPED);
436762306a36Sopenharmony_ciout:
436862306a36Sopenharmony_ci	spin_unlock(vmf->ptl);
436962306a36Sopenharmony_ci	return ret;
437062306a36Sopenharmony_ci}
437162306a36Sopenharmony_ci#else
437262306a36Sopenharmony_civm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
437362306a36Sopenharmony_ci{
437462306a36Sopenharmony_ci	return VM_FAULT_FALLBACK;
437562306a36Sopenharmony_ci}
437662306a36Sopenharmony_ci#endif
437762306a36Sopenharmony_ci
437862306a36Sopenharmony_ci/**
437962306a36Sopenharmony_ci * set_pte_range - Set a range of PTEs to point to pages in a folio.
438062306a36Sopenharmony_ci * @vmf: Fault decription.
438162306a36Sopenharmony_ci * @folio: The folio that contains @page.
438262306a36Sopenharmony_ci * @page: The first page to create a PTE for.
438362306a36Sopenharmony_ci * @nr: The number of PTEs to create.
438462306a36Sopenharmony_ci * @addr: The first address to create a PTE for.
438562306a36Sopenharmony_ci */
438662306a36Sopenharmony_civoid set_pte_range(struct vm_fault *vmf, struct folio *folio,
438762306a36Sopenharmony_ci		struct page *page, unsigned int nr, unsigned long addr)
438862306a36Sopenharmony_ci{
438962306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
439062306a36Sopenharmony_ci	bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
439162306a36Sopenharmony_ci	bool write = vmf->flags & FAULT_FLAG_WRITE;
439262306a36Sopenharmony_ci	bool prefault = in_range(vmf->address, addr, nr * PAGE_SIZE);
439362306a36Sopenharmony_ci	pte_t entry;
439462306a36Sopenharmony_ci
439562306a36Sopenharmony_ci	flush_icache_pages(vma, page, nr);
439662306a36Sopenharmony_ci	entry = mk_pte(page, vma->vm_page_prot);
439762306a36Sopenharmony_ci
439862306a36Sopenharmony_ci	if (prefault && arch_wants_old_prefaulted_pte())
439962306a36Sopenharmony_ci		entry = pte_mkold(entry);
440062306a36Sopenharmony_ci	else
440162306a36Sopenharmony_ci		entry = pte_sw_mkyoung(entry);
440262306a36Sopenharmony_ci
440362306a36Sopenharmony_ci	if (write)
440462306a36Sopenharmony_ci		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
440562306a36Sopenharmony_ci	if (unlikely(uffd_wp))
440662306a36Sopenharmony_ci		entry = pte_mkuffd_wp(entry);
440762306a36Sopenharmony_ci	/* copy-on-write page */
440862306a36Sopenharmony_ci	if (write && !(vma->vm_flags & VM_SHARED)) {
440962306a36Sopenharmony_ci		add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr);
441062306a36Sopenharmony_ci		VM_BUG_ON_FOLIO(nr != 1, folio);
441162306a36Sopenharmony_ci		folio_add_new_anon_rmap(folio, vma, addr);
441262306a36Sopenharmony_ci		folio_add_lru_vma(folio, vma);
441362306a36Sopenharmony_ci	} else {
441462306a36Sopenharmony_ci		add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
441562306a36Sopenharmony_ci		folio_add_file_rmap_range(folio, page, nr, vma, false);
441662306a36Sopenharmony_ci	}
441762306a36Sopenharmony_ci	set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);
441862306a36Sopenharmony_ci
441962306a36Sopenharmony_ci	/* no need to invalidate: a not-present page won't be cached */
442062306a36Sopenharmony_ci	update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr);
442162306a36Sopenharmony_ci}
442262306a36Sopenharmony_ci
442362306a36Sopenharmony_cistatic bool vmf_pte_changed(struct vm_fault *vmf)
442462306a36Sopenharmony_ci{
442562306a36Sopenharmony_ci	if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)
442662306a36Sopenharmony_ci		return !pte_same(ptep_get(vmf->pte), vmf->orig_pte);
442762306a36Sopenharmony_ci
442862306a36Sopenharmony_ci	return !pte_none(ptep_get(vmf->pte));
442962306a36Sopenharmony_ci}
443062306a36Sopenharmony_ci
443162306a36Sopenharmony_ci/**
443262306a36Sopenharmony_ci * finish_fault - finish page fault once we have prepared the page to fault
443362306a36Sopenharmony_ci *
443462306a36Sopenharmony_ci * @vmf: structure describing the fault
443562306a36Sopenharmony_ci *
443662306a36Sopenharmony_ci * This function handles all that is needed to finish a page fault once the
443762306a36Sopenharmony_ci * page to fault in is prepared. It handles locking of PTEs, inserts PTE for
443862306a36Sopenharmony_ci * given page, adds reverse page mapping, handles memcg charges and LRU
443962306a36Sopenharmony_ci * addition.
444062306a36Sopenharmony_ci *
444162306a36Sopenharmony_ci * The function expects the page to be locked and on success it consumes a
444262306a36Sopenharmony_ci * reference of a page being mapped (for the PTE which maps it).
444362306a36Sopenharmony_ci *
444462306a36Sopenharmony_ci * Return: %0 on success, %VM_FAULT_ code in case of error.
444562306a36Sopenharmony_ci */
444662306a36Sopenharmony_civm_fault_t finish_fault(struct vm_fault *vmf)
444762306a36Sopenharmony_ci{
444862306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
444962306a36Sopenharmony_ci	struct page *page;
445062306a36Sopenharmony_ci	vm_fault_t ret;
445162306a36Sopenharmony_ci
445262306a36Sopenharmony_ci	/* Did we COW the page? */
445362306a36Sopenharmony_ci	if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
445462306a36Sopenharmony_ci		page = vmf->cow_page;
445562306a36Sopenharmony_ci	else
445662306a36Sopenharmony_ci		page = vmf->page;
445762306a36Sopenharmony_ci
445862306a36Sopenharmony_ci	/*
445962306a36Sopenharmony_ci	 * check even for read faults because we might have lost our CoWed
446062306a36Sopenharmony_ci	 * page
446162306a36Sopenharmony_ci	 */
446262306a36Sopenharmony_ci	if (!(vma->vm_flags & VM_SHARED)) {
446362306a36Sopenharmony_ci		ret = check_stable_address_space(vma->vm_mm);
446462306a36Sopenharmony_ci		if (ret)
446562306a36Sopenharmony_ci			return ret;
446662306a36Sopenharmony_ci	}
446762306a36Sopenharmony_ci
446862306a36Sopenharmony_ci	if (pmd_none(*vmf->pmd)) {
446962306a36Sopenharmony_ci		if (PageTransCompound(page)) {
447062306a36Sopenharmony_ci			ret = do_set_pmd(vmf, page);
447162306a36Sopenharmony_ci			if (ret != VM_FAULT_FALLBACK)
447262306a36Sopenharmony_ci				return ret;
447362306a36Sopenharmony_ci		}
447462306a36Sopenharmony_ci
447562306a36Sopenharmony_ci		if (vmf->prealloc_pte)
447662306a36Sopenharmony_ci			pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte);
447762306a36Sopenharmony_ci		else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
447862306a36Sopenharmony_ci			return VM_FAULT_OOM;
447962306a36Sopenharmony_ci	}
448062306a36Sopenharmony_ci
448162306a36Sopenharmony_ci	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
448262306a36Sopenharmony_ci				      vmf->address, &vmf->ptl);
448362306a36Sopenharmony_ci	if (!vmf->pte)
448462306a36Sopenharmony_ci		return VM_FAULT_NOPAGE;
448562306a36Sopenharmony_ci
448662306a36Sopenharmony_ci	/* Re-check under ptl */
448762306a36Sopenharmony_ci	if (likely(!vmf_pte_changed(vmf))) {
448862306a36Sopenharmony_ci		struct folio *folio = page_folio(page);
448962306a36Sopenharmony_ci
449062306a36Sopenharmony_ci		set_pte_range(vmf, folio, page, 1, vmf->address);
449162306a36Sopenharmony_ci		ret = 0;
449262306a36Sopenharmony_ci	} else {
449362306a36Sopenharmony_ci		update_mmu_tlb(vma, vmf->address, vmf->pte);
449462306a36Sopenharmony_ci		ret = VM_FAULT_NOPAGE;
449562306a36Sopenharmony_ci	}
449662306a36Sopenharmony_ci
449762306a36Sopenharmony_ci	pte_unmap_unlock(vmf->pte, vmf->ptl);
449862306a36Sopenharmony_ci	return ret;
449962306a36Sopenharmony_ci}
450062306a36Sopenharmony_ci
450162306a36Sopenharmony_cistatic unsigned long fault_around_pages __read_mostly =
450262306a36Sopenharmony_ci	65536 >> PAGE_SHIFT;
450362306a36Sopenharmony_ci
450462306a36Sopenharmony_ci#ifdef CONFIG_DEBUG_FS
450562306a36Sopenharmony_cistatic int fault_around_bytes_get(void *data, u64 *val)
450662306a36Sopenharmony_ci{
450762306a36Sopenharmony_ci	*val = fault_around_pages << PAGE_SHIFT;
450862306a36Sopenharmony_ci	return 0;
450962306a36Sopenharmony_ci}
451062306a36Sopenharmony_ci
451162306a36Sopenharmony_ci/*
451262306a36Sopenharmony_ci * fault_around_bytes must be rounded down to the nearest page order as it's
451362306a36Sopenharmony_ci * what do_fault_around() expects to see.
451462306a36Sopenharmony_ci */
451562306a36Sopenharmony_cistatic int fault_around_bytes_set(void *data, u64 val)
451662306a36Sopenharmony_ci{
451762306a36Sopenharmony_ci	if (val / PAGE_SIZE > PTRS_PER_PTE)
451862306a36Sopenharmony_ci		return -EINVAL;
451962306a36Sopenharmony_ci
452062306a36Sopenharmony_ci	/*
452162306a36Sopenharmony_ci	 * The minimum value is 1 page, however this results in no fault-around
452262306a36Sopenharmony_ci	 * at all. See should_fault_around().
452362306a36Sopenharmony_ci	 */
452462306a36Sopenharmony_ci	fault_around_pages = max(rounddown_pow_of_two(val) >> PAGE_SHIFT, 1UL);
452562306a36Sopenharmony_ci
452662306a36Sopenharmony_ci	return 0;
452762306a36Sopenharmony_ci}
452862306a36Sopenharmony_ciDEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
452962306a36Sopenharmony_ci		fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
453062306a36Sopenharmony_ci
453162306a36Sopenharmony_cistatic int __init fault_around_debugfs(void)
453262306a36Sopenharmony_ci{
453362306a36Sopenharmony_ci	debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
453462306a36Sopenharmony_ci				   &fault_around_bytes_fops);
453562306a36Sopenharmony_ci	return 0;
453662306a36Sopenharmony_ci}
453762306a36Sopenharmony_cilate_initcall(fault_around_debugfs);
453862306a36Sopenharmony_ci#endif
453962306a36Sopenharmony_ci
454062306a36Sopenharmony_ci/*
454162306a36Sopenharmony_ci * do_fault_around() tries to map few pages around the fault address. The hope
454262306a36Sopenharmony_ci * is that the pages will be needed soon and this will lower the number of
454362306a36Sopenharmony_ci * faults to handle.
454462306a36Sopenharmony_ci *
454562306a36Sopenharmony_ci * It uses vm_ops->map_pages() to map the pages, which skips the page if it's
454662306a36Sopenharmony_ci * not ready to be mapped: not up-to-date, locked, etc.
454762306a36Sopenharmony_ci *
454862306a36Sopenharmony_ci * This function doesn't cross VMA or page table boundaries, in order to call
454962306a36Sopenharmony_ci * map_pages() and acquire a PTE lock only once.
455062306a36Sopenharmony_ci *
455162306a36Sopenharmony_ci * fault_around_pages defines how many pages we'll try to map.
455262306a36Sopenharmony_ci * do_fault_around() expects it to be set to a power of two less than or equal
455362306a36Sopenharmony_ci * to PTRS_PER_PTE.
455462306a36Sopenharmony_ci *
455562306a36Sopenharmony_ci * The virtual address of the area that we map is naturally aligned to
455662306a36Sopenharmony_ci * fault_around_pages * PAGE_SIZE rounded down to the machine page size
455762306a36Sopenharmony_ci * (and therefore to page order).  This way it's easier to guarantee
455862306a36Sopenharmony_ci * that we don't cross page table boundaries.
455962306a36Sopenharmony_ci */
456062306a36Sopenharmony_cistatic vm_fault_t do_fault_around(struct vm_fault *vmf)
456162306a36Sopenharmony_ci{
456262306a36Sopenharmony_ci	pgoff_t nr_pages = READ_ONCE(fault_around_pages);
456362306a36Sopenharmony_ci	pgoff_t pte_off = pte_index(vmf->address);
456462306a36Sopenharmony_ci	/* The page offset of vmf->address within the VMA. */
456562306a36Sopenharmony_ci	pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
456662306a36Sopenharmony_ci	pgoff_t from_pte, to_pte;
456762306a36Sopenharmony_ci	vm_fault_t ret;
456862306a36Sopenharmony_ci
456962306a36Sopenharmony_ci	/* The PTE offset of the start address, clamped to the VMA. */
457062306a36Sopenharmony_ci	from_pte = max(ALIGN_DOWN(pte_off, nr_pages),
457162306a36Sopenharmony_ci		       pte_off - min(pte_off, vma_off));
457262306a36Sopenharmony_ci
457362306a36Sopenharmony_ci	/* The PTE offset of the end address, clamped to the VMA and PTE. */
457462306a36Sopenharmony_ci	to_pte = min3(from_pte + nr_pages, (pgoff_t)PTRS_PER_PTE,
457562306a36Sopenharmony_ci		      pte_off + vma_pages(vmf->vma) - vma_off) - 1;
457662306a36Sopenharmony_ci
457762306a36Sopenharmony_ci	if (pmd_none(*vmf->pmd)) {
457862306a36Sopenharmony_ci		vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
457962306a36Sopenharmony_ci		if (!vmf->prealloc_pte)
458062306a36Sopenharmony_ci			return VM_FAULT_OOM;
458162306a36Sopenharmony_ci	}
458262306a36Sopenharmony_ci
458362306a36Sopenharmony_ci	rcu_read_lock();
458462306a36Sopenharmony_ci	ret = vmf->vma->vm_ops->map_pages(vmf,
458562306a36Sopenharmony_ci			vmf->pgoff + from_pte - pte_off,
458662306a36Sopenharmony_ci			vmf->pgoff + to_pte - pte_off);
458762306a36Sopenharmony_ci	rcu_read_unlock();
458862306a36Sopenharmony_ci
458962306a36Sopenharmony_ci	return ret;
459062306a36Sopenharmony_ci}
459162306a36Sopenharmony_ci
459262306a36Sopenharmony_ci/* Return true if we should do read fault-around, false otherwise */
459362306a36Sopenharmony_cistatic inline bool should_fault_around(struct vm_fault *vmf)
459462306a36Sopenharmony_ci{
459562306a36Sopenharmony_ci	/* No ->map_pages?  No way to fault around... */
459662306a36Sopenharmony_ci	if (!vmf->vma->vm_ops->map_pages)
459762306a36Sopenharmony_ci		return false;
459862306a36Sopenharmony_ci
459962306a36Sopenharmony_ci	if (uffd_disable_fault_around(vmf->vma))
460062306a36Sopenharmony_ci		return false;
460162306a36Sopenharmony_ci
460262306a36Sopenharmony_ci	/* A single page implies no faulting 'around' at all. */
460362306a36Sopenharmony_ci	return fault_around_pages > 1;
460462306a36Sopenharmony_ci}
460562306a36Sopenharmony_ci
460662306a36Sopenharmony_cistatic vm_fault_t do_read_fault(struct vm_fault *vmf)
460762306a36Sopenharmony_ci{
460862306a36Sopenharmony_ci	vm_fault_t ret = 0;
460962306a36Sopenharmony_ci	struct folio *folio;
461062306a36Sopenharmony_ci
461162306a36Sopenharmony_ci	/*
461262306a36Sopenharmony_ci	 * Let's call ->map_pages() first and use ->fault() as fallback
461362306a36Sopenharmony_ci	 * if page by the offset is not ready to be mapped (cold cache or
461462306a36Sopenharmony_ci	 * something).
461562306a36Sopenharmony_ci	 */
461662306a36Sopenharmony_ci	if (should_fault_around(vmf)) {
461762306a36Sopenharmony_ci		ret = do_fault_around(vmf);
461862306a36Sopenharmony_ci		if (ret)
461962306a36Sopenharmony_ci			return ret;
462062306a36Sopenharmony_ci	}
462162306a36Sopenharmony_ci
462262306a36Sopenharmony_ci	if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
462362306a36Sopenharmony_ci		vma_end_read(vmf->vma);
462462306a36Sopenharmony_ci		return VM_FAULT_RETRY;
462562306a36Sopenharmony_ci	}
462662306a36Sopenharmony_ci
462762306a36Sopenharmony_ci	ret = __do_fault(vmf);
462862306a36Sopenharmony_ci	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
462962306a36Sopenharmony_ci		return ret;
463062306a36Sopenharmony_ci
463162306a36Sopenharmony_ci	ret |= finish_fault(vmf);
463262306a36Sopenharmony_ci	folio = page_folio(vmf->page);
463362306a36Sopenharmony_ci	folio_unlock(folio);
463462306a36Sopenharmony_ci	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
463562306a36Sopenharmony_ci		folio_put(folio);
463662306a36Sopenharmony_ci	return ret;
463762306a36Sopenharmony_ci}
463862306a36Sopenharmony_ci
463962306a36Sopenharmony_cistatic vm_fault_t do_cow_fault(struct vm_fault *vmf)
464062306a36Sopenharmony_ci{
464162306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
464262306a36Sopenharmony_ci	vm_fault_t ret;
464362306a36Sopenharmony_ci
464462306a36Sopenharmony_ci	if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
464562306a36Sopenharmony_ci		vma_end_read(vma);
464662306a36Sopenharmony_ci		return VM_FAULT_RETRY;
464762306a36Sopenharmony_ci	}
464862306a36Sopenharmony_ci
464962306a36Sopenharmony_ci	if (unlikely(anon_vma_prepare(vma)))
465062306a36Sopenharmony_ci		return VM_FAULT_OOM;
465162306a36Sopenharmony_ci
465262306a36Sopenharmony_ci	vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
465362306a36Sopenharmony_ci	if (!vmf->cow_page)
465462306a36Sopenharmony_ci		return VM_FAULT_OOM;
465562306a36Sopenharmony_ci
465662306a36Sopenharmony_ci	if (mem_cgroup_charge(page_folio(vmf->cow_page), vma->vm_mm,
465762306a36Sopenharmony_ci				GFP_KERNEL)) {
465862306a36Sopenharmony_ci		put_page(vmf->cow_page);
465962306a36Sopenharmony_ci		return VM_FAULT_OOM;
466062306a36Sopenharmony_ci	}
466162306a36Sopenharmony_ci	folio_throttle_swaprate(page_folio(vmf->cow_page), GFP_KERNEL);
466262306a36Sopenharmony_ci
466362306a36Sopenharmony_ci	ret = __do_fault(vmf);
466462306a36Sopenharmony_ci	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
466562306a36Sopenharmony_ci		goto uncharge_out;
466662306a36Sopenharmony_ci	if (ret & VM_FAULT_DONE_COW)
466762306a36Sopenharmony_ci		return ret;
466862306a36Sopenharmony_ci
466962306a36Sopenharmony_ci	copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
467062306a36Sopenharmony_ci	__SetPageUptodate(vmf->cow_page);
467162306a36Sopenharmony_ci
467262306a36Sopenharmony_ci	ret |= finish_fault(vmf);
467362306a36Sopenharmony_ci	unlock_page(vmf->page);
467462306a36Sopenharmony_ci	put_page(vmf->page);
467562306a36Sopenharmony_ci	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
467662306a36Sopenharmony_ci		goto uncharge_out;
467762306a36Sopenharmony_ci	return ret;
467862306a36Sopenharmony_ciuncharge_out:
467962306a36Sopenharmony_ci	put_page(vmf->cow_page);
468062306a36Sopenharmony_ci	return ret;
468162306a36Sopenharmony_ci}
468262306a36Sopenharmony_ci
468362306a36Sopenharmony_cistatic vm_fault_t do_shared_fault(struct vm_fault *vmf)
468462306a36Sopenharmony_ci{
468562306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
468662306a36Sopenharmony_ci	vm_fault_t ret, tmp;
468762306a36Sopenharmony_ci	struct folio *folio;
468862306a36Sopenharmony_ci
468962306a36Sopenharmony_ci	if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
469062306a36Sopenharmony_ci		vma_end_read(vma);
469162306a36Sopenharmony_ci		return VM_FAULT_RETRY;
469262306a36Sopenharmony_ci	}
469362306a36Sopenharmony_ci
469462306a36Sopenharmony_ci	ret = __do_fault(vmf);
469562306a36Sopenharmony_ci	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
469662306a36Sopenharmony_ci		return ret;
469762306a36Sopenharmony_ci
469862306a36Sopenharmony_ci	folio = page_folio(vmf->page);
469962306a36Sopenharmony_ci
470062306a36Sopenharmony_ci	/*
470162306a36Sopenharmony_ci	 * Check if the backing address space wants to know that the page is
470262306a36Sopenharmony_ci	 * about to become writable
470362306a36Sopenharmony_ci	 */
470462306a36Sopenharmony_ci	if (vma->vm_ops->page_mkwrite) {
470562306a36Sopenharmony_ci		folio_unlock(folio);
470662306a36Sopenharmony_ci		tmp = do_page_mkwrite(vmf, folio);
470762306a36Sopenharmony_ci		if (unlikely(!tmp ||
470862306a36Sopenharmony_ci				(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
470962306a36Sopenharmony_ci			folio_put(folio);
471062306a36Sopenharmony_ci			return tmp;
471162306a36Sopenharmony_ci		}
471262306a36Sopenharmony_ci	}
471362306a36Sopenharmony_ci
471462306a36Sopenharmony_ci	ret |= finish_fault(vmf);
471562306a36Sopenharmony_ci	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
471662306a36Sopenharmony_ci					VM_FAULT_RETRY))) {
471762306a36Sopenharmony_ci		folio_unlock(folio);
471862306a36Sopenharmony_ci		folio_put(folio);
471962306a36Sopenharmony_ci		return ret;
472062306a36Sopenharmony_ci	}
472162306a36Sopenharmony_ci
472262306a36Sopenharmony_ci	ret |= fault_dirty_shared_page(vmf);
472362306a36Sopenharmony_ci	return ret;
472462306a36Sopenharmony_ci}
472562306a36Sopenharmony_ci
472662306a36Sopenharmony_ci/*
472762306a36Sopenharmony_ci * We enter with non-exclusive mmap_lock (to exclude vma changes,
472862306a36Sopenharmony_ci * but allow concurrent faults).
472962306a36Sopenharmony_ci * The mmap_lock may have been released depending on flags and our
473062306a36Sopenharmony_ci * return value.  See filemap_fault() and __folio_lock_or_retry().
473162306a36Sopenharmony_ci * If mmap_lock is released, vma may become invalid (for example
473262306a36Sopenharmony_ci * by other thread calling munmap()).
473362306a36Sopenharmony_ci */
473462306a36Sopenharmony_cistatic vm_fault_t do_fault(struct vm_fault *vmf)
473562306a36Sopenharmony_ci{
473662306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
473762306a36Sopenharmony_ci	struct mm_struct *vm_mm = vma->vm_mm;
473862306a36Sopenharmony_ci	vm_fault_t ret;
473962306a36Sopenharmony_ci
474062306a36Sopenharmony_ci	/*
474162306a36Sopenharmony_ci	 * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
474262306a36Sopenharmony_ci	 */
474362306a36Sopenharmony_ci	if (!vma->vm_ops->fault) {
474462306a36Sopenharmony_ci		vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
474562306a36Sopenharmony_ci					       vmf->address, &vmf->ptl);
474662306a36Sopenharmony_ci		if (unlikely(!vmf->pte))
474762306a36Sopenharmony_ci			ret = VM_FAULT_SIGBUS;
474862306a36Sopenharmony_ci		else {
474962306a36Sopenharmony_ci			/*
475062306a36Sopenharmony_ci			 * Make sure this is not a temporary clearing of pte
475162306a36Sopenharmony_ci			 * by holding ptl and checking again. A R/M/W update
475262306a36Sopenharmony_ci			 * of pte involves: take ptl, clearing the pte so that
475362306a36Sopenharmony_ci			 * we don't have concurrent modification by hardware
475462306a36Sopenharmony_ci			 * followed by an update.
475562306a36Sopenharmony_ci			 */
475662306a36Sopenharmony_ci			if (unlikely(pte_none(ptep_get(vmf->pte))))
475762306a36Sopenharmony_ci				ret = VM_FAULT_SIGBUS;
475862306a36Sopenharmony_ci			else
475962306a36Sopenharmony_ci				ret = VM_FAULT_NOPAGE;
476062306a36Sopenharmony_ci
476162306a36Sopenharmony_ci			pte_unmap_unlock(vmf->pte, vmf->ptl);
476262306a36Sopenharmony_ci		}
476362306a36Sopenharmony_ci	} else if (!(vmf->flags & FAULT_FLAG_WRITE))
476462306a36Sopenharmony_ci		ret = do_read_fault(vmf);
476562306a36Sopenharmony_ci	else if (!(vma->vm_flags & VM_SHARED))
476662306a36Sopenharmony_ci		ret = do_cow_fault(vmf);
476762306a36Sopenharmony_ci	else
476862306a36Sopenharmony_ci		ret = do_shared_fault(vmf);
476962306a36Sopenharmony_ci
477062306a36Sopenharmony_ci	/* preallocated pagetable is unused: free it */
477162306a36Sopenharmony_ci	if (vmf->prealloc_pte) {
477262306a36Sopenharmony_ci		pte_free(vm_mm, vmf->prealloc_pte);
477362306a36Sopenharmony_ci		vmf->prealloc_pte = NULL;
477462306a36Sopenharmony_ci	}
477562306a36Sopenharmony_ci	return ret;
477662306a36Sopenharmony_ci}
477762306a36Sopenharmony_ci
477862306a36Sopenharmony_ciint numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
477962306a36Sopenharmony_ci		      unsigned long addr, int page_nid, int *flags)
478062306a36Sopenharmony_ci{
478162306a36Sopenharmony_ci	get_page(page);
478262306a36Sopenharmony_ci
478362306a36Sopenharmony_ci	/* Record the current PID acceesing VMA */
478462306a36Sopenharmony_ci	vma_set_access_pid_bit(vma);
478562306a36Sopenharmony_ci
478662306a36Sopenharmony_ci	count_vm_numa_event(NUMA_HINT_FAULTS);
478762306a36Sopenharmony_ci	if (page_nid == numa_node_id()) {
478862306a36Sopenharmony_ci		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
478962306a36Sopenharmony_ci		*flags |= TNF_FAULT_LOCAL;
479062306a36Sopenharmony_ci	}
479162306a36Sopenharmony_ci
479262306a36Sopenharmony_ci	return mpol_misplaced(page, vma, addr);
479362306a36Sopenharmony_ci}
479462306a36Sopenharmony_ci
479562306a36Sopenharmony_cistatic vm_fault_t do_numa_page(struct vm_fault *vmf)
479662306a36Sopenharmony_ci{
479762306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
479862306a36Sopenharmony_ci	struct page *page = NULL;
479962306a36Sopenharmony_ci	int page_nid = NUMA_NO_NODE;
480062306a36Sopenharmony_ci	bool writable = false;
480162306a36Sopenharmony_ci	int last_cpupid;
480262306a36Sopenharmony_ci	int target_nid;
480362306a36Sopenharmony_ci	pte_t pte, old_pte;
480462306a36Sopenharmony_ci	int flags = 0;
480562306a36Sopenharmony_ci
480662306a36Sopenharmony_ci	/*
480762306a36Sopenharmony_ci	 * The "pte" at this point cannot be used safely without
480862306a36Sopenharmony_ci	 * validation through pte_unmap_same(). It's of NUMA type but
480962306a36Sopenharmony_ci	 * the pfn may be screwed if the read is non atomic.
481062306a36Sopenharmony_ci	 */
481162306a36Sopenharmony_ci	spin_lock(vmf->ptl);
481262306a36Sopenharmony_ci	if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
481362306a36Sopenharmony_ci		pte_unmap_unlock(vmf->pte, vmf->ptl);
481462306a36Sopenharmony_ci		goto out;
481562306a36Sopenharmony_ci	}
481662306a36Sopenharmony_ci
481762306a36Sopenharmony_ci	/* Get the normal PTE  */
481862306a36Sopenharmony_ci	old_pte = ptep_get(vmf->pte);
481962306a36Sopenharmony_ci	pte = pte_modify(old_pte, vma->vm_page_prot);
482062306a36Sopenharmony_ci
482162306a36Sopenharmony_ci	/*
482262306a36Sopenharmony_ci	 * Detect now whether the PTE could be writable; this information
482362306a36Sopenharmony_ci	 * is only valid while holding the PT lock.
482462306a36Sopenharmony_ci	 */
482562306a36Sopenharmony_ci	writable = pte_write(pte);
482662306a36Sopenharmony_ci	if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
482762306a36Sopenharmony_ci	    can_change_pte_writable(vma, vmf->address, pte))
482862306a36Sopenharmony_ci		writable = true;
482962306a36Sopenharmony_ci
483062306a36Sopenharmony_ci	page = vm_normal_page(vma, vmf->address, pte);
483162306a36Sopenharmony_ci	if (!page || is_zone_device_page(page))
483262306a36Sopenharmony_ci		goto out_map;
483362306a36Sopenharmony_ci
483462306a36Sopenharmony_ci	/* TODO: handle PTE-mapped THP */
483562306a36Sopenharmony_ci	if (PageCompound(page))
483662306a36Sopenharmony_ci		goto out_map;
483762306a36Sopenharmony_ci
483862306a36Sopenharmony_ci	/*
483962306a36Sopenharmony_ci	 * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
484062306a36Sopenharmony_ci	 * much anyway since they can be in shared cache state. This misses
484162306a36Sopenharmony_ci	 * the case where a mapping is writable but the process never writes
484262306a36Sopenharmony_ci	 * to it but pte_write gets cleared during protection updates and
484362306a36Sopenharmony_ci	 * pte_dirty has unpredictable behaviour between PTE scan updates,
484462306a36Sopenharmony_ci	 * background writeback, dirty balancing and application behaviour.
484562306a36Sopenharmony_ci	 */
484662306a36Sopenharmony_ci	if (!writable)
484762306a36Sopenharmony_ci		flags |= TNF_NO_GROUP;
484862306a36Sopenharmony_ci
484962306a36Sopenharmony_ci	/*
485062306a36Sopenharmony_ci	 * Flag if the page is shared between multiple address spaces. This
485162306a36Sopenharmony_ci	 * is later used when determining whether to group tasks together
485262306a36Sopenharmony_ci	 */
485362306a36Sopenharmony_ci	if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
485462306a36Sopenharmony_ci		flags |= TNF_SHARED;
485562306a36Sopenharmony_ci
485662306a36Sopenharmony_ci	page_nid = page_to_nid(page);
485762306a36Sopenharmony_ci	/*
485862306a36Sopenharmony_ci	 * For memory tiering mode, cpupid of slow memory page is used
485962306a36Sopenharmony_ci	 * to record page access time.  So use default value.
486062306a36Sopenharmony_ci	 */
486162306a36Sopenharmony_ci	if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
486262306a36Sopenharmony_ci	    !node_is_toptier(page_nid))
486362306a36Sopenharmony_ci		last_cpupid = (-1 & LAST_CPUPID_MASK);
486462306a36Sopenharmony_ci	else
486562306a36Sopenharmony_ci		last_cpupid = page_cpupid_last(page);
486662306a36Sopenharmony_ci	target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
486762306a36Sopenharmony_ci			&flags);
486862306a36Sopenharmony_ci	if (target_nid == NUMA_NO_NODE) {
486962306a36Sopenharmony_ci		put_page(page);
487062306a36Sopenharmony_ci		goto out_map;
487162306a36Sopenharmony_ci	}
487262306a36Sopenharmony_ci	pte_unmap_unlock(vmf->pte, vmf->ptl);
487362306a36Sopenharmony_ci	writable = false;
487462306a36Sopenharmony_ci
487562306a36Sopenharmony_ci	/* Migrate to the requested node */
487662306a36Sopenharmony_ci	if (migrate_misplaced_page(page, vma, target_nid)) {
487762306a36Sopenharmony_ci		page_nid = target_nid;
487862306a36Sopenharmony_ci		flags |= TNF_MIGRATED;
487962306a36Sopenharmony_ci	} else {
488062306a36Sopenharmony_ci		flags |= TNF_MIGRATE_FAIL;
488162306a36Sopenharmony_ci		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
488262306a36Sopenharmony_ci					       vmf->address, &vmf->ptl);
488362306a36Sopenharmony_ci		if (unlikely(!vmf->pte))
488462306a36Sopenharmony_ci			goto out;
488562306a36Sopenharmony_ci		if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
488662306a36Sopenharmony_ci			pte_unmap_unlock(vmf->pte, vmf->ptl);
488762306a36Sopenharmony_ci			goto out;
488862306a36Sopenharmony_ci		}
488962306a36Sopenharmony_ci		goto out_map;
489062306a36Sopenharmony_ci	}
489162306a36Sopenharmony_ci
489262306a36Sopenharmony_ciout:
489362306a36Sopenharmony_ci	if (page_nid != NUMA_NO_NODE)
489462306a36Sopenharmony_ci		task_numa_fault(last_cpupid, page_nid, 1, flags);
489562306a36Sopenharmony_ci	return 0;
489662306a36Sopenharmony_ciout_map:
489762306a36Sopenharmony_ci	/*
489862306a36Sopenharmony_ci	 * Make it present again, depending on how arch implements
489962306a36Sopenharmony_ci	 * non-accessible ptes, some can allow access by kernel mode.
490062306a36Sopenharmony_ci	 */
490162306a36Sopenharmony_ci	old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
490262306a36Sopenharmony_ci	pte = pte_modify(old_pte, vma->vm_page_prot);
490362306a36Sopenharmony_ci	pte = pte_mkyoung(pte);
490462306a36Sopenharmony_ci	if (writable)
490562306a36Sopenharmony_ci		pte = pte_mkwrite(pte, vma);
490662306a36Sopenharmony_ci	ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
490762306a36Sopenharmony_ci	update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
490862306a36Sopenharmony_ci	pte_unmap_unlock(vmf->pte, vmf->ptl);
490962306a36Sopenharmony_ci	goto out;
491062306a36Sopenharmony_ci}
491162306a36Sopenharmony_ci
491262306a36Sopenharmony_cistatic inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
491362306a36Sopenharmony_ci{
491462306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
491562306a36Sopenharmony_ci	if (vma_is_anonymous(vma))
491662306a36Sopenharmony_ci		return do_huge_pmd_anonymous_page(vmf);
491762306a36Sopenharmony_ci	if (vma->vm_ops->huge_fault)
491862306a36Sopenharmony_ci		return vma->vm_ops->huge_fault(vmf, PMD_ORDER);
491962306a36Sopenharmony_ci	return VM_FAULT_FALLBACK;
492062306a36Sopenharmony_ci}
492162306a36Sopenharmony_ci
492262306a36Sopenharmony_ci/* `inline' is required to avoid gcc 4.1.2 build error */
492362306a36Sopenharmony_cistatic inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
492462306a36Sopenharmony_ci{
492562306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
492662306a36Sopenharmony_ci	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
492762306a36Sopenharmony_ci	vm_fault_t ret;
492862306a36Sopenharmony_ci
492962306a36Sopenharmony_ci	if (vma_is_anonymous(vma)) {
493062306a36Sopenharmony_ci		if (likely(!unshare) &&
493162306a36Sopenharmony_ci		    userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd))
493262306a36Sopenharmony_ci			return handle_userfault(vmf, VM_UFFD_WP);
493362306a36Sopenharmony_ci		return do_huge_pmd_wp_page(vmf);
493462306a36Sopenharmony_ci	}
493562306a36Sopenharmony_ci
493662306a36Sopenharmony_ci	if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
493762306a36Sopenharmony_ci		if (vma->vm_ops->huge_fault) {
493862306a36Sopenharmony_ci			ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER);
493962306a36Sopenharmony_ci			if (!(ret & VM_FAULT_FALLBACK))
494062306a36Sopenharmony_ci				return ret;
494162306a36Sopenharmony_ci		}
494262306a36Sopenharmony_ci	}
494362306a36Sopenharmony_ci
494462306a36Sopenharmony_ci	/* COW or write-notify handled on pte level: split pmd. */
494562306a36Sopenharmony_ci	__split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
494662306a36Sopenharmony_ci
494762306a36Sopenharmony_ci	return VM_FAULT_FALLBACK;
494862306a36Sopenharmony_ci}
494962306a36Sopenharmony_ci
495062306a36Sopenharmony_cistatic vm_fault_t create_huge_pud(struct vm_fault *vmf)
495162306a36Sopenharmony_ci{
495262306a36Sopenharmony_ci#if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&			\
495362306a36Sopenharmony_ci	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
495462306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
495562306a36Sopenharmony_ci	/* No support for anonymous transparent PUD pages yet */
495662306a36Sopenharmony_ci	if (vma_is_anonymous(vma))
495762306a36Sopenharmony_ci		return VM_FAULT_FALLBACK;
495862306a36Sopenharmony_ci	if (vma->vm_ops->huge_fault)
495962306a36Sopenharmony_ci		return vma->vm_ops->huge_fault(vmf, PUD_ORDER);
496062306a36Sopenharmony_ci#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
496162306a36Sopenharmony_ci	return VM_FAULT_FALLBACK;
496262306a36Sopenharmony_ci}
496362306a36Sopenharmony_ci
496462306a36Sopenharmony_cistatic vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
496562306a36Sopenharmony_ci{
496662306a36Sopenharmony_ci#if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&			\
496762306a36Sopenharmony_ci	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
496862306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
496962306a36Sopenharmony_ci	vm_fault_t ret;
497062306a36Sopenharmony_ci
497162306a36Sopenharmony_ci	/* No support for anonymous transparent PUD pages yet */
497262306a36Sopenharmony_ci	if (vma_is_anonymous(vma))
497362306a36Sopenharmony_ci		goto split;
497462306a36Sopenharmony_ci	if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
497562306a36Sopenharmony_ci		if (vma->vm_ops->huge_fault) {
497662306a36Sopenharmony_ci			ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER);
497762306a36Sopenharmony_ci			if (!(ret & VM_FAULT_FALLBACK))
497862306a36Sopenharmony_ci				return ret;
497962306a36Sopenharmony_ci		}
498062306a36Sopenharmony_ci	}
498162306a36Sopenharmony_cisplit:
498262306a36Sopenharmony_ci	/* COW or write-notify not handled on PUD level: split pud.*/
498362306a36Sopenharmony_ci	__split_huge_pud(vma, vmf->pud, vmf->address);
498462306a36Sopenharmony_ci#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
498562306a36Sopenharmony_ci	return VM_FAULT_FALLBACK;
498662306a36Sopenharmony_ci}
498762306a36Sopenharmony_ci
498862306a36Sopenharmony_ci/*
498962306a36Sopenharmony_ci * These routines also need to handle stuff like marking pages dirty
499062306a36Sopenharmony_ci * and/or accessed for architectures that don't do it in hardware (most
499162306a36Sopenharmony_ci * RISC architectures).  The early dirtying is also good on the i386.
499262306a36Sopenharmony_ci *
499362306a36Sopenharmony_ci * There is also a hook called "update_mmu_cache()" that architectures
499462306a36Sopenharmony_ci * with external mmu caches can use to update those (ie the Sparc or
499562306a36Sopenharmony_ci * PowerPC hashed page tables that act as extended TLBs).
499662306a36Sopenharmony_ci *
499762306a36Sopenharmony_ci * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow
499862306a36Sopenharmony_ci * concurrent faults).
499962306a36Sopenharmony_ci *
500062306a36Sopenharmony_ci * The mmap_lock may have been released depending on flags and our return value.
500162306a36Sopenharmony_ci * See filemap_fault() and __folio_lock_or_retry().
500262306a36Sopenharmony_ci */
500362306a36Sopenharmony_cistatic vm_fault_t handle_pte_fault(struct vm_fault *vmf)
500462306a36Sopenharmony_ci{
500562306a36Sopenharmony_ci	pte_t entry;
500662306a36Sopenharmony_ci
500762306a36Sopenharmony_ci	if (unlikely(pmd_none(*vmf->pmd))) {
500862306a36Sopenharmony_ci		/*
500962306a36Sopenharmony_ci		 * Leave __pte_alloc() until later: because vm_ops->fault may
501062306a36Sopenharmony_ci		 * want to allocate huge page, and if we expose page table
501162306a36Sopenharmony_ci		 * for an instant, it will be difficult to retract from
501262306a36Sopenharmony_ci		 * concurrent faults and from rmap lookups.
501362306a36Sopenharmony_ci		 */
501462306a36Sopenharmony_ci		vmf->pte = NULL;
501562306a36Sopenharmony_ci		vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID;
501662306a36Sopenharmony_ci	} else {
501762306a36Sopenharmony_ci		/*
501862306a36Sopenharmony_ci		 * A regular pmd is established and it can't morph into a huge
501962306a36Sopenharmony_ci		 * pmd by anon khugepaged, since that takes mmap_lock in write
502062306a36Sopenharmony_ci		 * mode; but shmem or file collapse to THP could still morph
502162306a36Sopenharmony_ci		 * it into a huge pmd: just retry later if so.
502262306a36Sopenharmony_ci		 */
502362306a36Sopenharmony_ci		vmf->pte = pte_offset_map_nolock(vmf->vma->vm_mm, vmf->pmd,
502462306a36Sopenharmony_ci						 vmf->address, &vmf->ptl);
502562306a36Sopenharmony_ci		if (unlikely(!vmf->pte))
502662306a36Sopenharmony_ci			return 0;
502762306a36Sopenharmony_ci		vmf->orig_pte = ptep_get_lockless(vmf->pte);
502862306a36Sopenharmony_ci		vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID;
502962306a36Sopenharmony_ci
503062306a36Sopenharmony_ci		if (pte_none(vmf->orig_pte)) {
503162306a36Sopenharmony_ci			pte_unmap(vmf->pte);
503262306a36Sopenharmony_ci			vmf->pte = NULL;
503362306a36Sopenharmony_ci		}
503462306a36Sopenharmony_ci	}
503562306a36Sopenharmony_ci
503662306a36Sopenharmony_ci	if (!vmf->pte)
503762306a36Sopenharmony_ci		return do_pte_missing(vmf);
503862306a36Sopenharmony_ci
503962306a36Sopenharmony_ci	if (!pte_present(vmf->orig_pte))
504062306a36Sopenharmony_ci		return do_swap_page(vmf);
504162306a36Sopenharmony_ci
504262306a36Sopenharmony_ci	if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
504362306a36Sopenharmony_ci		return do_numa_page(vmf);
504462306a36Sopenharmony_ci
504562306a36Sopenharmony_ci	spin_lock(vmf->ptl);
504662306a36Sopenharmony_ci	entry = vmf->orig_pte;
504762306a36Sopenharmony_ci	if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
504862306a36Sopenharmony_ci		update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
504962306a36Sopenharmony_ci		goto unlock;
505062306a36Sopenharmony_ci	}
505162306a36Sopenharmony_ci	if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
505262306a36Sopenharmony_ci		if (!pte_write(entry))
505362306a36Sopenharmony_ci			return do_wp_page(vmf);
505462306a36Sopenharmony_ci		else if (likely(vmf->flags & FAULT_FLAG_WRITE))
505562306a36Sopenharmony_ci			entry = pte_mkdirty(entry);
505662306a36Sopenharmony_ci	}
505762306a36Sopenharmony_ci	entry = pte_mkyoung(entry);
505862306a36Sopenharmony_ci	if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
505962306a36Sopenharmony_ci				vmf->flags & FAULT_FLAG_WRITE)) {
506062306a36Sopenharmony_ci		update_mmu_cache_range(vmf, vmf->vma, vmf->address,
506162306a36Sopenharmony_ci				vmf->pte, 1);
506262306a36Sopenharmony_ci	} else {
506362306a36Sopenharmony_ci		/* Skip spurious TLB flush for retried page fault */
506462306a36Sopenharmony_ci		if (vmf->flags & FAULT_FLAG_TRIED)
506562306a36Sopenharmony_ci			goto unlock;
506662306a36Sopenharmony_ci		/*
506762306a36Sopenharmony_ci		 * This is needed only for protection faults but the arch code
506862306a36Sopenharmony_ci		 * is not yet telling us if this is a protection fault or not.
506962306a36Sopenharmony_ci		 * This still avoids useless tlb flushes for .text page faults
507062306a36Sopenharmony_ci		 * with threads.
507162306a36Sopenharmony_ci		 */
507262306a36Sopenharmony_ci		if (vmf->flags & FAULT_FLAG_WRITE)
507362306a36Sopenharmony_ci			flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
507462306a36Sopenharmony_ci						     vmf->pte);
507562306a36Sopenharmony_ci	}
507662306a36Sopenharmony_ciunlock:
507762306a36Sopenharmony_ci	pte_unmap_unlock(vmf->pte, vmf->ptl);
507862306a36Sopenharmony_ci	return 0;
507962306a36Sopenharmony_ci}
508062306a36Sopenharmony_ci
508162306a36Sopenharmony_ci/*
508262306a36Sopenharmony_ci * On entry, we hold either the VMA lock or the mmap_lock
508362306a36Sopenharmony_ci * (FAULT_FLAG_VMA_LOCK tells you which).  If VM_FAULT_RETRY is set in
508462306a36Sopenharmony_ci * the result, the mmap_lock is not held on exit.  See filemap_fault()
508562306a36Sopenharmony_ci * and __folio_lock_or_retry().
508662306a36Sopenharmony_ci */
508762306a36Sopenharmony_cistatic vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
508862306a36Sopenharmony_ci		unsigned long address, unsigned int flags)
508962306a36Sopenharmony_ci{
509062306a36Sopenharmony_ci	struct vm_fault vmf = {
509162306a36Sopenharmony_ci		.vma = vma,
509262306a36Sopenharmony_ci		.address = address & PAGE_MASK,
509362306a36Sopenharmony_ci		.real_address = address,
509462306a36Sopenharmony_ci		.flags = flags,
509562306a36Sopenharmony_ci		.pgoff = linear_page_index(vma, address),
509662306a36Sopenharmony_ci		.gfp_mask = __get_fault_gfp_mask(vma),
509762306a36Sopenharmony_ci	};
509862306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
509962306a36Sopenharmony_ci	unsigned long vm_flags = vma->vm_flags;
510062306a36Sopenharmony_ci	pgd_t *pgd;
510162306a36Sopenharmony_ci	p4d_t *p4d;
510262306a36Sopenharmony_ci	vm_fault_t ret;
510362306a36Sopenharmony_ci
510462306a36Sopenharmony_ci	pgd = pgd_offset(mm, address);
510562306a36Sopenharmony_ci	p4d = p4d_alloc(mm, pgd, address);
510662306a36Sopenharmony_ci	if (!p4d)
510762306a36Sopenharmony_ci		return VM_FAULT_OOM;
510862306a36Sopenharmony_ci
510962306a36Sopenharmony_ci	vmf.pud = pud_alloc(mm, p4d, address);
511062306a36Sopenharmony_ci	if (!vmf.pud)
511162306a36Sopenharmony_ci		return VM_FAULT_OOM;
511262306a36Sopenharmony_ciretry_pud:
511362306a36Sopenharmony_ci	if (pud_none(*vmf.pud) &&
511462306a36Sopenharmony_ci	    hugepage_vma_check(vma, vm_flags, false, true, true)) {
511562306a36Sopenharmony_ci		ret = create_huge_pud(&vmf);
511662306a36Sopenharmony_ci		if (!(ret & VM_FAULT_FALLBACK))
511762306a36Sopenharmony_ci			return ret;
511862306a36Sopenharmony_ci	} else {
511962306a36Sopenharmony_ci		pud_t orig_pud = *vmf.pud;
512062306a36Sopenharmony_ci
512162306a36Sopenharmony_ci		barrier();
512262306a36Sopenharmony_ci		if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
512362306a36Sopenharmony_ci
512462306a36Sopenharmony_ci			/*
512562306a36Sopenharmony_ci			 * TODO once we support anonymous PUDs: NUMA case and
512662306a36Sopenharmony_ci			 * FAULT_FLAG_UNSHARE handling.
512762306a36Sopenharmony_ci			 */
512862306a36Sopenharmony_ci			if ((flags & FAULT_FLAG_WRITE) && !pud_write(orig_pud)) {
512962306a36Sopenharmony_ci				ret = wp_huge_pud(&vmf, orig_pud);
513062306a36Sopenharmony_ci				if (!(ret & VM_FAULT_FALLBACK))
513162306a36Sopenharmony_ci					return ret;
513262306a36Sopenharmony_ci			} else {
513362306a36Sopenharmony_ci				huge_pud_set_accessed(&vmf, orig_pud);
513462306a36Sopenharmony_ci				return 0;
513562306a36Sopenharmony_ci			}
513662306a36Sopenharmony_ci		}
513762306a36Sopenharmony_ci	}
513862306a36Sopenharmony_ci
513962306a36Sopenharmony_ci	vmf.pmd = pmd_alloc(mm, vmf.pud, address);
514062306a36Sopenharmony_ci	if (!vmf.pmd)
514162306a36Sopenharmony_ci		return VM_FAULT_OOM;
514262306a36Sopenharmony_ci
514362306a36Sopenharmony_ci	/* Huge pud page fault raced with pmd_alloc? */
514462306a36Sopenharmony_ci	if (pud_trans_unstable(vmf.pud))
514562306a36Sopenharmony_ci		goto retry_pud;
514662306a36Sopenharmony_ci
514762306a36Sopenharmony_ci	if (pmd_none(*vmf.pmd) &&
514862306a36Sopenharmony_ci	    hugepage_vma_check(vma, vm_flags, false, true, true)) {
514962306a36Sopenharmony_ci		ret = create_huge_pmd(&vmf);
515062306a36Sopenharmony_ci		if (!(ret & VM_FAULT_FALLBACK))
515162306a36Sopenharmony_ci			return ret;
515262306a36Sopenharmony_ci	} else {
515362306a36Sopenharmony_ci		vmf.orig_pmd = pmdp_get_lockless(vmf.pmd);
515462306a36Sopenharmony_ci
515562306a36Sopenharmony_ci		if (unlikely(is_swap_pmd(vmf.orig_pmd))) {
515662306a36Sopenharmony_ci			VM_BUG_ON(thp_migration_supported() &&
515762306a36Sopenharmony_ci					  !is_pmd_migration_entry(vmf.orig_pmd));
515862306a36Sopenharmony_ci			if (is_pmd_migration_entry(vmf.orig_pmd))
515962306a36Sopenharmony_ci				pmd_migration_entry_wait(mm, vmf.pmd);
516062306a36Sopenharmony_ci			return 0;
516162306a36Sopenharmony_ci		}
516262306a36Sopenharmony_ci		if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) {
516362306a36Sopenharmony_ci			if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
516462306a36Sopenharmony_ci				return do_huge_pmd_numa_page(&vmf);
516562306a36Sopenharmony_ci
516662306a36Sopenharmony_ci			if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
516762306a36Sopenharmony_ci			    !pmd_write(vmf.orig_pmd)) {
516862306a36Sopenharmony_ci				ret = wp_huge_pmd(&vmf);
516962306a36Sopenharmony_ci				if (!(ret & VM_FAULT_FALLBACK))
517062306a36Sopenharmony_ci					return ret;
517162306a36Sopenharmony_ci			} else {
517262306a36Sopenharmony_ci				huge_pmd_set_accessed(&vmf);
517362306a36Sopenharmony_ci				return 0;
517462306a36Sopenharmony_ci			}
517562306a36Sopenharmony_ci		}
517662306a36Sopenharmony_ci	}
517762306a36Sopenharmony_ci
517862306a36Sopenharmony_ci	return handle_pte_fault(&vmf);
517962306a36Sopenharmony_ci}
518062306a36Sopenharmony_ci
518162306a36Sopenharmony_ci/**
518262306a36Sopenharmony_ci * mm_account_fault - Do page fault accounting
518362306a36Sopenharmony_ci * @mm: mm from which memcg should be extracted. It can be NULL.
518462306a36Sopenharmony_ci * @regs: the pt_regs struct pointer.  When set to NULL, will skip accounting
518562306a36Sopenharmony_ci *        of perf event counters, but we'll still do the per-task accounting to
518662306a36Sopenharmony_ci *        the task who triggered this page fault.
518762306a36Sopenharmony_ci * @address: the faulted address.
518862306a36Sopenharmony_ci * @flags: the fault flags.
518962306a36Sopenharmony_ci * @ret: the fault retcode.
519062306a36Sopenharmony_ci *
519162306a36Sopenharmony_ci * This will take care of most of the page fault accounting.  Meanwhile, it
519262306a36Sopenharmony_ci * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
519362306a36Sopenharmony_ci * updates.  However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
519462306a36Sopenharmony_ci * still be in per-arch page fault handlers at the entry of page fault.
519562306a36Sopenharmony_ci */
519662306a36Sopenharmony_cistatic inline void mm_account_fault(struct mm_struct *mm, struct pt_regs *regs,
519762306a36Sopenharmony_ci				    unsigned long address, unsigned int flags,
519862306a36Sopenharmony_ci				    vm_fault_t ret)
519962306a36Sopenharmony_ci{
520062306a36Sopenharmony_ci	bool major;
520162306a36Sopenharmony_ci
520262306a36Sopenharmony_ci	/* Incomplete faults will be accounted upon completion. */
520362306a36Sopenharmony_ci	if (ret & VM_FAULT_RETRY)
520462306a36Sopenharmony_ci		return;
520562306a36Sopenharmony_ci
520662306a36Sopenharmony_ci	/*
520762306a36Sopenharmony_ci	 * To preserve the behavior of older kernels, PGFAULT counters record
520862306a36Sopenharmony_ci	 * both successful and failed faults, as opposed to perf counters,
520962306a36Sopenharmony_ci	 * which ignore failed cases.
521062306a36Sopenharmony_ci	 */
521162306a36Sopenharmony_ci	count_vm_event(PGFAULT);
521262306a36Sopenharmony_ci	count_memcg_event_mm(mm, PGFAULT);
521362306a36Sopenharmony_ci
521462306a36Sopenharmony_ci	/*
521562306a36Sopenharmony_ci	 * Do not account for unsuccessful faults (e.g. when the address wasn't
521662306a36Sopenharmony_ci	 * valid).  That includes arch_vma_access_permitted() failing before
521762306a36Sopenharmony_ci	 * reaching here. So this is not a "this many hardware page faults"
521862306a36Sopenharmony_ci	 * counter.  We should use the hw profiling for that.
521962306a36Sopenharmony_ci	 */
522062306a36Sopenharmony_ci	if (ret & VM_FAULT_ERROR)
522162306a36Sopenharmony_ci		return;
522262306a36Sopenharmony_ci
522362306a36Sopenharmony_ci	/*
522462306a36Sopenharmony_ci	 * We define the fault as a major fault when the final successful fault
522562306a36Sopenharmony_ci	 * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't
522662306a36Sopenharmony_ci	 * handle it immediately previously).
522762306a36Sopenharmony_ci	 */
522862306a36Sopenharmony_ci	major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED);
522962306a36Sopenharmony_ci
523062306a36Sopenharmony_ci	if (major)
523162306a36Sopenharmony_ci		current->maj_flt++;
523262306a36Sopenharmony_ci	else
523362306a36Sopenharmony_ci		current->min_flt++;
523462306a36Sopenharmony_ci
523562306a36Sopenharmony_ci	/*
523662306a36Sopenharmony_ci	 * If the fault is done for GUP, regs will be NULL.  We only do the
523762306a36Sopenharmony_ci	 * accounting for the per thread fault counters who triggered the
523862306a36Sopenharmony_ci	 * fault, and we skip the perf event updates.
523962306a36Sopenharmony_ci	 */
524062306a36Sopenharmony_ci	if (!regs)
524162306a36Sopenharmony_ci		return;
524262306a36Sopenharmony_ci
524362306a36Sopenharmony_ci	if (major)
524462306a36Sopenharmony_ci		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
524562306a36Sopenharmony_ci	else
524662306a36Sopenharmony_ci		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
524762306a36Sopenharmony_ci}
524862306a36Sopenharmony_ci
524962306a36Sopenharmony_ci#ifdef CONFIG_LRU_GEN
525062306a36Sopenharmony_cistatic void lru_gen_enter_fault(struct vm_area_struct *vma)
525162306a36Sopenharmony_ci{
525262306a36Sopenharmony_ci	/* the LRU algorithm only applies to accesses with recency */
525362306a36Sopenharmony_ci	current->in_lru_fault = vma_has_recency(vma);
525462306a36Sopenharmony_ci}
525562306a36Sopenharmony_ci
525662306a36Sopenharmony_cistatic void lru_gen_exit_fault(void)
525762306a36Sopenharmony_ci{
525862306a36Sopenharmony_ci	current->in_lru_fault = false;
525962306a36Sopenharmony_ci}
526062306a36Sopenharmony_ci#else
526162306a36Sopenharmony_cistatic void lru_gen_enter_fault(struct vm_area_struct *vma)
526262306a36Sopenharmony_ci{
526362306a36Sopenharmony_ci}
526462306a36Sopenharmony_ci
526562306a36Sopenharmony_cistatic void lru_gen_exit_fault(void)
526662306a36Sopenharmony_ci{
526762306a36Sopenharmony_ci}
526862306a36Sopenharmony_ci#endif /* CONFIG_LRU_GEN */
526962306a36Sopenharmony_ci
527062306a36Sopenharmony_cistatic vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma,
527162306a36Sopenharmony_ci				       unsigned int *flags)
527262306a36Sopenharmony_ci{
527362306a36Sopenharmony_ci	if (unlikely(*flags & FAULT_FLAG_UNSHARE)) {
527462306a36Sopenharmony_ci		if (WARN_ON_ONCE(*flags & FAULT_FLAG_WRITE))
527562306a36Sopenharmony_ci			return VM_FAULT_SIGSEGV;
527662306a36Sopenharmony_ci		/*
527762306a36Sopenharmony_ci		 * FAULT_FLAG_UNSHARE only applies to COW mappings. Let's
527862306a36Sopenharmony_ci		 * just treat it like an ordinary read-fault otherwise.
527962306a36Sopenharmony_ci		 */
528062306a36Sopenharmony_ci		if (!is_cow_mapping(vma->vm_flags))
528162306a36Sopenharmony_ci			*flags &= ~FAULT_FLAG_UNSHARE;
528262306a36Sopenharmony_ci	} else if (*flags & FAULT_FLAG_WRITE) {
528362306a36Sopenharmony_ci		/* Write faults on read-only mappings are impossible ... */
528462306a36Sopenharmony_ci		if (WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)))
528562306a36Sopenharmony_ci			return VM_FAULT_SIGSEGV;
528662306a36Sopenharmony_ci		/* ... and FOLL_FORCE only applies to COW mappings. */
528762306a36Sopenharmony_ci		if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE) &&
528862306a36Sopenharmony_ci				 !is_cow_mapping(vma->vm_flags)))
528962306a36Sopenharmony_ci			return VM_FAULT_SIGSEGV;
529062306a36Sopenharmony_ci	}
529162306a36Sopenharmony_ci#ifdef CONFIG_PER_VMA_LOCK
529262306a36Sopenharmony_ci	/*
529362306a36Sopenharmony_ci	 * Per-VMA locks can't be used with FAULT_FLAG_RETRY_NOWAIT because of
529462306a36Sopenharmony_ci	 * the assumption that lock is dropped on VM_FAULT_RETRY.
529562306a36Sopenharmony_ci	 */
529662306a36Sopenharmony_ci	if (WARN_ON_ONCE((*flags &
529762306a36Sopenharmony_ci			(FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)) ==
529862306a36Sopenharmony_ci			(FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)))
529962306a36Sopenharmony_ci		return VM_FAULT_SIGSEGV;
530062306a36Sopenharmony_ci#endif
530162306a36Sopenharmony_ci
530262306a36Sopenharmony_ci	return 0;
530362306a36Sopenharmony_ci}
530462306a36Sopenharmony_ci
530562306a36Sopenharmony_ci/*
530662306a36Sopenharmony_ci * By the time we get here, we already hold the mm semaphore
530762306a36Sopenharmony_ci *
530862306a36Sopenharmony_ci * The mmap_lock may have been released depending on flags and our
530962306a36Sopenharmony_ci * return value.  See filemap_fault() and __folio_lock_or_retry().
531062306a36Sopenharmony_ci */
531162306a36Sopenharmony_civm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
531262306a36Sopenharmony_ci			   unsigned int flags, struct pt_regs *regs)
531362306a36Sopenharmony_ci{
531462306a36Sopenharmony_ci	/* If the fault handler drops the mmap_lock, vma may be freed */
531562306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
531662306a36Sopenharmony_ci	vm_fault_t ret;
531762306a36Sopenharmony_ci
531862306a36Sopenharmony_ci	__set_current_state(TASK_RUNNING);
531962306a36Sopenharmony_ci
532062306a36Sopenharmony_ci	ret = sanitize_fault_flags(vma, &flags);
532162306a36Sopenharmony_ci	if (ret)
532262306a36Sopenharmony_ci		goto out;
532362306a36Sopenharmony_ci
532462306a36Sopenharmony_ci	if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
532562306a36Sopenharmony_ci					    flags & FAULT_FLAG_INSTRUCTION,
532662306a36Sopenharmony_ci					    flags & FAULT_FLAG_REMOTE)) {
532762306a36Sopenharmony_ci		ret = VM_FAULT_SIGSEGV;
532862306a36Sopenharmony_ci		goto out;
532962306a36Sopenharmony_ci	}
533062306a36Sopenharmony_ci
533162306a36Sopenharmony_ci	/*
533262306a36Sopenharmony_ci	 * Enable the memcg OOM handling for faults triggered in user
533362306a36Sopenharmony_ci	 * space.  Kernel faults are handled more gracefully.
533462306a36Sopenharmony_ci	 */
533562306a36Sopenharmony_ci	if (flags & FAULT_FLAG_USER)
533662306a36Sopenharmony_ci		mem_cgroup_enter_user_fault();
533762306a36Sopenharmony_ci
533862306a36Sopenharmony_ci	lru_gen_enter_fault(vma);
533962306a36Sopenharmony_ci
534062306a36Sopenharmony_ci	if (unlikely(is_vm_hugetlb_page(vma)))
534162306a36Sopenharmony_ci		ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
534262306a36Sopenharmony_ci	else
534362306a36Sopenharmony_ci		ret = __handle_mm_fault(vma, address, flags);
534462306a36Sopenharmony_ci
534562306a36Sopenharmony_ci	lru_gen_exit_fault();
534662306a36Sopenharmony_ci
534762306a36Sopenharmony_ci	if (flags & FAULT_FLAG_USER) {
534862306a36Sopenharmony_ci		mem_cgroup_exit_user_fault();
534962306a36Sopenharmony_ci		/*
535062306a36Sopenharmony_ci		 * The task may have entered a memcg OOM situation but
535162306a36Sopenharmony_ci		 * if the allocation error was handled gracefully (no
535262306a36Sopenharmony_ci		 * VM_FAULT_OOM), there is no need to kill anything.
535362306a36Sopenharmony_ci		 * Just clean up the OOM state peacefully.
535462306a36Sopenharmony_ci		 */
535562306a36Sopenharmony_ci		if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
535662306a36Sopenharmony_ci			mem_cgroup_oom_synchronize(false);
535762306a36Sopenharmony_ci	}
535862306a36Sopenharmony_ciout:
535962306a36Sopenharmony_ci	mm_account_fault(mm, regs, address, flags, ret);
536062306a36Sopenharmony_ci
536162306a36Sopenharmony_ci	return ret;
536262306a36Sopenharmony_ci}
536362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(handle_mm_fault);
536462306a36Sopenharmony_ci
536562306a36Sopenharmony_ci#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
536662306a36Sopenharmony_ci#include <linux/extable.h>
536762306a36Sopenharmony_ci
536862306a36Sopenharmony_cistatic inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
536962306a36Sopenharmony_ci{
537062306a36Sopenharmony_ci	if (likely(mmap_read_trylock(mm)))
537162306a36Sopenharmony_ci		return true;
537262306a36Sopenharmony_ci
537362306a36Sopenharmony_ci	if (regs && !user_mode(regs)) {
537462306a36Sopenharmony_ci		unsigned long ip = exception_ip(regs);
537562306a36Sopenharmony_ci		if (!search_exception_tables(ip))
537662306a36Sopenharmony_ci			return false;
537762306a36Sopenharmony_ci	}
537862306a36Sopenharmony_ci
537962306a36Sopenharmony_ci	return !mmap_read_lock_killable(mm);
538062306a36Sopenharmony_ci}
538162306a36Sopenharmony_ci
538262306a36Sopenharmony_cistatic inline bool mmap_upgrade_trylock(struct mm_struct *mm)
538362306a36Sopenharmony_ci{
538462306a36Sopenharmony_ci	/*
538562306a36Sopenharmony_ci	 * We don't have this operation yet.
538662306a36Sopenharmony_ci	 *
538762306a36Sopenharmony_ci	 * It should be easy enough to do: it's basically a
538862306a36Sopenharmony_ci	 *    atomic_long_try_cmpxchg_acquire()
538962306a36Sopenharmony_ci	 * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
539062306a36Sopenharmony_ci	 * it also needs the proper lockdep magic etc.
539162306a36Sopenharmony_ci	 */
539262306a36Sopenharmony_ci	return false;
539362306a36Sopenharmony_ci}
539462306a36Sopenharmony_ci
539562306a36Sopenharmony_cistatic inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
539662306a36Sopenharmony_ci{
539762306a36Sopenharmony_ci	mmap_read_unlock(mm);
539862306a36Sopenharmony_ci	if (regs && !user_mode(regs)) {
539962306a36Sopenharmony_ci		unsigned long ip = exception_ip(regs);
540062306a36Sopenharmony_ci		if (!search_exception_tables(ip))
540162306a36Sopenharmony_ci			return false;
540262306a36Sopenharmony_ci	}
540362306a36Sopenharmony_ci	return !mmap_write_lock_killable(mm);
540462306a36Sopenharmony_ci}
540562306a36Sopenharmony_ci
540662306a36Sopenharmony_ci/*
540762306a36Sopenharmony_ci * Helper for page fault handling.
540862306a36Sopenharmony_ci *
540962306a36Sopenharmony_ci * This is kind of equivalend to "mmap_read_lock()" followed
541062306a36Sopenharmony_ci * by "find_extend_vma()", except it's a lot more careful about
541162306a36Sopenharmony_ci * the locking (and will drop the lock on failure).
541262306a36Sopenharmony_ci *
541362306a36Sopenharmony_ci * For example, if we have a kernel bug that causes a page
541462306a36Sopenharmony_ci * fault, we don't want to just use mmap_read_lock() to get
541562306a36Sopenharmony_ci * the mm lock, because that would deadlock if the bug were
541662306a36Sopenharmony_ci * to happen while we're holding the mm lock for writing.
541762306a36Sopenharmony_ci *
541862306a36Sopenharmony_ci * So this checks the exception tables on kernel faults in
541962306a36Sopenharmony_ci * order to only do this all for instructions that are actually
542062306a36Sopenharmony_ci * expected to fault.
542162306a36Sopenharmony_ci *
542262306a36Sopenharmony_ci * We can also actually take the mm lock for writing if we
542362306a36Sopenharmony_ci * need to extend the vma, which helps the VM layer a lot.
542462306a36Sopenharmony_ci */
542562306a36Sopenharmony_cistruct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
542662306a36Sopenharmony_ci			unsigned long addr, struct pt_regs *regs)
542762306a36Sopenharmony_ci{
542862306a36Sopenharmony_ci	struct vm_area_struct *vma;
542962306a36Sopenharmony_ci
543062306a36Sopenharmony_ci	if (!get_mmap_lock_carefully(mm, regs))
543162306a36Sopenharmony_ci		return NULL;
543262306a36Sopenharmony_ci
543362306a36Sopenharmony_ci	vma = find_vma(mm, addr);
543462306a36Sopenharmony_ci	if (likely(vma && (vma->vm_start <= addr)))
543562306a36Sopenharmony_ci		return vma;
543662306a36Sopenharmony_ci
543762306a36Sopenharmony_ci	/*
543862306a36Sopenharmony_ci	 * Well, dang. We might still be successful, but only
543962306a36Sopenharmony_ci	 * if we can extend a vma to do so.
544062306a36Sopenharmony_ci	 */
544162306a36Sopenharmony_ci	if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
544262306a36Sopenharmony_ci		mmap_read_unlock(mm);
544362306a36Sopenharmony_ci		return NULL;
544462306a36Sopenharmony_ci	}
544562306a36Sopenharmony_ci
544662306a36Sopenharmony_ci	/*
544762306a36Sopenharmony_ci	 * We can try to upgrade the mmap lock atomically,
544862306a36Sopenharmony_ci	 * in which case we can continue to use the vma
544962306a36Sopenharmony_ci	 * we already looked up.
545062306a36Sopenharmony_ci	 *
545162306a36Sopenharmony_ci	 * Otherwise we'll have to drop the mmap lock and
545262306a36Sopenharmony_ci	 * re-take it, and also look up the vma again,
545362306a36Sopenharmony_ci	 * re-checking it.
545462306a36Sopenharmony_ci	 */
545562306a36Sopenharmony_ci	if (!mmap_upgrade_trylock(mm)) {
545662306a36Sopenharmony_ci		if (!upgrade_mmap_lock_carefully(mm, regs))
545762306a36Sopenharmony_ci			return NULL;
545862306a36Sopenharmony_ci
545962306a36Sopenharmony_ci		vma = find_vma(mm, addr);
546062306a36Sopenharmony_ci		if (!vma)
546162306a36Sopenharmony_ci			goto fail;
546262306a36Sopenharmony_ci		if (vma->vm_start <= addr)
546362306a36Sopenharmony_ci			goto success;
546462306a36Sopenharmony_ci		if (!(vma->vm_flags & VM_GROWSDOWN))
546562306a36Sopenharmony_ci			goto fail;
546662306a36Sopenharmony_ci	}
546762306a36Sopenharmony_ci
546862306a36Sopenharmony_ci	if (expand_stack_locked(vma, addr))
546962306a36Sopenharmony_ci		goto fail;
547062306a36Sopenharmony_ci
547162306a36Sopenharmony_cisuccess:
547262306a36Sopenharmony_ci	mmap_write_downgrade(mm);
547362306a36Sopenharmony_ci	return vma;
547462306a36Sopenharmony_ci
547562306a36Sopenharmony_cifail:
547662306a36Sopenharmony_ci	mmap_write_unlock(mm);
547762306a36Sopenharmony_ci	return NULL;
547862306a36Sopenharmony_ci}
547962306a36Sopenharmony_ci#endif
548062306a36Sopenharmony_ci
548162306a36Sopenharmony_ci#ifdef CONFIG_PER_VMA_LOCK
548262306a36Sopenharmony_ci/*
548362306a36Sopenharmony_ci * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
548462306a36Sopenharmony_ci * stable and not isolated. If the VMA is not found or is being modified the
548562306a36Sopenharmony_ci * function returns NULL.
548662306a36Sopenharmony_ci */
548762306a36Sopenharmony_cistruct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
548862306a36Sopenharmony_ci					  unsigned long address)
548962306a36Sopenharmony_ci{
549062306a36Sopenharmony_ci	MA_STATE(mas, &mm->mm_mt, address, address);
549162306a36Sopenharmony_ci	struct vm_area_struct *vma;
549262306a36Sopenharmony_ci
549362306a36Sopenharmony_ci	rcu_read_lock();
549462306a36Sopenharmony_ciretry:
549562306a36Sopenharmony_ci	vma = mas_walk(&mas);
549662306a36Sopenharmony_ci	if (!vma)
549762306a36Sopenharmony_ci		goto inval;
549862306a36Sopenharmony_ci
549962306a36Sopenharmony_ci	if (!vma_start_read(vma))
550062306a36Sopenharmony_ci		goto inval;
550162306a36Sopenharmony_ci
550262306a36Sopenharmony_ci	/*
550362306a36Sopenharmony_ci	 * find_mergeable_anon_vma uses adjacent vmas which are not locked.
550462306a36Sopenharmony_ci	 * This check must happen after vma_start_read(); otherwise, a
550562306a36Sopenharmony_ci	 * concurrent mremap() with MREMAP_DONTUNMAP could dissociate the VMA
550662306a36Sopenharmony_ci	 * from its anon_vma.
550762306a36Sopenharmony_ci	 */
550862306a36Sopenharmony_ci	if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma))
550962306a36Sopenharmony_ci		goto inval_end_read;
551062306a36Sopenharmony_ci
551162306a36Sopenharmony_ci	/* Check since vm_start/vm_end might change before we lock the VMA */
551262306a36Sopenharmony_ci	if (unlikely(address < vma->vm_start || address >= vma->vm_end))
551362306a36Sopenharmony_ci		goto inval_end_read;
551462306a36Sopenharmony_ci
551562306a36Sopenharmony_ci	/* Check if the VMA got isolated after we found it */
551662306a36Sopenharmony_ci	if (vma->detached) {
551762306a36Sopenharmony_ci		vma_end_read(vma);
551862306a36Sopenharmony_ci		count_vm_vma_lock_event(VMA_LOCK_MISS);
551962306a36Sopenharmony_ci		/* The area was replaced with another one */
552062306a36Sopenharmony_ci		goto retry;
552162306a36Sopenharmony_ci	}
552262306a36Sopenharmony_ci
552362306a36Sopenharmony_ci	rcu_read_unlock();
552462306a36Sopenharmony_ci	return vma;
552562306a36Sopenharmony_ci
552662306a36Sopenharmony_ciinval_end_read:
552762306a36Sopenharmony_ci	vma_end_read(vma);
552862306a36Sopenharmony_ciinval:
552962306a36Sopenharmony_ci	rcu_read_unlock();
553062306a36Sopenharmony_ci	count_vm_vma_lock_event(VMA_LOCK_ABORT);
553162306a36Sopenharmony_ci	return NULL;
553262306a36Sopenharmony_ci}
553362306a36Sopenharmony_ci#endif /* CONFIG_PER_VMA_LOCK */
553462306a36Sopenharmony_ci
553562306a36Sopenharmony_ci#ifndef __PAGETABLE_P4D_FOLDED
553662306a36Sopenharmony_ci/*
553762306a36Sopenharmony_ci * Allocate p4d page table.
553862306a36Sopenharmony_ci * We've already handled the fast-path in-line.
553962306a36Sopenharmony_ci */
554062306a36Sopenharmony_ciint __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
554162306a36Sopenharmony_ci{
554262306a36Sopenharmony_ci	p4d_t *new = p4d_alloc_one(mm, address);
554362306a36Sopenharmony_ci	if (!new)
554462306a36Sopenharmony_ci		return -ENOMEM;
554562306a36Sopenharmony_ci
554662306a36Sopenharmony_ci	spin_lock(&mm->page_table_lock);
554762306a36Sopenharmony_ci	if (pgd_present(*pgd)) {	/* Another has populated it */
554862306a36Sopenharmony_ci		p4d_free(mm, new);
554962306a36Sopenharmony_ci	} else {
555062306a36Sopenharmony_ci		smp_wmb(); /* See comment in pmd_install() */
555162306a36Sopenharmony_ci		pgd_populate(mm, pgd, new);
555262306a36Sopenharmony_ci	}
555362306a36Sopenharmony_ci	spin_unlock(&mm->page_table_lock);
555462306a36Sopenharmony_ci	return 0;
555562306a36Sopenharmony_ci}
555662306a36Sopenharmony_ci#endif /* __PAGETABLE_P4D_FOLDED */
555762306a36Sopenharmony_ci
555862306a36Sopenharmony_ci#ifndef __PAGETABLE_PUD_FOLDED
555962306a36Sopenharmony_ci/*
556062306a36Sopenharmony_ci * Allocate page upper directory.
556162306a36Sopenharmony_ci * We've already handled the fast-path in-line.
556262306a36Sopenharmony_ci */
556362306a36Sopenharmony_ciint __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
556462306a36Sopenharmony_ci{
556562306a36Sopenharmony_ci	pud_t *new = pud_alloc_one(mm, address);
556662306a36Sopenharmony_ci	if (!new)
556762306a36Sopenharmony_ci		return -ENOMEM;
556862306a36Sopenharmony_ci
556962306a36Sopenharmony_ci	spin_lock(&mm->page_table_lock);
557062306a36Sopenharmony_ci	if (!p4d_present(*p4d)) {
557162306a36Sopenharmony_ci		mm_inc_nr_puds(mm);
557262306a36Sopenharmony_ci		smp_wmb(); /* See comment in pmd_install() */
557362306a36Sopenharmony_ci		p4d_populate(mm, p4d, new);
557462306a36Sopenharmony_ci	} else	/* Another has populated it */
557562306a36Sopenharmony_ci		pud_free(mm, new);
557662306a36Sopenharmony_ci	spin_unlock(&mm->page_table_lock);
557762306a36Sopenharmony_ci	return 0;
557862306a36Sopenharmony_ci}
557962306a36Sopenharmony_ci#endif /* __PAGETABLE_PUD_FOLDED */
558062306a36Sopenharmony_ci
558162306a36Sopenharmony_ci#ifndef __PAGETABLE_PMD_FOLDED
558262306a36Sopenharmony_ci/*
558362306a36Sopenharmony_ci * Allocate page middle directory.
558462306a36Sopenharmony_ci * We've already handled the fast-path in-line.
558562306a36Sopenharmony_ci */
558662306a36Sopenharmony_ciint __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
558762306a36Sopenharmony_ci{
558862306a36Sopenharmony_ci	spinlock_t *ptl;
558962306a36Sopenharmony_ci	pmd_t *new = pmd_alloc_one(mm, address);
559062306a36Sopenharmony_ci	if (!new)
559162306a36Sopenharmony_ci		return -ENOMEM;
559262306a36Sopenharmony_ci
559362306a36Sopenharmony_ci	ptl = pud_lock(mm, pud);
559462306a36Sopenharmony_ci	if (!pud_present(*pud)) {
559562306a36Sopenharmony_ci		mm_inc_nr_pmds(mm);
559662306a36Sopenharmony_ci		smp_wmb(); /* See comment in pmd_install() */
559762306a36Sopenharmony_ci		pud_populate(mm, pud, new);
559862306a36Sopenharmony_ci	} else {	/* Another has populated it */
559962306a36Sopenharmony_ci		pmd_free(mm, new);
560062306a36Sopenharmony_ci	}
560162306a36Sopenharmony_ci	spin_unlock(ptl);
560262306a36Sopenharmony_ci	return 0;
560362306a36Sopenharmony_ci}
560462306a36Sopenharmony_ci#endif /* __PAGETABLE_PMD_FOLDED */
560562306a36Sopenharmony_ci
560662306a36Sopenharmony_ci/**
560762306a36Sopenharmony_ci * follow_pte - look up PTE at a user virtual address
560862306a36Sopenharmony_ci * @mm: the mm_struct of the target address space
560962306a36Sopenharmony_ci * @address: user virtual address
561062306a36Sopenharmony_ci * @ptepp: location to store found PTE
561162306a36Sopenharmony_ci * @ptlp: location to store the lock for the PTE
561262306a36Sopenharmony_ci *
561362306a36Sopenharmony_ci * On a successful return, the pointer to the PTE is stored in @ptepp;
561462306a36Sopenharmony_ci * the corresponding lock is taken and its location is stored in @ptlp.
561562306a36Sopenharmony_ci * The contents of the PTE are only stable until @ptlp is released;
561662306a36Sopenharmony_ci * any further use, if any, must be protected against invalidation
561762306a36Sopenharmony_ci * with MMU notifiers.
561862306a36Sopenharmony_ci *
561962306a36Sopenharmony_ci * Only IO mappings and raw PFN mappings are allowed.  The mmap semaphore
562062306a36Sopenharmony_ci * should be taken for read.
562162306a36Sopenharmony_ci *
562262306a36Sopenharmony_ci * KVM uses this function.  While it is arguably less bad than ``follow_pfn``,
562362306a36Sopenharmony_ci * it is not a good general-purpose API.
562462306a36Sopenharmony_ci *
562562306a36Sopenharmony_ci * Return: zero on success, -ve otherwise.
562662306a36Sopenharmony_ci */
562762306a36Sopenharmony_ciint follow_pte(struct mm_struct *mm, unsigned long address,
562862306a36Sopenharmony_ci	       pte_t **ptepp, spinlock_t **ptlp)
562962306a36Sopenharmony_ci{
563062306a36Sopenharmony_ci	pgd_t *pgd;
563162306a36Sopenharmony_ci	p4d_t *p4d;
563262306a36Sopenharmony_ci	pud_t *pud;
563362306a36Sopenharmony_ci	pmd_t *pmd;
563462306a36Sopenharmony_ci	pte_t *ptep;
563562306a36Sopenharmony_ci
563662306a36Sopenharmony_ci	pgd = pgd_offset(mm, address);
563762306a36Sopenharmony_ci	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
563862306a36Sopenharmony_ci		goto out;
563962306a36Sopenharmony_ci
564062306a36Sopenharmony_ci	p4d = p4d_offset(pgd, address);
564162306a36Sopenharmony_ci	if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
564262306a36Sopenharmony_ci		goto out;
564362306a36Sopenharmony_ci
564462306a36Sopenharmony_ci	pud = pud_offset(p4d, address);
564562306a36Sopenharmony_ci	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
564662306a36Sopenharmony_ci		goto out;
564762306a36Sopenharmony_ci
564862306a36Sopenharmony_ci	pmd = pmd_offset(pud, address);
564962306a36Sopenharmony_ci	VM_BUG_ON(pmd_trans_huge(*pmd));
565062306a36Sopenharmony_ci
565162306a36Sopenharmony_ci	ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
565262306a36Sopenharmony_ci	if (!ptep)
565362306a36Sopenharmony_ci		goto out;
565462306a36Sopenharmony_ci	if (!pte_present(ptep_get(ptep)))
565562306a36Sopenharmony_ci		goto unlock;
565662306a36Sopenharmony_ci	*ptepp = ptep;
565762306a36Sopenharmony_ci	return 0;
565862306a36Sopenharmony_ciunlock:
565962306a36Sopenharmony_ci	pte_unmap_unlock(ptep, *ptlp);
566062306a36Sopenharmony_ciout:
566162306a36Sopenharmony_ci	return -EINVAL;
566262306a36Sopenharmony_ci}
566362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(follow_pte);
566462306a36Sopenharmony_ci
566562306a36Sopenharmony_ci/**
566662306a36Sopenharmony_ci * follow_pfn - look up PFN at a user virtual address
566762306a36Sopenharmony_ci * @vma: memory mapping
566862306a36Sopenharmony_ci * @address: user virtual address
566962306a36Sopenharmony_ci * @pfn: location to store found PFN
567062306a36Sopenharmony_ci *
567162306a36Sopenharmony_ci * Only IO mappings and raw PFN mappings are allowed.
567262306a36Sopenharmony_ci *
567362306a36Sopenharmony_ci * This function does not allow the caller to read the permissions
567462306a36Sopenharmony_ci * of the PTE.  Do not use it.
567562306a36Sopenharmony_ci *
567662306a36Sopenharmony_ci * Return: zero and the pfn at @pfn on success, -ve otherwise.
567762306a36Sopenharmony_ci */
567862306a36Sopenharmony_ciint follow_pfn(struct vm_area_struct *vma, unsigned long address,
567962306a36Sopenharmony_ci	unsigned long *pfn)
568062306a36Sopenharmony_ci{
568162306a36Sopenharmony_ci	int ret = -EINVAL;
568262306a36Sopenharmony_ci	spinlock_t *ptl;
568362306a36Sopenharmony_ci	pte_t *ptep;
568462306a36Sopenharmony_ci
568562306a36Sopenharmony_ci	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
568662306a36Sopenharmony_ci		return ret;
568762306a36Sopenharmony_ci
568862306a36Sopenharmony_ci	ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
568962306a36Sopenharmony_ci	if (ret)
569062306a36Sopenharmony_ci		return ret;
569162306a36Sopenharmony_ci	*pfn = pte_pfn(ptep_get(ptep));
569262306a36Sopenharmony_ci	pte_unmap_unlock(ptep, ptl);
569362306a36Sopenharmony_ci	return 0;
569462306a36Sopenharmony_ci}
569562306a36Sopenharmony_ciEXPORT_SYMBOL(follow_pfn);
569662306a36Sopenharmony_ci
569762306a36Sopenharmony_ci#ifdef CONFIG_HAVE_IOREMAP_PROT
569862306a36Sopenharmony_ciint follow_phys(struct vm_area_struct *vma,
569962306a36Sopenharmony_ci		unsigned long address, unsigned int flags,
570062306a36Sopenharmony_ci		unsigned long *prot, resource_size_t *phys)
570162306a36Sopenharmony_ci{
570262306a36Sopenharmony_ci	int ret = -EINVAL;
570362306a36Sopenharmony_ci	pte_t *ptep, pte;
570462306a36Sopenharmony_ci	spinlock_t *ptl;
570562306a36Sopenharmony_ci
570662306a36Sopenharmony_ci	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
570762306a36Sopenharmony_ci		goto out;
570862306a36Sopenharmony_ci
570962306a36Sopenharmony_ci	if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
571062306a36Sopenharmony_ci		goto out;
571162306a36Sopenharmony_ci	pte = ptep_get(ptep);
571262306a36Sopenharmony_ci
571362306a36Sopenharmony_ci	if ((flags & FOLL_WRITE) && !pte_write(pte))
571462306a36Sopenharmony_ci		goto unlock;
571562306a36Sopenharmony_ci
571662306a36Sopenharmony_ci	*prot = pgprot_val(pte_pgprot(pte));
571762306a36Sopenharmony_ci	*phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
571862306a36Sopenharmony_ci
571962306a36Sopenharmony_ci	ret = 0;
572062306a36Sopenharmony_ciunlock:
572162306a36Sopenharmony_ci	pte_unmap_unlock(ptep, ptl);
572262306a36Sopenharmony_ciout:
572362306a36Sopenharmony_ci	return ret;
572462306a36Sopenharmony_ci}
572562306a36Sopenharmony_ci
572662306a36Sopenharmony_ci/**
572762306a36Sopenharmony_ci * generic_access_phys - generic implementation for iomem mmap access
572862306a36Sopenharmony_ci * @vma: the vma to access
572962306a36Sopenharmony_ci * @addr: userspace address, not relative offset within @vma
573062306a36Sopenharmony_ci * @buf: buffer to read/write
573162306a36Sopenharmony_ci * @len: length of transfer
573262306a36Sopenharmony_ci * @write: set to FOLL_WRITE when writing, otherwise reading
573362306a36Sopenharmony_ci *
573462306a36Sopenharmony_ci * This is a generic implementation for &vm_operations_struct.access for an
573562306a36Sopenharmony_ci * iomem mapping. This callback is used by access_process_vm() when the @vma is
573662306a36Sopenharmony_ci * not page based.
573762306a36Sopenharmony_ci */
573862306a36Sopenharmony_ciint generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
573962306a36Sopenharmony_ci			void *buf, int len, int write)
574062306a36Sopenharmony_ci{
574162306a36Sopenharmony_ci	resource_size_t phys_addr;
574262306a36Sopenharmony_ci	unsigned long prot = 0;
574362306a36Sopenharmony_ci	void __iomem *maddr;
574462306a36Sopenharmony_ci	pte_t *ptep, pte;
574562306a36Sopenharmony_ci	spinlock_t *ptl;
574662306a36Sopenharmony_ci	int offset = offset_in_page(addr);
574762306a36Sopenharmony_ci	int ret = -EINVAL;
574862306a36Sopenharmony_ci
574962306a36Sopenharmony_ci	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
575062306a36Sopenharmony_ci		return -EINVAL;
575162306a36Sopenharmony_ci
575262306a36Sopenharmony_ciretry:
575362306a36Sopenharmony_ci	if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
575462306a36Sopenharmony_ci		return -EINVAL;
575562306a36Sopenharmony_ci	pte = ptep_get(ptep);
575662306a36Sopenharmony_ci	pte_unmap_unlock(ptep, ptl);
575762306a36Sopenharmony_ci
575862306a36Sopenharmony_ci	prot = pgprot_val(pte_pgprot(pte));
575962306a36Sopenharmony_ci	phys_addr = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
576062306a36Sopenharmony_ci
576162306a36Sopenharmony_ci	if ((write & FOLL_WRITE) && !pte_write(pte))
576262306a36Sopenharmony_ci		return -EINVAL;
576362306a36Sopenharmony_ci
576462306a36Sopenharmony_ci	maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
576562306a36Sopenharmony_ci	if (!maddr)
576662306a36Sopenharmony_ci		return -ENOMEM;
576762306a36Sopenharmony_ci
576862306a36Sopenharmony_ci	if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
576962306a36Sopenharmony_ci		goto out_unmap;
577062306a36Sopenharmony_ci
577162306a36Sopenharmony_ci	if (!pte_same(pte, ptep_get(ptep))) {
577262306a36Sopenharmony_ci		pte_unmap_unlock(ptep, ptl);
577362306a36Sopenharmony_ci		iounmap(maddr);
577462306a36Sopenharmony_ci
577562306a36Sopenharmony_ci		goto retry;
577662306a36Sopenharmony_ci	}
577762306a36Sopenharmony_ci
577862306a36Sopenharmony_ci	if (write)
577962306a36Sopenharmony_ci		memcpy_toio(maddr + offset, buf, len);
578062306a36Sopenharmony_ci	else
578162306a36Sopenharmony_ci		memcpy_fromio(buf, maddr + offset, len);
578262306a36Sopenharmony_ci	ret = len;
578362306a36Sopenharmony_ci	pte_unmap_unlock(ptep, ptl);
578462306a36Sopenharmony_ciout_unmap:
578562306a36Sopenharmony_ci	iounmap(maddr);
578662306a36Sopenharmony_ci
578762306a36Sopenharmony_ci	return ret;
578862306a36Sopenharmony_ci}
578962306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(generic_access_phys);
579062306a36Sopenharmony_ci#endif
579162306a36Sopenharmony_ci
579262306a36Sopenharmony_ci/*
579362306a36Sopenharmony_ci * Access another process' address space as given in mm.
579462306a36Sopenharmony_ci */
579562306a36Sopenharmony_ciint __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
579662306a36Sopenharmony_ci		       int len, unsigned int gup_flags)
579762306a36Sopenharmony_ci{
579862306a36Sopenharmony_ci	void *old_buf = buf;
579962306a36Sopenharmony_ci	int write = gup_flags & FOLL_WRITE;
580062306a36Sopenharmony_ci
580162306a36Sopenharmony_ci	if (mmap_read_lock_killable(mm))
580262306a36Sopenharmony_ci		return 0;
580362306a36Sopenharmony_ci
580462306a36Sopenharmony_ci	/* Untag the address before looking up the VMA */
580562306a36Sopenharmony_ci	addr = untagged_addr_remote(mm, addr);
580662306a36Sopenharmony_ci
580762306a36Sopenharmony_ci	/* Avoid triggering the temporary warning in __get_user_pages */
580862306a36Sopenharmony_ci	if (!vma_lookup(mm, addr) && !expand_stack(mm, addr))
580962306a36Sopenharmony_ci		return 0;
581062306a36Sopenharmony_ci
581162306a36Sopenharmony_ci	/* ignore errors, just check how much was successfully transferred */
581262306a36Sopenharmony_ci	while (len) {
581362306a36Sopenharmony_ci		int bytes, offset;
581462306a36Sopenharmony_ci		void *maddr;
581562306a36Sopenharmony_ci		struct vm_area_struct *vma = NULL;
581662306a36Sopenharmony_ci		struct page *page = get_user_page_vma_remote(mm, addr,
581762306a36Sopenharmony_ci							     gup_flags, &vma);
581862306a36Sopenharmony_ci
581962306a36Sopenharmony_ci		if (IS_ERR_OR_NULL(page)) {
582062306a36Sopenharmony_ci			/* We might need to expand the stack to access it */
582162306a36Sopenharmony_ci			vma = vma_lookup(mm, addr);
582262306a36Sopenharmony_ci			if (!vma) {
582362306a36Sopenharmony_ci				vma = expand_stack(mm, addr);
582462306a36Sopenharmony_ci
582562306a36Sopenharmony_ci				/* mmap_lock was dropped on failure */
582662306a36Sopenharmony_ci				if (!vma)
582762306a36Sopenharmony_ci					return buf - old_buf;
582862306a36Sopenharmony_ci
582962306a36Sopenharmony_ci				/* Try again if stack expansion worked */
583062306a36Sopenharmony_ci				continue;
583162306a36Sopenharmony_ci			}
583262306a36Sopenharmony_ci
583362306a36Sopenharmony_ci
583462306a36Sopenharmony_ci			/*
583562306a36Sopenharmony_ci			 * Check if this is a VM_IO | VM_PFNMAP VMA, which
583662306a36Sopenharmony_ci			 * we can access using slightly different code.
583762306a36Sopenharmony_ci			 */
583862306a36Sopenharmony_ci			bytes = 0;
583962306a36Sopenharmony_ci#ifdef CONFIG_HAVE_IOREMAP_PROT
584062306a36Sopenharmony_ci			if (vma->vm_ops && vma->vm_ops->access)
584162306a36Sopenharmony_ci				bytes = vma->vm_ops->access(vma, addr, buf,
584262306a36Sopenharmony_ci							    len, write);
584362306a36Sopenharmony_ci#endif
584462306a36Sopenharmony_ci			if (bytes <= 0)
584562306a36Sopenharmony_ci				break;
584662306a36Sopenharmony_ci		} else {
584762306a36Sopenharmony_ci			bytes = len;
584862306a36Sopenharmony_ci			offset = addr & (PAGE_SIZE-1);
584962306a36Sopenharmony_ci			if (bytes > PAGE_SIZE-offset)
585062306a36Sopenharmony_ci				bytes = PAGE_SIZE-offset;
585162306a36Sopenharmony_ci
585262306a36Sopenharmony_ci			maddr = kmap(page);
585362306a36Sopenharmony_ci			if (write) {
585462306a36Sopenharmony_ci				copy_to_user_page(vma, page, addr,
585562306a36Sopenharmony_ci						  maddr + offset, buf, bytes);
585662306a36Sopenharmony_ci				set_page_dirty_lock(page);
585762306a36Sopenharmony_ci			} else {
585862306a36Sopenharmony_ci				copy_from_user_page(vma, page, addr,
585962306a36Sopenharmony_ci						    buf, maddr + offset, bytes);
586062306a36Sopenharmony_ci			}
586162306a36Sopenharmony_ci			kunmap(page);
586262306a36Sopenharmony_ci			put_page(page);
586362306a36Sopenharmony_ci		}
586462306a36Sopenharmony_ci		len -= bytes;
586562306a36Sopenharmony_ci		buf += bytes;
586662306a36Sopenharmony_ci		addr += bytes;
586762306a36Sopenharmony_ci	}
586862306a36Sopenharmony_ci	mmap_read_unlock(mm);
586962306a36Sopenharmony_ci
587062306a36Sopenharmony_ci	return buf - old_buf;
587162306a36Sopenharmony_ci}
587262306a36Sopenharmony_ci
587362306a36Sopenharmony_ci/**
587462306a36Sopenharmony_ci * access_remote_vm - access another process' address space
587562306a36Sopenharmony_ci * @mm:		the mm_struct of the target address space
587662306a36Sopenharmony_ci * @addr:	start address to access
587762306a36Sopenharmony_ci * @buf:	source or destination buffer
587862306a36Sopenharmony_ci * @len:	number of bytes to transfer
587962306a36Sopenharmony_ci * @gup_flags:	flags modifying lookup behaviour
588062306a36Sopenharmony_ci *
588162306a36Sopenharmony_ci * The caller must hold a reference on @mm.
588262306a36Sopenharmony_ci *
588362306a36Sopenharmony_ci * Return: number of bytes copied from source to destination.
588462306a36Sopenharmony_ci */
588562306a36Sopenharmony_ciint access_remote_vm(struct mm_struct *mm, unsigned long addr,
588662306a36Sopenharmony_ci		void *buf, int len, unsigned int gup_flags)
588762306a36Sopenharmony_ci{
588862306a36Sopenharmony_ci	return __access_remote_vm(mm, addr, buf, len, gup_flags);
588962306a36Sopenharmony_ci}
589062306a36Sopenharmony_ci
589162306a36Sopenharmony_ci/*
589262306a36Sopenharmony_ci * Access another process' address space.
589362306a36Sopenharmony_ci * Source/target buffer must be kernel space,
589462306a36Sopenharmony_ci * Do not walk the page table directly, use get_user_pages
589562306a36Sopenharmony_ci */
589662306a36Sopenharmony_ciint access_process_vm(struct task_struct *tsk, unsigned long addr,
589762306a36Sopenharmony_ci		void *buf, int len, unsigned int gup_flags)
589862306a36Sopenharmony_ci{
589962306a36Sopenharmony_ci	struct mm_struct *mm;
590062306a36Sopenharmony_ci	int ret;
590162306a36Sopenharmony_ci
590262306a36Sopenharmony_ci	mm = get_task_mm(tsk);
590362306a36Sopenharmony_ci	if (!mm)
590462306a36Sopenharmony_ci		return 0;
590562306a36Sopenharmony_ci
590662306a36Sopenharmony_ci	ret = __access_remote_vm(mm, addr, buf, len, gup_flags);
590762306a36Sopenharmony_ci
590862306a36Sopenharmony_ci	mmput(mm);
590962306a36Sopenharmony_ci
591062306a36Sopenharmony_ci	return ret;
591162306a36Sopenharmony_ci}
591262306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(access_process_vm);
591362306a36Sopenharmony_ci
591462306a36Sopenharmony_ci/*
591562306a36Sopenharmony_ci * Print the name of a VMA.
591662306a36Sopenharmony_ci */
591762306a36Sopenharmony_civoid print_vma_addr(char *prefix, unsigned long ip)
591862306a36Sopenharmony_ci{
591962306a36Sopenharmony_ci	struct mm_struct *mm = current->mm;
592062306a36Sopenharmony_ci	struct vm_area_struct *vma;
592162306a36Sopenharmony_ci
592262306a36Sopenharmony_ci	/*
592362306a36Sopenharmony_ci	 * we might be running from an atomic context so we cannot sleep
592462306a36Sopenharmony_ci	 */
592562306a36Sopenharmony_ci	if (!mmap_read_trylock(mm))
592662306a36Sopenharmony_ci		return;
592762306a36Sopenharmony_ci
592862306a36Sopenharmony_ci	vma = find_vma(mm, ip);
592962306a36Sopenharmony_ci	if (vma && vma->vm_file) {
593062306a36Sopenharmony_ci		struct file *f = vma->vm_file;
593162306a36Sopenharmony_ci		char *buf = (char *)__get_free_page(GFP_NOWAIT);
593262306a36Sopenharmony_ci		if (buf) {
593362306a36Sopenharmony_ci			char *p;
593462306a36Sopenharmony_ci
593562306a36Sopenharmony_ci			p = file_path(f, buf, PAGE_SIZE);
593662306a36Sopenharmony_ci			if (IS_ERR(p))
593762306a36Sopenharmony_ci				p = "?";
593862306a36Sopenharmony_ci			printk("%s%s[%lx+%lx]", prefix, kbasename(p),
593962306a36Sopenharmony_ci					vma->vm_start,
594062306a36Sopenharmony_ci					vma->vm_end - vma->vm_start);
594162306a36Sopenharmony_ci			free_page((unsigned long)buf);
594262306a36Sopenharmony_ci		}
594362306a36Sopenharmony_ci	}
594462306a36Sopenharmony_ci	mmap_read_unlock(mm);
594562306a36Sopenharmony_ci}
594662306a36Sopenharmony_ci
594762306a36Sopenharmony_ci#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
594862306a36Sopenharmony_civoid __might_fault(const char *file, int line)
594962306a36Sopenharmony_ci{
595062306a36Sopenharmony_ci	if (pagefault_disabled())
595162306a36Sopenharmony_ci		return;
595262306a36Sopenharmony_ci	__might_sleep(file, line);
595362306a36Sopenharmony_ci#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
595462306a36Sopenharmony_ci	if (current->mm)
595562306a36Sopenharmony_ci		might_lock_read(&current->mm->mmap_lock);
595662306a36Sopenharmony_ci#endif
595762306a36Sopenharmony_ci}
595862306a36Sopenharmony_ciEXPORT_SYMBOL(__might_fault);
595962306a36Sopenharmony_ci#endif
596062306a36Sopenharmony_ci
596162306a36Sopenharmony_ci#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
596262306a36Sopenharmony_ci/*
596362306a36Sopenharmony_ci * Process all subpages of the specified huge page with the specified
596462306a36Sopenharmony_ci * operation.  The target subpage will be processed last to keep its
596562306a36Sopenharmony_ci * cache lines hot.
596662306a36Sopenharmony_ci */
596762306a36Sopenharmony_cistatic inline int process_huge_page(
596862306a36Sopenharmony_ci	unsigned long addr_hint, unsigned int pages_per_huge_page,
596962306a36Sopenharmony_ci	int (*process_subpage)(unsigned long addr, int idx, void *arg),
597062306a36Sopenharmony_ci	void *arg)
597162306a36Sopenharmony_ci{
597262306a36Sopenharmony_ci	int i, n, base, l, ret;
597362306a36Sopenharmony_ci	unsigned long addr = addr_hint &
597462306a36Sopenharmony_ci		~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
597562306a36Sopenharmony_ci
597662306a36Sopenharmony_ci	/* Process target subpage last to keep its cache lines hot */
597762306a36Sopenharmony_ci	might_sleep();
597862306a36Sopenharmony_ci	n = (addr_hint - addr) / PAGE_SIZE;
597962306a36Sopenharmony_ci	if (2 * n <= pages_per_huge_page) {
598062306a36Sopenharmony_ci		/* If target subpage in first half of huge page */
598162306a36Sopenharmony_ci		base = 0;
598262306a36Sopenharmony_ci		l = n;
598362306a36Sopenharmony_ci		/* Process subpages at the end of huge page */
598462306a36Sopenharmony_ci		for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
598562306a36Sopenharmony_ci			cond_resched();
598662306a36Sopenharmony_ci			ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
598762306a36Sopenharmony_ci			if (ret)
598862306a36Sopenharmony_ci				return ret;
598962306a36Sopenharmony_ci		}
599062306a36Sopenharmony_ci	} else {
599162306a36Sopenharmony_ci		/* If target subpage in second half of huge page */
599262306a36Sopenharmony_ci		base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
599362306a36Sopenharmony_ci		l = pages_per_huge_page - n;
599462306a36Sopenharmony_ci		/* Process subpages at the begin of huge page */
599562306a36Sopenharmony_ci		for (i = 0; i < base; i++) {
599662306a36Sopenharmony_ci			cond_resched();
599762306a36Sopenharmony_ci			ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
599862306a36Sopenharmony_ci			if (ret)
599962306a36Sopenharmony_ci				return ret;
600062306a36Sopenharmony_ci		}
600162306a36Sopenharmony_ci	}
600262306a36Sopenharmony_ci	/*
600362306a36Sopenharmony_ci	 * Process remaining subpages in left-right-left-right pattern
600462306a36Sopenharmony_ci	 * towards the target subpage
600562306a36Sopenharmony_ci	 */
600662306a36Sopenharmony_ci	for (i = 0; i < l; i++) {
600762306a36Sopenharmony_ci		int left_idx = base + i;
600862306a36Sopenharmony_ci		int right_idx = base + 2 * l - 1 - i;
600962306a36Sopenharmony_ci
601062306a36Sopenharmony_ci		cond_resched();
601162306a36Sopenharmony_ci		ret = process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
601262306a36Sopenharmony_ci		if (ret)
601362306a36Sopenharmony_ci			return ret;
601462306a36Sopenharmony_ci		cond_resched();
601562306a36Sopenharmony_ci		ret = process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
601662306a36Sopenharmony_ci		if (ret)
601762306a36Sopenharmony_ci			return ret;
601862306a36Sopenharmony_ci	}
601962306a36Sopenharmony_ci	return 0;
602062306a36Sopenharmony_ci}
602162306a36Sopenharmony_ci
602262306a36Sopenharmony_cistatic void clear_gigantic_page(struct page *page,
602362306a36Sopenharmony_ci				unsigned long addr,
602462306a36Sopenharmony_ci				unsigned int pages_per_huge_page)
602562306a36Sopenharmony_ci{
602662306a36Sopenharmony_ci	int i;
602762306a36Sopenharmony_ci	struct page *p;
602862306a36Sopenharmony_ci
602962306a36Sopenharmony_ci	might_sleep();
603062306a36Sopenharmony_ci	for (i = 0; i < pages_per_huge_page; i++) {
603162306a36Sopenharmony_ci		p = nth_page(page, i);
603262306a36Sopenharmony_ci		cond_resched();
603362306a36Sopenharmony_ci		clear_user_highpage(p, addr + i * PAGE_SIZE);
603462306a36Sopenharmony_ci	}
603562306a36Sopenharmony_ci}
603662306a36Sopenharmony_ci
603762306a36Sopenharmony_cistatic int clear_subpage(unsigned long addr, int idx, void *arg)
603862306a36Sopenharmony_ci{
603962306a36Sopenharmony_ci	struct page *page = arg;
604062306a36Sopenharmony_ci
604162306a36Sopenharmony_ci	clear_user_highpage(page + idx, addr);
604262306a36Sopenharmony_ci	return 0;
604362306a36Sopenharmony_ci}
604462306a36Sopenharmony_ci
604562306a36Sopenharmony_civoid clear_huge_page(struct page *page,
604662306a36Sopenharmony_ci		     unsigned long addr_hint, unsigned int pages_per_huge_page)
604762306a36Sopenharmony_ci{
604862306a36Sopenharmony_ci	unsigned long addr = addr_hint &
604962306a36Sopenharmony_ci		~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
605062306a36Sopenharmony_ci
605162306a36Sopenharmony_ci	if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
605262306a36Sopenharmony_ci		clear_gigantic_page(page, addr, pages_per_huge_page);
605362306a36Sopenharmony_ci		return;
605462306a36Sopenharmony_ci	}
605562306a36Sopenharmony_ci
605662306a36Sopenharmony_ci	process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
605762306a36Sopenharmony_ci}
605862306a36Sopenharmony_ci
605962306a36Sopenharmony_cistatic int copy_user_gigantic_page(struct folio *dst, struct folio *src,
606062306a36Sopenharmony_ci				     unsigned long addr,
606162306a36Sopenharmony_ci				     struct vm_area_struct *vma,
606262306a36Sopenharmony_ci				     unsigned int pages_per_huge_page)
606362306a36Sopenharmony_ci{
606462306a36Sopenharmony_ci	int i;
606562306a36Sopenharmony_ci	struct page *dst_page;
606662306a36Sopenharmony_ci	struct page *src_page;
606762306a36Sopenharmony_ci
606862306a36Sopenharmony_ci	for (i = 0; i < pages_per_huge_page; i++) {
606962306a36Sopenharmony_ci		dst_page = folio_page(dst, i);
607062306a36Sopenharmony_ci		src_page = folio_page(src, i);
607162306a36Sopenharmony_ci
607262306a36Sopenharmony_ci		cond_resched();
607362306a36Sopenharmony_ci		if (copy_mc_user_highpage(dst_page, src_page,
607462306a36Sopenharmony_ci					  addr + i*PAGE_SIZE, vma)) {
607562306a36Sopenharmony_ci			memory_failure_queue(page_to_pfn(src_page), 0);
607662306a36Sopenharmony_ci			return -EHWPOISON;
607762306a36Sopenharmony_ci		}
607862306a36Sopenharmony_ci	}
607962306a36Sopenharmony_ci	return 0;
608062306a36Sopenharmony_ci}
608162306a36Sopenharmony_ci
608262306a36Sopenharmony_cistruct copy_subpage_arg {
608362306a36Sopenharmony_ci	struct page *dst;
608462306a36Sopenharmony_ci	struct page *src;
608562306a36Sopenharmony_ci	struct vm_area_struct *vma;
608662306a36Sopenharmony_ci};
608762306a36Sopenharmony_ci
608862306a36Sopenharmony_cistatic int copy_subpage(unsigned long addr, int idx, void *arg)
608962306a36Sopenharmony_ci{
609062306a36Sopenharmony_ci	struct copy_subpage_arg *copy_arg = arg;
609162306a36Sopenharmony_ci
609262306a36Sopenharmony_ci	if (copy_mc_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
609362306a36Sopenharmony_ci				  addr, copy_arg->vma)) {
609462306a36Sopenharmony_ci		memory_failure_queue(page_to_pfn(copy_arg->src + idx), 0);
609562306a36Sopenharmony_ci		return -EHWPOISON;
609662306a36Sopenharmony_ci	}
609762306a36Sopenharmony_ci	return 0;
609862306a36Sopenharmony_ci}
609962306a36Sopenharmony_ci
610062306a36Sopenharmony_ciint copy_user_large_folio(struct folio *dst, struct folio *src,
610162306a36Sopenharmony_ci			  unsigned long addr_hint, struct vm_area_struct *vma)
610262306a36Sopenharmony_ci{
610362306a36Sopenharmony_ci	unsigned int pages_per_huge_page = folio_nr_pages(dst);
610462306a36Sopenharmony_ci	unsigned long addr = addr_hint &
610562306a36Sopenharmony_ci		~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
610662306a36Sopenharmony_ci	struct copy_subpage_arg arg = {
610762306a36Sopenharmony_ci		.dst = &dst->page,
610862306a36Sopenharmony_ci		.src = &src->page,
610962306a36Sopenharmony_ci		.vma = vma,
611062306a36Sopenharmony_ci	};
611162306a36Sopenharmony_ci
611262306a36Sopenharmony_ci	if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES))
611362306a36Sopenharmony_ci		return copy_user_gigantic_page(dst, src, addr, vma,
611462306a36Sopenharmony_ci					       pages_per_huge_page);
611562306a36Sopenharmony_ci
611662306a36Sopenharmony_ci	return process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
611762306a36Sopenharmony_ci}
611862306a36Sopenharmony_ci
611962306a36Sopenharmony_cilong copy_folio_from_user(struct folio *dst_folio,
612062306a36Sopenharmony_ci			   const void __user *usr_src,
612162306a36Sopenharmony_ci			   bool allow_pagefault)
612262306a36Sopenharmony_ci{
612362306a36Sopenharmony_ci	void *kaddr;
612462306a36Sopenharmony_ci	unsigned long i, rc = 0;
612562306a36Sopenharmony_ci	unsigned int nr_pages = folio_nr_pages(dst_folio);
612662306a36Sopenharmony_ci	unsigned long ret_val = nr_pages * PAGE_SIZE;
612762306a36Sopenharmony_ci	struct page *subpage;
612862306a36Sopenharmony_ci
612962306a36Sopenharmony_ci	for (i = 0; i < nr_pages; i++) {
613062306a36Sopenharmony_ci		subpage = folio_page(dst_folio, i);
613162306a36Sopenharmony_ci		kaddr = kmap_local_page(subpage);
613262306a36Sopenharmony_ci		if (!allow_pagefault)
613362306a36Sopenharmony_ci			pagefault_disable();
613462306a36Sopenharmony_ci		rc = copy_from_user(kaddr, usr_src + i * PAGE_SIZE, PAGE_SIZE);
613562306a36Sopenharmony_ci		if (!allow_pagefault)
613662306a36Sopenharmony_ci			pagefault_enable();
613762306a36Sopenharmony_ci		kunmap_local(kaddr);
613862306a36Sopenharmony_ci
613962306a36Sopenharmony_ci		ret_val -= (PAGE_SIZE - rc);
614062306a36Sopenharmony_ci		if (rc)
614162306a36Sopenharmony_ci			break;
614262306a36Sopenharmony_ci
614362306a36Sopenharmony_ci		flush_dcache_page(subpage);
614462306a36Sopenharmony_ci
614562306a36Sopenharmony_ci		cond_resched();
614662306a36Sopenharmony_ci	}
614762306a36Sopenharmony_ci	return ret_val;
614862306a36Sopenharmony_ci}
614962306a36Sopenharmony_ci#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
615062306a36Sopenharmony_ci
615162306a36Sopenharmony_ci#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
615262306a36Sopenharmony_ci
615362306a36Sopenharmony_cistatic struct kmem_cache *page_ptl_cachep;
615462306a36Sopenharmony_ci
615562306a36Sopenharmony_civoid __init ptlock_cache_init(void)
615662306a36Sopenharmony_ci{
615762306a36Sopenharmony_ci	page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
615862306a36Sopenharmony_ci			SLAB_PANIC, NULL);
615962306a36Sopenharmony_ci}
616062306a36Sopenharmony_ci
616162306a36Sopenharmony_cibool ptlock_alloc(struct ptdesc *ptdesc)
616262306a36Sopenharmony_ci{
616362306a36Sopenharmony_ci	spinlock_t *ptl;
616462306a36Sopenharmony_ci
616562306a36Sopenharmony_ci	ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
616662306a36Sopenharmony_ci	if (!ptl)
616762306a36Sopenharmony_ci		return false;
616862306a36Sopenharmony_ci	ptdesc->ptl = ptl;
616962306a36Sopenharmony_ci	return true;
617062306a36Sopenharmony_ci}
617162306a36Sopenharmony_ci
617262306a36Sopenharmony_civoid ptlock_free(struct ptdesc *ptdesc)
617362306a36Sopenharmony_ci{
617462306a36Sopenharmony_ci	kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
617562306a36Sopenharmony_ci}
617662306a36Sopenharmony_ci#endif
6177