18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci *  linux/mm/memory.c
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
68c2ecf20Sopenharmony_ci */
78c2ecf20Sopenharmony_ci
88c2ecf20Sopenharmony_ci/*
98c2ecf20Sopenharmony_ci * demand-loading started 01.12.91 - seems it is high on the list of
108c2ecf20Sopenharmony_ci * things wanted, and it should be easy to implement. - Linus
118c2ecf20Sopenharmony_ci */
128c2ecf20Sopenharmony_ci
138c2ecf20Sopenharmony_ci/*
148c2ecf20Sopenharmony_ci * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
158c2ecf20Sopenharmony_ci * pages started 02.12.91, seems to work. - Linus.
168c2ecf20Sopenharmony_ci *
178c2ecf20Sopenharmony_ci * Tested sharing by executing about 30 /bin/sh: under the old kernel it
188c2ecf20Sopenharmony_ci * would have taken more than the 6M I have free, but it worked well as
198c2ecf20Sopenharmony_ci * far as I could see.
208c2ecf20Sopenharmony_ci *
218c2ecf20Sopenharmony_ci * Also corrected some "invalidate()"s - I wasn't doing enough of them.
228c2ecf20Sopenharmony_ci */
238c2ecf20Sopenharmony_ci
248c2ecf20Sopenharmony_ci/*
258c2ecf20Sopenharmony_ci * Real VM (paging to/from disk) started 18.12.91. Much more work and
268c2ecf20Sopenharmony_ci * thought has to go into this. Oh, well..
278c2ecf20Sopenharmony_ci * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
288c2ecf20Sopenharmony_ci *		Found it. Everything seems to work now.
298c2ecf20Sopenharmony_ci * 20.12.91  -  Ok, making the swap-device changeable like the root.
308c2ecf20Sopenharmony_ci */
318c2ecf20Sopenharmony_ci
328c2ecf20Sopenharmony_ci/*
338c2ecf20Sopenharmony_ci * 05.04.94  -  Multi-page memory management added for v1.1.
348c2ecf20Sopenharmony_ci *              Idea by Alex Bligh (alex@cconcepts.co.uk)
358c2ecf20Sopenharmony_ci *
368c2ecf20Sopenharmony_ci * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
378c2ecf20Sopenharmony_ci *		(Gerhard.Wichert@pdb.siemens.de)
388c2ecf20Sopenharmony_ci *
398c2ecf20Sopenharmony_ci * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
408c2ecf20Sopenharmony_ci */
418c2ecf20Sopenharmony_ci
428c2ecf20Sopenharmony_ci#include <linux/kernel_stat.h>
438c2ecf20Sopenharmony_ci#include <linux/mm.h>
448c2ecf20Sopenharmony_ci#include <linux/sched/mm.h>
458c2ecf20Sopenharmony_ci#include <linux/sched/coredump.h>
468c2ecf20Sopenharmony_ci#include <linux/sched/numa_balancing.h>
478c2ecf20Sopenharmony_ci#include <linux/sched/task.h>
488c2ecf20Sopenharmony_ci#include <linux/hugetlb.h>
498c2ecf20Sopenharmony_ci#include <linux/mman.h>
508c2ecf20Sopenharmony_ci#include <linux/swap.h>
518c2ecf20Sopenharmony_ci#include <linux/highmem.h>
528c2ecf20Sopenharmony_ci#include <linux/pagemap.h>
538c2ecf20Sopenharmony_ci#include <linux/memremap.h>
548c2ecf20Sopenharmony_ci#include <linux/ksm.h>
558c2ecf20Sopenharmony_ci#include <linux/rmap.h>
568c2ecf20Sopenharmony_ci#include <linux/export.h>
578c2ecf20Sopenharmony_ci#include <linux/delayacct.h>
588c2ecf20Sopenharmony_ci#include <linux/init.h>
598c2ecf20Sopenharmony_ci#include <linux/pfn_t.h>
608c2ecf20Sopenharmony_ci#include <linux/writeback.h>
618c2ecf20Sopenharmony_ci#include <linux/memcontrol.h>
628c2ecf20Sopenharmony_ci#include <linux/mmu_notifier.h>
638c2ecf20Sopenharmony_ci#include <linux/swapops.h>
648c2ecf20Sopenharmony_ci#include <linux/elf.h>
658c2ecf20Sopenharmony_ci#include <linux/gfp.h>
668c2ecf20Sopenharmony_ci#include <linux/migrate.h>
678c2ecf20Sopenharmony_ci#include <linux/string.h>
688c2ecf20Sopenharmony_ci#include <linux/debugfs.h>
698c2ecf20Sopenharmony_ci#include <linux/userfaultfd_k.h>
708c2ecf20Sopenharmony_ci#include <linux/dax.h>
718c2ecf20Sopenharmony_ci#include <linux/oom.h>
728c2ecf20Sopenharmony_ci#include <linux/numa.h>
738c2ecf20Sopenharmony_ci#include <linux/perf_event.h>
748c2ecf20Sopenharmony_ci#include <linux/ptrace.h>
758c2ecf20Sopenharmony_ci#include <linux/vmalloc.h>
768c2ecf20Sopenharmony_ci#include <linux/mm_purgeable.h>
778c2ecf20Sopenharmony_ci
788c2ecf20Sopenharmony_ci#include <trace/events/kmem.h>
798c2ecf20Sopenharmony_ci
808c2ecf20Sopenharmony_ci#include <asm/io.h>
818c2ecf20Sopenharmony_ci#include <asm/mmu_context.h>
828c2ecf20Sopenharmony_ci#include <asm/pgalloc.h>
838c2ecf20Sopenharmony_ci#include <linux/uaccess.h>
848c2ecf20Sopenharmony_ci#include <asm/tlb.h>
858c2ecf20Sopenharmony_ci#include <asm/tlbflush.h>
868c2ecf20Sopenharmony_ci
878c2ecf20Sopenharmony_ci#include "pgalloc-track.h"
888c2ecf20Sopenharmony_ci#include "internal.h"
898c2ecf20Sopenharmony_ci#include <linux/xpm.h>
908c2ecf20Sopenharmony_ci
918c2ecf20Sopenharmony_ci#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
928c2ecf20Sopenharmony_ci#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
938c2ecf20Sopenharmony_ci#endif
948c2ecf20Sopenharmony_ci
958c2ecf20Sopenharmony_ci#ifndef CONFIG_NEED_MULTIPLE_NODES
968c2ecf20Sopenharmony_ci/* use the per-pgdat data instead for discontigmem - mbligh */
978c2ecf20Sopenharmony_ciunsigned long max_mapnr;
988c2ecf20Sopenharmony_ciEXPORT_SYMBOL(max_mapnr);
998c2ecf20Sopenharmony_ci
1008c2ecf20Sopenharmony_cistruct page *mem_map;
1018c2ecf20Sopenharmony_ciEXPORT_SYMBOL(mem_map);
1028c2ecf20Sopenharmony_ci#endif
1038c2ecf20Sopenharmony_ci
1048c2ecf20Sopenharmony_ci/*
1058c2ecf20Sopenharmony_ci * A number of key systems in x86 including ioremap() rely on the assumption
1068c2ecf20Sopenharmony_ci * that high_memory defines the upper bound on direct map memory, then end
1078c2ecf20Sopenharmony_ci * of ZONE_NORMAL.  Under CONFIG_DISCONTIG this means that max_low_pfn and
1088c2ecf20Sopenharmony_ci * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
1098c2ecf20Sopenharmony_ci * and ZONE_HIGHMEM.
1108c2ecf20Sopenharmony_ci */
1118c2ecf20Sopenharmony_civoid *high_memory;
1128c2ecf20Sopenharmony_ciEXPORT_SYMBOL(high_memory);
1138c2ecf20Sopenharmony_ci
1148c2ecf20Sopenharmony_ci/*
1158c2ecf20Sopenharmony_ci * Randomize the address space (stacks, mmaps, brk, etc.).
1168c2ecf20Sopenharmony_ci *
1178c2ecf20Sopenharmony_ci * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
1188c2ecf20Sopenharmony_ci *   as ancient (libc5 based) binaries can segfault. )
1198c2ecf20Sopenharmony_ci */
1208c2ecf20Sopenharmony_ciint randomize_va_space __read_mostly =
1218c2ecf20Sopenharmony_ci#ifdef CONFIG_COMPAT_BRK
1228c2ecf20Sopenharmony_ci					1;
1238c2ecf20Sopenharmony_ci#else
1248c2ecf20Sopenharmony_ci					2;
1258c2ecf20Sopenharmony_ci#endif
1268c2ecf20Sopenharmony_ci
1278c2ecf20Sopenharmony_ci#ifndef arch_faults_on_old_pte
1288c2ecf20Sopenharmony_cistatic inline bool arch_faults_on_old_pte(void)
1298c2ecf20Sopenharmony_ci{
1308c2ecf20Sopenharmony_ci	/*
1318c2ecf20Sopenharmony_ci	 * Those arches which don't have hw access flag feature need to
1328c2ecf20Sopenharmony_ci	 * implement their own helper. By default, "true" means pagefault
1338c2ecf20Sopenharmony_ci	 * will be hit on old pte.
1348c2ecf20Sopenharmony_ci	 */
1358c2ecf20Sopenharmony_ci	return true;
1368c2ecf20Sopenharmony_ci}
1378c2ecf20Sopenharmony_ci#endif
1388c2ecf20Sopenharmony_ci
1398c2ecf20Sopenharmony_cistatic int __init disable_randmaps(char *s)
1408c2ecf20Sopenharmony_ci{
1418c2ecf20Sopenharmony_ci	randomize_va_space = 0;
1428c2ecf20Sopenharmony_ci	return 1;
1438c2ecf20Sopenharmony_ci}
1448c2ecf20Sopenharmony_ci__setup("norandmaps", disable_randmaps);
1458c2ecf20Sopenharmony_ci
1468c2ecf20Sopenharmony_ciunsigned long zero_pfn __read_mostly;
1478c2ecf20Sopenharmony_ciEXPORT_SYMBOL(zero_pfn);
1488c2ecf20Sopenharmony_ci
1498c2ecf20Sopenharmony_ciunsigned long highest_memmap_pfn __read_mostly;
1508c2ecf20Sopenharmony_ci
1518c2ecf20Sopenharmony_ci/*
1528c2ecf20Sopenharmony_ci * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
1538c2ecf20Sopenharmony_ci */
1548c2ecf20Sopenharmony_cistatic int __init init_zero_pfn(void)
1558c2ecf20Sopenharmony_ci{
1568c2ecf20Sopenharmony_ci	zero_pfn = page_to_pfn(ZERO_PAGE(0));
1578c2ecf20Sopenharmony_ci	return 0;
1588c2ecf20Sopenharmony_ci}
1598c2ecf20Sopenharmony_ciearly_initcall(init_zero_pfn);
1608c2ecf20Sopenharmony_ci
1618c2ecf20Sopenharmony_civoid mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
1628c2ecf20Sopenharmony_ci{
1638c2ecf20Sopenharmony_ci	trace_rss_stat(mm, member, count);
1648c2ecf20Sopenharmony_ci}
1658c2ecf20Sopenharmony_ci
1668c2ecf20Sopenharmony_ci#if defined(SPLIT_RSS_COUNTING)
1678c2ecf20Sopenharmony_ci
1688c2ecf20Sopenharmony_civoid sync_mm_rss(struct mm_struct *mm)
1698c2ecf20Sopenharmony_ci{
1708c2ecf20Sopenharmony_ci	int i;
1718c2ecf20Sopenharmony_ci
1728c2ecf20Sopenharmony_ci	for (i = 0; i < NR_MM_COUNTERS; i++) {
1738c2ecf20Sopenharmony_ci		if (current->rss_stat.count[i]) {
1748c2ecf20Sopenharmony_ci			add_mm_counter(mm, i, current->rss_stat.count[i]);
1758c2ecf20Sopenharmony_ci			current->rss_stat.count[i] = 0;
1768c2ecf20Sopenharmony_ci		}
1778c2ecf20Sopenharmony_ci	}
1788c2ecf20Sopenharmony_ci	current->rss_stat.events = 0;
1798c2ecf20Sopenharmony_ci}
1808c2ecf20Sopenharmony_ci
1818c2ecf20Sopenharmony_cistatic void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
1828c2ecf20Sopenharmony_ci{
1838c2ecf20Sopenharmony_ci	struct task_struct *task = current;
1848c2ecf20Sopenharmony_ci
1858c2ecf20Sopenharmony_ci	if (likely(task->mm == mm))
1868c2ecf20Sopenharmony_ci		task->rss_stat.count[member] += val;
1878c2ecf20Sopenharmony_ci	else
1888c2ecf20Sopenharmony_ci		add_mm_counter(mm, member, val);
1898c2ecf20Sopenharmony_ci}
1908c2ecf20Sopenharmony_ci#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
1918c2ecf20Sopenharmony_ci#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
1928c2ecf20Sopenharmony_ci
1938c2ecf20Sopenharmony_ci/* sync counter once per 64 page faults */
1948c2ecf20Sopenharmony_ci#define TASK_RSS_EVENTS_THRESH	(64)
1958c2ecf20Sopenharmony_cistatic void check_sync_rss_stat(struct task_struct *task)
1968c2ecf20Sopenharmony_ci{
1978c2ecf20Sopenharmony_ci	if (unlikely(task != current))
1988c2ecf20Sopenharmony_ci		return;
1998c2ecf20Sopenharmony_ci	if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
2008c2ecf20Sopenharmony_ci		sync_mm_rss(task->mm);
2018c2ecf20Sopenharmony_ci}
2028c2ecf20Sopenharmony_ci#else /* SPLIT_RSS_COUNTING */
2038c2ecf20Sopenharmony_ci
2048c2ecf20Sopenharmony_ci#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
2058c2ecf20Sopenharmony_ci#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
2068c2ecf20Sopenharmony_ci
2078c2ecf20Sopenharmony_cistatic void check_sync_rss_stat(struct task_struct *task)
2088c2ecf20Sopenharmony_ci{
2098c2ecf20Sopenharmony_ci}
2108c2ecf20Sopenharmony_ci
2118c2ecf20Sopenharmony_ci#endif /* SPLIT_RSS_COUNTING */
2128c2ecf20Sopenharmony_ci
2138c2ecf20Sopenharmony_ci/*
2148c2ecf20Sopenharmony_ci * Note: this doesn't free the actual pages themselves. That
2158c2ecf20Sopenharmony_ci * has been handled earlier when unmapping all the memory regions.
2168c2ecf20Sopenharmony_ci */
2178c2ecf20Sopenharmony_cistatic void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
2188c2ecf20Sopenharmony_ci			   unsigned long addr)
2198c2ecf20Sopenharmony_ci{
2208c2ecf20Sopenharmony_ci	pgtable_t token = pmd_pgtable(*pmd);
2218c2ecf20Sopenharmony_ci	pmd_clear(pmd);
2228c2ecf20Sopenharmony_ci	pte_free_tlb(tlb, token, addr);
2238c2ecf20Sopenharmony_ci	mm_dec_nr_ptes(tlb->mm);
2248c2ecf20Sopenharmony_ci}
2258c2ecf20Sopenharmony_ci
2268c2ecf20Sopenharmony_cistatic inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
2278c2ecf20Sopenharmony_ci				unsigned long addr, unsigned long end,
2288c2ecf20Sopenharmony_ci				unsigned long floor, unsigned long ceiling)
2298c2ecf20Sopenharmony_ci{
2308c2ecf20Sopenharmony_ci	pmd_t *pmd;
2318c2ecf20Sopenharmony_ci	unsigned long next;
2328c2ecf20Sopenharmony_ci	unsigned long start;
2338c2ecf20Sopenharmony_ci
2348c2ecf20Sopenharmony_ci	start = addr;
2358c2ecf20Sopenharmony_ci	pmd = pmd_offset(pud, addr);
2368c2ecf20Sopenharmony_ci	do {
2378c2ecf20Sopenharmony_ci		next = pmd_addr_end(addr, end);
2388c2ecf20Sopenharmony_ci		if (pmd_none_or_clear_bad(pmd))
2398c2ecf20Sopenharmony_ci			continue;
2408c2ecf20Sopenharmony_ci		free_pte_range(tlb, pmd, addr);
2418c2ecf20Sopenharmony_ci	} while (pmd++, addr = next, addr != end);
2428c2ecf20Sopenharmony_ci
2438c2ecf20Sopenharmony_ci	start &= PUD_MASK;
2448c2ecf20Sopenharmony_ci	if (start < floor)
2458c2ecf20Sopenharmony_ci		return;
2468c2ecf20Sopenharmony_ci	if (ceiling) {
2478c2ecf20Sopenharmony_ci		ceiling &= PUD_MASK;
2488c2ecf20Sopenharmony_ci		if (!ceiling)
2498c2ecf20Sopenharmony_ci			return;
2508c2ecf20Sopenharmony_ci	}
2518c2ecf20Sopenharmony_ci	if (end - 1 > ceiling - 1)
2528c2ecf20Sopenharmony_ci		return;
2538c2ecf20Sopenharmony_ci
2548c2ecf20Sopenharmony_ci	pmd = pmd_offset(pud, start);
2558c2ecf20Sopenharmony_ci	pud_clear(pud);
2568c2ecf20Sopenharmony_ci	pmd_free_tlb(tlb, pmd, start);
2578c2ecf20Sopenharmony_ci	mm_dec_nr_pmds(tlb->mm);
2588c2ecf20Sopenharmony_ci}
2598c2ecf20Sopenharmony_ci
2608c2ecf20Sopenharmony_cistatic inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
2618c2ecf20Sopenharmony_ci				unsigned long addr, unsigned long end,
2628c2ecf20Sopenharmony_ci				unsigned long floor, unsigned long ceiling)
2638c2ecf20Sopenharmony_ci{
2648c2ecf20Sopenharmony_ci	pud_t *pud;
2658c2ecf20Sopenharmony_ci	unsigned long next;
2668c2ecf20Sopenharmony_ci	unsigned long start;
2678c2ecf20Sopenharmony_ci
2688c2ecf20Sopenharmony_ci	start = addr;
2698c2ecf20Sopenharmony_ci	pud = pud_offset(p4d, addr);
2708c2ecf20Sopenharmony_ci	do {
2718c2ecf20Sopenharmony_ci		next = pud_addr_end(addr, end);
2728c2ecf20Sopenharmony_ci		if (pud_none_or_clear_bad(pud))
2738c2ecf20Sopenharmony_ci			continue;
2748c2ecf20Sopenharmony_ci		free_pmd_range(tlb, pud, addr, next, floor, ceiling);
2758c2ecf20Sopenharmony_ci	} while (pud++, addr = next, addr != end);
2768c2ecf20Sopenharmony_ci
2778c2ecf20Sopenharmony_ci	start &= P4D_MASK;
2788c2ecf20Sopenharmony_ci	if (start < floor)
2798c2ecf20Sopenharmony_ci		return;
2808c2ecf20Sopenharmony_ci	if (ceiling) {
2818c2ecf20Sopenharmony_ci		ceiling &= P4D_MASK;
2828c2ecf20Sopenharmony_ci		if (!ceiling)
2838c2ecf20Sopenharmony_ci			return;
2848c2ecf20Sopenharmony_ci	}
2858c2ecf20Sopenharmony_ci	if (end - 1 > ceiling - 1)
2868c2ecf20Sopenharmony_ci		return;
2878c2ecf20Sopenharmony_ci
2888c2ecf20Sopenharmony_ci	pud = pud_offset(p4d, start);
2898c2ecf20Sopenharmony_ci	p4d_clear(p4d);
2908c2ecf20Sopenharmony_ci	pud_free_tlb(tlb, pud, start);
2918c2ecf20Sopenharmony_ci	mm_dec_nr_puds(tlb->mm);
2928c2ecf20Sopenharmony_ci}
2938c2ecf20Sopenharmony_ci
2948c2ecf20Sopenharmony_cistatic inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
2958c2ecf20Sopenharmony_ci				unsigned long addr, unsigned long end,
2968c2ecf20Sopenharmony_ci				unsigned long floor, unsigned long ceiling)
2978c2ecf20Sopenharmony_ci{
2988c2ecf20Sopenharmony_ci	p4d_t *p4d;
2998c2ecf20Sopenharmony_ci	unsigned long next;
3008c2ecf20Sopenharmony_ci	unsigned long start;
3018c2ecf20Sopenharmony_ci
3028c2ecf20Sopenharmony_ci	start = addr;
3038c2ecf20Sopenharmony_ci	p4d = p4d_offset(pgd, addr);
3048c2ecf20Sopenharmony_ci	do {
3058c2ecf20Sopenharmony_ci		next = p4d_addr_end(addr, end);
3068c2ecf20Sopenharmony_ci		if (p4d_none_or_clear_bad(p4d))
3078c2ecf20Sopenharmony_ci			continue;
3088c2ecf20Sopenharmony_ci		free_pud_range(tlb, p4d, addr, next, floor, ceiling);
3098c2ecf20Sopenharmony_ci	} while (p4d++, addr = next, addr != end);
3108c2ecf20Sopenharmony_ci
3118c2ecf20Sopenharmony_ci	start &= PGDIR_MASK;
3128c2ecf20Sopenharmony_ci	if (start < floor)
3138c2ecf20Sopenharmony_ci		return;
3148c2ecf20Sopenharmony_ci	if (ceiling) {
3158c2ecf20Sopenharmony_ci		ceiling &= PGDIR_MASK;
3168c2ecf20Sopenharmony_ci		if (!ceiling)
3178c2ecf20Sopenharmony_ci			return;
3188c2ecf20Sopenharmony_ci	}
3198c2ecf20Sopenharmony_ci	if (end - 1 > ceiling - 1)
3208c2ecf20Sopenharmony_ci		return;
3218c2ecf20Sopenharmony_ci
3228c2ecf20Sopenharmony_ci	p4d = p4d_offset(pgd, start);
3238c2ecf20Sopenharmony_ci	pgd_clear(pgd);
3248c2ecf20Sopenharmony_ci	p4d_free_tlb(tlb, p4d, start);
3258c2ecf20Sopenharmony_ci}
3268c2ecf20Sopenharmony_ci
3278c2ecf20Sopenharmony_ci/*
3288c2ecf20Sopenharmony_ci * This function frees user-level page tables of a process.
3298c2ecf20Sopenharmony_ci */
3308c2ecf20Sopenharmony_civoid free_pgd_range(struct mmu_gather *tlb,
3318c2ecf20Sopenharmony_ci			unsigned long addr, unsigned long end,
3328c2ecf20Sopenharmony_ci			unsigned long floor, unsigned long ceiling)
3338c2ecf20Sopenharmony_ci{
3348c2ecf20Sopenharmony_ci	pgd_t *pgd;
3358c2ecf20Sopenharmony_ci	unsigned long next;
3368c2ecf20Sopenharmony_ci
3378c2ecf20Sopenharmony_ci	/*
3388c2ecf20Sopenharmony_ci	 * The next few lines have given us lots of grief...
3398c2ecf20Sopenharmony_ci	 *
3408c2ecf20Sopenharmony_ci	 * Why are we testing PMD* at this top level?  Because often
3418c2ecf20Sopenharmony_ci	 * there will be no work to do at all, and we'd prefer not to
3428c2ecf20Sopenharmony_ci	 * go all the way down to the bottom just to discover that.
3438c2ecf20Sopenharmony_ci	 *
3448c2ecf20Sopenharmony_ci	 * Why all these "- 1"s?  Because 0 represents both the bottom
3458c2ecf20Sopenharmony_ci	 * of the address space and the top of it (using -1 for the
3468c2ecf20Sopenharmony_ci	 * top wouldn't help much: the masks would do the wrong thing).
3478c2ecf20Sopenharmony_ci	 * The rule is that addr 0 and floor 0 refer to the bottom of
3488c2ecf20Sopenharmony_ci	 * the address space, but end 0 and ceiling 0 refer to the top
3498c2ecf20Sopenharmony_ci	 * Comparisons need to use "end - 1" and "ceiling - 1" (though
3508c2ecf20Sopenharmony_ci	 * that end 0 case should be mythical).
3518c2ecf20Sopenharmony_ci	 *
3528c2ecf20Sopenharmony_ci	 * Wherever addr is brought up or ceiling brought down, we must
3538c2ecf20Sopenharmony_ci	 * be careful to reject "the opposite 0" before it confuses the
3548c2ecf20Sopenharmony_ci	 * subsequent tests.  But what about where end is brought down
3558c2ecf20Sopenharmony_ci	 * by PMD_SIZE below? no, end can't go down to 0 there.
3568c2ecf20Sopenharmony_ci	 *
3578c2ecf20Sopenharmony_ci	 * Whereas we round start (addr) and ceiling down, by different
3588c2ecf20Sopenharmony_ci	 * masks at different levels, in order to test whether a table
3598c2ecf20Sopenharmony_ci	 * now has no other vmas using it, so can be freed, we don't
3608c2ecf20Sopenharmony_ci	 * bother to round floor or end up - the tests don't need that.
3618c2ecf20Sopenharmony_ci	 */
3628c2ecf20Sopenharmony_ci
3638c2ecf20Sopenharmony_ci	addr &= PMD_MASK;
3648c2ecf20Sopenharmony_ci	if (addr < floor) {
3658c2ecf20Sopenharmony_ci		addr += PMD_SIZE;
3668c2ecf20Sopenharmony_ci		if (!addr)
3678c2ecf20Sopenharmony_ci			return;
3688c2ecf20Sopenharmony_ci	}
3698c2ecf20Sopenharmony_ci	if (ceiling) {
3708c2ecf20Sopenharmony_ci		ceiling &= PMD_MASK;
3718c2ecf20Sopenharmony_ci		if (!ceiling)
3728c2ecf20Sopenharmony_ci			return;
3738c2ecf20Sopenharmony_ci	}
3748c2ecf20Sopenharmony_ci	if (end - 1 > ceiling - 1)
3758c2ecf20Sopenharmony_ci		end -= PMD_SIZE;
3768c2ecf20Sopenharmony_ci	if (addr > end - 1)
3778c2ecf20Sopenharmony_ci		return;
3788c2ecf20Sopenharmony_ci	/*
3798c2ecf20Sopenharmony_ci	 * We add page table cache pages with PAGE_SIZE,
3808c2ecf20Sopenharmony_ci	 * (see pte_free_tlb()), flush the tlb if we need
3818c2ecf20Sopenharmony_ci	 */
3828c2ecf20Sopenharmony_ci	tlb_change_page_size(tlb, PAGE_SIZE);
3838c2ecf20Sopenharmony_ci	pgd = pgd_offset(tlb->mm, addr);
3848c2ecf20Sopenharmony_ci	do {
3858c2ecf20Sopenharmony_ci		next = pgd_addr_end(addr, end);
3868c2ecf20Sopenharmony_ci		if (pgd_none_or_clear_bad(pgd))
3878c2ecf20Sopenharmony_ci			continue;
3888c2ecf20Sopenharmony_ci		free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
3898c2ecf20Sopenharmony_ci	} while (pgd++, addr = next, addr != end);
3908c2ecf20Sopenharmony_ci}
3918c2ecf20Sopenharmony_ci
3928c2ecf20Sopenharmony_civoid free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
3938c2ecf20Sopenharmony_ci		unsigned long floor, unsigned long ceiling)
3948c2ecf20Sopenharmony_ci{
3958c2ecf20Sopenharmony_ci	while (vma) {
3968c2ecf20Sopenharmony_ci		struct vm_area_struct *next = vma->vm_next;
3978c2ecf20Sopenharmony_ci		unsigned long addr = vma->vm_start;
3988c2ecf20Sopenharmony_ci
3998c2ecf20Sopenharmony_ci		/*
4008c2ecf20Sopenharmony_ci		 * Hide vma from rmap and truncate_pagecache before freeing
4018c2ecf20Sopenharmony_ci		 * pgtables
4028c2ecf20Sopenharmony_ci		 */
4038c2ecf20Sopenharmony_ci		unlink_anon_vmas(vma);
4048c2ecf20Sopenharmony_ci		unlink_file_vma(vma);
4058c2ecf20Sopenharmony_ci
4068c2ecf20Sopenharmony_ci		if (is_vm_hugetlb_page(vma)) {
4078c2ecf20Sopenharmony_ci			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
4088c2ecf20Sopenharmony_ci				floor, next ? next->vm_start : ceiling);
4098c2ecf20Sopenharmony_ci		} else {
4108c2ecf20Sopenharmony_ci			/*
4118c2ecf20Sopenharmony_ci			 * Optimization: gather nearby vmas into one call down
4128c2ecf20Sopenharmony_ci			 */
4138c2ecf20Sopenharmony_ci			while (next && next->vm_start <= vma->vm_end + PMD_SIZE
4148c2ecf20Sopenharmony_ci			       && !is_vm_hugetlb_page(next)) {
4158c2ecf20Sopenharmony_ci				vma = next;
4168c2ecf20Sopenharmony_ci				next = vma->vm_next;
4178c2ecf20Sopenharmony_ci				unlink_anon_vmas(vma);
4188c2ecf20Sopenharmony_ci				unlink_file_vma(vma);
4198c2ecf20Sopenharmony_ci			}
4208c2ecf20Sopenharmony_ci			free_pgd_range(tlb, addr, vma->vm_end,
4218c2ecf20Sopenharmony_ci				floor, next ? next->vm_start : ceiling);
4228c2ecf20Sopenharmony_ci		}
4238c2ecf20Sopenharmony_ci		vma = next;
4248c2ecf20Sopenharmony_ci	}
4258c2ecf20Sopenharmony_ci}
4268c2ecf20Sopenharmony_ci
4278c2ecf20Sopenharmony_ciint __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
4288c2ecf20Sopenharmony_ci{
4298c2ecf20Sopenharmony_ci	spinlock_t *ptl;
4308c2ecf20Sopenharmony_ci	pgtable_t new = pte_alloc_one(mm);
4318c2ecf20Sopenharmony_ci	if (!new)
4328c2ecf20Sopenharmony_ci		return -ENOMEM;
4338c2ecf20Sopenharmony_ci
4348c2ecf20Sopenharmony_ci	/*
4358c2ecf20Sopenharmony_ci	 * Ensure all pte setup (eg. pte page lock and page clearing) are
4368c2ecf20Sopenharmony_ci	 * visible before the pte is made visible to other CPUs by being
4378c2ecf20Sopenharmony_ci	 * put into page tables.
4388c2ecf20Sopenharmony_ci	 *
4398c2ecf20Sopenharmony_ci	 * The other side of the story is the pointer chasing in the page
4408c2ecf20Sopenharmony_ci	 * table walking code (when walking the page table without locking;
4418c2ecf20Sopenharmony_ci	 * ie. most of the time). Fortunately, these data accesses consist
4428c2ecf20Sopenharmony_ci	 * of a chain of data-dependent loads, meaning most CPUs (alpha
4438c2ecf20Sopenharmony_ci	 * being the notable exception) will already guarantee loads are
4448c2ecf20Sopenharmony_ci	 * seen in-order. See the alpha page table accessors for the
4458c2ecf20Sopenharmony_ci	 * smp_rmb() barriers in page table walking code.
4468c2ecf20Sopenharmony_ci	 */
4478c2ecf20Sopenharmony_ci	smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
4488c2ecf20Sopenharmony_ci
4498c2ecf20Sopenharmony_ci	ptl = pmd_lock(mm, pmd);
4508c2ecf20Sopenharmony_ci	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
4518c2ecf20Sopenharmony_ci		mm_inc_nr_ptes(mm);
4528c2ecf20Sopenharmony_ci		pmd_populate(mm, pmd, new);
4538c2ecf20Sopenharmony_ci		new = NULL;
4548c2ecf20Sopenharmony_ci	}
4558c2ecf20Sopenharmony_ci	spin_unlock(ptl);
4568c2ecf20Sopenharmony_ci	if (new)
4578c2ecf20Sopenharmony_ci		pte_free(mm, new);
4588c2ecf20Sopenharmony_ci	return 0;
4598c2ecf20Sopenharmony_ci}
4608c2ecf20Sopenharmony_ci
4618c2ecf20Sopenharmony_ciint __pte_alloc_kernel(pmd_t *pmd)
4628c2ecf20Sopenharmony_ci{
4638c2ecf20Sopenharmony_ci	pte_t *new = pte_alloc_one_kernel(&init_mm);
4648c2ecf20Sopenharmony_ci	if (!new)
4658c2ecf20Sopenharmony_ci		return -ENOMEM;
4668c2ecf20Sopenharmony_ci
4678c2ecf20Sopenharmony_ci	smp_wmb(); /* See comment in __pte_alloc */
4688c2ecf20Sopenharmony_ci
4698c2ecf20Sopenharmony_ci	spin_lock(&init_mm.page_table_lock);
4708c2ecf20Sopenharmony_ci	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
4718c2ecf20Sopenharmony_ci		pmd_populate_kernel(&init_mm, pmd, new);
4728c2ecf20Sopenharmony_ci		new = NULL;
4738c2ecf20Sopenharmony_ci	}
4748c2ecf20Sopenharmony_ci	spin_unlock(&init_mm.page_table_lock);
4758c2ecf20Sopenharmony_ci	if (new)
4768c2ecf20Sopenharmony_ci		pte_free_kernel(&init_mm, new);
4778c2ecf20Sopenharmony_ci	return 0;
4788c2ecf20Sopenharmony_ci}
4798c2ecf20Sopenharmony_ci
4808c2ecf20Sopenharmony_cistatic inline void init_rss_vec(int *rss)
4818c2ecf20Sopenharmony_ci{
4828c2ecf20Sopenharmony_ci	memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
4838c2ecf20Sopenharmony_ci}
4848c2ecf20Sopenharmony_ci
4858c2ecf20Sopenharmony_cistatic inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
4868c2ecf20Sopenharmony_ci{
4878c2ecf20Sopenharmony_ci	int i;
4888c2ecf20Sopenharmony_ci
4898c2ecf20Sopenharmony_ci	if (current->mm == mm)
4908c2ecf20Sopenharmony_ci		sync_mm_rss(mm);
4918c2ecf20Sopenharmony_ci	for (i = 0; i < NR_MM_COUNTERS; i++)
4928c2ecf20Sopenharmony_ci		if (rss[i])
4938c2ecf20Sopenharmony_ci			add_mm_counter(mm, i, rss[i]);
4948c2ecf20Sopenharmony_ci}
4958c2ecf20Sopenharmony_ci
4968c2ecf20Sopenharmony_ci/*
4978c2ecf20Sopenharmony_ci * This function is called to print an error when a bad pte
4988c2ecf20Sopenharmony_ci * is found. For example, we might have a PFN-mapped pte in
4998c2ecf20Sopenharmony_ci * a region that doesn't allow it.
5008c2ecf20Sopenharmony_ci *
5018c2ecf20Sopenharmony_ci * The calling function must still handle the error.
5028c2ecf20Sopenharmony_ci */
5038c2ecf20Sopenharmony_cistatic void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
5048c2ecf20Sopenharmony_ci			  pte_t pte, struct page *page)
5058c2ecf20Sopenharmony_ci{
5068c2ecf20Sopenharmony_ci	pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
5078c2ecf20Sopenharmony_ci	p4d_t *p4d = p4d_offset(pgd, addr);
5088c2ecf20Sopenharmony_ci	pud_t *pud = pud_offset(p4d, addr);
5098c2ecf20Sopenharmony_ci	pmd_t *pmd = pmd_offset(pud, addr);
5108c2ecf20Sopenharmony_ci	struct address_space *mapping;
5118c2ecf20Sopenharmony_ci	pgoff_t index;
5128c2ecf20Sopenharmony_ci	static unsigned long resume;
5138c2ecf20Sopenharmony_ci	static unsigned long nr_shown;
5148c2ecf20Sopenharmony_ci	static unsigned long nr_unshown;
5158c2ecf20Sopenharmony_ci
5168c2ecf20Sopenharmony_ci	/*
5178c2ecf20Sopenharmony_ci	 * Allow a burst of 60 reports, then keep quiet for that minute;
5188c2ecf20Sopenharmony_ci	 * or allow a steady drip of one report per second.
5198c2ecf20Sopenharmony_ci	 */
5208c2ecf20Sopenharmony_ci	if (nr_shown == 60) {
5218c2ecf20Sopenharmony_ci		if (time_before(jiffies, resume)) {
5228c2ecf20Sopenharmony_ci			nr_unshown++;
5238c2ecf20Sopenharmony_ci			return;
5248c2ecf20Sopenharmony_ci		}
5258c2ecf20Sopenharmony_ci		if (nr_unshown) {
5268c2ecf20Sopenharmony_ci			pr_alert("BUG: Bad page map: %lu messages suppressed\n",
5278c2ecf20Sopenharmony_ci				 nr_unshown);
5288c2ecf20Sopenharmony_ci			nr_unshown = 0;
5298c2ecf20Sopenharmony_ci		}
5308c2ecf20Sopenharmony_ci		nr_shown = 0;
5318c2ecf20Sopenharmony_ci	}
5328c2ecf20Sopenharmony_ci	if (nr_shown++ == 0)
5338c2ecf20Sopenharmony_ci		resume = jiffies + 60 * HZ;
5348c2ecf20Sopenharmony_ci
5358c2ecf20Sopenharmony_ci	mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
5368c2ecf20Sopenharmony_ci	index = linear_page_index(vma, addr);
5378c2ecf20Sopenharmony_ci
5388c2ecf20Sopenharmony_ci	pr_alert("BUG: Bad page map in process %s  pte:%08llx pmd:%08llx\n",
5398c2ecf20Sopenharmony_ci		 current->comm,
5408c2ecf20Sopenharmony_ci		 (long long)pte_val(pte), (long long)pmd_val(*pmd));
5418c2ecf20Sopenharmony_ci	if (page)
5428c2ecf20Sopenharmony_ci		dump_page(page, "bad pte");
5438c2ecf20Sopenharmony_ci	pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
5448c2ecf20Sopenharmony_ci		 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
5458c2ecf20Sopenharmony_ci	pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
5468c2ecf20Sopenharmony_ci		 vma->vm_file,
5478c2ecf20Sopenharmony_ci		 vma->vm_ops ? vma->vm_ops->fault : NULL,
5488c2ecf20Sopenharmony_ci		 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
5498c2ecf20Sopenharmony_ci		 mapping ? mapping->a_ops->readpage : NULL);
5508c2ecf20Sopenharmony_ci	dump_stack();
5518c2ecf20Sopenharmony_ci	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
5528c2ecf20Sopenharmony_ci}
5538c2ecf20Sopenharmony_ci
5548c2ecf20Sopenharmony_ci/*
5558c2ecf20Sopenharmony_ci * vm_normal_page -- This function gets the "struct page" associated with a pte.
5568c2ecf20Sopenharmony_ci *
5578c2ecf20Sopenharmony_ci * "Special" mappings do not wish to be associated with a "struct page" (either
5588c2ecf20Sopenharmony_ci * it doesn't exist, or it exists but they don't want to touch it). In this
5598c2ecf20Sopenharmony_ci * case, NULL is returned here. "Normal" mappings do have a struct page.
5608c2ecf20Sopenharmony_ci *
5618c2ecf20Sopenharmony_ci * There are 2 broad cases. Firstly, an architecture may define a pte_special()
5628c2ecf20Sopenharmony_ci * pte bit, in which case this function is trivial. Secondly, an architecture
5638c2ecf20Sopenharmony_ci * may not have a spare pte bit, which requires a more complicated scheme,
5648c2ecf20Sopenharmony_ci * described below.
5658c2ecf20Sopenharmony_ci *
5668c2ecf20Sopenharmony_ci * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
5678c2ecf20Sopenharmony_ci * special mapping (even if there are underlying and valid "struct pages").
5688c2ecf20Sopenharmony_ci * COWed pages of a VM_PFNMAP are always normal.
5698c2ecf20Sopenharmony_ci *
5708c2ecf20Sopenharmony_ci * The way we recognize COWed pages within VM_PFNMAP mappings is through the
5718c2ecf20Sopenharmony_ci * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
5728c2ecf20Sopenharmony_ci * set, and the vm_pgoff will point to the first PFN mapped: thus every special
5738c2ecf20Sopenharmony_ci * mapping will always honor the rule
5748c2ecf20Sopenharmony_ci *
5758c2ecf20Sopenharmony_ci *	pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
5768c2ecf20Sopenharmony_ci *
5778c2ecf20Sopenharmony_ci * And for normal mappings this is false.
5788c2ecf20Sopenharmony_ci *
5798c2ecf20Sopenharmony_ci * This restricts such mappings to be a linear translation from virtual address
5808c2ecf20Sopenharmony_ci * to pfn. To get around this restriction, we allow arbitrary mappings so long
5818c2ecf20Sopenharmony_ci * as the vma is not a COW mapping; in that case, we know that all ptes are
5828c2ecf20Sopenharmony_ci * special (because none can have been COWed).
5838c2ecf20Sopenharmony_ci *
5848c2ecf20Sopenharmony_ci *
5858c2ecf20Sopenharmony_ci * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
5868c2ecf20Sopenharmony_ci *
5878c2ecf20Sopenharmony_ci * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
5888c2ecf20Sopenharmony_ci * page" backing, however the difference is that _all_ pages with a struct
5898c2ecf20Sopenharmony_ci * page (that is, those where pfn_valid is true) are refcounted and considered
5908c2ecf20Sopenharmony_ci * normal pages by the VM. The disadvantage is that pages are refcounted
5918c2ecf20Sopenharmony_ci * (which can be slower and simply not an option for some PFNMAP users). The
5928c2ecf20Sopenharmony_ci * advantage is that we don't have to follow the strict linearity rule of
5938c2ecf20Sopenharmony_ci * PFNMAP mappings in order to support COWable mappings.
5948c2ecf20Sopenharmony_ci *
5958c2ecf20Sopenharmony_ci */
5968c2ecf20Sopenharmony_cistruct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
5978c2ecf20Sopenharmony_ci			    pte_t pte)
5988c2ecf20Sopenharmony_ci{
5998c2ecf20Sopenharmony_ci	unsigned long pfn = pte_pfn(pte);
6008c2ecf20Sopenharmony_ci
6018c2ecf20Sopenharmony_ci	if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
6028c2ecf20Sopenharmony_ci		if (likely(!pte_special(pte)))
6038c2ecf20Sopenharmony_ci			goto check_pfn;
6048c2ecf20Sopenharmony_ci		if (vma->vm_ops && vma->vm_ops->find_special_page)
6058c2ecf20Sopenharmony_ci			return vma->vm_ops->find_special_page(vma, addr);
6068c2ecf20Sopenharmony_ci		if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
6078c2ecf20Sopenharmony_ci			return NULL;
6088c2ecf20Sopenharmony_ci		if (is_zero_pfn(pfn))
6098c2ecf20Sopenharmony_ci			return NULL;
6108c2ecf20Sopenharmony_ci		if (pte_devmap(pte))
6118c2ecf20Sopenharmony_ci			return NULL;
6128c2ecf20Sopenharmony_ci
6138c2ecf20Sopenharmony_ci		print_bad_pte(vma, addr, pte, NULL);
6148c2ecf20Sopenharmony_ci		return NULL;
6158c2ecf20Sopenharmony_ci	}
6168c2ecf20Sopenharmony_ci
6178c2ecf20Sopenharmony_ci	/* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */
6188c2ecf20Sopenharmony_ci
6198c2ecf20Sopenharmony_ci	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
6208c2ecf20Sopenharmony_ci		if (vma->vm_flags & VM_MIXEDMAP) {
6218c2ecf20Sopenharmony_ci			if (!pfn_valid(pfn))
6228c2ecf20Sopenharmony_ci				return NULL;
6238c2ecf20Sopenharmony_ci			goto out;
6248c2ecf20Sopenharmony_ci		} else {
6258c2ecf20Sopenharmony_ci			unsigned long off;
6268c2ecf20Sopenharmony_ci			off = (addr - vma->vm_start) >> PAGE_SHIFT;
6278c2ecf20Sopenharmony_ci			if (pfn == vma->vm_pgoff + off)
6288c2ecf20Sopenharmony_ci				return NULL;
6298c2ecf20Sopenharmony_ci			if (!is_cow_mapping(vma->vm_flags))
6308c2ecf20Sopenharmony_ci				return NULL;
6318c2ecf20Sopenharmony_ci		}
6328c2ecf20Sopenharmony_ci	}
6338c2ecf20Sopenharmony_ci
6348c2ecf20Sopenharmony_ci	if (is_zero_pfn(pfn))
6358c2ecf20Sopenharmony_ci		return NULL;
6368c2ecf20Sopenharmony_ci
6378c2ecf20Sopenharmony_cicheck_pfn:
6388c2ecf20Sopenharmony_ci	if (unlikely(pfn > highest_memmap_pfn)) {
6398c2ecf20Sopenharmony_ci		print_bad_pte(vma, addr, pte, NULL);
6408c2ecf20Sopenharmony_ci		return NULL;
6418c2ecf20Sopenharmony_ci	}
6428c2ecf20Sopenharmony_ci
6438c2ecf20Sopenharmony_ci	/*
6448c2ecf20Sopenharmony_ci	 * NOTE! We still have PageReserved() pages in the page tables.
6458c2ecf20Sopenharmony_ci	 * eg. VDSO mappings can cause them to exist.
6468c2ecf20Sopenharmony_ci	 */
6478c2ecf20Sopenharmony_ciout:
6488c2ecf20Sopenharmony_ci	return pfn_to_page(pfn);
6498c2ecf20Sopenharmony_ci}
6508c2ecf20Sopenharmony_ci
6518c2ecf20Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE
6528c2ecf20Sopenharmony_cistruct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
6538c2ecf20Sopenharmony_ci				pmd_t pmd)
6548c2ecf20Sopenharmony_ci{
6558c2ecf20Sopenharmony_ci	unsigned long pfn = pmd_pfn(pmd);
6568c2ecf20Sopenharmony_ci
6578c2ecf20Sopenharmony_ci	/*
6588c2ecf20Sopenharmony_ci	 * There is no pmd_special() but there may be special pmds, e.g.
6598c2ecf20Sopenharmony_ci	 * in a direct-access (dax) mapping, so let's just replicate the
6608c2ecf20Sopenharmony_ci	 * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here.
6618c2ecf20Sopenharmony_ci	 */
6628c2ecf20Sopenharmony_ci	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
6638c2ecf20Sopenharmony_ci		if (vma->vm_flags & VM_MIXEDMAP) {
6648c2ecf20Sopenharmony_ci			if (!pfn_valid(pfn))
6658c2ecf20Sopenharmony_ci				return NULL;
6668c2ecf20Sopenharmony_ci			goto out;
6678c2ecf20Sopenharmony_ci		} else {
6688c2ecf20Sopenharmony_ci			unsigned long off;
6698c2ecf20Sopenharmony_ci			off = (addr - vma->vm_start) >> PAGE_SHIFT;
6708c2ecf20Sopenharmony_ci			if (pfn == vma->vm_pgoff + off)
6718c2ecf20Sopenharmony_ci				return NULL;
6728c2ecf20Sopenharmony_ci			if (!is_cow_mapping(vma->vm_flags))
6738c2ecf20Sopenharmony_ci				return NULL;
6748c2ecf20Sopenharmony_ci		}
6758c2ecf20Sopenharmony_ci	}
6768c2ecf20Sopenharmony_ci
6778c2ecf20Sopenharmony_ci	if (pmd_devmap(pmd))
6788c2ecf20Sopenharmony_ci		return NULL;
6798c2ecf20Sopenharmony_ci	if (is_huge_zero_pmd(pmd))
6808c2ecf20Sopenharmony_ci		return NULL;
6818c2ecf20Sopenharmony_ci	if (unlikely(pfn > highest_memmap_pfn))
6828c2ecf20Sopenharmony_ci		return NULL;
6838c2ecf20Sopenharmony_ci
6848c2ecf20Sopenharmony_ci	/*
6858c2ecf20Sopenharmony_ci	 * NOTE! We still have PageReserved() pages in the page tables.
6868c2ecf20Sopenharmony_ci	 * eg. VDSO mappings can cause them to exist.
6878c2ecf20Sopenharmony_ci	 */
6888c2ecf20Sopenharmony_ciout:
6898c2ecf20Sopenharmony_ci	return pfn_to_page(pfn);
6908c2ecf20Sopenharmony_ci}
6918c2ecf20Sopenharmony_ci#endif
6928c2ecf20Sopenharmony_ci
6938c2ecf20Sopenharmony_ci/*
6948c2ecf20Sopenharmony_ci * copy one vm_area from one task to the other. Assumes the page tables
6958c2ecf20Sopenharmony_ci * already present in the new task to be cleared in the whole range
6968c2ecf20Sopenharmony_ci * covered by this vma.
6978c2ecf20Sopenharmony_ci */
6988c2ecf20Sopenharmony_ci
6998c2ecf20Sopenharmony_cistatic unsigned long
7008c2ecf20Sopenharmony_cicopy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
7018c2ecf20Sopenharmony_ci		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
7028c2ecf20Sopenharmony_ci		struct vm_area_struct *src_vma, unsigned long addr, int *rss)
7038c2ecf20Sopenharmony_ci{
7048c2ecf20Sopenharmony_ci	unsigned long vm_flags = dst_vma->vm_flags;
7058c2ecf20Sopenharmony_ci	pte_t pte = *src_pte;
7068c2ecf20Sopenharmony_ci	struct page *page;
7078c2ecf20Sopenharmony_ci	swp_entry_t entry = pte_to_swp_entry(pte);
7088c2ecf20Sopenharmony_ci
7098c2ecf20Sopenharmony_ci	if (likely(!non_swap_entry(entry))) {
7108c2ecf20Sopenharmony_ci		if (swap_duplicate(entry) < 0)
7118c2ecf20Sopenharmony_ci			return entry.val;
7128c2ecf20Sopenharmony_ci
7138c2ecf20Sopenharmony_ci		/* make sure dst_mm is on swapoff's mmlist. */
7148c2ecf20Sopenharmony_ci		if (unlikely(list_empty(&dst_mm->mmlist))) {
7158c2ecf20Sopenharmony_ci			spin_lock(&mmlist_lock);
7168c2ecf20Sopenharmony_ci			if (list_empty(&dst_mm->mmlist))
7178c2ecf20Sopenharmony_ci				list_add(&dst_mm->mmlist,
7188c2ecf20Sopenharmony_ci						&src_mm->mmlist);
7198c2ecf20Sopenharmony_ci			spin_unlock(&mmlist_lock);
7208c2ecf20Sopenharmony_ci		}
7218c2ecf20Sopenharmony_ci		rss[MM_SWAPENTS]++;
7228c2ecf20Sopenharmony_ci	} else if (is_migration_entry(entry)) {
7238c2ecf20Sopenharmony_ci		page = migration_entry_to_page(entry);
7248c2ecf20Sopenharmony_ci
7258c2ecf20Sopenharmony_ci		rss[mm_counter(page)]++;
7268c2ecf20Sopenharmony_ci
7278c2ecf20Sopenharmony_ci		if (is_write_migration_entry(entry) &&
7288c2ecf20Sopenharmony_ci				is_cow_mapping(vm_flags)) {
7298c2ecf20Sopenharmony_ci			/*
7308c2ecf20Sopenharmony_ci			 * COW mappings require pages in both
7318c2ecf20Sopenharmony_ci			 * parent and child to be set to read.
7328c2ecf20Sopenharmony_ci			 */
7338c2ecf20Sopenharmony_ci			make_migration_entry_read(&entry);
7348c2ecf20Sopenharmony_ci			pte = swp_entry_to_pte(entry);
7358c2ecf20Sopenharmony_ci			if (pte_swp_soft_dirty(*src_pte))
7368c2ecf20Sopenharmony_ci				pte = pte_swp_mksoft_dirty(pte);
7378c2ecf20Sopenharmony_ci			if (pte_swp_uffd_wp(*src_pte))
7388c2ecf20Sopenharmony_ci				pte = pte_swp_mkuffd_wp(pte);
7398c2ecf20Sopenharmony_ci			set_pte_at(src_mm, addr, src_pte, pte);
7408c2ecf20Sopenharmony_ci		}
7418c2ecf20Sopenharmony_ci	} else if (is_device_private_entry(entry)) {
7428c2ecf20Sopenharmony_ci		page = device_private_entry_to_page(entry);
7438c2ecf20Sopenharmony_ci
7448c2ecf20Sopenharmony_ci		/*
7458c2ecf20Sopenharmony_ci		 * Update rss count even for unaddressable pages, as
7468c2ecf20Sopenharmony_ci		 * they should treated just like normal pages in this
7478c2ecf20Sopenharmony_ci		 * respect.
7488c2ecf20Sopenharmony_ci		 *
7498c2ecf20Sopenharmony_ci		 * We will likely want to have some new rss counters
7508c2ecf20Sopenharmony_ci		 * for unaddressable pages, at some point. But for now
7518c2ecf20Sopenharmony_ci		 * keep things as they are.
7528c2ecf20Sopenharmony_ci		 */
7538c2ecf20Sopenharmony_ci		get_page(page);
7548c2ecf20Sopenharmony_ci		rss[mm_counter(page)]++;
7558c2ecf20Sopenharmony_ci		page_dup_rmap(page, false);
7568c2ecf20Sopenharmony_ci
7578c2ecf20Sopenharmony_ci		/*
7588c2ecf20Sopenharmony_ci		 * We do not preserve soft-dirty information, because so
7598c2ecf20Sopenharmony_ci		 * far, checkpoint/restore is the only feature that
7608c2ecf20Sopenharmony_ci		 * requires that. And checkpoint/restore does not work
7618c2ecf20Sopenharmony_ci		 * when a device driver is involved (you cannot easily
7628c2ecf20Sopenharmony_ci		 * save and restore device driver state).
7638c2ecf20Sopenharmony_ci		 */
7648c2ecf20Sopenharmony_ci		if (is_write_device_private_entry(entry) &&
7658c2ecf20Sopenharmony_ci		    is_cow_mapping(vm_flags)) {
7668c2ecf20Sopenharmony_ci			make_device_private_entry_read(&entry);
7678c2ecf20Sopenharmony_ci			pte = swp_entry_to_pte(entry);
7688c2ecf20Sopenharmony_ci			if (pte_swp_uffd_wp(*src_pte))
7698c2ecf20Sopenharmony_ci				pte = pte_swp_mkuffd_wp(pte);
7708c2ecf20Sopenharmony_ci			set_pte_at(src_mm, addr, src_pte, pte);
7718c2ecf20Sopenharmony_ci		}
7728c2ecf20Sopenharmony_ci	}
7738c2ecf20Sopenharmony_ci	if (!userfaultfd_wp(dst_vma))
7748c2ecf20Sopenharmony_ci		pte = pte_swp_clear_uffd_wp(pte);
7758c2ecf20Sopenharmony_ci	set_pte_at(dst_mm, addr, dst_pte, pte);
7768c2ecf20Sopenharmony_ci	return 0;
7778c2ecf20Sopenharmony_ci}
7788c2ecf20Sopenharmony_ci
7798c2ecf20Sopenharmony_ci/*
7808c2ecf20Sopenharmony_ci * Copy a present and normal page if necessary.
7818c2ecf20Sopenharmony_ci *
7828c2ecf20Sopenharmony_ci * NOTE! The usual case is that this doesn't need to do
7838c2ecf20Sopenharmony_ci * anything, and can just return a positive value. That
7848c2ecf20Sopenharmony_ci * will let the caller know that it can just increase
7858c2ecf20Sopenharmony_ci * the page refcount and re-use the pte the traditional
7868c2ecf20Sopenharmony_ci * way.
7878c2ecf20Sopenharmony_ci *
7888c2ecf20Sopenharmony_ci * But _if_ we need to copy it because it needs to be
7898c2ecf20Sopenharmony_ci * pinned in the parent (and the child should get its own
7908c2ecf20Sopenharmony_ci * copy rather than just a reference to the same page),
7918c2ecf20Sopenharmony_ci * we'll do that here and return zero to let the caller
7928c2ecf20Sopenharmony_ci * know we're done.
7938c2ecf20Sopenharmony_ci *
7948c2ecf20Sopenharmony_ci * And if we need a pre-allocated page but don't yet have
7958c2ecf20Sopenharmony_ci * one, return a negative error to let the preallocation
7968c2ecf20Sopenharmony_ci * code know so that it can do so outside the page table
7978c2ecf20Sopenharmony_ci * lock.
7988c2ecf20Sopenharmony_ci */
7998c2ecf20Sopenharmony_cistatic inline int
8008c2ecf20Sopenharmony_cicopy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
8018c2ecf20Sopenharmony_ci		  pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
8028c2ecf20Sopenharmony_ci		  struct page **prealloc, pte_t pte, struct page *page)
8038c2ecf20Sopenharmony_ci{
8048c2ecf20Sopenharmony_ci	struct mm_struct *src_mm = src_vma->vm_mm;
8058c2ecf20Sopenharmony_ci	struct page *new_page;
8068c2ecf20Sopenharmony_ci
8078c2ecf20Sopenharmony_ci	if (!is_cow_mapping(src_vma->vm_flags))
8088c2ecf20Sopenharmony_ci		return 1;
8098c2ecf20Sopenharmony_ci
8108c2ecf20Sopenharmony_ci	/*
8118c2ecf20Sopenharmony_ci	 * What we want to do is to check whether this page may
8128c2ecf20Sopenharmony_ci	 * have been pinned by the parent process.  If so,
8138c2ecf20Sopenharmony_ci	 * instead of wrprotect the pte on both sides, we copy
8148c2ecf20Sopenharmony_ci	 * the page immediately so that we'll always guarantee
8158c2ecf20Sopenharmony_ci	 * the pinned page won't be randomly replaced in the
8168c2ecf20Sopenharmony_ci	 * future.
8178c2ecf20Sopenharmony_ci	 *
8188c2ecf20Sopenharmony_ci	 * The page pinning checks are just "has this mm ever
8198c2ecf20Sopenharmony_ci	 * seen pinning", along with the (inexact) check of
8208c2ecf20Sopenharmony_ci	 * the page count. That might give false positives for
8218c2ecf20Sopenharmony_ci	 * for pinning, but it will work correctly.
8228c2ecf20Sopenharmony_ci	 */
8238c2ecf20Sopenharmony_ci	if (likely(!atomic_read(&src_mm->has_pinned)))
8248c2ecf20Sopenharmony_ci		return 1;
8258c2ecf20Sopenharmony_ci	if (likely(!page_maybe_dma_pinned(page)))
8268c2ecf20Sopenharmony_ci		return 1;
8278c2ecf20Sopenharmony_ci
8288c2ecf20Sopenharmony_ci	/*
8298c2ecf20Sopenharmony_ci	 * The vma->anon_vma of the child process may be NULL
8308c2ecf20Sopenharmony_ci	 * because the entire vma does not contain anonymous pages.
8318c2ecf20Sopenharmony_ci	 * A BUG will occur when the copy_present_page() passes
8328c2ecf20Sopenharmony_ci	 * a copy of a non-anonymous page of that vma to the
8338c2ecf20Sopenharmony_ci	 * page_add_new_anon_rmap() to set up new anonymous rmap.
8348c2ecf20Sopenharmony_ci	 * Return 1 if the page is not an anonymous page.
8358c2ecf20Sopenharmony_ci	 */
8368c2ecf20Sopenharmony_ci	if (!PageAnon(page))
8378c2ecf20Sopenharmony_ci		return 1;
8388c2ecf20Sopenharmony_ci
8398c2ecf20Sopenharmony_ci	new_page = *prealloc;
8408c2ecf20Sopenharmony_ci	if (!new_page)
8418c2ecf20Sopenharmony_ci		return -EAGAIN;
8428c2ecf20Sopenharmony_ci
8438c2ecf20Sopenharmony_ci	/*
8448c2ecf20Sopenharmony_ci	 * We have a prealloc page, all good!  Take it
8458c2ecf20Sopenharmony_ci	 * over and copy the page & arm it.
8468c2ecf20Sopenharmony_ci	 */
8478c2ecf20Sopenharmony_ci	*prealloc = NULL;
8488c2ecf20Sopenharmony_ci	copy_user_highpage(new_page, page, addr, src_vma);
8498c2ecf20Sopenharmony_ci	__SetPageUptodate(new_page);
8508c2ecf20Sopenharmony_ci	page_add_new_anon_rmap(new_page, dst_vma, addr, false);
8518c2ecf20Sopenharmony_ci	lru_cache_add_inactive_or_unevictable(new_page, dst_vma);
8528c2ecf20Sopenharmony_ci	rss[mm_counter(new_page)]++;
8538c2ecf20Sopenharmony_ci
8548c2ecf20Sopenharmony_ci	/* All done, just insert the new page copy in the child */
8558c2ecf20Sopenharmony_ci	pte = mk_pte(new_page, dst_vma->vm_page_prot);
8568c2ecf20Sopenharmony_ci	pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
8578c2ecf20Sopenharmony_ci	if (userfaultfd_pte_wp(dst_vma, *src_pte))
8588c2ecf20Sopenharmony_ci		/* Uffd-wp needs to be delivered to dest pte as well */
8598c2ecf20Sopenharmony_ci		pte = pte_wrprotect(pte_mkuffd_wp(pte));
8608c2ecf20Sopenharmony_ci	set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
8618c2ecf20Sopenharmony_ci	return 0;
8628c2ecf20Sopenharmony_ci}
8638c2ecf20Sopenharmony_ci
8648c2ecf20Sopenharmony_ci/*
8658c2ecf20Sopenharmony_ci * Copy one pte.  Returns 0 if succeeded, or -EAGAIN if one preallocated page
8668c2ecf20Sopenharmony_ci * is required to copy this pte.
8678c2ecf20Sopenharmony_ci */
8688c2ecf20Sopenharmony_cistatic inline int
8698c2ecf20Sopenharmony_cicopy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
8708c2ecf20Sopenharmony_ci		 pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
8718c2ecf20Sopenharmony_ci		 struct page **prealloc)
8728c2ecf20Sopenharmony_ci{
8738c2ecf20Sopenharmony_ci	struct mm_struct *src_mm = src_vma->vm_mm;
8748c2ecf20Sopenharmony_ci	unsigned long vm_flags = src_vma->vm_flags;
8758c2ecf20Sopenharmony_ci	pte_t pte = *src_pte;
8768c2ecf20Sopenharmony_ci	struct page *page;
8778c2ecf20Sopenharmony_ci
8788c2ecf20Sopenharmony_ci	page = vm_normal_page(src_vma, addr, pte);
8798c2ecf20Sopenharmony_ci	if (page) {
8808c2ecf20Sopenharmony_ci		int retval;
8818c2ecf20Sopenharmony_ci
8828c2ecf20Sopenharmony_ci		retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
8838c2ecf20Sopenharmony_ci					   addr, rss, prealloc, pte, page);
8848c2ecf20Sopenharmony_ci		if (retval <= 0)
8858c2ecf20Sopenharmony_ci			return retval;
8868c2ecf20Sopenharmony_ci
8878c2ecf20Sopenharmony_ci		get_page(page);
8888c2ecf20Sopenharmony_ci		page_dup_rmap(page, false);
8898c2ecf20Sopenharmony_ci		rss[mm_counter(page)]++;
8908c2ecf20Sopenharmony_ci	}
8918c2ecf20Sopenharmony_ci
8928c2ecf20Sopenharmony_ci	/*
8938c2ecf20Sopenharmony_ci	 * If it's a COW mapping, write protect it both
8948c2ecf20Sopenharmony_ci	 * in the parent and the child
8958c2ecf20Sopenharmony_ci	 */
8968c2ecf20Sopenharmony_ci	if (is_cow_mapping(vm_flags) && pte_write(pte)) {
8978c2ecf20Sopenharmony_ci		ptep_set_wrprotect(src_mm, addr, src_pte);
8988c2ecf20Sopenharmony_ci		pte = pte_wrprotect(pte);
8998c2ecf20Sopenharmony_ci	}
9008c2ecf20Sopenharmony_ci
9018c2ecf20Sopenharmony_ci	/*
9028c2ecf20Sopenharmony_ci	 * If it's a shared mapping, mark it clean in
9038c2ecf20Sopenharmony_ci	 * the child
9048c2ecf20Sopenharmony_ci	 */
9058c2ecf20Sopenharmony_ci	if (vm_flags & VM_SHARED)
9068c2ecf20Sopenharmony_ci		pte = pte_mkclean(pte);
9078c2ecf20Sopenharmony_ci	pte = pte_mkold(pte);
9088c2ecf20Sopenharmony_ci
9098c2ecf20Sopenharmony_ci	if (!userfaultfd_wp(dst_vma))
9108c2ecf20Sopenharmony_ci		pte = pte_clear_uffd_wp(pte);
9118c2ecf20Sopenharmony_ci
9128c2ecf20Sopenharmony_ci	set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
9138c2ecf20Sopenharmony_ci	return 0;
9148c2ecf20Sopenharmony_ci}
9158c2ecf20Sopenharmony_ci
9168c2ecf20Sopenharmony_cistatic inline struct page *
9178c2ecf20Sopenharmony_cipage_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
9188c2ecf20Sopenharmony_ci		   unsigned long addr)
9198c2ecf20Sopenharmony_ci{
9208c2ecf20Sopenharmony_ci	struct page *new_page;
9218c2ecf20Sopenharmony_ci
9228c2ecf20Sopenharmony_ci	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr);
9238c2ecf20Sopenharmony_ci	if (!new_page)
9248c2ecf20Sopenharmony_ci		return NULL;
9258c2ecf20Sopenharmony_ci
9268c2ecf20Sopenharmony_ci	if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) {
9278c2ecf20Sopenharmony_ci		put_page(new_page);
9288c2ecf20Sopenharmony_ci		return NULL;
9298c2ecf20Sopenharmony_ci	}
9308c2ecf20Sopenharmony_ci	cgroup_throttle_swaprate(new_page, GFP_KERNEL);
9318c2ecf20Sopenharmony_ci
9328c2ecf20Sopenharmony_ci	return new_page;
9338c2ecf20Sopenharmony_ci}
9348c2ecf20Sopenharmony_ci
9358c2ecf20Sopenharmony_cistatic int
9368c2ecf20Sopenharmony_cicopy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
9378c2ecf20Sopenharmony_ci	       pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
9388c2ecf20Sopenharmony_ci	       unsigned long end)
9398c2ecf20Sopenharmony_ci{
9408c2ecf20Sopenharmony_ci	struct mm_struct *dst_mm = dst_vma->vm_mm;
9418c2ecf20Sopenharmony_ci	struct mm_struct *src_mm = src_vma->vm_mm;
9428c2ecf20Sopenharmony_ci	pte_t *orig_src_pte, *orig_dst_pte;
9438c2ecf20Sopenharmony_ci	pte_t *src_pte, *dst_pte;
9448c2ecf20Sopenharmony_ci	spinlock_t *src_ptl, *dst_ptl;
9458c2ecf20Sopenharmony_ci	int progress, ret = 0;
9468c2ecf20Sopenharmony_ci	int rss[NR_MM_COUNTERS];
9478c2ecf20Sopenharmony_ci	swp_entry_t entry = (swp_entry_t){0};
9488c2ecf20Sopenharmony_ci	struct page *prealloc = NULL;
9498c2ecf20Sopenharmony_ci
9508c2ecf20Sopenharmony_ciagain:
9518c2ecf20Sopenharmony_ci	progress = 0;
9528c2ecf20Sopenharmony_ci	init_rss_vec(rss);
9538c2ecf20Sopenharmony_ci
9548c2ecf20Sopenharmony_ci	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
9558c2ecf20Sopenharmony_ci	if (!dst_pte) {
9568c2ecf20Sopenharmony_ci		ret = -ENOMEM;
9578c2ecf20Sopenharmony_ci		goto out;
9588c2ecf20Sopenharmony_ci	}
9598c2ecf20Sopenharmony_ci	src_pte = pte_offset_map(src_pmd, addr);
9608c2ecf20Sopenharmony_ci	src_ptl = pte_lockptr(src_mm, src_pmd);
9618c2ecf20Sopenharmony_ci	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
9628c2ecf20Sopenharmony_ci	orig_src_pte = src_pte;
9638c2ecf20Sopenharmony_ci	orig_dst_pte = dst_pte;
9648c2ecf20Sopenharmony_ci	arch_enter_lazy_mmu_mode();
9658c2ecf20Sopenharmony_ci
9668c2ecf20Sopenharmony_ci	do {
9678c2ecf20Sopenharmony_ci		/*
9688c2ecf20Sopenharmony_ci		 * We are holding two locks at this point - either of them
9698c2ecf20Sopenharmony_ci		 * could generate latencies in another task on another CPU.
9708c2ecf20Sopenharmony_ci		 */
9718c2ecf20Sopenharmony_ci		if (progress >= 32) {
9728c2ecf20Sopenharmony_ci			progress = 0;
9738c2ecf20Sopenharmony_ci			if (need_resched() ||
9748c2ecf20Sopenharmony_ci			    spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
9758c2ecf20Sopenharmony_ci				break;
9768c2ecf20Sopenharmony_ci		}
9778c2ecf20Sopenharmony_ci		if (pte_none(*src_pte)) {
9788c2ecf20Sopenharmony_ci			progress++;
9798c2ecf20Sopenharmony_ci			continue;
9808c2ecf20Sopenharmony_ci		}
9818c2ecf20Sopenharmony_ci		if (unlikely(!pte_present(*src_pte))) {
9828c2ecf20Sopenharmony_ci			entry.val = copy_nonpresent_pte(dst_mm, src_mm,
9838c2ecf20Sopenharmony_ci							dst_pte, src_pte,
9848c2ecf20Sopenharmony_ci							dst_vma, src_vma,
9858c2ecf20Sopenharmony_ci							addr, rss);
9868c2ecf20Sopenharmony_ci			if (entry.val)
9878c2ecf20Sopenharmony_ci				break;
9888c2ecf20Sopenharmony_ci			progress += 8;
9898c2ecf20Sopenharmony_ci			continue;
9908c2ecf20Sopenharmony_ci		}
9918c2ecf20Sopenharmony_ci		/* copy_present_pte() will clear `*prealloc' if consumed */
9928c2ecf20Sopenharmony_ci		ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
9938c2ecf20Sopenharmony_ci				       addr, rss, &prealloc);
9948c2ecf20Sopenharmony_ci		/*
9958c2ecf20Sopenharmony_ci		 * If we need a pre-allocated page for this pte, drop the
9968c2ecf20Sopenharmony_ci		 * locks, allocate, and try again.
9978c2ecf20Sopenharmony_ci		 */
9988c2ecf20Sopenharmony_ci		if (unlikely(ret == -EAGAIN))
9998c2ecf20Sopenharmony_ci			break;
10008c2ecf20Sopenharmony_ci		if (unlikely(prealloc)) {
10018c2ecf20Sopenharmony_ci			/*
10028c2ecf20Sopenharmony_ci			 * pre-alloc page cannot be reused by next time so as
10038c2ecf20Sopenharmony_ci			 * to strictly follow mempolicy (e.g., alloc_page_vma()
10048c2ecf20Sopenharmony_ci			 * will allocate page according to address).  This
10058c2ecf20Sopenharmony_ci			 * could only happen if one pinned pte changed.
10068c2ecf20Sopenharmony_ci			 */
10078c2ecf20Sopenharmony_ci			put_page(prealloc);
10088c2ecf20Sopenharmony_ci			prealloc = NULL;
10098c2ecf20Sopenharmony_ci		}
10108c2ecf20Sopenharmony_ci		progress += 8;
10118c2ecf20Sopenharmony_ci	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
10128c2ecf20Sopenharmony_ci
10138c2ecf20Sopenharmony_ci	arch_leave_lazy_mmu_mode();
10148c2ecf20Sopenharmony_ci	spin_unlock(src_ptl);
10158c2ecf20Sopenharmony_ci	pte_unmap(orig_src_pte);
10168c2ecf20Sopenharmony_ci	add_mm_rss_vec(dst_mm, rss);
10178c2ecf20Sopenharmony_ci	pte_unmap_unlock(orig_dst_pte, dst_ptl);
10188c2ecf20Sopenharmony_ci	cond_resched();
10198c2ecf20Sopenharmony_ci
10208c2ecf20Sopenharmony_ci	if (entry.val) {
10218c2ecf20Sopenharmony_ci		if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
10228c2ecf20Sopenharmony_ci			ret = -ENOMEM;
10238c2ecf20Sopenharmony_ci			goto out;
10248c2ecf20Sopenharmony_ci		}
10258c2ecf20Sopenharmony_ci		entry.val = 0;
10268c2ecf20Sopenharmony_ci	} else if (ret) {
10278c2ecf20Sopenharmony_ci		WARN_ON_ONCE(ret != -EAGAIN);
10288c2ecf20Sopenharmony_ci		prealloc = page_copy_prealloc(src_mm, src_vma, addr);
10298c2ecf20Sopenharmony_ci		if (!prealloc)
10308c2ecf20Sopenharmony_ci			return -ENOMEM;
10318c2ecf20Sopenharmony_ci		/* We've captured and resolved the error. Reset, try again. */
10328c2ecf20Sopenharmony_ci		ret = 0;
10338c2ecf20Sopenharmony_ci	}
10348c2ecf20Sopenharmony_ci	if (addr != end)
10358c2ecf20Sopenharmony_ci		goto again;
10368c2ecf20Sopenharmony_ciout:
10378c2ecf20Sopenharmony_ci	if (unlikely(prealloc))
10388c2ecf20Sopenharmony_ci		put_page(prealloc);
10398c2ecf20Sopenharmony_ci	return ret;
10408c2ecf20Sopenharmony_ci}
10418c2ecf20Sopenharmony_ci
10428c2ecf20Sopenharmony_cistatic inline int
10438c2ecf20Sopenharmony_cicopy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
10448c2ecf20Sopenharmony_ci	       pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
10458c2ecf20Sopenharmony_ci	       unsigned long end)
10468c2ecf20Sopenharmony_ci{
10478c2ecf20Sopenharmony_ci	struct mm_struct *dst_mm = dst_vma->vm_mm;
10488c2ecf20Sopenharmony_ci	struct mm_struct *src_mm = src_vma->vm_mm;
10498c2ecf20Sopenharmony_ci	pmd_t *src_pmd, *dst_pmd;
10508c2ecf20Sopenharmony_ci	unsigned long next;
10518c2ecf20Sopenharmony_ci
10528c2ecf20Sopenharmony_ci	dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
10538c2ecf20Sopenharmony_ci	if (!dst_pmd)
10548c2ecf20Sopenharmony_ci		return -ENOMEM;
10558c2ecf20Sopenharmony_ci	src_pmd = pmd_offset(src_pud, addr);
10568c2ecf20Sopenharmony_ci	do {
10578c2ecf20Sopenharmony_ci		next = pmd_addr_end(addr, end);
10588c2ecf20Sopenharmony_ci		if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
10598c2ecf20Sopenharmony_ci			|| pmd_devmap(*src_pmd)) {
10608c2ecf20Sopenharmony_ci			int err;
10618c2ecf20Sopenharmony_ci			VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
10628c2ecf20Sopenharmony_ci			err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
10638c2ecf20Sopenharmony_ci					    addr, dst_vma, src_vma);
10648c2ecf20Sopenharmony_ci			if (err == -ENOMEM)
10658c2ecf20Sopenharmony_ci				return -ENOMEM;
10668c2ecf20Sopenharmony_ci			if (!err)
10678c2ecf20Sopenharmony_ci				continue;
10688c2ecf20Sopenharmony_ci			/* fall through */
10698c2ecf20Sopenharmony_ci		}
10708c2ecf20Sopenharmony_ci		if (pmd_none_or_clear_bad(src_pmd))
10718c2ecf20Sopenharmony_ci			continue;
10728c2ecf20Sopenharmony_ci		if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
10738c2ecf20Sopenharmony_ci				   addr, next))
10748c2ecf20Sopenharmony_ci			return -ENOMEM;
10758c2ecf20Sopenharmony_ci	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
10768c2ecf20Sopenharmony_ci	return 0;
10778c2ecf20Sopenharmony_ci}
10788c2ecf20Sopenharmony_ci
10798c2ecf20Sopenharmony_cistatic inline int
10808c2ecf20Sopenharmony_cicopy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
10818c2ecf20Sopenharmony_ci	       p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr,
10828c2ecf20Sopenharmony_ci	       unsigned long end)
10838c2ecf20Sopenharmony_ci{
10848c2ecf20Sopenharmony_ci	struct mm_struct *dst_mm = dst_vma->vm_mm;
10858c2ecf20Sopenharmony_ci	struct mm_struct *src_mm = src_vma->vm_mm;
10868c2ecf20Sopenharmony_ci	pud_t *src_pud, *dst_pud;
10878c2ecf20Sopenharmony_ci	unsigned long next;
10888c2ecf20Sopenharmony_ci
10898c2ecf20Sopenharmony_ci	dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
10908c2ecf20Sopenharmony_ci	if (!dst_pud)
10918c2ecf20Sopenharmony_ci		return -ENOMEM;
10928c2ecf20Sopenharmony_ci	src_pud = pud_offset(src_p4d, addr);
10938c2ecf20Sopenharmony_ci	do {
10948c2ecf20Sopenharmony_ci		next = pud_addr_end(addr, end);
10958c2ecf20Sopenharmony_ci		if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
10968c2ecf20Sopenharmony_ci			int err;
10978c2ecf20Sopenharmony_ci
10988c2ecf20Sopenharmony_ci			VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
10998c2ecf20Sopenharmony_ci			err = copy_huge_pud(dst_mm, src_mm,
11008c2ecf20Sopenharmony_ci					    dst_pud, src_pud, addr, src_vma);
11018c2ecf20Sopenharmony_ci			if (err == -ENOMEM)
11028c2ecf20Sopenharmony_ci				return -ENOMEM;
11038c2ecf20Sopenharmony_ci			if (!err)
11048c2ecf20Sopenharmony_ci				continue;
11058c2ecf20Sopenharmony_ci			/* fall through */
11068c2ecf20Sopenharmony_ci		}
11078c2ecf20Sopenharmony_ci		if (pud_none_or_clear_bad(src_pud))
11088c2ecf20Sopenharmony_ci			continue;
11098c2ecf20Sopenharmony_ci		if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud,
11108c2ecf20Sopenharmony_ci				   addr, next))
11118c2ecf20Sopenharmony_ci			return -ENOMEM;
11128c2ecf20Sopenharmony_ci	} while (dst_pud++, src_pud++, addr = next, addr != end);
11138c2ecf20Sopenharmony_ci	return 0;
11148c2ecf20Sopenharmony_ci}
11158c2ecf20Sopenharmony_ci
11168c2ecf20Sopenharmony_cistatic inline int
11178c2ecf20Sopenharmony_cicopy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
11188c2ecf20Sopenharmony_ci	       pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr,
11198c2ecf20Sopenharmony_ci	       unsigned long end)
11208c2ecf20Sopenharmony_ci{
11218c2ecf20Sopenharmony_ci	struct mm_struct *dst_mm = dst_vma->vm_mm;
11228c2ecf20Sopenharmony_ci	p4d_t *src_p4d, *dst_p4d;
11238c2ecf20Sopenharmony_ci	unsigned long next;
11248c2ecf20Sopenharmony_ci
11258c2ecf20Sopenharmony_ci	dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
11268c2ecf20Sopenharmony_ci	if (!dst_p4d)
11278c2ecf20Sopenharmony_ci		return -ENOMEM;
11288c2ecf20Sopenharmony_ci	src_p4d = p4d_offset(src_pgd, addr);
11298c2ecf20Sopenharmony_ci	do {
11308c2ecf20Sopenharmony_ci		next = p4d_addr_end(addr, end);
11318c2ecf20Sopenharmony_ci		if (p4d_none_or_clear_bad(src_p4d))
11328c2ecf20Sopenharmony_ci			continue;
11338c2ecf20Sopenharmony_ci		if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d,
11348c2ecf20Sopenharmony_ci				   addr, next))
11358c2ecf20Sopenharmony_ci			return -ENOMEM;
11368c2ecf20Sopenharmony_ci	} while (dst_p4d++, src_p4d++, addr = next, addr != end);
11378c2ecf20Sopenharmony_ci	return 0;
11388c2ecf20Sopenharmony_ci}
11398c2ecf20Sopenharmony_ci
11408c2ecf20Sopenharmony_ciint
11418c2ecf20Sopenharmony_cicopy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
11428c2ecf20Sopenharmony_ci{
11438c2ecf20Sopenharmony_ci	pgd_t *src_pgd, *dst_pgd;
11448c2ecf20Sopenharmony_ci	unsigned long next;
11458c2ecf20Sopenharmony_ci	unsigned long addr = src_vma->vm_start;
11468c2ecf20Sopenharmony_ci	unsigned long end = src_vma->vm_end;
11478c2ecf20Sopenharmony_ci	struct mm_struct *dst_mm = dst_vma->vm_mm;
11488c2ecf20Sopenharmony_ci	struct mm_struct *src_mm = src_vma->vm_mm;
11498c2ecf20Sopenharmony_ci	struct mmu_notifier_range range;
11508c2ecf20Sopenharmony_ci	bool is_cow;
11518c2ecf20Sopenharmony_ci	int ret;
11528c2ecf20Sopenharmony_ci
11538c2ecf20Sopenharmony_ci	/*
11548c2ecf20Sopenharmony_ci	 * Don't copy ptes where a page fault will fill them correctly.
11558c2ecf20Sopenharmony_ci	 * Fork becomes much lighter when there are big shared or private
11568c2ecf20Sopenharmony_ci	 * readonly mappings. The tradeoff is that copy_page_range is more
11578c2ecf20Sopenharmony_ci	 * efficient than faulting.
11588c2ecf20Sopenharmony_ci	 */
11598c2ecf20Sopenharmony_ci	if (!(src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
11608c2ecf20Sopenharmony_ci	    !src_vma->anon_vma)
11618c2ecf20Sopenharmony_ci		return 0;
11628c2ecf20Sopenharmony_ci
11638c2ecf20Sopenharmony_ci	if (is_vm_hugetlb_page(src_vma))
11648c2ecf20Sopenharmony_ci		return copy_hugetlb_page_range(dst_mm, src_mm, src_vma);
11658c2ecf20Sopenharmony_ci
11668c2ecf20Sopenharmony_ci	if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
11678c2ecf20Sopenharmony_ci		/*
11688c2ecf20Sopenharmony_ci		 * We do not free on error cases below as remove_vma
11698c2ecf20Sopenharmony_ci		 * gets called on error from higher level routine
11708c2ecf20Sopenharmony_ci		 */
11718c2ecf20Sopenharmony_ci		ret = track_pfn_copy(src_vma);
11728c2ecf20Sopenharmony_ci		if (ret)
11738c2ecf20Sopenharmony_ci			return ret;
11748c2ecf20Sopenharmony_ci	}
11758c2ecf20Sopenharmony_ci
11768c2ecf20Sopenharmony_ci	/*
11778c2ecf20Sopenharmony_ci	 * We need to invalidate the secondary MMU mappings only when
11788c2ecf20Sopenharmony_ci	 * there could be a permission downgrade on the ptes of the
11798c2ecf20Sopenharmony_ci	 * parent mm. And a permission downgrade will only happen if
11808c2ecf20Sopenharmony_ci	 * is_cow_mapping() returns true.
11818c2ecf20Sopenharmony_ci	 */
11828c2ecf20Sopenharmony_ci	is_cow = is_cow_mapping(src_vma->vm_flags);
11838c2ecf20Sopenharmony_ci
11848c2ecf20Sopenharmony_ci	if (is_cow) {
11858c2ecf20Sopenharmony_ci		mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
11868c2ecf20Sopenharmony_ci					0, src_vma, src_mm, addr, end);
11878c2ecf20Sopenharmony_ci		mmu_notifier_invalidate_range_start(&range);
11888c2ecf20Sopenharmony_ci		/*
11898c2ecf20Sopenharmony_ci		 * Disabling preemption is not needed for the write side, as
11908c2ecf20Sopenharmony_ci		 * the read side doesn't spin, but goes to the mmap_lock.
11918c2ecf20Sopenharmony_ci		 *
11928c2ecf20Sopenharmony_ci		 * Use the raw variant of the seqcount_t write API to avoid
11938c2ecf20Sopenharmony_ci		 * lockdep complaining about preemptibility.
11948c2ecf20Sopenharmony_ci		 */
11958c2ecf20Sopenharmony_ci		mmap_assert_write_locked(src_mm);
11968c2ecf20Sopenharmony_ci		raw_write_seqcount_begin(&src_mm->write_protect_seq);
11978c2ecf20Sopenharmony_ci	}
11988c2ecf20Sopenharmony_ci
11998c2ecf20Sopenharmony_ci	ret = 0;
12008c2ecf20Sopenharmony_ci	dst_pgd = pgd_offset(dst_mm, addr);
12018c2ecf20Sopenharmony_ci	src_pgd = pgd_offset(src_mm, addr);
12028c2ecf20Sopenharmony_ci	do {
12038c2ecf20Sopenharmony_ci		next = pgd_addr_end(addr, end);
12048c2ecf20Sopenharmony_ci		if (pgd_none_or_clear_bad(src_pgd))
12058c2ecf20Sopenharmony_ci			continue;
12068c2ecf20Sopenharmony_ci		if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
12078c2ecf20Sopenharmony_ci					    addr, next))) {
12088c2ecf20Sopenharmony_ci			ret = -ENOMEM;
12098c2ecf20Sopenharmony_ci			break;
12108c2ecf20Sopenharmony_ci		}
12118c2ecf20Sopenharmony_ci	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
12128c2ecf20Sopenharmony_ci
12138c2ecf20Sopenharmony_ci	if (is_cow) {
12148c2ecf20Sopenharmony_ci		raw_write_seqcount_end(&src_mm->write_protect_seq);
12158c2ecf20Sopenharmony_ci		mmu_notifier_invalidate_range_end(&range);
12168c2ecf20Sopenharmony_ci	}
12178c2ecf20Sopenharmony_ci	return ret;
12188c2ecf20Sopenharmony_ci}
12198c2ecf20Sopenharmony_ci
12208c2ecf20Sopenharmony_ci/* Whether we should zap all COWed (private) pages too */
12218c2ecf20Sopenharmony_cistatic inline bool should_zap_cows(struct zap_details *details)
12228c2ecf20Sopenharmony_ci{
12238c2ecf20Sopenharmony_ci	/* By default, zap all pages */
12248c2ecf20Sopenharmony_ci	if (!details)
12258c2ecf20Sopenharmony_ci		return true;
12268c2ecf20Sopenharmony_ci
12278c2ecf20Sopenharmony_ci	/* Or, we zap COWed pages only if the caller wants to */
12288c2ecf20Sopenharmony_ci	return !details->check_mapping;
12298c2ecf20Sopenharmony_ci}
12308c2ecf20Sopenharmony_ci
12318c2ecf20Sopenharmony_cistatic unsigned long zap_pte_range(struct mmu_gather *tlb,
12328c2ecf20Sopenharmony_ci				struct vm_area_struct *vma, pmd_t *pmd,
12338c2ecf20Sopenharmony_ci				unsigned long addr, unsigned long end,
12348c2ecf20Sopenharmony_ci				struct zap_details *details)
12358c2ecf20Sopenharmony_ci{
12368c2ecf20Sopenharmony_ci	struct mm_struct *mm = tlb->mm;
12378c2ecf20Sopenharmony_ci	int force_flush = 0;
12388c2ecf20Sopenharmony_ci	int rss[NR_MM_COUNTERS];
12398c2ecf20Sopenharmony_ci	spinlock_t *ptl;
12408c2ecf20Sopenharmony_ci	pte_t *start_pte;
12418c2ecf20Sopenharmony_ci	pte_t *pte;
12428c2ecf20Sopenharmony_ci	swp_entry_t entry;
12438c2ecf20Sopenharmony_ci
12448c2ecf20Sopenharmony_ci	tlb_change_page_size(tlb, PAGE_SIZE);
12458c2ecf20Sopenharmony_ciagain:
12468c2ecf20Sopenharmony_ci	init_rss_vec(rss);
12478c2ecf20Sopenharmony_ci	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
12488c2ecf20Sopenharmony_ci	pte = start_pte;
12498c2ecf20Sopenharmony_ci	flush_tlb_batched_pending(mm);
12508c2ecf20Sopenharmony_ci	arch_enter_lazy_mmu_mode();
12518c2ecf20Sopenharmony_ci	do {
12528c2ecf20Sopenharmony_ci		pte_t ptent = *pte;
12538c2ecf20Sopenharmony_ci		if (pte_none(ptent))
12548c2ecf20Sopenharmony_ci			continue;
12558c2ecf20Sopenharmony_ci
12568c2ecf20Sopenharmony_ci		if (need_resched())
12578c2ecf20Sopenharmony_ci			break;
12588c2ecf20Sopenharmony_ci
12598c2ecf20Sopenharmony_ci		if (pte_present(ptent)) {
12608c2ecf20Sopenharmony_ci			struct page *page;
12618c2ecf20Sopenharmony_ci
12628c2ecf20Sopenharmony_ci			page = vm_normal_page(vma, addr, ptent);
12638c2ecf20Sopenharmony_ci			if (vma->vm_flags & VM_USEREXPTE)
12648c2ecf20Sopenharmony_ci				page =  NULL;
12658c2ecf20Sopenharmony_ci			if (unlikely(details) && page) {
12668c2ecf20Sopenharmony_ci				/*
12678c2ecf20Sopenharmony_ci				 * unmap_shared_mapping_pages() wants to
12688c2ecf20Sopenharmony_ci				 * invalidate cache without truncating:
12698c2ecf20Sopenharmony_ci				 * unmap shared but keep private pages.
12708c2ecf20Sopenharmony_ci				 */
12718c2ecf20Sopenharmony_ci				if (details->check_mapping &&
12728c2ecf20Sopenharmony_ci				    details->check_mapping != page_rmapping(page))
12738c2ecf20Sopenharmony_ci					continue;
12748c2ecf20Sopenharmony_ci			}
12758c2ecf20Sopenharmony_ci			ptent = ptep_get_and_clear_full(mm, addr, pte,
12768c2ecf20Sopenharmony_ci							tlb->fullmm);
12778c2ecf20Sopenharmony_ci			tlb_remove_tlb_entry(tlb, pte, addr);
12788c2ecf20Sopenharmony_ci			if (unlikely(!page))
12798c2ecf20Sopenharmony_ci				continue;
12808c2ecf20Sopenharmony_ci			if (vma->vm_flags & VM_PURGEABLE)
12818c2ecf20Sopenharmony_ci				uxpte_clear_present(vma, addr);
12828c2ecf20Sopenharmony_ci			if (!PageAnon(page)) {
12838c2ecf20Sopenharmony_ci				if (pte_dirty(ptent)) {
12848c2ecf20Sopenharmony_ci					force_flush = 1;
12858c2ecf20Sopenharmony_ci					set_page_dirty(page);
12868c2ecf20Sopenharmony_ci				}
12878c2ecf20Sopenharmony_ci				if (pte_young(ptent) &&
12888c2ecf20Sopenharmony_ci				    likely(!(vma->vm_flags & VM_SEQ_READ)))
12898c2ecf20Sopenharmony_ci					mark_page_accessed(page);
12908c2ecf20Sopenharmony_ci			}
12918c2ecf20Sopenharmony_ci			rss[mm_counter(page)]--;
12928c2ecf20Sopenharmony_ci			page_remove_rmap(page, false);
12938c2ecf20Sopenharmony_ci			if (unlikely(page_mapcount(page) < 0))
12948c2ecf20Sopenharmony_ci				print_bad_pte(vma, addr, ptent, page);
12958c2ecf20Sopenharmony_ci			if (unlikely(__tlb_remove_page(tlb, page))) {
12968c2ecf20Sopenharmony_ci				force_flush = 1;
12978c2ecf20Sopenharmony_ci				addr += PAGE_SIZE;
12988c2ecf20Sopenharmony_ci				break;
12998c2ecf20Sopenharmony_ci			}
13008c2ecf20Sopenharmony_ci			continue;
13018c2ecf20Sopenharmony_ci		}
13028c2ecf20Sopenharmony_ci
13038c2ecf20Sopenharmony_ci		entry = pte_to_swp_entry(ptent);
13048c2ecf20Sopenharmony_ci		if (is_device_private_entry(entry)) {
13058c2ecf20Sopenharmony_ci			struct page *page = device_private_entry_to_page(entry);
13068c2ecf20Sopenharmony_ci
13078c2ecf20Sopenharmony_ci			if (unlikely(details && details->check_mapping)) {
13088c2ecf20Sopenharmony_ci				/*
13098c2ecf20Sopenharmony_ci				 * unmap_shared_mapping_pages() wants to
13108c2ecf20Sopenharmony_ci				 * invalidate cache without truncating:
13118c2ecf20Sopenharmony_ci				 * unmap shared but keep private pages.
13128c2ecf20Sopenharmony_ci				 */
13138c2ecf20Sopenharmony_ci				if (details->check_mapping !=
13148c2ecf20Sopenharmony_ci				    page_rmapping(page))
13158c2ecf20Sopenharmony_ci					continue;
13168c2ecf20Sopenharmony_ci			}
13178c2ecf20Sopenharmony_ci
13188c2ecf20Sopenharmony_ci			pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
13198c2ecf20Sopenharmony_ci			rss[mm_counter(page)]--;
13208c2ecf20Sopenharmony_ci			page_remove_rmap(page, false);
13218c2ecf20Sopenharmony_ci			put_page(page);
13228c2ecf20Sopenharmony_ci			continue;
13238c2ecf20Sopenharmony_ci		}
13248c2ecf20Sopenharmony_ci
13258c2ecf20Sopenharmony_ci		if (!non_swap_entry(entry)) {
13268c2ecf20Sopenharmony_ci			/* Genuine swap entry, hence a private anon page */
13278c2ecf20Sopenharmony_ci			if (!should_zap_cows(details))
13288c2ecf20Sopenharmony_ci				continue;
13298c2ecf20Sopenharmony_ci			rss[MM_SWAPENTS]--;
13308c2ecf20Sopenharmony_ci		} else if (is_migration_entry(entry)) {
13318c2ecf20Sopenharmony_ci			struct page *page;
13328c2ecf20Sopenharmony_ci
13338c2ecf20Sopenharmony_ci			page = migration_entry_to_page(entry);
13348c2ecf20Sopenharmony_ci			if (details && details->check_mapping &&
13358c2ecf20Sopenharmony_ci			    details->check_mapping != page_rmapping(page))
13368c2ecf20Sopenharmony_ci				continue;
13378c2ecf20Sopenharmony_ci			rss[mm_counter(page)]--;
13388c2ecf20Sopenharmony_ci		}
13398c2ecf20Sopenharmony_ci		if (unlikely(!free_swap_and_cache(entry)))
13408c2ecf20Sopenharmony_ci			print_bad_pte(vma, addr, ptent, NULL);
13418c2ecf20Sopenharmony_ci		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
13428c2ecf20Sopenharmony_ci	} while (pte++, addr += PAGE_SIZE, addr != end);
13438c2ecf20Sopenharmony_ci
13448c2ecf20Sopenharmony_ci	add_mm_rss_vec(mm, rss);
13458c2ecf20Sopenharmony_ci	arch_leave_lazy_mmu_mode();
13468c2ecf20Sopenharmony_ci
13478c2ecf20Sopenharmony_ci	/* Do the actual TLB flush before dropping ptl */
13488c2ecf20Sopenharmony_ci	if (force_flush)
13498c2ecf20Sopenharmony_ci		tlb_flush_mmu_tlbonly(tlb);
13508c2ecf20Sopenharmony_ci	pte_unmap_unlock(start_pte, ptl);
13518c2ecf20Sopenharmony_ci
13528c2ecf20Sopenharmony_ci	/*
13538c2ecf20Sopenharmony_ci	 * If we forced a TLB flush (either due to running out of
13548c2ecf20Sopenharmony_ci	 * batch buffers or because we needed to flush dirty TLB
13558c2ecf20Sopenharmony_ci	 * entries before releasing the ptl), free the batched
13568c2ecf20Sopenharmony_ci	 * memory too. Restart if we didn't do everything.
13578c2ecf20Sopenharmony_ci	 */
13588c2ecf20Sopenharmony_ci	if (force_flush) {
13598c2ecf20Sopenharmony_ci		force_flush = 0;
13608c2ecf20Sopenharmony_ci		tlb_flush_mmu(tlb);
13618c2ecf20Sopenharmony_ci	}
13628c2ecf20Sopenharmony_ci
13638c2ecf20Sopenharmony_ci	if (addr != end) {
13648c2ecf20Sopenharmony_ci		cond_resched();
13658c2ecf20Sopenharmony_ci		goto again;
13668c2ecf20Sopenharmony_ci	}
13678c2ecf20Sopenharmony_ci
13688c2ecf20Sopenharmony_ci	return addr;
13698c2ecf20Sopenharmony_ci}
13708c2ecf20Sopenharmony_ci
13718c2ecf20Sopenharmony_cistatic inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
13728c2ecf20Sopenharmony_ci				struct vm_area_struct *vma, pud_t *pud,
13738c2ecf20Sopenharmony_ci				unsigned long addr, unsigned long end,
13748c2ecf20Sopenharmony_ci				struct zap_details *details)
13758c2ecf20Sopenharmony_ci{
13768c2ecf20Sopenharmony_ci	pmd_t *pmd;
13778c2ecf20Sopenharmony_ci	unsigned long next;
13788c2ecf20Sopenharmony_ci
13798c2ecf20Sopenharmony_ci	pmd = pmd_offset(pud, addr);
13808c2ecf20Sopenharmony_ci	do {
13818c2ecf20Sopenharmony_ci		next = pmd_addr_end(addr, end);
13828c2ecf20Sopenharmony_ci		if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
13838c2ecf20Sopenharmony_ci			if (next - addr != HPAGE_PMD_SIZE)
13848c2ecf20Sopenharmony_ci				__split_huge_pmd(vma, pmd, addr, false, NULL);
13858c2ecf20Sopenharmony_ci			else if (zap_huge_pmd(tlb, vma, pmd, addr))
13868c2ecf20Sopenharmony_ci				goto next;
13878c2ecf20Sopenharmony_ci			/* fall through */
13888c2ecf20Sopenharmony_ci		} else if (details && details->single_page &&
13898c2ecf20Sopenharmony_ci			   PageTransCompound(details->single_page) &&
13908c2ecf20Sopenharmony_ci			   next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
13918c2ecf20Sopenharmony_ci			spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
13928c2ecf20Sopenharmony_ci			/*
13938c2ecf20Sopenharmony_ci			 * Take and drop THP pmd lock so that we cannot return
13948c2ecf20Sopenharmony_ci			 * prematurely, while zap_huge_pmd() has cleared *pmd,
13958c2ecf20Sopenharmony_ci			 * but not yet decremented compound_mapcount().
13968c2ecf20Sopenharmony_ci			 */
13978c2ecf20Sopenharmony_ci			spin_unlock(ptl);
13988c2ecf20Sopenharmony_ci		}
13998c2ecf20Sopenharmony_ci
14008c2ecf20Sopenharmony_ci		/*
14018c2ecf20Sopenharmony_ci		 * Here there can be other concurrent MADV_DONTNEED or
14028c2ecf20Sopenharmony_ci		 * trans huge page faults running, and if the pmd is
14038c2ecf20Sopenharmony_ci		 * none or trans huge it can change under us. This is
14048c2ecf20Sopenharmony_ci		 * because MADV_DONTNEED holds the mmap_lock in read
14058c2ecf20Sopenharmony_ci		 * mode.
14068c2ecf20Sopenharmony_ci		 */
14078c2ecf20Sopenharmony_ci		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
14088c2ecf20Sopenharmony_ci			goto next;
14098c2ecf20Sopenharmony_ci		next = zap_pte_range(tlb, vma, pmd, addr, next, details);
14108c2ecf20Sopenharmony_cinext:
14118c2ecf20Sopenharmony_ci		cond_resched();
14128c2ecf20Sopenharmony_ci	} while (pmd++, addr = next, addr != end);
14138c2ecf20Sopenharmony_ci
14148c2ecf20Sopenharmony_ci	return addr;
14158c2ecf20Sopenharmony_ci}
14168c2ecf20Sopenharmony_ci
14178c2ecf20Sopenharmony_cistatic inline unsigned long zap_pud_range(struct mmu_gather *tlb,
14188c2ecf20Sopenharmony_ci				struct vm_area_struct *vma, p4d_t *p4d,
14198c2ecf20Sopenharmony_ci				unsigned long addr, unsigned long end,
14208c2ecf20Sopenharmony_ci				struct zap_details *details)
14218c2ecf20Sopenharmony_ci{
14228c2ecf20Sopenharmony_ci	pud_t *pud;
14238c2ecf20Sopenharmony_ci	unsigned long next;
14248c2ecf20Sopenharmony_ci
14258c2ecf20Sopenharmony_ci	pud = pud_offset(p4d, addr);
14268c2ecf20Sopenharmony_ci	do {
14278c2ecf20Sopenharmony_ci		next = pud_addr_end(addr, end);
14288c2ecf20Sopenharmony_ci		if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
14298c2ecf20Sopenharmony_ci			if (next - addr != HPAGE_PUD_SIZE) {
14308c2ecf20Sopenharmony_ci				mmap_assert_locked(tlb->mm);
14318c2ecf20Sopenharmony_ci				split_huge_pud(vma, pud, addr);
14328c2ecf20Sopenharmony_ci			} else if (zap_huge_pud(tlb, vma, pud, addr))
14338c2ecf20Sopenharmony_ci				goto next;
14348c2ecf20Sopenharmony_ci			/* fall through */
14358c2ecf20Sopenharmony_ci		}
14368c2ecf20Sopenharmony_ci		if (pud_none_or_clear_bad(pud))
14378c2ecf20Sopenharmony_ci			continue;
14388c2ecf20Sopenharmony_ci		next = zap_pmd_range(tlb, vma, pud, addr, next, details);
14398c2ecf20Sopenharmony_cinext:
14408c2ecf20Sopenharmony_ci		cond_resched();
14418c2ecf20Sopenharmony_ci	} while (pud++, addr = next, addr != end);
14428c2ecf20Sopenharmony_ci
14438c2ecf20Sopenharmony_ci	return addr;
14448c2ecf20Sopenharmony_ci}
14458c2ecf20Sopenharmony_ci
14468c2ecf20Sopenharmony_cistatic inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
14478c2ecf20Sopenharmony_ci				struct vm_area_struct *vma, pgd_t *pgd,
14488c2ecf20Sopenharmony_ci				unsigned long addr, unsigned long end,
14498c2ecf20Sopenharmony_ci				struct zap_details *details)
14508c2ecf20Sopenharmony_ci{
14518c2ecf20Sopenharmony_ci	p4d_t *p4d;
14528c2ecf20Sopenharmony_ci	unsigned long next;
14538c2ecf20Sopenharmony_ci
14548c2ecf20Sopenharmony_ci	p4d = p4d_offset(pgd, addr);
14558c2ecf20Sopenharmony_ci	do {
14568c2ecf20Sopenharmony_ci		next = p4d_addr_end(addr, end);
14578c2ecf20Sopenharmony_ci		if (p4d_none_or_clear_bad(p4d))
14588c2ecf20Sopenharmony_ci			continue;
14598c2ecf20Sopenharmony_ci		next = zap_pud_range(tlb, vma, p4d, addr, next, details);
14608c2ecf20Sopenharmony_ci	} while (p4d++, addr = next, addr != end);
14618c2ecf20Sopenharmony_ci
14628c2ecf20Sopenharmony_ci	return addr;
14638c2ecf20Sopenharmony_ci}
14648c2ecf20Sopenharmony_ci
14658c2ecf20Sopenharmony_civoid unmap_page_range(struct mmu_gather *tlb,
14668c2ecf20Sopenharmony_ci			     struct vm_area_struct *vma,
14678c2ecf20Sopenharmony_ci			     unsigned long addr, unsigned long end,
14688c2ecf20Sopenharmony_ci			     struct zap_details *details)
14698c2ecf20Sopenharmony_ci{
14708c2ecf20Sopenharmony_ci	pgd_t *pgd;
14718c2ecf20Sopenharmony_ci	unsigned long next;
14728c2ecf20Sopenharmony_ci
14738c2ecf20Sopenharmony_ci	BUG_ON(addr >= end);
14748c2ecf20Sopenharmony_ci	tlb_start_vma(tlb, vma);
14758c2ecf20Sopenharmony_ci	pgd = pgd_offset(vma->vm_mm, addr);
14768c2ecf20Sopenharmony_ci	do {
14778c2ecf20Sopenharmony_ci		next = pgd_addr_end(addr, end);
14788c2ecf20Sopenharmony_ci		if (pgd_none_or_clear_bad(pgd))
14798c2ecf20Sopenharmony_ci			continue;
14808c2ecf20Sopenharmony_ci		next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
14818c2ecf20Sopenharmony_ci	} while (pgd++, addr = next, addr != end);
14828c2ecf20Sopenharmony_ci	tlb_end_vma(tlb, vma);
14838c2ecf20Sopenharmony_ci}
14848c2ecf20Sopenharmony_ci
14858c2ecf20Sopenharmony_ci
14868c2ecf20Sopenharmony_cistatic void unmap_single_vma(struct mmu_gather *tlb,
14878c2ecf20Sopenharmony_ci		struct vm_area_struct *vma, unsigned long start_addr,
14888c2ecf20Sopenharmony_ci		unsigned long end_addr,
14898c2ecf20Sopenharmony_ci		struct zap_details *details)
14908c2ecf20Sopenharmony_ci{
14918c2ecf20Sopenharmony_ci	unsigned long start = max(vma->vm_start, start_addr);
14928c2ecf20Sopenharmony_ci	unsigned long end;
14938c2ecf20Sopenharmony_ci
14948c2ecf20Sopenharmony_ci	if (start >= vma->vm_end)
14958c2ecf20Sopenharmony_ci		return;
14968c2ecf20Sopenharmony_ci	end = min(vma->vm_end, end_addr);
14978c2ecf20Sopenharmony_ci	if (end <= vma->vm_start)
14988c2ecf20Sopenharmony_ci		return;
14998c2ecf20Sopenharmony_ci
15008c2ecf20Sopenharmony_ci	if (vma->vm_file)
15018c2ecf20Sopenharmony_ci		uprobe_munmap(vma, start, end);
15028c2ecf20Sopenharmony_ci
15038c2ecf20Sopenharmony_ci	if (unlikely(vma->vm_flags & VM_PFNMAP))
15048c2ecf20Sopenharmony_ci		untrack_pfn(vma, 0, 0);
15058c2ecf20Sopenharmony_ci
15068c2ecf20Sopenharmony_ci	if (start != end) {
15078c2ecf20Sopenharmony_ci		if (unlikely(is_vm_hugetlb_page(vma))) {
15088c2ecf20Sopenharmony_ci			/*
15098c2ecf20Sopenharmony_ci			 * It is undesirable to test vma->vm_file as it
15108c2ecf20Sopenharmony_ci			 * should be non-null for valid hugetlb area.
15118c2ecf20Sopenharmony_ci			 * However, vm_file will be NULL in the error
15128c2ecf20Sopenharmony_ci			 * cleanup path of mmap_region. When
15138c2ecf20Sopenharmony_ci			 * hugetlbfs ->mmap method fails,
15148c2ecf20Sopenharmony_ci			 * mmap_region() nullifies vma->vm_file
15158c2ecf20Sopenharmony_ci			 * before calling this function to clean up.
15168c2ecf20Sopenharmony_ci			 * Since no pte has actually been setup, it is
15178c2ecf20Sopenharmony_ci			 * safe to do nothing in this case.
15188c2ecf20Sopenharmony_ci			 */
15198c2ecf20Sopenharmony_ci			if (vma->vm_file) {
15208c2ecf20Sopenharmony_ci				i_mmap_lock_write(vma->vm_file->f_mapping);
15218c2ecf20Sopenharmony_ci				__unmap_hugepage_range_final(tlb, vma, start, end, NULL);
15228c2ecf20Sopenharmony_ci				i_mmap_unlock_write(vma->vm_file->f_mapping);
15238c2ecf20Sopenharmony_ci			}
15248c2ecf20Sopenharmony_ci		} else
15258c2ecf20Sopenharmony_ci			unmap_page_range(tlb, vma, start, end, details);
15268c2ecf20Sopenharmony_ci	}
15278c2ecf20Sopenharmony_ci}
15288c2ecf20Sopenharmony_ci
15298c2ecf20Sopenharmony_ci/**
15308c2ecf20Sopenharmony_ci * unmap_vmas - unmap a range of memory covered by a list of vma's
15318c2ecf20Sopenharmony_ci * @tlb: address of the caller's struct mmu_gather
15328c2ecf20Sopenharmony_ci * @vma: the starting vma
15338c2ecf20Sopenharmony_ci * @start_addr: virtual address at which to start unmapping
15348c2ecf20Sopenharmony_ci * @end_addr: virtual address at which to end unmapping
15358c2ecf20Sopenharmony_ci *
15368c2ecf20Sopenharmony_ci * Unmap all pages in the vma list.
15378c2ecf20Sopenharmony_ci *
15388c2ecf20Sopenharmony_ci * Only addresses between `start' and `end' will be unmapped.
15398c2ecf20Sopenharmony_ci *
15408c2ecf20Sopenharmony_ci * The VMA list must be sorted in ascending virtual address order.
15418c2ecf20Sopenharmony_ci *
15428c2ecf20Sopenharmony_ci * unmap_vmas() assumes that the caller will flush the whole unmapped address
15438c2ecf20Sopenharmony_ci * range after unmap_vmas() returns.  So the only responsibility here is to
15448c2ecf20Sopenharmony_ci * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
15458c2ecf20Sopenharmony_ci * drops the lock and schedules.
15468c2ecf20Sopenharmony_ci */
15478c2ecf20Sopenharmony_civoid unmap_vmas(struct mmu_gather *tlb,
15488c2ecf20Sopenharmony_ci		struct vm_area_struct *vma, unsigned long start_addr,
15498c2ecf20Sopenharmony_ci		unsigned long end_addr)
15508c2ecf20Sopenharmony_ci{
15518c2ecf20Sopenharmony_ci	struct mmu_notifier_range range;
15528c2ecf20Sopenharmony_ci
15538c2ecf20Sopenharmony_ci	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
15548c2ecf20Sopenharmony_ci				start_addr, end_addr);
15558c2ecf20Sopenharmony_ci	mmu_notifier_invalidate_range_start(&range);
15568c2ecf20Sopenharmony_ci	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
15578c2ecf20Sopenharmony_ci		unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
15588c2ecf20Sopenharmony_ci	mmu_notifier_invalidate_range_end(&range);
15598c2ecf20Sopenharmony_ci}
15608c2ecf20Sopenharmony_ci
15618c2ecf20Sopenharmony_ci/**
15628c2ecf20Sopenharmony_ci * zap_page_range - remove user pages in a given range
15638c2ecf20Sopenharmony_ci * @vma: vm_area_struct holding the applicable pages
15648c2ecf20Sopenharmony_ci * @start: starting address of pages to zap
15658c2ecf20Sopenharmony_ci * @size: number of bytes to zap
15668c2ecf20Sopenharmony_ci *
15678c2ecf20Sopenharmony_ci * Caller must protect the VMA list
15688c2ecf20Sopenharmony_ci */
15698c2ecf20Sopenharmony_civoid zap_page_range(struct vm_area_struct *vma, unsigned long start,
15708c2ecf20Sopenharmony_ci		unsigned long size)
15718c2ecf20Sopenharmony_ci{
15728c2ecf20Sopenharmony_ci	struct mmu_notifier_range range;
15738c2ecf20Sopenharmony_ci	struct mmu_gather tlb;
15748c2ecf20Sopenharmony_ci
15758c2ecf20Sopenharmony_ci	lru_add_drain();
15768c2ecf20Sopenharmony_ci	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
15778c2ecf20Sopenharmony_ci				start, start + size);
15788c2ecf20Sopenharmony_ci	tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end);
15798c2ecf20Sopenharmony_ci	update_hiwater_rss(vma->vm_mm);
15808c2ecf20Sopenharmony_ci	mmu_notifier_invalidate_range_start(&range);
15818c2ecf20Sopenharmony_ci	for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
15828c2ecf20Sopenharmony_ci		unmap_single_vma(&tlb, vma, start, range.end, NULL);
15838c2ecf20Sopenharmony_ci	mmu_notifier_invalidate_range_end(&range);
15848c2ecf20Sopenharmony_ci	tlb_finish_mmu(&tlb, start, range.end);
15858c2ecf20Sopenharmony_ci}
15868c2ecf20Sopenharmony_ci
15878c2ecf20Sopenharmony_ci/**
15888c2ecf20Sopenharmony_ci * zap_page_range_single - remove user pages in a given range
15898c2ecf20Sopenharmony_ci * @vma: vm_area_struct holding the applicable pages
15908c2ecf20Sopenharmony_ci * @address: starting address of pages to zap
15918c2ecf20Sopenharmony_ci * @size: number of bytes to zap
15928c2ecf20Sopenharmony_ci * @details: details of shared cache invalidation
15938c2ecf20Sopenharmony_ci *
15948c2ecf20Sopenharmony_ci * The range must fit into one VMA.
15958c2ecf20Sopenharmony_ci */
15968c2ecf20Sopenharmony_cistatic void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
15978c2ecf20Sopenharmony_ci		unsigned long size, struct zap_details *details)
15988c2ecf20Sopenharmony_ci{
15998c2ecf20Sopenharmony_ci	struct mmu_notifier_range range;
16008c2ecf20Sopenharmony_ci	struct mmu_gather tlb;
16018c2ecf20Sopenharmony_ci
16028c2ecf20Sopenharmony_ci	lru_add_drain();
16038c2ecf20Sopenharmony_ci	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
16048c2ecf20Sopenharmony_ci				address, address + size);
16058c2ecf20Sopenharmony_ci	tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
16068c2ecf20Sopenharmony_ci	update_hiwater_rss(vma->vm_mm);
16078c2ecf20Sopenharmony_ci	mmu_notifier_invalidate_range_start(&range);
16088c2ecf20Sopenharmony_ci	unmap_single_vma(&tlb, vma, address, range.end, details);
16098c2ecf20Sopenharmony_ci	mmu_notifier_invalidate_range_end(&range);
16108c2ecf20Sopenharmony_ci	tlb_finish_mmu(&tlb, address, range.end);
16118c2ecf20Sopenharmony_ci}
16128c2ecf20Sopenharmony_ci
16138c2ecf20Sopenharmony_ci/**
16148c2ecf20Sopenharmony_ci * zap_vma_ptes - remove ptes mapping the vma
16158c2ecf20Sopenharmony_ci * @vma: vm_area_struct holding ptes to be zapped
16168c2ecf20Sopenharmony_ci * @address: starting address of pages to zap
16178c2ecf20Sopenharmony_ci * @size: number of bytes to zap
16188c2ecf20Sopenharmony_ci *
16198c2ecf20Sopenharmony_ci * This function only unmaps ptes assigned to VM_PFNMAP vmas.
16208c2ecf20Sopenharmony_ci *
16218c2ecf20Sopenharmony_ci * The entire address range must be fully contained within the vma.
16228c2ecf20Sopenharmony_ci *
16238c2ecf20Sopenharmony_ci */
16248c2ecf20Sopenharmony_civoid zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
16258c2ecf20Sopenharmony_ci		unsigned long size)
16268c2ecf20Sopenharmony_ci{
16278c2ecf20Sopenharmony_ci	if (address < vma->vm_start || address + size > vma->vm_end ||
16288c2ecf20Sopenharmony_ci	    		!(vma->vm_flags & VM_PFNMAP))
16298c2ecf20Sopenharmony_ci		return;
16308c2ecf20Sopenharmony_ci
16318c2ecf20Sopenharmony_ci	zap_page_range_single(vma, address, size, NULL);
16328c2ecf20Sopenharmony_ci}
16338c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(zap_vma_ptes);
16348c2ecf20Sopenharmony_ci
16358c2ecf20Sopenharmony_cistatic pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr)
16368c2ecf20Sopenharmony_ci{
16378c2ecf20Sopenharmony_ci	pgd_t *pgd;
16388c2ecf20Sopenharmony_ci	p4d_t *p4d;
16398c2ecf20Sopenharmony_ci	pud_t *pud;
16408c2ecf20Sopenharmony_ci	pmd_t *pmd;
16418c2ecf20Sopenharmony_ci
16428c2ecf20Sopenharmony_ci	pgd = pgd_offset(mm, addr);
16438c2ecf20Sopenharmony_ci	p4d = p4d_alloc(mm, pgd, addr);
16448c2ecf20Sopenharmony_ci	if (!p4d)
16458c2ecf20Sopenharmony_ci		return NULL;
16468c2ecf20Sopenharmony_ci	pud = pud_alloc(mm, p4d, addr);
16478c2ecf20Sopenharmony_ci	if (!pud)
16488c2ecf20Sopenharmony_ci		return NULL;
16498c2ecf20Sopenharmony_ci	pmd = pmd_alloc(mm, pud, addr);
16508c2ecf20Sopenharmony_ci	if (!pmd)
16518c2ecf20Sopenharmony_ci		return NULL;
16528c2ecf20Sopenharmony_ci
16538c2ecf20Sopenharmony_ci	VM_BUG_ON(pmd_trans_huge(*pmd));
16548c2ecf20Sopenharmony_ci	return pmd;
16558c2ecf20Sopenharmony_ci}
16568c2ecf20Sopenharmony_ci
16578c2ecf20Sopenharmony_cipte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
16588c2ecf20Sopenharmony_ci			spinlock_t **ptl)
16598c2ecf20Sopenharmony_ci{
16608c2ecf20Sopenharmony_ci	pmd_t *pmd = walk_to_pmd(mm, addr);
16618c2ecf20Sopenharmony_ci
16628c2ecf20Sopenharmony_ci	if (!pmd)
16638c2ecf20Sopenharmony_ci		return NULL;
16648c2ecf20Sopenharmony_ci	return pte_alloc_map_lock(mm, pmd, addr, ptl);
16658c2ecf20Sopenharmony_ci}
16668c2ecf20Sopenharmony_ci
16678c2ecf20Sopenharmony_cistatic int validate_page_before_insert(struct page *page)
16688c2ecf20Sopenharmony_ci{
16698c2ecf20Sopenharmony_ci	if (PageAnon(page) || PageSlab(page) || page_has_type(page))
16708c2ecf20Sopenharmony_ci		return -EINVAL;
16718c2ecf20Sopenharmony_ci	flush_dcache_page(page);
16728c2ecf20Sopenharmony_ci	return 0;
16738c2ecf20Sopenharmony_ci}
16748c2ecf20Sopenharmony_ci
16758c2ecf20Sopenharmony_cistatic int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
16768c2ecf20Sopenharmony_ci			unsigned long addr, struct page *page, pgprot_t prot)
16778c2ecf20Sopenharmony_ci{
16788c2ecf20Sopenharmony_ci	if (!pte_none(*pte))
16798c2ecf20Sopenharmony_ci		return -EBUSY;
16808c2ecf20Sopenharmony_ci	/* Ok, finally just insert the thing.. */
16818c2ecf20Sopenharmony_ci	get_page(page);
16828c2ecf20Sopenharmony_ci	inc_mm_counter_fast(mm, mm_counter_file(page));
16838c2ecf20Sopenharmony_ci	page_add_file_rmap(page, false);
16848c2ecf20Sopenharmony_ci	set_pte_at(mm, addr, pte, mk_pte(page, prot));
16858c2ecf20Sopenharmony_ci	return 0;
16868c2ecf20Sopenharmony_ci}
16878c2ecf20Sopenharmony_ci
16888c2ecf20Sopenharmony_ci/*
16898c2ecf20Sopenharmony_ci * This is the old fallback for page remapping.
16908c2ecf20Sopenharmony_ci *
16918c2ecf20Sopenharmony_ci * For historical reasons, it only allows reserved pages. Only
16928c2ecf20Sopenharmony_ci * old drivers should use this, and they needed to mark their
16938c2ecf20Sopenharmony_ci * pages reserved for the old functions anyway.
16948c2ecf20Sopenharmony_ci */
16958c2ecf20Sopenharmony_cistatic int insert_page(struct vm_area_struct *vma, unsigned long addr,
16968c2ecf20Sopenharmony_ci			struct page *page, pgprot_t prot)
16978c2ecf20Sopenharmony_ci{
16988c2ecf20Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
16998c2ecf20Sopenharmony_ci	int retval;
17008c2ecf20Sopenharmony_ci	pte_t *pte;
17018c2ecf20Sopenharmony_ci	spinlock_t *ptl;
17028c2ecf20Sopenharmony_ci
17038c2ecf20Sopenharmony_ci	retval = validate_page_before_insert(page);
17048c2ecf20Sopenharmony_ci	if (retval)
17058c2ecf20Sopenharmony_ci		goto out;
17068c2ecf20Sopenharmony_ci	retval = -ENOMEM;
17078c2ecf20Sopenharmony_ci	pte = get_locked_pte(mm, addr, &ptl);
17088c2ecf20Sopenharmony_ci	if (!pte)
17098c2ecf20Sopenharmony_ci		goto out;
17108c2ecf20Sopenharmony_ci	retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
17118c2ecf20Sopenharmony_ci	pte_unmap_unlock(pte, ptl);
17128c2ecf20Sopenharmony_ciout:
17138c2ecf20Sopenharmony_ci	return retval;
17148c2ecf20Sopenharmony_ci}
17158c2ecf20Sopenharmony_ci
17168c2ecf20Sopenharmony_ci#ifdef pte_index
17178c2ecf20Sopenharmony_cistatic int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte,
17188c2ecf20Sopenharmony_ci			unsigned long addr, struct page *page, pgprot_t prot)
17198c2ecf20Sopenharmony_ci{
17208c2ecf20Sopenharmony_ci	int err;
17218c2ecf20Sopenharmony_ci
17228c2ecf20Sopenharmony_ci	if (!page_count(page))
17238c2ecf20Sopenharmony_ci		return -EINVAL;
17248c2ecf20Sopenharmony_ci	err = validate_page_before_insert(page);
17258c2ecf20Sopenharmony_ci	if (err)
17268c2ecf20Sopenharmony_ci		return err;
17278c2ecf20Sopenharmony_ci	return insert_page_into_pte_locked(mm, pte, addr, page, prot);
17288c2ecf20Sopenharmony_ci}
17298c2ecf20Sopenharmony_ci
17308c2ecf20Sopenharmony_ci/* insert_pages() amortizes the cost of spinlock operations
17318c2ecf20Sopenharmony_ci * when inserting pages in a loop. Arch *must* define pte_index.
17328c2ecf20Sopenharmony_ci */
17338c2ecf20Sopenharmony_cistatic int insert_pages(struct vm_area_struct *vma, unsigned long addr,
17348c2ecf20Sopenharmony_ci			struct page **pages, unsigned long *num, pgprot_t prot)
17358c2ecf20Sopenharmony_ci{
17368c2ecf20Sopenharmony_ci	pmd_t *pmd = NULL;
17378c2ecf20Sopenharmony_ci	pte_t *start_pte, *pte;
17388c2ecf20Sopenharmony_ci	spinlock_t *pte_lock;
17398c2ecf20Sopenharmony_ci	struct mm_struct *const mm = vma->vm_mm;
17408c2ecf20Sopenharmony_ci	unsigned long curr_page_idx = 0;
17418c2ecf20Sopenharmony_ci	unsigned long remaining_pages_total = *num;
17428c2ecf20Sopenharmony_ci	unsigned long pages_to_write_in_pmd;
17438c2ecf20Sopenharmony_ci	int ret;
17448c2ecf20Sopenharmony_cimore:
17458c2ecf20Sopenharmony_ci	ret = -EFAULT;
17468c2ecf20Sopenharmony_ci	pmd = walk_to_pmd(mm, addr);
17478c2ecf20Sopenharmony_ci	if (!pmd)
17488c2ecf20Sopenharmony_ci		goto out;
17498c2ecf20Sopenharmony_ci
17508c2ecf20Sopenharmony_ci	pages_to_write_in_pmd = min_t(unsigned long,
17518c2ecf20Sopenharmony_ci		remaining_pages_total, PTRS_PER_PTE - pte_index(addr));
17528c2ecf20Sopenharmony_ci
17538c2ecf20Sopenharmony_ci	/* Allocate the PTE if necessary; takes PMD lock once only. */
17548c2ecf20Sopenharmony_ci	ret = -ENOMEM;
17558c2ecf20Sopenharmony_ci	if (pte_alloc(mm, pmd))
17568c2ecf20Sopenharmony_ci		goto out;
17578c2ecf20Sopenharmony_ci
17588c2ecf20Sopenharmony_ci	while (pages_to_write_in_pmd) {
17598c2ecf20Sopenharmony_ci		int pte_idx = 0;
17608c2ecf20Sopenharmony_ci		const int batch_size = min_t(int, pages_to_write_in_pmd, 8);
17618c2ecf20Sopenharmony_ci
17628c2ecf20Sopenharmony_ci		start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
17638c2ecf20Sopenharmony_ci		for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
17648c2ecf20Sopenharmony_ci			int err = insert_page_in_batch_locked(mm, pte,
17658c2ecf20Sopenharmony_ci				addr, pages[curr_page_idx], prot);
17668c2ecf20Sopenharmony_ci			if (unlikely(err)) {
17678c2ecf20Sopenharmony_ci				pte_unmap_unlock(start_pte, pte_lock);
17688c2ecf20Sopenharmony_ci				ret = err;
17698c2ecf20Sopenharmony_ci				remaining_pages_total -= pte_idx;
17708c2ecf20Sopenharmony_ci				goto out;
17718c2ecf20Sopenharmony_ci			}
17728c2ecf20Sopenharmony_ci			addr += PAGE_SIZE;
17738c2ecf20Sopenharmony_ci			++curr_page_idx;
17748c2ecf20Sopenharmony_ci		}
17758c2ecf20Sopenharmony_ci		pte_unmap_unlock(start_pte, pte_lock);
17768c2ecf20Sopenharmony_ci		pages_to_write_in_pmd -= batch_size;
17778c2ecf20Sopenharmony_ci		remaining_pages_total -= batch_size;
17788c2ecf20Sopenharmony_ci	}
17798c2ecf20Sopenharmony_ci	if (remaining_pages_total)
17808c2ecf20Sopenharmony_ci		goto more;
17818c2ecf20Sopenharmony_ci	ret = 0;
17828c2ecf20Sopenharmony_ciout:
17838c2ecf20Sopenharmony_ci	*num = remaining_pages_total;
17848c2ecf20Sopenharmony_ci	return ret;
17858c2ecf20Sopenharmony_ci}
17868c2ecf20Sopenharmony_ci#endif  /* ifdef pte_index */
17878c2ecf20Sopenharmony_ci
17888c2ecf20Sopenharmony_ci/**
17898c2ecf20Sopenharmony_ci * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
17908c2ecf20Sopenharmony_ci * @vma: user vma to map to
17918c2ecf20Sopenharmony_ci * @addr: target start user address of these pages
17928c2ecf20Sopenharmony_ci * @pages: source kernel pages
17938c2ecf20Sopenharmony_ci * @num: in: number of pages to map. out: number of pages that were *not*
17948c2ecf20Sopenharmony_ci * mapped. (0 means all pages were successfully mapped).
17958c2ecf20Sopenharmony_ci *
17968c2ecf20Sopenharmony_ci * Preferred over vm_insert_page() when inserting multiple pages.
17978c2ecf20Sopenharmony_ci *
17988c2ecf20Sopenharmony_ci * In case of error, we may have mapped a subset of the provided
17998c2ecf20Sopenharmony_ci * pages. It is the caller's responsibility to account for this case.
18008c2ecf20Sopenharmony_ci *
18018c2ecf20Sopenharmony_ci * The same restrictions apply as in vm_insert_page().
18028c2ecf20Sopenharmony_ci */
18038c2ecf20Sopenharmony_ciint vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
18048c2ecf20Sopenharmony_ci			struct page **pages, unsigned long *num)
18058c2ecf20Sopenharmony_ci{
18068c2ecf20Sopenharmony_ci#ifdef pte_index
18078c2ecf20Sopenharmony_ci	const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1;
18088c2ecf20Sopenharmony_ci
18098c2ecf20Sopenharmony_ci	if (addr < vma->vm_start || end_addr >= vma->vm_end)
18108c2ecf20Sopenharmony_ci		return -EFAULT;
18118c2ecf20Sopenharmony_ci	if (!(vma->vm_flags & VM_MIXEDMAP)) {
18128c2ecf20Sopenharmony_ci		BUG_ON(mmap_read_trylock(vma->vm_mm));
18138c2ecf20Sopenharmony_ci		BUG_ON(vma->vm_flags & VM_PFNMAP);
18148c2ecf20Sopenharmony_ci		vma->vm_flags |= VM_MIXEDMAP;
18158c2ecf20Sopenharmony_ci	}
18168c2ecf20Sopenharmony_ci	/* Defer page refcount checking till we're about to map that page. */
18178c2ecf20Sopenharmony_ci	return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
18188c2ecf20Sopenharmony_ci#else
18198c2ecf20Sopenharmony_ci	unsigned long idx = 0, pgcount = *num;
18208c2ecf20Sopenharmony_ci	int err = -EINVAL;
18218c2ecf20Sopenharmony_ci
18228c2ecf20Sopenharmony_ci	for (; idx < pgcount; ++idx) {
18238c2ecf20Sopenharmony_ci		err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]);
18248c2ecf20Sopenharmony_ci		if (err)
18258c2ecf20Sopenharmony_ci			break;
18268c2ecf20Sopenharmony_ci	}
18278c2ecf20Sopenharmony_ci	*num = pgcount - idx;
18288c2ecf20Sopenharmony_ci	return err;
18298c2ecf20Sopenharmony_ci#endif  /* ifdef pte_index */
18308c2ecf20Sopenharmony_ci}
18318c2ecf20Sopenharmony_ciEXPORT_SYMBOL(vm_insert_pages);
18328c2ecf20Sopenharmony_ci
18338c2ecf20Sopenharmony_ci/**
18348c2ecf20Sopenharmony_ci * vm_insert_page - insert single page into user vma
18358c2ecf20Sopenharmony_ci * @vma: user vma to map to
18368c2ecf20Sopenharmony_ci * @addr: target user address of this page
18378c2ecf20Sopenharmony_ci * @page: source kernel page
18388c2ecf20Sopenharmony_ci *
18398c2ecf20Sopenharmony_ci * This allows drivers to insert individual pages they've allocated
18408c2ecf20Sopenharmony_ci * into a user vma.
18418c2ecf20Sopenharmony_ci *
18428c2ecf20Sopenharmony_ci * The page has to be a nice clean _individual_ kernel allocation.
18438c2ecf20Sopenharmony_ci * If you allocate a compound page, you need to have marked it as
18448c2ecf20Sopenharmony_ci * such (__GFP_COMP), or manually just split the page up yourself
18458c2ecf20Sopenharmony_ci * (see split_page()).
18468c2ecf20Sopenharmony_ci *
18478c2ecf20Sopenharmony_ci * NOTE! Traditionally this was done with "remap_pfn_range()" which
18488c2ecf20Sopenharmony_ci * took an arbitrary page protection parameter. This doesn't allow
18498c2ecf20Sopenharmony_ci * that. Your vma protection will have to be set up correctly, which
18508c2ecf20Sopenharmony_ci * means that if you want a shared writable mapping, you'd better
18518c2ecf20Sopenharmony_ci * ask for a shared writable mapping!
18528c2ecf20Sopenharmony_ci *
18538c2ecf20Sopenharmony_ci * The page does not need to be reserved.
18548c2ecf20Sopenharmony_ci *
18558c2ecf20Sopenharmony_ci * Usually this function is called from f_op->mmap() handler
18568c2ecf20Sopenharmony_ci * under mm->mmap_lock write-lock, so it can change vma->vm_flags.
18578c2ecf20Sopenharmony_ci * Caller must set VM_MIXEDMAP on vma if it wants to call this
18588c2ecf20Sopenharmony_ci * function from other places, for example from page-fault handler.
18598c2ecf20Sopenharmony_ci *
18608c2ecf20Sopenharmony_ci * Return: %0 on success, negative error code otherwise.
18618c2ecf20Sopenharmony_ci */
18628c2ecf20Sopenharmony_ciint vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
18638c2ecf20Sopenharmony_ci			struct page *page)
18648c2ecf20Sopenharmony_ci{
18658c2ecf20Sopenharmony_ci	if (addr < vma->vm_start || addr >= vma->vm_end)
18668c2ecf20Sopenharmony_ci		return -EFAULT;
18678c2ecf20Sopenharmony_ci	if (!page_count(page))
18688c2ecf20Sopenharmony_ci		return -EINVAL;
18698c2ecf20Sopenharmony_ci	if (!(vma->vm_flags & VM_MIXEDMAP)) {
18708c2ecf20Sopenharmony_ci		BUG_ON(mmap_read_trylock(vma->vm_mm));
18718c2ecf20Sopenharmony_ci		BUG_ON(vma->vm_flags & VM_PFNMAP);
18728c2ecf20Sopenharmony_ci		vma->vm_flags |= VM_MIXEDMAP;
18738c2ecf20Sopenharmony_ci	}
18748c2ecf20Sopenharmony_ci	return insert_page(vma, addr, page, vma->vm_page_prot);
18758c2ecf20Sopenharmony_ci}
18768c2ecf20Sopenharmony_ciEXPORT_SYMBOL(vm_insert_page);
18778c2ecf20Sopenharmony_ci
18788c2ecf20Sopenharmony_ci/*
18798c2ecf20Sopenharmony_ci * __vm_map_pages - maps range of kernel pages into user vma
18808c2ecf20Sopenharmony_ci * @vma: user vma to map to
18818c2ecf20Sopenharmony_ci * @pages: pointer to array of source kernel pages
18828c2ecf20Sopenharmony_ci * @num: number of pages in page array
18838c2ecf20Sopenharmony_ci * @offset: user's requested vm_pgoff
18848c2ecf20Sopenharmony_ci *
18858c2ecf20Sopenharmony_ci * This allows drivers to map range of kernel pages into a user vma.
18868c2ecf20Sopenharmony_ci *
18878c2ecf20Sopenharmony_ci * Return: 0 on success and error code otherwise.
18888c2ecf20Sopenharmony_ci */
18898c2ecf20Sopenharmony_cistatic int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
18908c2ecf20Sopenharmony_ci				unsigned long num, unsigned long offset)
18918c2ecf20Sopenharmony_ci{
18928c2ecf20Sopenharmony_ci	unsigned long count = vma_pages(vma);
18938c2ecf20Sopenharmony_ci	unsigned long uaddr = vma->vm_start;
18948c2ecf20Sopenharmony_ci	int ret, i;
18958c2ecf20Sopenharmony_ci
18968c2ecf20Sopenharmony_ci	/* Fail if the user requested offset is beyond the end of the object */
18978c2ecf20Sopenharmony_ci	if (offset >= num)
18988c2ecf20Sopenharmony_ci		return -ENXIO;
18998c2ecf20Sopenharmony_ci
19008c2ecf20Sopenharmony_ci	/* Fail if the user requested size exceeds available object size */
19018c2ecf20Sopenharmony_ci	if (count > num - offset)
19028c2ecf20Sopenharmony_ci		return -ENXIO;
19038c2ecf20Sopenharmony_ci
19048c2ecf20Sopenharmony_ci	for (i = 0; i < count; i++) {
19058c2ecf20Sopenharmony_ci		ret = vm_insert_page(vma, uaddr, pages[offset + i]);
19068c2ecf20Sopenharmony_ci		if (ret < 0)
19078c2ecf20Sopenharmony_ci			return ret;
19088c2ecf20Sopenharmony_ci		uaddr += PAGE_SIZE;
19098c2ecf20Sopenharmony_ci	}
19108c2ecf20Sopenharmony_ci
19118c2ecf20Sopenharmony_ci	return 0;
19128c2ecf20Sopenharmony_ci}
19138c2ecf20Sopenharmony_ci
19148c2ecf20Sopenharmony_ci/**
19158c2ecf20Sopenharmony_ci * vm_map_pages - maps range of kernel pages starts with non zero offset
19168c2ecf20Sopenharmony_ci * @vma: user vma to map to
19178c2ecf20Sopenharmony_ci * @pages: pointer to array of source kernel pages
19188c2ecf20Sopenharmony_ci * @num: number of pages in page array
19198c2ecf20Sopenharmony_ci *
19208c2ecf20Sopenharmony_ci * Maps an object consisting of @num pages, catering for the user's
19218c2ecf20Sopenharmony_ci * requested vm_pgoff
19228c2ecf20Sopenharmony_ci *
19238c2ecf20Sopenharmony_ci * If we fail to insert any page into the vma, the function will return
19248c2ecf20Sopenharmony_ci * immediately leaving any previously inserted pages present.  Callers
19258c2ecf20Sopenharmony_ci * from the mmap handler may immediately return the error as their caller
19268c2ecf20Sopenharmony_ci * will destroy the vma, removing any successfully inserted pages. Other
19278c2ecf20Sopenharmony_ci * callers should make their own arrangements for calling unmap_region().
19288c2ecf20Sopenharmony_ci *
19298c2ecf20Sopenharmony_ci * Context: Process context. Called by mmap handlers.
19308c2ecf20Sopenharmony_ci * Return: 0 on success and error code otherwise.
19318c2ecf20Sopenharmony_ci */
19328c2ecf20Sopenharmony_ciint vm_map_pages(struct vm_area_struct *vma, struct page **pages,
19338c2ecf20Sopenharmony_ci				unsigned long num)
19348c2ecf20Sopenharmony_ci{
19358c2ecf20Sopenharmony_ci	return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
19368c2ecf20Sopenharmony_ci}
19378c2ecf20Sopenharmony_ciEXPORT_SYMBOL(vm_map_pages);
19388c2ecf20Sopenharmony_ci
19398c2ecf20Sopenharmony_ci/**
19408c2ecf20Sopenharmony_ci * vm_map_pages_zero - map range of kernel pages starts with zero offset
19418c2ecf20Sopenharmony_ci * @vma: user vma to map to
19428c2ecf20Sopenharmony_ci * @pages: pointer to array of source kernel pages
19438c2ecf20Sopenharmony_ci * @num: number of pages in page array
19448c2ecf20Sopenharmony_ci *
19458c2ecf20Sopenharmony_ci * Similar to vm_map_pages(), except that it explicitly sets the offset
19468c2ecf20Sopenharmony_ci * to 0. This function is intended for the drivers that did not consider
19478c2ecf20Sopenharmony_ci * vm_pgoff.
19488c2ecf20Sopenharmony_ci *
19498c2ecf20Sopenharmony_ci * Context: Process context. Called by mmap handlers.
19508c2ecf20Sopenharmony_ci * Return: 0 on success and error code otherwise.
19518c2ecf20Sopenharmony_ci */
19528c2ecf20Sopenharmony_ciint vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
19538c2ecf20Sopenharmony_ci				unsigned long num)
19548c2ecf20Sopenharmony_ci{
19558c2ecf20Sopenharmony_ci	return __vm_map_pages(vma, pages, num, 0);
19568c2ecf20Sopenharmony_ci}
19578c2ecf20Sopenharmony_ciEXPORT_SYMBOL(vm_map_pages_zero);
19588c2ecf20Sopenharmony_ci
19598c2ecf20Sopenharmony_cistatic vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
19608c2ecf20Sopenharmony_ci			pfn_t pfn, pgprot_t prot, bool mkwrite)
19618c2ecf20Sopenharmony_ci{
19628c2ecf20Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
19638c2ecf20Sopenharmony_ci	pte_t *pte, entry;
19648c2ecf20Sopenharmony_ci	spinlock_t *ptl;
19658c2ecf20Sopenharmony_ci
19668c2ecf20Sopenharmony_ci	pte = get_locked_pte(mm, addr, &ptl);
19678c2ecf20Sopenharmony_ci	if (!pte)
19688c2ecf20Sopenharmony_ci		return VM_FAULT_OOM;
19698c2ecf20Sopenharmony_ci	if (!pte_none(*pte)) {
19708c2ecf20Sopenharmony_ci		if (mkwrite) {
19718c2ecf20Sopenharmony_ci			/*
19728c2ecf20Sopenharmony_ci			 * For read faults on private mappings the PFN passed
19738c2ecf20Sopenharmony_ci			 * in may not match the PFN we have mapped if the
19748c2ecf20Sopenharmony_ci			 * mapped PFN is a writeable COW page.  In the mkwrite
19758c2ecf20Sopenharmony_ci			 * case we are creating a writable PTE for a shared
19768c2ecf20Sopenharmony_ci			 * mapping and we expect the PFNs to match. If they
19778c2ecf20Sopenharmony_ci			 * don't match, we are likely racing with block
19788c2ecf20Sopenharmony_ci			 * allocation and mapping invalidation so just skip the
19798c2ecf20Sopenharmony_ci			 * update.
19808c2ecf20Sopenharmony_ci			 */
19818c2ecf20Sopenharmony_ci			if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
19828c2ecf20Sopenharmony_ci				WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
19838c2ecf20Sopenharmony_ci				goto out_unlock;
19848c2ecf20Sopenharmony_ci			}
19858c2ecf20Sopenharmony_ci			entry = pte_mkyoung(*pte);
19868c2ecf20Sopenharmony_ci			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
19878c2ecf20Sopenharmony_ci			if (ptep_set_access_flags(vma, addr, pte, entry, 1))
19888c2ecf20Sopenharmony_ci				update_mmu_cache(vma, addr, pte);
19898c2ecf20Sopenharmony_ci		}
19908c2ecf20Sopenharmony_ci		goto out_unlock;
19918c2ecf20Sopenharmony_ci	}
19928c2ecf20Sopenharmony_ci
19938c2ecf20Sopenharmony_ci	/* Ok, finally just insert the thing.. */
19948c2ecf20Sopenharmony_ci	if (pfn_t_devmap(pfn))
19958c2ecf20Sopenharmony_ci		entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
19968c2ecf20Sopenharmony_ci	else
19978c2ecf20Sopenharmony_ci		entry = pte_mkspecial(pfn_t_pte(pfn, prot));
19988c2ecf20Sopenharmony_ci
19998c2ecf20Sopenharmony_ci	if (mkwrite) {
20008c2ecf20Sopenharmony_ci		entry = pte_mkyoung(entry);
20018c2ecf20Sopenharmony_ci		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
20028c2ecf20Sopenharmony_ci	}
20038c2ecf20Sopenharmony_ci
20048c2ecf20Sopenharmony_ci	set_pte_at(mm, addr, pte, entry);
20058c2ecf20Sopenharmony_ci	update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
20068c2ecf20Sopenharmony_ci
20078c2ecf20Sopenharmony_ciout_unlock:
20088c2ecf20Sopenharmony_ci	pte_unmap_unlock(pte, ptl);
20098c2ecf20Sopenharmony_ci	return VM_FAULT_NOPAGE;
20108c2ecf20Sopenharmony_ci}
20118c2ecf20Sopenharmony_ci
20128c2ecf20Sopenharmony_ci/**
20138c2ecf20Sopenharmony_ci * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
20148c2ecf20Sopenharmony_ci * @vma: user vma to map to
20158c2ecf20Sopenharmony_ci * @addr: target user address of this page
20168c2ecf20Sopenharmony_ci * @pfn: source kernel pfn
20178c2ecf20Sopenharmony_ci * @pgprot: pgprot flags for the inserted page
20188c2ecf20Sopenharmony_ci *
20198c2ecf20Sopenharmony_ci * This is exactly like vmf_insert_pfn(), except that it allows drivers
20208c2ecf20Sopenharmony_ci * to override pgprot on a per-page basis.
20218c2ecf20Sopenharmony_ci *
20228c2ecf20Sopenharmony_ci * This only makes sense for IO mappings, and it makes no sense for
20238c2ecf20Sopenharmony_ci * COW mappings.  In general, using multiple vmas is preferable;
20248c2ecf20Sopenharmony_ci * vmf_insert_pfn_prot should only be used if using multiple VMAs is
20258c2ecf20Sopenharmony_ci * impractical.
20268c2ecf20Sopenharmony_ci *
20278c2ecf20Sopenharmony_ci * See vmf_insert_mixed_prot() for a discussion of the implication of using
20288c2ecf20Sopenharmony_ci * a value of @pgprot different from that of @vma->vm_page_prot.
20298c2ecf20Sopenharmony_ci *
20308c2ecf20Sopenharmony_ci * Context: Process context.  May allocate using %GFP_KERNEL.
20318c2ecf20Sopenharmony_ci * Return: vm_fault_t value.
20328c2ecf20Sopenharmony_ci */
20338c2ecf20Sopenharmony_civm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
20348c2ecf20Sopenharmony_ci			unsigned long pfn, pgprot_t pgprot)
20358c2ecf20Sopenharmony_ci{
20368c2ecf20Sopenharmony_ci	/*
20378c2ecf20Sopenharmony_ci	 * Technically, architectures with pte_special can avoid all these
20388c2ecf20Sopenharmony_ci	 * restrictions (same for remap_pfn_range).  However we would like
20398c2ecf20Sopenharmony_ci	 * consistency in testing and feature parity among all, so we should
20408c2ecf20Sopenharmony_ci	 * try to keep these invariants in place for everybody.
20418c2ecf20Sopenharmony_ci	 */
20428c2ecf20Sopenharmony_ci	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
20438c2ecf20Sopenharmony_ci	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
20448c2ecf20Sopenharmony_ci						(VM_PFNMAP|VM_MIXEDMAP));
20458c2ecf20Sopenharmony_ci	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
20468c2ecf20Sopenharmony_ci	BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
20478c2ecf20Sopenharmony_ci
20488c2ecf20Sopenharmony_ci	if (addr < vma->vm_start || addr >= vma->vm_end)
20498c2ecf20Sopenharmony_ci		return VM_FAULT_SIGBUS;
20508c2ecf20Sopenharmony_ci
20518c2ecf20Sopenharmony_ci	if (!pfn_modify_allowed(pfn, pgprot))
20528c2ecf20Sopenharmony_ci		return VM_FAULT_SIGBUS;
20538c2ecf20Sopenharmony_ci
20548c2ecf20Sopenharmony_ci	track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
20558c2ecf20Sopenharmony_ci
20568c2ecf20Sopenharmony_ci	return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
20578c2ecf20Sopenharmony_ci			false);
20588c2ecf20Sopenharmony_ci}
20598c2ecf20Sopenharmony_ciEXPORT_SYMBOL(vmf_insert_pfn_prot);
20608c2ecf20Sopenharmony_ci
20618c2ecf20Sopenharmony_ci/**
20628c2ecf20Sopenharmony_ci * vmf_insert_pfn - insert single pfn into user vma
20638c2ecf20Sopenharmony_ci * @vma: user vma to map to
20648c2ecf20Sopenharmony_ci * @addr: target user address of this page
20658c2ecf20Sopenharmony_ci * @pfn: source kernel pfn
20668c2ecf20Sopenharmony_ci *
20678c2ecf20Sopenharmony_ci * Similar to vm_insert_page, this allows drivers to insert individual pages
20688c2ecf20Sopenharmony_ci * they've allocated into a user vma. Same comments apply.
20698c2ecf20Sopenharmony_ci *
20708c2ecf20Sopenharmony_ci * This function should only be called from a vm_ops->fault handler, and
20718c2ecf20Sopenharmony_ci * in that case the handler should return the result of this function.
20728c2ecf20Sopenharmony_ci *
20738c2ecf20Sopenharmony_ci * vma cannot be a COW mapping.
20748c2ecf20Sopenharmony_ci *
20758c2ecf20Sopenharmony_ci * As this is called only for pages that do not currently exist, we
20768c2ecf20Sopenharmony_ci * do not need to flush old virtual caches or the TLB.
20778c2ecf20Sopenharmony_ci *
20788c2ecf20Sopenharmony_ci * Context: Process context.  May allocate using %GFP_KERNEL.
20798c2ecf20Sopenharmony_ci * Return: vm_fault_t value.
20808c2ecf20Sopenharmony_ci */
20818c2ecf20Sopenharmony_civm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
20828c2ecf20Sopenharmony_ci			unsigned long pfn)
20838c2ecf20Sopenharmony_ci{
20848c2ecf20Sopenharmony_ci	return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
20858c2ecf20Sopenharmony_ci}
20868c2ecf20Sopenharmony_ciEXPORT_SYMBOL(vmf_insert_pfn);
20878c2ecf20Sopenharmony_ci
20888c2ecf20Sopenharmony_cistatic bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
20898c2ecf20Sopenharmony_ci{
20908c2ecf20Sopenharmony_ci	/* these checks mirror the abort conditions in vm_normal_page */
20918c2ecf20Sopenharmony_ci	if (vma->vm_flags & VM_MIXEDMAP)
20928c2ecf20Sopenharmony_ci		return true;
20938c2ecf20Sopenharmony_ci	if (pfn_t_devmap(pfn))
20948c2ecf20Sopenharmony_ci		return true;
20958c2ecf20Sopenharmony_ci	if (pfn_t_special(pfn))
20968c2ecf20Sopenharmony_ci		return true;
20978c2ecf20Sopenharmony_ci	if (is_zero_pfn(pfn_t_to_pfn(pfn)))
20988c2ecf20Sopenharmony_ci		return true;
20998c2ecf20Sopenharmony_ci	return false;
21008c2ecf20Sopenharmony_ci}
21018c2ecf20Sopenharmony_ci
21028c2ecf20Sopenharmony_cistatic vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
21038c2ecf20Sopenharmony_ci		unsigned long addr, pfn_t pfn, pgprot_t pgprot,
21048c2ecf20Sopenharmony_ci		bool mkwrite)
21058c2ecf20Sopenharmony_ci{
21068c2ecf20Sopenharmony_ci	int err;
21078c2ecf20Sopenharmony_ci
21088c2ecf20Sopenharmony_ci	BUG_ON(!vm_mixed_ok(vma, pfn));
21098c2ecf20Sopenharmony_ci
21108c2ecf20Sopenharmony_ci	if (addr < vma->vm_start || addr >= vma->vm_end)
21118c2ecf20Sopenharmony_ci		return VM_FAULT_SIGBUS;
21128c2ecf20Sopenharmony_ci
21138c2ecf20Sopenharmony_ci	track_pfn_insert(vma, &pgprot, pfn);
21148c2ecf20Sopenharmony_ci
21158c2ecf20Sopenharmony_ci	if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
21168c2ecf20Sopenharmony_ci		return VM_FAULT_SIGBUS;
21178c2ecf20Sopenharmony_ci
21188c2ecf20Sopenharmony_ci	/*
21198c2ecf20Sopenharmony_ci	 * If we don't have pte special, then we have to use the pfn_valid()
21208c2ecf20Sopenharmony_ci	 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
21218c2ecf20Sopenharmony_ci	 * refcount the page if pfn_valid is true (hence insert_page rather
21228c2ecf20Sopenharmony_ci	 * than insert_pfn).  If a zero_pfn were inserted into a VM_MIXEDMAP
21238c2ecf20Sopenharmony_ci	 * without pte special, it would there be refcounted as a normal page.
21248c2ecf20Sopenharmony_ci	 */
21258c2ecf20Sopenharmony_ci	if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
21268c2ecf20Sopenharmony_ci	    !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
21278c2ecf20Sopenharmony_ci		struct page *page;
21288c2ecf20Sopenharmony_ci
21298c2ecf20Sopenharmony_ci		/*
21308c2ecf20Sopenharmony_ci		 * At this point we are committed to insert_page()
21318c2ecf20Sopenharmony_ci		 * regardless of whether the caller specified flags that
21328c2ecf20Sopenharmony_ci		 * result in pfn_t_has_page() == false.
21338c2ecf20Sopenharmony_ci		 */
21348c2ecf20Sopenharmony_ci		page = pfn_to_page(pfn_t_to_pfn(pfn));
21358c2ecf20Sopenharmony_ci		err = insert_page(vma, addr, page, pgprot);
21368c2ecf20Sopenharmony_ci	} else {
21378c2ecf20Sopenharmony_ci		return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
21388c2ecf20Sopenharmony_ci	}
21398c2ecf20Sopenharmony_ci
21408c2ecf20Sopenharmony_ci	if (err == -ENOMEM)
21418c2ecf20Sopenharmony_ci		return VM_FAULT_OOM;
21428c2ecf20Sopenharmony_ci	if (err < 0 && err != -EBUSY)
21438c2ecf20Sopenharmony_ci		return VM_FAULT_SIGBUS;
21448c2ecf20Sopenharmony_ci
21458c2ecf20Sopenharmony_ci	return VM_FAULT_NOPAGE;
21468c2ecf20Sopenharmony_ci}
21478c2ecf20Sopenharmony_ci
21488c2ecf20Sopenharmony_ci/**
21498c2ecf20Sopenharmony_ci * vmf_insert_mixed_prot - insert single pfn into user vma with specified pgprot
21508c2ecf20Sopenharmony_ci * @vma: user vma to map to
21518c2ecf20Sopenharmony_ci * @addr: target user address of this page
21528c2ecf20Sopenharmony_ci * @pfn: source kernel pfn
21538c2ecf20Sopenharmony_ci * @pgprot: pgprot flags for the inserted page
21548c2ecf20Sopenharmony_ci *
21558c2ecf20Sopenharmony_ci * This is exactly like vmf_insert_mixed(), except that it allows drivers
21568c2ecf20Sopenharmony_ci * to override pgprot on a per-page basis.
21578c2ecf20Sopenharmony_ci *
21588c2ecf20Sopenharmony_ci * Typically this function should be used by drivers to set caching- and
21598c2ecf20Sopenharmony_ci * encryption bits different than those of @vma->vm_page_prot, because
21608c2ecf20Sopenharmony_ci * the caching- or encryption mode may not be known at mmap() time.
21618c2ecf20Sopenharmony_ci * This is ok as long as @vma->vm_page_prot is not used by the core vm
21628c2ecf20Sopenharmony_ci * to set caching and encryption bits for those vmas (except for COW pages).
21638c2ecf20Sopenharmony_ci * This is ensured by core vm only modifying these page table entries using
21648c2ecf20Sopenharmony_ci * functions that don't touch caching- or encryption bits, using pte_modify()
21658c2ecf20Sopenharmony_ci * if needed. (See for example mprotect()).
21668c2ecf20Sopenharmony_ci * Also when new page-table entries are created, this is only done using the
21678c2ecf20Sopenharmony_ci * fault() callback, and never using the value of vma->vm_page_prot,
21688c2ecf20Sopenharmony_ci * except for page-table entries that point to anonymous pages as the result
21698c2ecf20Sopenharmony_ci * of COW.
21708c2ecf20Sopenharmony_ci *
21718c2ecf20Sopenharmony_ci * Context: Process context.  May allocate using %GFP_KERNEL.
21728c2ecf20Sopenharmony_ci * Return: vm_fault_t value.
21738c2ecf20Sopenharmony_ci */
21748c2ecf20Sopenharmony_civm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
21758c2ecf20Sopenharmony_ci				 pfn_t pfn, pgprot_t pgprot)
21768c2ecf20Sopenharmony_ci{
21778c2ecf20Sopenharmony_ci	return __vm_insert_mixed(vma, addr, pfn, pgprot, false);
21788c2ecf20Sopenharmony_ci}
21798c2ecf20Sopenharmony_ciEXPORT_SYMBOL(vmf_insert_mixed_prot);
21808c2ecf20Sopenharmony_ci
21818c2ecf20Sopenharmony_civm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
21828c2ecf20Sopenharmony_ci		pfn_t pfn)
21838c2ecf20Sopenharmony_ci{
21848c2ecf20Sopenharmony_ci	return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, false);
21858c2ecf20Sopenharmony_ci}
21868c2ecf20Sopenharmony_ciEXPORT_SYMBOL(vmf_insert_mixed);
21878c2ecf20Sopenharmony_ci
21888c2ecf20Sopenharmony_ci/*
21898c2ecf20Sopenharmony_ci *  If the insertion of PTE failed because someone else already added a
21908c2ecf20Sopenharmony_ci *  different entry in the mean time, we treat that as success as we assume
21918c2ecf20Sopenharmony_ci *  the same entry was actually inserted.
21928c2ecf20Sopenharmony_ci */
21938c2ecf20Sopenharmony_civm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
21948c2ecf20Sopenharmony_ci		unsigned long addr, pfn_t pfn)
21958c2ecf20Sopenharmony_ci{
21968c2ecf20Sopenharmony_ci	return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, true);
21978c2ecf20Sopenharmony_ci}
21988c2ecf20Sopenharmony_ciEXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
21998c2ecf20Sopenharmony_ci
22008c2ecf20Sopenharmony_ci/*
22018c2ecf20Sopenharmony_ci * maps a range of physical memory into the requested pages. the old
22028c2ecf20Sopenharmony_ci * mappings are removed. any references to nonexistent pages results
22038c2ecf20Sopenharmony_ci * in null mappings (currently treated as "copy-on-access")
22048c2ecf20Sopenharmony_ci */
22058c2ecf20Sopenharmony_cistatic int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
22068c2ecf20Sopenharmony_ci			unsigned long addr, unsigned long end,
22078c2ecf20Sopenharmony_ci			unsigned long pfn, pgprot_t prot)
22088c2ecf20Sopenharmony_ci{
22098c2ecf20Sopenharmony_ci	pte_t *pte, *mapped_pte;
22108c2ecf20Sopenharmony_ci	spinlock_t *ptl;
22118c2ecf20Sopenharmony_ci	int err = 0;
22128c2ecf20Sopenharmony_ci
22138c2ecf20Sopenharmony_ci	mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
22148c2ecf20Sopenharmony_ci	if (!pte)
22158c2ecf20Sopenharmony_ci		return -ENOMEM;
22168c2ecf20Sopenharmony_ci	arch_enter_lazy_mmu_mode();
22178c2ecf20Sopenharmony_ci	do {
22188c2ecf20Sopenharmony_ci		BUG_ON(!pte_none(*pte));
22198c2ecf20Sopenharmony_ci		if (!pfn_modify_allowed(pfn, prot)) {
22208c2ecf20Sopenharmony_ci			err = -EACCES;
22218c2ecf20Sopenharmony_ci			break;
22228c2ecf20Sopenharmony_ci		}
22238c2ecf20Sopenharmony_ci		set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
22248c2ecf20Sopenharmony_ci		pfn++;
22258c2ecf20Sopenharmony_ci	} while (pte++, addr += PAGE_SIZE, addr != end);
22268c2ecf20Sopenharmony_ci	arch_leave_lazy_mmu_mode();
22278c2ecf20Sopenharmony_ci	pte_unmap_unlock(mapped_pte, ptl);
22288c2ecf20Sopenharmony_ci	return err;
22298c2ecf20Sopenharmony_ci}
22308c2ecf20Sopenharmony_ci
22318c2ecf20Sopenharmony_cistatic inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
22328c2ecf20Sopenharmony_ci			unsigned long addr, unsigned long end,
22338c2ecf20Sopenharmony_ci			unsigned long pfn, pgprot_t prot)
22348c2ecf20Sopenharmony_ci{
22358c2ecf20Sopenharmony_ci	pmd_t *pmd;
22368c2ecf20Sopenharmony_ci	unsigned long next;
22378c2ecf20Sopenharmony_ci	int err;
22388c2ecf20Sopenharmony_ci
22398c2ecf20Sopenharmony_ci	pfn -= addr >> PAGE_SHIFT;
22408c2ecf20Sopenharmony_ci	pmd = pmd_alloc(mm, pud, addr);
22418c2ecf20Sopenharmony_ci	if (!pmd)
22428c2ecf20Sopenharmony_ci		return -ENOMEM;
22438c2ecf20Sopenharmony_ci	VM_BUG_ON(pmd_trans_huge(*pmd));
22448c2ecf20Sopenharmony_ci	do {
22458c2ecf20Sopenharmony_ci		next = pmd_addr_end(addr, end);
22468c2ecf20Sopenharmony_ci		err = remap_pte_range(mm, pmd, addr, next,
22478c2ecf20Sopenharmony_ci				pfn + (addr >> PAGE_SHIFT), prot);
22488c2ecf20Sopenharmony_ci		if (err)
22498c2ecf20Sopenharmony_ci			return err;
22508c2ecf20Sopenharmony_ci	} while (pmd++, addr = next, addr != end);
22518c2ecf20Sopenharmony_ci	return 0;
22528c2ecf20Sopenharmony_ci}
22538c2ecf20Sopenharmony_ci
22548c2ecf20Sopenharmony_cistatic inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
22558c2ecf20Sopenharmony_ci			unsigned long addr, unsigned long end,
22568c2ecf20Sopenharmony_ci			unsigned long pfn, pgprot_t prot)
22578c2ecf20Sopenharmony_ci{
22588c2ecf20Sopenharmony_ci	pud_t *pud;
22598c2ecf20Sopenharmony_ci	unsigned long next;
22608c2ecf20Sopenharmony_ci	int err;
22618c2ecf20Sopenharmony_ci
22628c2ecf20Sopenharmony_ci	pfn -= addr >> PAGE_SHIFT;
22638c2ecf20Sopenharmony_ci	pud = pud_alloc(mm, p4d, addr);
22648c2ecf20Sopenharmony_ci	if (!pud)
22658c2ecf20Sopenharmony_ci		return -ENOMEM;
22668c2ecf20Sopenharmony_ci	do {
22678c2ecf20Sopenharmony_ci		next = pud_addr_end(addr, end);
22688c2ecf20Sopenharmony_ci		err = remap_pmd_range(mm, pud, addr, next,
22698c2ecf20Sopenharmony_ci				pfn + (addr >> PAGE_SHIFT), prot);
22708c2ecf20Sopenharmony_ci		if (err)
22718c2ecf20Sopenharmony_ci			return err;
22728c2ecf20Sopenharmony_ci	} while (pud++, addr = next, addr != end);
22738c2ecf20Sopenharmony_ci	return 0;
22748c2ecf20Sopenharmony_ci}
22758c2ecf20Sopenharmony_ci
22768c2ecf20Sopenharmony_cistatic inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
22778c2ecf20Sopenharmony_ci			unsigned long addr, unsigned long end,
22788c2ecf20Sopenharmony_ci			unsigned long pfn, pgprot_t prot)
22798c2ecf20Sopenharmony_ci{
22808c2ecf20Sopenharmony_ci	p4d_t *p4d;
22818c2ecf20Sopenharmony_ci	unsigned long next;
22828c2ecf20Sopenharmony_ci	int err;
22838c2ecf20Sopenharmony_ci
22848c2ecf20Sopenharmony_ci	pfn -= addr >> PAGE_SHIFT;
22858c2ecf20Sopenharmony_ci	p4d = p4d_alloc(mm, pgd, addr);
22868c2ecf20Sopenharmony_ci	if (!p4d)
22878c2ecf20Sopenharmony_ci		return -ENOMEM;
22888c2ecf20Sopenharmony_ci	do {
22898c2ecf20Sopenharmony_ci		next = p4d_addr_end(addr, end);
22908c2ecf20Sopenharmony_ci		err = remap_pud_range(mm, p4d, addr, next,
22918c2ecf20Sopenharmony_ci				pfn + (addr >> PAGE_SHIFT), prot);
22928c2ecf20Sopenharmony_ci		if (err)
22938c2ecf20Sopenharmony_ci			return err;
22948c2ecf20Sopenharmony_ci	} while (p4d++, addr = next, addr != end);
22958c2ecf20Sopenharmony_ci	return 0;
22968c2ecf20Sopenharmony_ci}
22978c2ecf20Sopenharmony_ci
22988c2ecf20Sopenharmony_ci/**
22998c2ecf20Sopenharmony_ci * remap_pfn_range - remap kernel memory to userspace
23008c2ecf20Sopenharmony_ci * @vma: user vma to map to
23018c2ecf20Sopenharmony_ci * @addr: target page aligned user address to start at
23028c2ecf20Sopenharmony_ci * @pfn: page frame number of kernel physical memory address
23038c2ecf20Sopenharmony_ci * @size: size of mapping area
23048c2ecf20Sopenharmony_ci * @prot: page protection flags for this mapping
23058c2ecf20Sopenharmony_ci *
23068c2ecf20Sopenharmony_ci * Note: this is only safe if the mm semaphore is held when called.
23078c2ecf20Sopenharmony_ci *
23088c2ecf20Sopenharmony_ci * Return: %0 on success, negative error code otherwise.
23098c2ecf20Sopenharmony_ci */
23108c2ecf20Sopenharmony_ciint remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
23118c2ecf20Sopenharmony_ci		    unsigned long pfn, unsigned long size, pgprot_t prot)
23128c2ecf20Sopenharmony_ci{
23138c2ecf20Sopenharmony_ci	pgd_t *pgd;
23148c2ecf20Sopenharmony_ci	unsigned long next;
23158c2ecf20Sopenharmony_ci	unsigned long end = addr + PAGE_ALIGN(size);
23168c2ecf20Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
23178c2ecf20Sopenharmony_ci	unsigned long remap_pfn = pfn;
23188c2ecf20Sopenharmony_ci	int err;
23198c2ecf20Sopenharmony_ci
23208c2ecf20Sopenharmony_ci	if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
23218c2ecf20Sopenharmony_ci		return -EINVAL;
23228c2ecf20Sopenharmony_ci
23238c2ecf20Sopenharmony_ci	/*
23248c2ecf20Sopenharmony_ci	 * Physically remapped pages are special. Tell the
23258c2ecf20Sopenharmony_ci	 * rest of the world about it:
23268c2ecf20Sopenharmony_ci	 *   VM_IO tells people not to look at these pages
23278c2ecf20Sopenharmony_ci	 *	(accesses can have side effects).
23288c2ecf20Sopenharmony_ci	 *   VM_PFNMAP tells the core MM that the base pages are just
23298c2ecf20Sopenharmony_ci	 *	raw PFN mappings, and do not have a "struct page" associated
23308c2ecf20Sopenharmony_ci	 *	with them.
23318c2ecf20Sopenharmony_ci	 *   VM_DONTEXPAND
23328c2ecf20Sopenharmony_ci	 *      Disable vma merging and expanding with mremap().
23338c2ecf20Sopenharmony_ci	 *   VM_DONTDUMP
23348c2ecf20Sopenharmony_ci	 *      Omit vma from core dump, even when VM_IO turned off.
23358c2ecf20Sopenharmony_ci	 *
23368c2ecf20Sopenharmony_ci	 * There's a horrible special case to handle copy-on-write
23378c2ecf20Sopenharmony_ci	 * behaviour that some programs depend on. We mark the "original"
23388c2ecf20Sopenharmony_ci	 * un-COW'ed pages by matching them up with "vma->vm_pgoff".
23398c2ecf20Sopenharmony_ci	 * See vm_normal_page() for details.
23408c2ecf20Sopenharmony_ci	 */
23418c2ecf20Sopenharmony_ci	if (is_cow_mapping(vma->vm_flags)) {
23428c2ecf20Sopenharmony_ci		if (addr != vma->vm_start || end != vma->vm_end)
23438c2ecf20Sopenharmony_ci			return -EINVAL;
23448c2ecf20Sopenharmony_ci		vma->vm_pgoff = pfn;
23458c2ecf20Sopenharmony_ci	}
23468c2ecf20Sopenharmony_ci
23478c2ecf20Sopenharmony_ci	err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size));
23488c2ecf20Sopenharmony_ci	if (err)
23498c2ecf20Sopenharmony_ci		return -EINVAL;
23508c2ecf20Sopenharmony_ci
23518c2ecf20Sopenharmony_ci	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
23528c2ecf20Sopenharmony_ci
23538c2ecf20Sopenharmony_ci	BUG_ON(addr >= end);
23548c2ecf20Sopenharmony_ci	pfn -= addr >> PAGE_SHIFT;
23558c2ecf20Sopenharmony_ci	pgd = pgd_offset(mm, addr);
23568c2ecf20Sopenharmony_ci	flush_cache_range(vma, addr, end);
23578c2ecf20Sopenharmony_ci	do {
23588c2ecf20Sopenharmony_ci		next = pgd_addr_end(addr, end);
23598c2ecf20Sopenharmony_ci		err = remap_p4d_range(mm, pgd, addr, next,
23608c2ecf20Sopenharmony_ci				pfn + (addr >> PAGE_SHIFT), prot);
23618c2ecf20Sopenharmony_ci		if (err)
23628c2ecf20Sopenharmony_ci			break;
23638c2ecf20Sopenharmony_ci	} while (pgd++, addr = next, addr != end);
23648c2ecf20Sopenharmony_ci
23658c2ecf20Sopenharmony_ci	if (err)
23668c2ecf20Sopenharmony_ci		untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size));
23678c2ecf20Sopenharmony_ci
23688c2ecf20Sopenharmony_ci	return err;
23698c2ecf20Sopenharmony_ci}
23708c2ecf20Sopenharmony_ciEXPORT_SYMBOL(remap_pfn_range);
23718c2ecf20Sopenharmony_ci
23728c2ecf20Sopenharmony_ci/**
23738c2ecf20Sopenharmony_ci * vm_iomap_memory - remap memory to userspace
23748c2ecf20Sopenharmony_ci * @vma: user vma to map to
23758c2ecf20Sopenharmony_ci * @start: start of the physical memory to be mapped
23768c2ecf20Sopenharmony_ci * @len: size of area
23778c2ecf20Sopenharmony_ci *
23788c2ecf20Sopenharmony_ci * This is a simplified io_remap_pfn_range() for common driver use. The
23798c2ecf20Sopenharmony_ci * driver just needs to give us the physical memory range to be mapped,
23808c2ecf20Sopenharmony_ci * we'll figure out the rest from the vma information.
23818c2ecf20Sopenharmony_ci *
23828c2ecf20Sopenharmony_ci * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
23838c2ecf20Sopenharmony_ci * whatever write-combining details or similar.
23848c2ecf20Sopenharmony_ci *
23858c2ecf20Sopenharmony_ci * Return: %0 on success, negative error code otherwise.
23868c2ecf20Sopenharmony_ci */
23878c2ecf20Sopenharmony_ciint vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
23888c2ecf20Sopenharmony_ci{
23898c2ecf20Sopenharmony_ci	unsigned long vm_len, pfn, pages;
23908c2ecf20Sopenharmony_ci
23918c2ecf20Sopenharmony_ci	/* Check that the physical memory area passed in looks valid */
23928c2ecf20Sopenharmony_ci	if (start + len < start)
23938c2ecf20Sopenharmony_ci		return -EINVAL;
23948c2ecf20Sopenharmony_ci	/*
23958c2ecf20Sopenharmony_ci	 * You *really* shouldn't map things that aren't page-aligned,
23968c2ecf20Sopenharmony_ci	 * but we've historically allowed it because IO memory might
23978c2ecf20Sopenharmony_ci	 * just have smaller alignment.
23988c2ecf20Sopenharmony_ci	 */
23998c2ecf20Sopenharmony_ci	len += start & ~PAGE_MASK;
24008c2ecf20Sopenharmony_ci	pfn = start >> PAGE_SHIFT;
24018c2ecf20Sopenharmony_ci	pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
24028c2ecf20Sopenharmony_ci	if (pfn + pages < pfn)
24038c2ecf20Sopenharmony_ci		return -EINVAL;
24048c2ecf20Sopenharmony_ci
24058c2ecf20Sopenharmony_ci	/* We start the mapping 'vm_pgoff' pages into the area */
24068c2ecf20Sopenharmony_ci	if (vma->vm_pgoff > pages)
24078c2ecf20Sopenharmony_ci		return -EINVAL;
24088c2ecf20Sopenharmony_ci	pfn += vma->vm_pgoff;
24098c2ecf20Sopenharmony_ci	pages -= vma->vm_pgoff;
24108c2ecf20Sopenharmony_ci
24118c2ecf20Sopenharmony_ci	/* Can we fit all of the mapping? */
24128c2ecf20Sopenharmony_ci	vm_len = vma->vm_end - vma->vm_start;
24138c2ecf20Sopenharmony_ci	if (vm_len >> PAGE_SHIFT > pages)
24148c2ecf20Sopenharmony_ci		return -EINVAL;
24158c2ecf20Sopenharmony_ci
24168c2ecf20Sopenharmony_ci	/* Ok, let it rip */
24178c2ecf20Sopenharmony_ci	return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
24188c2ecf20Sopenharmony_ci}
24198c2ecf20Sopenharmony_ciEXPORT_SYMBOL(vm_iomap_memory);
24208c2ecf20Sopenharmony_ci
24218c2ecf20Sopenharmony_cistatic int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
24228c2ecf20Sopenharmony_ci				     unsigned long addr, unsigned long end,
24238c2ecf20Sopenharmony_ci				     pte_fn_t fn, void *data, bool create,
24248c2ecf20Sopenharmony_ci				     pgtbl_mod_mask *mask)
24258c2ecf20Sopenharmony_ci{
24268c2ecf20Sopenharmony_ci	pte_t *pte;
24278c2ecf20Sopenharmony_ci	int err = 0;
24288c2ecf20Sopenharmony_ci	spinlock_t *ptl;
24298c2ecf20Sopenharmony_ci
24308c2ecf20Sopenharmony_ci	if (create) {
24318c2ecf20Sopenharmony_ci		pte = (mm == &init_mm) ?
24328c2ecf20Sopenharmony_ci			pte_alloc_kernel_track(pmd, addr, mask) :
24338c2ecf20Sopenharmony_ci			pte_alloc_map_lock(mm, pmd, addr, &ptl);
24348c2ecf20Sopenharmony_ci		if (!pte)
24358c2ecf20Sopenharmony_ci			return -ENOMEM;
24368c2ecf20Sopenharmony_ci	} else {
24378c2ecf20Sopenharmony_ci		pte = (mm == &init_mm) ?
24388c2ecf20Sopenharmony_ci			pte_offset_kernel(pmd, addr) :
24398c2ecf20Sopenharmony_ci			pte_offset_map_lock(mm, pmd, addr, &ptl);
24408c2ecf20Sopenharmony_ci	}
24418c2ecf20Sopenharmony_ci
24428c2ecf20Sopenharmony_ci	BUG_ON(pmd_huge(*pmd));
24438c2ecf20Sopenharmony_ci
24448c2ecf20Sopenharmony_ci	arch_enter_lazy_mmu_mode();
24458c2ecf20Sopenharmony_ci
24468c2ecf20Sopenharmony_ci	if (fn) {
24478c2ecf20Sopenharmony_ci		do {
24488c2ecf20Sopenharmony_ci			if (create || !pte_none(*pte)) {
24498c2ecf20Sopenharmony_ci				err = fn(pte++, addr, data);
24508c2ecf20Sopenharmony_ci				if (err)
24518c2ecf20Sopenharmony_ci					break;
24528c2ecf20Sopenharmony_ci			}
24538c2ecf20Sopenharmony_ci		} while (addr += PAGE_SIZE, addr != end);
24548c2ecf20Sopenharmony_ci	}
24558c2ecf20Sopenharmony_ci	*mask |= PGTBL_PTE_MODIFIED;
24568c2ecf20Sopenharmony_ci
24578c2ecf20Sopenharmony_ci	arch_leave_lazy_mmu_mode();
24588c2ecf20Sopenharmony_ci
24598c2ecf20Sopenharmony_ci	if (mm != &init_mm)
24608c2ecf20Sopenharmony_ci		pte_unmap_unlock(pte-1, ptl);
24618c2ecf20Sopenharmony_ci	return err;
24628c2ecf20Sopenharmony_ci}
24638c2ecf20Sopenharmony_ci
24648c2ecf20Sopenharmony_cistatic int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
24658c2ecf20Sopenharmony_ci				     unsigned long addr, unsigned long end,
24668c2ecf20Sopenharmony_ci				     pte_fn_t fn, void *data, bool create,
24678c2ecf20Sopenharmony_ci				     pgtbl_mod_mask *mask)
24688c2ecf20Sopenharmony_ci{
24698c2ecf20Sopenharmony_ci	pmd_t *pmd;
24708c2ecf20Sopenharmony_ci	unsigned long next;
24718c2ecf20Sopenharmony_ci	int err = 0;
24728c2ecf20Sopenharmony_ci
24738c2ecf20Sopenharmony_ci	BUG_ON(pud_huge(*pud));
24748c2ecf20Sopenharmony_ci
24758c2ecf20Sopenharmony_ci	if (create) {
24768c2ecf20Sopenharmony_ci		pmd = pmd_alloc_track(mm, pud, addr, mask);
24778c2ecf20Sopenharmony_ci		if (!pmd)
24788c2ecf20Sopenharmony_ci			return -ENOMEM;
24798c2ecf20Sopenharmony_ci	} else {
24808c2ecf20Sopenharmony_ci		pmd = pmd_offset(pud, addr);
24818c2ecf20Sopenharmony_ci	}
24828c2ecf20Sopenharmony_ci	do {
24838c2ecf20Sopenharmony_ci		next = pmd_addr_end(addr, end);
24848c2ecf20Sopenharmony_ci		if (create || !pmd_none_or_clear_bad(pmd)) {
24858c2ecf20Sopenharmony_ci			err = apply_to_pte_range(mm, pmd, addr, next, fn, data,
24868c2ecf20Sopenharmony_ci						 create, mask);
24878c2ecf20Sopenharmony_ci			if (err)
24888c2ecf20Sopenharmony_ci				break;
24898c2ecf20Sopenharmony_ci		}
24908c2ecf20Sopenharmony_ci	} while (pmd++, addr = next, addr != end);
24918c2ecf20Sopenharmony_ci	return err;
24928c2ecf20Sopenharmony_ci}
24938c2ecf20Sopenharmony_ci
24948c2ecf20Sopenharmony_cistatic int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
24958c2ecf20Sopenharmony_ci				     unsigned long addr, unsigned long end,
24968c2ecf20Sopenharmony_ci				     pte_fn_t fn, void *data, bool create,
24978c2ecf20Sopenharmony_ci				     pgtbl_mod_mask *mask)
24988c2ecf20Sopenharmony_ci{
24998c2ecf20Sopenharmony_ci	pud_t *pud;
25008c2ecf20Sopenharmony_ci	unsigned long next;
25018c2ecf20Sopenharmony_ci	int err = 0;
25028c2ecf20Sopenharmony_ci
25038c2ecf20Sopenharmony_ci	if (create) {
25048c2ecf20Sopenharmony_ci		pud = pud_alloc_track(mm, p4d, addr, mask);
25058c2ecf20Sopenharmony_ci		if (!pud)
25068c2ecf20Sopenharmony_ci			return -ENOMEM;
25078c2ecf20Sopenharmony_ci	} else {
25088c2ecf20Sopenharmony_ci		pud = pud_offset(p4d, addr);
25098c2ecf20Sopenharmony_ci	}
25108c2ecf20Sopenharmony_ci	do {
25118c2ecf20Sopenharmony_ci		next = pud_addr_end(addr, end);
25128c2ecf20Sopenharmony_ci		if (create || !pud_none_or_clear_bad(pud)) {
25138c2ecf20Sopenharmony_ci			err = apply_to_pmd_range(mm, pud, addr, next, fn, data,
25148c2ecf20Sopenharmony_ci						 create, mask);
25158c2ecf20Sopenharmony_ci			if (err)
25168c2ecf20Sopenharmony_ci				break;
25178c2ecf20Sopenharmony_ci		}
25188c2ecf20Sopenharmony_ci	} while (pud++, addr = next, addr != end);
25198c2ecf20Sopenharmony_ci	return err;
25208c2ecf20Sopenharmony_ci}
25218c2ecf20Sopenharmony_ci
25228c2ecf20Sopenharmony_cistatic int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
25238c2ecf20Sopenharmony_ci				     unsigned long addr, unsigned long end,
25248c2ecf20Sopenharmony_ci				     pte_fn_t fn, void *data, bool create,
25258c2ecf20Sopenharmony_ci				     pgtbl_mod_mask *mask)
25268c2ecf20Sopenharmony_ci{
25278c2ecf20Sopenharmony_ci	p4d_t *p4d;
25288c2ecf20Sopenharmony_ci	unsigned long next;
25298c2ecf20Sopenharmony_ci	int err = 0;
25308c2ecf20Sopenharmony_ci
25318c2ecf20Sopenharmony_ci	if (create) {
25328c2ecf20Sopenharmony_ci		p4d = p4d_alloc_track(mm, pgd, addr, mask);
25338c2ecf20Sopenharmony_ci		if (!p4d)
25348c2ecf20Sopenharmony_ci			return -ENOMEM;
25358c2ecf20Sopenharmony_ci	} else {
25368c2ecf20Sopenharmony_ci		p4d = p4d_offset(pgd, addr);
25378c2ecf20Sopenharmony_ci	}
25388c2ecf20Sopenharmony_ci	do {
25398c2ecf20Sopenharmony_ci		next = p4d_addr_end(addr, end);
25408c2ecf20Sopenharmony_ci		if (create || !p4d_none_or_clear_bad(p4d)) {
25418c2ecf20Sopenharmony_ci			err = apply_to_pud_range(mm, p4d, addr, next, fn, data,
25428c2ecf20Sopenharmony_ci						 create, mask);
25438c2ecf20Sopenharmony_ci			if (err)
25448c2ecf20Sopenharmony_ci				break;
25458c2ecf20Sopenharmony_ci		}
25468c2ecf20Sopenharmony_ci	} while (p4d++, addr = next, addr != end);
25478c2ecf20Sopenharmony_ci	return err;
25488c2ecf20Sopenharmony_ci}
25498c2ecf20Sopenharmony_ci
25508c2ecf20Sopenharmony_cistatic int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
25518c2ecf20Sopenharmony_ci				 unsigned long size, pte_fn_t fn,
25528c2ecf20Sopenharmony_ci				 void *data, bool create)
25538c2ecf20Sopenharmony_ci{
25548c2ecf20Sopenharmony_ci	pgd_t *pgd;
25558c2ecf20Sopenharmony_ci	unsigned long start = addr, next;
25568c2ecf20Sopenharmony_ci	unsigned long end = addr + size;
25578c2ecf20Sopenharmony_ci	pgtbl_mod_mask mask = 0;
25588c2ecf20Sopenharmony_ci	int err = 0;
25598c2ecf20Sopenharmony_ci
25608c2ecf20Sopenharmony_ci	if (WARN_ON(addr >= end))
25618c2ecf20Sopenharmony_ci		return -EINVAL;
25628c2ecf20Sopenharmony_ci
25638c2ecf20Sopenharmony_ci	pgd = pgd_offset(mm, addr);
25648c2ecf20Sopenharmony_ci	do {
25658c2ecf20Sopenharmony_ci		next = pgd_addr_end(addr, end);
25668c2ecf20Sopenharmony_ci		if (!create && pgd_none_or_clear_bad(pgd))
25678c2ecf20Sopenharmony_ci			continue;
25688c2ecf20Sopenharmony_ci		err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask);
25698c2ecf20Sopenharmony_ci		if (err)
25708c2ecf20Sopenharmony_ci			break;
25718c2ecf20Sopenharmony_ci	} while (pgd++, addr = next, addr != end);
25728c2ecf20Sopenharmony_ci
25738c2ecf20Sopenharmony_ci	if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
25748c2ecf20Sopenharmony_ci		arch_sync_kernel_mappings(start, start + size);
25758c2ecf20Sopenharmony_ci
25768c2ecf20Sopenharmony_ci	return err;
25778c2ecf20Sopenharmony_ci}
25788c2ecf20Sopenharmony_ci
25798c2ecf20Sopenharmony_ci/*
25808c2ecf20Sopenharmony_ci * Scan a region of virtual memory, filling in page tables as necessary
25818c2ecf20Sopenharmony_ci * and calling a provided function on each leaf page table.
25828c2ecf20Sopenharmony_ci */
25838c2ecf20Sopenharmony_ciint apply_to_page_range(struct mm_struct *mm, unsigned long addr,
25848c2ecf20Sopenharmony_ci			unsigned long size, pte_fn_t fn, void *data)
25858c2ecf20Sopenharmony_ci{
25868c2ecf20Sopenharmony_ci	return __apply_to_page_range(mm, addr, size, fn, data, true);
25878c2ecf20Sopenharmony_ci}
25888c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(apply_to_page_range);
25898c2ecf20Sopenharmony_ci
25908c2ecf20Sopenharmony_ci/*
25918c2ecf20Sopenharmony_ci * Scan a region of virtual memory, calling a provided function on
25928c2ecf20Sopenharmony_ci * each leaf page table where it exists.
25938c2ecf20Sopenharmony_ci *
25948c2ecf20Sopenharmony_ci * Unlike apply_to_page_range, this does _not_ fill in page tables
25958c2ecf20Sopenharmony_ci * where they are absent.
25968c2ecf20Sopenharmony_ci */
25978c2ecf20Sopenharmony_ciint apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
25988c2ecf20Sopenharmony_ci				 unsigned long size, pte_fn_t fn, void *data)
25998c2ecf20Sopenharmony_ci{
26008c2ecf20Sopenharmony_ci	return __apply_to_page_range(mm, addr, size, fn, data, false);
26018c2ecf20Sopenharmony_ci}
26028c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(apply_to_existing_page_range);
26038c2ecf20Sopenharmony_ci
26048c2ecf20Sopenharmony_ci/*
26058c2ecf20Sopenharmony_ci * handle_pte_fault chooses page fault handler according to an entry which was
26068c2ecf20Sopenharmony_ci * read non-atomically.  Before making any commitment, on those architectures
26078c2ecf20Sopenharmony_ci * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
26088c2ecf20Sopenharmony_ci * parts, do_swap_page must check under lock before unmapping the pte and
26098c2ecf20Sopenharmony_ci * proceeding (but do_wp_page is only called after already making such a check;
26108c2ecf20Sopenharmony_ci * and do_anonymous_page can safely check later on).
26118c2ecf20Sopenharmony_ci */
26128c2ecf20Sopenharmony_cistatic inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
26138c2ecf20Sopenharmony_ci				pte_t *page_table, pte_t orig_pte)
26148c2ecf20Sopenharmony_ci{
26158c2ecf20Sopenharmony_ci	int same = 1;
26168c2ecf20Sopenharmony_ci#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
26178c2ecf20Sopenharmony_ci	if (sizeof(pte_t) > sizeof(unsigned long)) {
26188c2ecf20Sopenharmony_ci		spinlock_t *ptl = pte_lockptr(mm, pmd);
26198c2ecf20Sopenharmony_ci		spin_lock(ptl);
26208c2ecf20Sopenharmony_ci		same = pte_same(*page_table, orig_pte);
26218c2ecf20Sopenharmony_ci		spin_unlock(ptl);
26228c2ecf20Sopenharmony_ci	}
26238c2ecf20Sopenharmony_ci#endif
26248c2ecf20Sopenharmony_ci	pte_unmap(page_table);
26258c2ecf20Sopenharmony_ci	return same;
26268c2ecf20Sopenharmony_ci}
26278c2ecf20Sopenharmony_ci
26288c2ecf20Sopenharmony_cistatic inline bool cow_user_page(struct page *dst, struct page *src,
26298c2ecf20Sopenharmony_ci				 struct vm_fault *vmf)
26308c2ecf20Sopenharmony_ci{
26318c2ecf20Sopenharmony_ci	bool ret;
26328c2ecf20Sopenharmony_ci	void *kaddr;
26338c2ecf20Sopenharmony_ci	void __user *uaddr;
26348c2ecf20Sopenharmony_ci	bool locked = false;
26358c2ecf20Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
26368c2ecf20Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
26378c2ecf20Sopenharmony_ci	unsigned long addr = vmf->address;
26388c2ecf20Sopenharmony_ci
26398c2ecf20Sopenharmony_ci	if (likely(src)) {
26408c2ecf20Sopenharmony_ci		copy_user_highpage(dst, src, addr, vma);
26418c2ecf20Sopenharmony_ci		return true;
26428c2ecf20Sopenharmony_ci	}
26438c2ecf20Sopenharmony_ci
26448c2ecf20Sopenharmony_ci	/*
26458c2ecf20Sopenharmony_ci	 * If the source page was a PFN mapping, we don't have
26468c2ecf20Sopenharmony_ci	 * a "struct page" for it. We do a best-effort copy by
26478c2ecf20Sopenharmony_ci	 * just copying from the original user address. If that
26488c2ecf20Sopenharmony_ci	 * fails, we just zero-fill it. Live with it.
26498c2ecf20Sopenharmony_ci	 */
26508c2ecf20Sopenharmony_ci	kaddr = kmap_atomic(dst);
26518c2ecf20Sopenharmony_ci	uaddr = (void __user *)(addr & PAGE_MASK);
26528c2ecf20Sopenharmony_ci
26538c2ecf20Sopenharmony_ci	/*
26548c2ecf20Sopenharmony_ci	 * On architectures with software "accessed" bits, we would
26558c2ecf20Sopenharmony_ci	 * take a double page fault, so mark it accessed here.
26568c2ecf20Sopenharmony_ci	 */
26578c2ecf20Sopenharmony_ci	if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
26588c2ecf20Sopenharmony_ci		pte_t entry;
26598c2ecf20Sopenharmony_ci
26608c2ecf20Sopenharmony_ci		vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
26618c2ecf20Sopenharmony_ci		locked = true;
26628c2ecf20Sopenharmony_ci		if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
26638c2ecf20Sopenharmony_ci			/*
26648c2ecf20Sopenharmony_ci			 * Other thread has already handled the fault
26658c2ecf20Sopenharmony_ci			 * and update local tlb only
26668c2ecf20Sopenharmony_ci			 */
26678c2ecf20Sopenharmony_ci			update_mmu_tlb(vma, addr, vmf->pte);
26688c2ecf20Sopenharmony_ci			ret = false;
26698c2ecf20Sopenharmony_ci			goto pte_unlock;
26708c2ecf20Sopenharmony_ci		}
26718c2ecf20Sopenharmony_ci
26728c2ecf20Sopenharmony_ci		entry = pte_mkyoung(vmf->orig_pte);
26738c2ecf20Sopenharmony_ci		if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
26748c2ecf20Sopenharmony_ci			update_mmu_cache(vma, addr, vmf->pte);
26758c2ecf20Sopenharmony_ci	}
26768c2ecf20Sopenharmony_ci
26778c2ecf20Sopenharmony_ci	/*
26788c2ecf20Sopenharmony_ci	 * This really shouldn't fail, because the page is there
26798c2ecf20Sopenharmony_ci	 * in the page tables. But it might just be unreadable,
26808c2ecf20Sopenharmony_ci	 * in which case we just give up and fill the result with
26818c2ecf20Sopenharmony_ci	 * zeroes.
26828c2ecf20Sopenharmony_ci	 */
26838c2ecf20Sopenharmony_ci	if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
26848c2ecf20Sopenharmony_ci		if (locked)
26858c2ecf20Sopenharmony_ci			goto warn;
26868c2ecf20Sopenharmony_ci
26878c2ecf20Sopenharmony_ci		/* Re-validate under PTL if the page is still mapped */
26888c2ecf20Sopenharmony_ci		vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
26898c2ecf20Sopenharmony_ci		locked = true;
26908c2ecf20Sopenharmony_ci		if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
26918c2ecf20Sopenharmony_ci			/* The PTE changed under us, update local tlb */
26928c2ecf20Sopenharmony_ci			update_mmu_tlb(vma, addr, vmf->pte);
26938c2ecf20Sopenharmony_ci			ret = false;
26948c2ecf20Sopenharmony_ci			goto pte_unlock;
26958c2ecf20Sopenharmony_ci		}
26968c2ecf20Sopenharmony_ci
26978c2ecf20Sopenharmony_ci		/*
26988c2ecf20Sopenharmony_ci		 * The same page can be mapped back since last copy attempt.
26998c2ecf20Sopenharmony_ci		 * Try to copy again under PTL.
27008c2ecf20Sopenharmony_ci		 */
27018c2ecf20Sopenharmony_ci		if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
27028c2ecf20Sopenharmony_ci			/*
27038c2ecf20Sopenharmony_ci			 * Give a warn in case there can be some obscure
27048c2ecf20Sopenharmony_ci			 * use-case
27058c2ecf20Sopenharmony_ci			 */
27068c2ecf20Sopenharmony_ciwarn:
27078c2ecf20Sopenharmony_ci			WARN_ON_ONCE(1);
27088c2ecf20Sopenharmony_ci			clear_page(kaddr);
27098c2ecf20Sopenharmony_ci		}
27108c2ecf20Sopenharmony_ci	}
27118c2ecf20Sopenharmony_ci
27128c2ecf20Sopenharmony_ci	ret = true;
27138c2ecf20Sopenharmony_ci
27148c2ecf20Sopenharmony_cipte_unlock:
27158c2ecf20Sopenharmony_ci	if (locked)
27168c2ecf20Sopenharmony_ci		pte_unmap_unlock(vmf->pte, vmf->ptl);
27178c2ecf20Sopenharmony_ci	kunmap_atomic(kaddr);
27188c2ecf20Sopenharmony_ci	flush_dcache_page(dst);
27198c2ecf20Sopenharmony_ci
27208c2ecf20Sopenharmony_ci	return ret;
27218c2ecf20Sopenharmony_ci}
27228c2ecf20Sopenharmony_ci
27238c2ecf20Sopenharmony_cistatic gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
27248c2ecf20Sopenharmony_ci{
27258c2ecf20Sopenharmony_ci	struct file *vm_file = vma->vm_file;
27268c2ecf20Sopenharmony_ci
27278c2ecf20Sopenharmony_ci	if (vm_file)
27288c2ecf20Sopenharmony_ci		return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
27298c2ecf20Sopenharmony_ci
27308c2ecf20Sopenharmony_ci	/*
27318c2ecf20Sopenharmony_ci	 * Special mappings (e.g. VDSO) do not have any file so fake
27328c2ecf20Sopenharmony_ci	 * a default GFP_KERNEL for them.
27338c2ecf20Sopenharmony_ci	 */
27348c2ecf20Sopenharmony_ci	return GFP_KERNEL;
27358c2ecf20Sopenharmony_ci}
27368c2ecf20Sopenharmony_ci
27378c2ecf20Sopenharmony_ci/*
27388c2ecf20Sopenharmony_ci * Notify the address space that the page is about to become writable so that
27398c2ecf20Sopenharmony_ci * it can prohibit this or wait for the page to get into an appropriate state.
27408c2ecf20Sopenharmony_ci *
27418c2ecf20Sopenharmony_ci * We do this without the lock held, so that it can sleep if it needs to.
27428c2ecf20Sopenharmony_ci */
27438c2ecf20Sopenharmony_cistatic vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
27448c2ecf20Sopenharmony_ci{
27458c2ecf20Sopenharmony_ci	vm_fault_t ret;
27468c2ecf20Sopenharmony_ci	struct page *page = vmf->page;
27478c2ecf20Sopenharmony_ci	unsigned int old_flags = vmf->flags;
27488c2ecf20Sopenharmony_ci
27498c2ecf20Sopenharmony_ci	vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
27508c2ecf20Sopenharmony_ci
27518c2ecf20Sopenharmony_ci	if (vmf->vma->vm_file &&
27528c2ecf20Sopenharmony_ci	    IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host))
27538c2ecf20Sopenharmony_ci		return VM_FAULT_SIGBUS;
27548c2ecf20Sopenharmony_ci
27558c2ecf20Sopenharmony_ci	ret = vmf->vma->vm_ops->page_mkwrite(vmf);
27568c2ecf20Sopenharmony_ci	/* Restore original flags so that caller is not surprised */
27578c2ecf20Sopenharmony_ci	vmf->flags = old_flags;
27588c2ecf20Sopenharmony_ci	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
27598c2ecf20Sopenharmony_ci		return ret;
27608c2ecf20Sopenharmony_ci	if (unlikely(!(ret & VM_FAULT_LOCKED))) {
27618c2ecf20Sopenharmony_ci		lock_page(page);
27628c2ecf20Sopenharmony_ci		if (!page->mapping) {
27638c2ecf20Sopenharmony_ci			unlock_page(page);
27648c2ecf20Sopenharmony_ci			return 0; /* retry */
27658c2ecf20Sopenharmony_ci		}
27668c2ecf20Sopenharmony_ci		ret |= VM_FAULT_LOCKED;
27678c2ecf20Sopenharmony_ci	} else
27688c2ecf20Sopenharmony_ci		VM_BUG_ON_PAGE(!PageLocked(page), page);
27698c2ecf20Sopenharmony_ci	return ret;
27708c2ecf20Sopenharmony_ci}
27718c2ecf20Sopenharmony_ci
27728c2ecf20Sopenharmony_ci/*
27738c2ecf20Sopenharmony_ci * Handle dirtying of a page in shared file mapping on a write fault.
27748c2ecf20Sopenharmony_ci *
27758c2ecf20Sopenharmony_ci * The function expects the page to be locked and unlocks it.
27768c2ecf20Sopenharmony_ci */
27778c2ecf20Sopenharmony_cistatic vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
27788c2ecf20Sopenharmony_ci{
27798c2ecf20Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
27808c2ecf20Sopenharmony_ci	struct address_space *mapping;
27818c2ecf20Sopenharmony_ci	struct page *page = vmf->page;
27828c2ecf20Sopenharmony_ci	bool dirtied;
27838c2ecf20Sopenharmony_ci	bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
27848c2ecf20Sopenharmony_ci
27858c2ecf20Sopenharmony_ci	dirtied = set_page_dirty(page);
27868c2ecf20Sopenharmony_ci	VM_BUG_ON_PAGE(PageAnon(page), page);
27878c2ecf20Sopenharmony_ci	/*
27888c2ecf20Sopenharmony_ci	 * Take a local copy of the address_space - page.mapping may be zeroed
27898c2ecf20Sopenharmony_ci	 * by truncate after unlock_page().   The address_space itself remains
27908c2ecf20Sopenharmony_ci	 * pinned by vma->vm_file's reference.  We rely on unlock_page()'s
27918c2ecf20Sopenharmony_ci	 * release semantics to prevent the compiler from undoing this copying.
27928c2ecf20Sopenharmony_ci	 */
27938c2ecf20Sopenharmony_ci	mapping = page_rmapping(page);
27948c2ecf20Sopenharmony_ci	unlock_page(page);
27958c2ecf20Sopenharmony_ci
27968c2ecf20Sopenharmony_ci	if (!page_mkwrite)
27978c2ecf20Sopenharmony_ci		file_update_time(vma->vm_file);
27988c2ecf20Sopenharmony_ci
27998c2ecf20Sopenharmony_ci	/*
28008c2ecf20Sopenharmony_ci	 * Throttle page dirtying rate down to writeback speed.
28018c2ecf20Sopenharmony_ci	 *
28028c2ecf20Sopenharmony_ci	 * mapping may be NULL here because some device drivers do not
28038c2ecf20Sopenharmony_ci	 * set page.mapping but still dirty their pages
28048c2ecf20Sopenharmony_ci	 *
28058c2ecf20Sopenharmony_ci	 * Drop the mmap_lock before waiting on IO, if we can. The file
28068c2ecf20Sopenharmony_ci	 * is pinning the mapping, as per above.
28078c2ecf20Sopenharmony_ci	 */
28088c2ecf20Sopenharmony_ci	if ((dirtied || page_mkwrite) && mapping) {
28098c2ecf20Sopenharmony_ci		struct file *fpin;
28108c2ecf20Sopenharmony_ci
28118c2ecf20Sopenharmony_ci		fpin = maybe_unlock_mmap_for_io(vmf, NULL);
28128c2ecf20Sopenharmony_ci		balance_dirty_pages_ratelimited(mapping);
28138c2ecf20Sopenharmony_ci		if (fpin) {
28148c2ecf20Sopenharmony_ci			fput(fpin);
28158c2ecf20Sopenharmony_ci			return VM_FAULT_RETRY;
28168c2ecf20Sopenharmony_ci		}
28178c2ecf20Sopenharmony_ci	}
28188c2ecf20Sopenharmony_ci
28198c2ecf20Sopenharmony_ci	return 0;
28208c2ecf20Sopenharmony_ci}
28218c2ecf20Sopenharmony_ci
28228c2ecf20Sopenharmony_ci/*
28238c2ecf20Sopenharmony_ci * Handle write page faults for pages that can be reused in the current vma
28248c2ecf20Sopenharmony_ci *
28258c2ecf20Sopenharmony_ci * This can happen either due to the mapping being with the VM_SHARED flag,
28268c2ecf20Sopenharmony_ci * or due to us being the last reference standing to the page. In either
28278c2ecf20Sopenharmony_ci * case, all we need to do here is to mark the page as writable and update
28288c2ecf20Sopenharmony_ci * any related book-keeping.
28298c2ecf20Sopenharmony_ci */
28308c2ecf20Sopenharmony_cistatic inline void wp_page_reuse(struct vm_fault *vmf)
28318c2ecf20Sopenharmony_ci	__releases(vmf->ptl)
28328c2ecf20Sopenharmony_ci{
28338c2ecf20Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
28348c2ecf20Sopenharmony_ci	struct page *page = vmf->page;
28358c2ecf20Sopenharmony_ci	pte_t entry;
28368c2ecf20Sopenharmony_ci	/*
28378c2ecf20Sopenharmony_ci	 * Clear the pages cpupid information as the existing
28388c2ecf20Sopenharmony_ci	 * information potentially belongs to a now completely
28398c2ecf20Sopenharmony_ci	 * unrelated process.
28408c2ecf20Sopenharmony_ci	 */
28418c2ecf20Sopenharmony_ci	if (page)
28428c2ecf20Sopenharmony_ci		page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
28438c2ecf20Sopenharmony_ci
28448c2ecf20Sopenharmony_ci	flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
28458c2ecf20Sopenharmony_ci	entry = pte_mkyoung(vmf->orig_pte);
28468c2ecf20Sopenharmony_ci	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
28478c2ecf20Sopenharmony_ci	if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
28488c2ecf20Sopenharmony_ci		update_mmu_cache(vma, vmf->address, vmf->pte);
28498c2ecf20Sopenharmony_ci	pte_unmap_unlock(vmf->pte, vmf->ptl);
28508c2ecf20Sopenharmony_ci	count_vm_event(PGREUSE);
28518c2ecf20Sopenharmony_ci}
28528c2ecf20Sopenharmony_ci
28538c2ecf20Sopenharmony_ci/*
28548c2ecf20Sopenharmony_ci * Handle the case of a page which we actually need to copy to a new page.
28558c2ecf20Sopenharmony_ci *
28568c2ecf20Sopenharmony_ci * Called with mmap_lock locked and the old page referenced, but
28578c2ecf20Sopenharmony_ci * without the ptl held.
28588c2ecf20Sopenharmony_ci *
28598c2ecf20Sopenharmony_ci * High level logic flow:
28608c2ecf20Sopenharmony_ci *
28618c2ecf20Sopenharmony_ci * - Allocate a page, copy the content of the old page to the new one.
28628c2ecf20Sopenharmony_ci * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
28638c2ecf20Sopenharmony_ci * - Take the PTL. If the pte changed, bail out and release the allocated page
28648c2ecf20Sopenharmony_ci * - If the pte is still the way we remember it, update the page table and all
28658c2ecf20Sopenharmony_ci *   relevant references. This includes dropping the reference the page-table
28668c2ecf20Sopenharmony_ci *   held to the old page, as well as updating the rmap.
28678c2ecf20Sopenharmony_ci * - In any case, unlock the PTL and drop the reference we took to the old page.
28688c2ecf20Sopenharmony_ci */
28698c2ecf20Sopenharmony_cistatic vm_fault_t wp_page_copy(struct vm_fault *vmf)
28708c2ecf20Sopenharmony_ci{
28718c2ecf20Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
28728c2ecf20Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
28738c2ecf20Sopenharmony_ci	struct page *old_page = vmf->page;
28748c2ecf20Sopenharmony_ci	struct page *new_page = NULL;
28758c2ecf20Sopenharmony_ci	pte_t entry;
28768c2ecf20Sopenharmony_ci	int page_copied = 0;
28778c2ecf20Sopenharmony_ci	struct mmu_notifier_range range;
28788c2ecf20Sopenharmony_ci
28798c2ecf20Sopenharmony_ci	if (unlikely(anon_vma_prepare(vma)))
28808c2ecf20Sopenharmony_ci		goto oom;
28818c2ecf20Sopenharmony_ci
28828c2ecf20Sopenharmony_ci	if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
28838c2ecf20Sopenharmony_ci		new_page = alloc_zeroed_user_highpage_movable(vma,
28848c2ecf20Sopenharmony_ci							      vmf->address);
28858c2ecf20Sopenharmony_ci		if (!new_page)
28868c2ecf20Sopenharmony_ci			goto oom;
28878c2ecf20Sopenharmony_ci	} else {
28888c2ecf20Sopenharmony_ci		new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
28898c2ecf20Sopenharmony_ci				vmf->address);
28908c2ecf20Sopenharmony_ci		if (!new_page)
28918c2ecf20Sopenharmony_ci			goto oom;
28928c2ecf20Sopenharmony_ci
28938c2ecf20Sopenharmony_ci		if (!cow_user_page(new_page, old_page, vmf)) {
28948c2ecf20Sopenharmony_ci			/*
28958c2ecf20Sopenharmony_ci			 * COW failed, if the fault was solved by other,
28968c2ecf20Sopenharmony_ci			 * it's fine. If not, userspace would re-fault on
28978c2ecf20Sopenharmony_ci			 * the same address and we will handle the fault
28988c2ecf20Sopenharmony_ci			 * from the second attempt.
28998c2ecf20Sopenharmony_ci			 */
29008c2ecf20Sopenharmony_ci			put_page(new_page);
29018c2ecf20Sopenharmony_ci			if (old_page)
29028c2ecf20Sopenharmony_ci				put_page(old_page);
29038c2ecf20Sopenharmony_ci			return 0;
29048c2ecf20Sopenharmony_ci		}
29058c2ecf20Sopenharmony_ci	}
29068c2ecf20Sopenharmony_ci
29078c2ecf20Sopenharmony_ci	if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
29088c2ecf20Sopenharmony_ci		goto oom_free_new;
29098c2ecf20Sopenharmony_ci	cgroup_throttle_swaprate(new_page, GFP_KERNEL);
29108c2ecf20Sopenharmony_ci
29118c2ecf20Sopenharmony_ci	__SetPageUptodate(new_page);
29128c2ecf20Sopenharmony_ci
29138c2ecf20Sopenharmony_ci	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
29148c2ecf20Sopenharmony_ci				vmf->address & PAGE_MASK,
29158c2ecf20Sopenharmony_ci				(vmf->address & PAGE_MASK) + PAGE_SIZE);
29168c2ecf20Sopenharmony_ci	mmu_notifier_invalidate_range_start(&range);
29178c2ecf20Sopenharmony_ci
29188c2ecf20Sopenharmony_ci	/*
29198c2ecf20Sopenharmony_ci	 * Re-check the pte - we dropped the lock
29208c2ecf20Sopenharmony_ci	 */
29218c2ecf20Sopenharmony_ci	vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
29228c2ecf20Sopenharmony_ci	if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
29238c2ecf20Sopenharmony_ci		if (old_page) {
29248c2ecf20Sopenharmony_ci			if (!PageAnon(old_page)) {
29258c2ecf20Sopenharmony_ci				dec_mm_counter_fast(mm,
29268c2ecf20Sopenharmony_ci						mm_counter_file(old_page));
29278c2ecf20Sopenharmony_ci				inc_mm_counter_fast(mm, MM_ANONPAGES);
29288c2ecf20Sopenharmony_ci			}
29298c2ecf20Sopenharmony_ci		} else {
29308c2ecf20Sopenharmony_ci			inc_mm_counter_fast(mm, MM_ANONPAGES);
29318c2ecf20Sopenharmony_ci		}
29328c2ecf20Sopenharmony_ci		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
29338c2ecf20Sopenharmony_ci		entry = mk_pte(new_page, vma->vm_page_prot);
29348c2ecf20Sopenharmony_ci		entry = pte_sw_mkyoung(entry);
29358c2ecf20Sopenharmony_ci		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
29368c2ecf20Sopenharmony_ci		/*
29378c2ecf20Sopenharmony_ci		 * Clear the pte entry and flush it first, before updating the
29388c2ecf20Sopenharmony_ci		 * pte with the new entry. This will avoid a race condition
29398c2ecf20Sopenharmony_ci		 * seen in the presence of one thread doing SMC and another
29408c2ecf20Sopenharmony_ci		 * thread doing COW.
29418c2ecf20Sopenharmony_ci		 */
29428c2ecf20Sopenharmony_ci		ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
29438c2ecf20Sopenharmony_ci		page_add_new_anon_rmap(new_page, vma, vmf->address, false);
29448c2ecf20Sopenharmony_ci		if (vma->vm_flags & VM_PURGEABLE) {
29458c2ecf20Sopenharmony_ci			pr_info("set wp new page %lx purgeable\n", page_to_pfn(new_page));
29468c2ecf20Sopenharmony_ci			SetPagePurgeable(new_page);
29478c2ecf20Sopenharmony_ci			uxpte_set_present(vma, vmf->address);
29488c2ecf20Sopenharmony_ci		}
29498c2ecf20Sopenharmony_ci		lru_cache_add_inactive_or_unevictable(new_page, vma);
29508c2ecf20Sopenharmony_ci		/*
29518c2ecf20Sopenharmony_ci		 * We call the notify macro here because, when using secondary
29528c2ecf20Sopenharmony_ci		 * mmu page tables (such as kvm shadow page tables), we want the
29538c2ecf20Sopenharmony_ci		 * new page to be mapped directly into the secondary page table.
29548c2ecf20Sopenharmony_ci		 */
29558c2ecf20Sopenharmony_ci		set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
29568c2ecf20Sopenharmony_ci		update_mmu_cache(vma, vmf->address, vmf->pte);
29578c2ecf20Sopenharmony_ci		xpm_integrity_update_hook(vma, vmf->flags, new_page);
29588c2ecf20Sopenharmony_ci		if (old_page) {
29598c2ecf20Sopenharmony_ci			/*
29608c2ecf20Sopenharmony_ci			 * Only after switching the pte to the new page may
29618c2ecf20Sopenharmony_ci			 * we remove the mapcount here. Otherwise another
29628c2ecf20Sopenharmony_ci			 * process may come and find the rmap count decremented
29638c2ecf20Sopenharmony_ci			 * before the pte is switched to the new page, and
29648c2ecf20Sopenharmony_ci			 * "reuse" the old page writing into it while our pte
29658c2ecf20Sopenharmony_ci			 * here still points into it and can be read by other
29668c2ecf20Sopenharmony_ci			 * threads.
29678c2ecf20Sopenharmony_ci			 *
29688c2ecf20Sopenharmony_ci			 * The critical issue is to order this
29698c2ecf20Sopenharmony_ci			 * page_remove_rmap with the ptp_clear_flush above.
29708c2ecf20Sopenharmony_ci			 * Those stores are ordered by (if nothing else,)
29718c2ecf20Sopenharmony_ci			 * the barrier present in the atomic_add_negative
29728c2ecf20Sopenharmony_ci			 * in page_remove_rmap.
29738c2ecf20Sopenharmony_ci			 *
29748c2ecf20Sopenharmony_ci			 * Then the TLB flush in ptep_clear_flush ensures that
29758c2ecf20Sopenharmony_ci			 * no process can access the old page before the
29768c2ecf20Sopenharmony_ci			 * decremented mapcount is visible. And the old page
29778c2ecf20Sopenharmony_ci			 * cannot be reused until after the decremented
29788c2ecf20Sopenharmony_ci			 * mapcount is visible. So transitively, TLBs to
29798c2ecf20Sopenharmony_ci			 * old page will be flushed before it can be reused.
29808c2ecf20Sopenharmony_ci			 */
29818c2ecf20Sopenharmony_ci			page_remove_rmap(old_page, false);
29828c2ecf20Sopenharmony_ci		}
29838c2ecf20Sopenharmony_ci
29848c2ecf20Sopenharmony_ci		/* Free the old page.. */
29858c2ecf20Sopenharmony_ci		new_page = old_page;
29868c2ecf20Sopenharmony_ci		page_copied = 1;
29878c2ecf20Sopenharmony_ci	} else {
29888c2ecf20Sopenharmony_ci		update_mmu_tlb(vma, vmf->address, vmf->pte);
29898c2ecf20Sopenharmony_ci	}
29908c2ecf20Sopenharmony_ci
29918c2ecf20Sopenharmony_ci	if (new_page)
29928c2ecf20Sopenharmony_ci		put_page(new_page);
29938c2ecf20Sopenharmony_ci
29948c2ecf20Sopenharmony_ci	pte_unmap_unlock(vmf->pte, vmf->ptl);
29958c2ecf20Sopenharmony_ci	/*
29968c2ecf20Sopenharmony_ci	 * No need to double call mmu_notifier->invalidate_range() callback as
29978c2ecf20Sopenharmony_ci	 * the above ptep_clear_flush_notify() did already call it.
29988c2ecf20Sopenharmony_ci	 */
29998c2ecf20Sopenharmony_ci	mmu_notifier_invalidate_range_only_end(&range);
30008c2ecf20Sopenharmony_ci	if (old_page) {
30018c2ecf20Sopenharmony_ci		/*
30028c2ecf20Sopenharmony_ci		 * Don't let another task, with possibly unlocked vma,
30038c2ecf20Sopenharmony_ci		 * keep the mlocked page.
30048c2ecf20Sopenharmony_ci		 */
30058c2ecf20Sopenharmony_ci		if (page_copied && (vma->vm_flags & VM_LOCKED)) {
30068c2ecf20Sopenharmony_ci			lock_page(old_page);	/* LRU manipulation */
30078c2ecf20Sopenharmony_ci			if (PageMlocked(old_page))
30088c2ecf20Sopenharmony_ci				munlock_vma_page(old_page);
30098c2ecf20Sopenharmony_ci			unlock_page(old_page);
30108c2ecf20Sopenharmony_ci		}
30118c2ecf20Sopenharmony_ci		put_page(old_page);
30128c2ecf20Sopenharmony_ci	}
30138c2ecf20Sopenharmony_ci	return page_copied ? VM_FAULT_WRITE : 0;
30148c2ecf20Sopenharmony_cioom_free_new:
30158c2ecf20Sopenharmony_ci	put_page(new_page);
30168c2ecf20Sopenharmony_cioom:
30178c2ecf20Sopenharmony_ci	if (old_page)
30188c2ecf20Sopenharmony_ci		put_page(old_page);
30198c2ecf20Sopenharmony_ci	return VM_FAULT_OOM;
30208c2ecf20Sopenharmony_ci}
30218c2ecf20Sopenharmony_ci
30228c2ecf20Sopenharmony_ci/**
30238c2ecf20Sopenharmony_ci * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
30248c2ecf20Sopenharmony_ci *			  writeable once the page is prepared
30258c2ecf20Sopenharmony_ci *
30268c2ecf20Sopenharmony_ci * @vmf: structure describing the fault
30278c2ecf20Sopenharmony_ci *
30288c2ecf20Sopenharmony_ci * This function handles all that is needed to finish a write page fault in a
30298c2ecf20Sopenharmony_ci * shared mapping due to PTE being read-only once the mapped page is prepared.
30308c2ecf20Sopenharmony_ci * It handles locking of PTE and modifying it.
30318c2ecf20Sopenharmony_ci *
30328c2ecf20Sopenharmony_ci * The function expects the page to be locked or other protection against
30338c2ecf20Sopenharmony_ci * concurrent faults / writeback (such as DAX radix tree locks).
30348c2ecf20Sopenharmony_ci *
30358c2ecf20Sopenharmony_ci * Return: %VM_FAULT_WRITE on success, %0 when PTE got changed before
30368c2ecf20Sopenharmony_ci * we acquired PTE lock.
30378c2ecf20Sopenharmony_ci */
30388c2ecf20Sopenharmony_civm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
30398c2ecf20Sopenharmony_ci{
30408c2ecf20Sopenharmony_ci	WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
30418c2ecf20Sopenharmony_ci	vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
30428c2ecf20Sopenharmony_ci				       &vmf->ptl);
30438c2ecf20Sopenharmony_ci	/*
30448c2ecf20Sopenharmony_ci	 * We might have raced with another page fault while we released the
30458c2ecf20Sopenharmony_ci	 * pte_offset_map_lock.
30468c2ecf20Sopenharmony_ci	 */
30478c2ecf20Sopenharmony_ci	if (!pte_same(*vmf->pte, vmf->orig_pte)) {
30488c2ecf20Sopenharmony_ci		update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
30498c2ecf20Sopenharmony_ci		pte_unmap_unlock(vmf->pte, vmf->ptl);
30508c2ecf20Sopenharmony_ci		return VM_FAULT_NOPAGE;
30518c2ecf20Sopenharmony_ci	}
30528c2ecf20Sopenharmony_ci
30538c2ecf20Sopenharmony_ci	if (unlikely(xpm_integrity_validate_hook(vmf->vma, vmf->flags,
30548c2ecf20Sopenharmony_ci		vmf->address, vmf->page))) {
30558c2ecf20Sopenharmony_ci		pte_unmap_unlock(vmf->pte, vmf->ptl);
30568c2ecf20Sopenharmony_ci		return VM_FAULT_SIGSEGV;
30578c2ecf20Sopenharmony_ci	}
30588c2ecf20Sopenharmony_ci
30598c2ecf20Sopenharmony_ci	wp_page_reuse(vmf);
30608c2ecf20Sopenharmony_ci	return 0;
30618c2ecf20Sopenharmony_ci}
30628c2ecf20Sopenharmony_ci
30638c2ecf20Sopenharmony_ci/*
30648c2ecf20Sopenharmony_ci * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
30658c2ecf20Sopenharmony_ci * mapping
30668c2ecf20Sopenharmony_ci */
30678c2ecf20Sopenharmony_cistatic vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
30688c2ecf20Sopenharmony_ci{
30698c2ecf20Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
30708c2ecf20Sopenharmony_ci
30718c2ecf20Sopenharmony_ci	if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
30728c2ecf20Sopenharmony_ci		vm_fault_t ret;
30738c2ecf20Sopenharmony_ci
30748c2ecf20Sopenharmony_ci		pte_unmap_unlock(vmf->pte, vmf->ptl);
30758c2ecf20Sopenharmony_ci		vmf->flags |= FAULT_FLAG_MKWRITE;
30768c2ecf20Sopenharmony_ci		ret = vma->vm_ops->pfn_mkwrite(vmf);
30778c2ecf20Sopenharmony_ci		if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
30788c2ecf20Sopenharmony_ci			return ret;
30798c2ecf20Sopenharmony_ci		return finish_mkwrite_fault(vmf);
30808c2ecf20Sopenharmony_ci	}
30818c2ecf20Sopenharmony_ci	wp_page_reuse(vmf);
30828c2ecf20Sopenharmony_ci	return VM_FAULT_WRITE;
30838c2ecf20Sopenharmony_ci}
30848c2ecf20Sopenharmony_ci
30858c2ecf20Sopenharmony_cistatic vm_fault_t wp_page_shared(struct vm_fault *vmf)
30868c2ecf20Sopenharmony_ci	__releases(vmf->ptl)
30878c2ecf20Sopenharmony_ci{
30888c2ecf20Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
30898c2ecf20Sopenharmony_ci	vm_fault_t ret = VM_FAULT_WRITE;
30908c2ecf20Sopenharmony_ci
30918c2ecf20Sopenharmony_ci	get_page(vmf->page);
30928c2ecf20Sopenharmony_ci
30938c2ecf20Sopenharmony_ci	if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
30948c2ecf20Sopenharmony_ci		vm_fault_t tmp;
30958c2ecf20Sopenharmony_ci
30968c2ecf20Sopenharmony_ci		pte_unmap_unlock(vmf->pte, vmf->ptl);
30978c2ecf20Sopenharmony_ci		tmp = do_page_mkwrite(vmf);
30988c2ecf20Sopenharmony_ci		if (unlikely(!tmp || (tmp &
30998c2ecf20Sopenharmony_ci				      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
31008c2ecf20Sopenharmony_ci			put_page(vmf->page);
31018c2ecf20Sopenharmony_ci			return tmp;
31028c2ecf20Sopenharmony_ci		}
31038c2ecf20Sopenharmony_ci		tmp = finish_mkwrite_fault(vmf);
31048c2ecf20Sopenharmony_ci		if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
31058c2ecf20Sopenharmony_ci			unlock_page(vmf->page);
31068c2ecf20Sopenharmony_ci			put_page(vmf->page);
31078c2ecf20Sopenharmony_ci			return tmp;
31088c2ecf20Sopenharmony_ci		}
31098c2ecf20Sopenharmony_ci	} else {
31108c2ecf20Sopenharmony_ci		if (unlikely(xpm_integrity_validate_hook(vmf->vma, vmf->flags, vmf->address,
31118c2ecf20Sopenharmony_ci			vmf->page))){
31128c2ecf20Sopenharmony_ci			pte_unmap_unlock(vmf->pte, vmf->ptl);
31138c2ecf20Sopenharmony_ci			put_page(vmf->page);
31148c2ecf20Sopenharmony_ci			return VM_FAULT_SIGSEGV;
31158c2ecf20Sopenharmony_ci		}
31168c2ecf20Sopenharmony_ci
31178c2ecf20Sopenharmony_ci		wp_page_reuse(vmf);
31188c2ecf20Sopenharmony_ci		lock_page(vmf->page);
31198c2ecf20Sopenharmony_ci	}
31208c2ecf20Sopenharmony_ci	ret |= fault_dirty_shared_page(vmf);
31218c2ecf20Sopenharmony_ci	put_page(vmf->page);
31228c2ecf20Sopenharmony_ci
31238c2ecf20Sopenharmony_ci	return ret;
31248c2ecf20Sopenharmony_ci}
31258c2ecf20Sopenharmony_ci
31268c2ecf20Sopenharmony_ci/*
31278c2ecf20Sopenharmony_ci * This routine handles present pages, when users try to write
31288c2ecf20Sopenharmony_ci * to a shared page. It is done by copying the page to a new address
31298c2ecf20Sopenharmony_ci * and decrementing the shared-page counter for the old page.
31308c2ecf20Sopenharmony_ci *
31318c2ecf20Sopenharmony_ci * Note that this routine assumes that the protection checks have been
31328c2ecf20Sopenharmony_ci * done by the caller (the low-level page fault routine in most cases).
31338c2ecf20Sopenharmony_ci * Thus we can safely just mark it writable once we've done any necessary
31348c2ecf20Sopenharmony_ci * COW.
31358c2ecf20Sopenharmony_ci *
31368c2ecf20Sopenharmony_ci * We also mark the page dirty at this point even though the page will
31378c2ecf20Sopenharmony_ci * change only once the write actually happens. This avoids a few races,
31388c2ecf20Sopenharmony_ci * and potentially makes it more efficient.
31398c2ecf20Sopenharmony_ci *
31408c2ecf20Sopenharmony_ci * We enter with non-exclusive mmap_lock (to exclude vma changes,
31418c2ecf20Sopenharmony_ci * but allow concurrent faults), with pte both mapped and locked.
31428c2ecf20Sopenharmony_ci * We return with mmap_lock still held, but pte unmapped and unlocked.
31438c2ecf20Sopenharmony_ci */
31448c2ecf20Sopenharmony_cistatic vm_fault_t do_wp_page(struct vm_fault *vmf)
31458c2ecf20Sopenharmony_ci	__releases(vmf->ptl)
31468c2ecf20Sopenharmony_ci{
31478c2ecf20Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
31488c2ecf20Sopenharmony_ci
31498c2ecf20Sopenharmony_ci	if (userfaultfd_pte_wp(vma, *vmf->pte)) {
31508c2ecf20Sopenharmony_ci		pte_unmap_unlock(vmf->pte, vmf->ptl);
31518c2ecf20Sopenharmony_ci		return handle_userfault(vmf, VM_UFFD_WP);
31528c2ecf20Sopenharmony_ci	}
31538c2ecf20Sopenharmony_ci
31548c2ecf20Sopenharmony_ci	/*
31558c2ecf20Sopenharmony_ci	 * Userfaultfd write-protect can defer flushes. Ensure the TLB
31568c2ecf20Sopenharmony_ci	 * is flushed in this case before copying.
31578c2ecf20Sopenharmony_ci	 */
31588c2ecf20Sopenharmony_ci	if (unlikely(userfaultfd_wp(vmf->vma) &&
31598c2ecf20Sopenharmony_ci		     mm_tlb_flush_pending(vmf->vma->vm_mm)))
31608c2ecf20Sopenharmony_ci		flush_tlb_page(vmf->vma, vmf->address);
31618c2ecf20Sopenharmony_ci
31628c2ecf20Sopenharmony_ci	vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
31638c2ecf20Sopenharmony_ci	if (!vmf->page) {
31648c2ecf20Sopenharmony_ci		/*
31658c2ecf20Sopenharmony_ci		 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
31668c2ecf20Sopenharmony_ci		 * VM_PFNMAP VMA.
31678c2ecf20Sopenharmony_ci		 *
31688c2ecf20Sopenharmony_ci		 * We should not cow pages in a shared writeable mapping.
31698c2ecf20Sopenharmony_ci		 * Just mark the pages writable and/or call ops->pfn_mkwrite.
31708c2ecf20Sopenharmony_ci		 */
31718c2ecf20Sopenharmony_ci		if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
31728c2ecf20Sopenharmony_ci				     (VM_WRITE|VM_SHARED))
31738c2ecf20Sopenharmony_ci			return wp_pfn_shared(vmf);
31748c2ecf20Sopenharmony_ci
31758c2ecf20Sopenharmony_ci		pte_unmap_unlock(vmf->pte, vmf->ptl);
31768c2ecf20Sopenharmony_ci		return wp_page_copy(vmf);
31778c2ecf20Sopenharmony_ci	}
31788c2ecf20Sopenharmony_ci
31798c2ecf20Sopenharmony_ci	/*
31808c2ecf20Sopenharmony_ci	 * Take out anonymous pages first, anonymous shared vmas are
31818c2ecf20Sopenharmony_ci	 * not dirty accountable.
31828c2ecf20Sopenharmony_ci	 */
31838c2ecf20Sopenharmony_ci	if (PageAnon(vmf->page)) {
31848c2ecf20Sopenharmony_ci		struct page *page = vmf->page;
31858c2ecf20Sopenharmony_ci
31868c2ecf20Sopenharmony_ci		/* PageKsm() doesn't necessarily raise the page refcount */
31878c2ecf20Sopenharmony_ci		if (PageKsm(page) || page_count(page) != 1)
31888c2ecf20Sopenharmony_ci			goto copy;
31898c2ecf20Sopenharmony_ci		if (!trylock_page(page))
31908c2ecf20Sopenharmony_ci			goto copy;
31918c2ecf20Sopenharmony_ci		if (PageKsm(page) || page_mapcount(page) != 1 || page_count(page) != 1) {
31928c2ecf20Sopenharmony_ci			unlock_page(page);
31938c2ecf20Sopenharmony_ci			goto copy;
31948c2ecf20Sopenharmony_ci		}
31958c2ecf20Sopenharmony_ci		/*
31968c2ecf20Sopenharmony_ci		 * Ok, we've got the only map reference, and the only
31978c2ecf20Sopenharmony_ci		 * page count reference, and the page is locked,
31988c2ecf20Sopenharmony_ci		 * it's dark out, and we're wearing sunglasses. Hit it.
31998c2ecf20Sopenharmony_ci		 */
32008c2ecf20Sopenharmony_ci		unlock_page(page);
32018c2ecf20Sopenharmony_ci
32028c2ecf20Sopenharmony_ci		if (unlikely(xpm_integrity_validate_hook(vmf->vma, vmf->flags, vmf->address,
32038c2ecf20Sopenharmony_ci			vmf->page))){
32048c2ecf20Sopenharmony_ci			pte_unmap_unlock(vmf->pte, vmf->ptl);
32058c2ecf20Sopenharmony_ci			return VM_FAULT_SIGSEGV;
32068c2ecf20Sopenharmony_ci		}
32078c2ecf20Sopenharmony_ci
32088c2ecf20Sopenharmony_ci		wp_page_reuse(vmf);
32098c2ecf20Sopenharmony_ci		return VM_FAULT_WRITE;
32108c2ecf20Sopenharmony_ci	} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
32118c2ecf20Sopenharmony_ci					(VM_WRITE|VM_SHARED))) {
32128c2ecf20Sopenharmony_ci		return wp_page_shared(vmf);
32138c2ecf20Sopenharmony_ci	}
32148c2ecf20Sopenharmony_cicopy:
32158c2ecf20Sopenharmony_ci	/*
32168c2ecf20Sopenharmony_ci	 * Ok, we need to copy. Oh, well..
32178c2ecf20Sopenharmony_ci	 */
32188c2ecf20Sopenharmony_ci	get_page(vmf->page);
32198c2ecf20Sopenharmony_ci
32208c2ecf20Sopenharmony_ci	pte_unmap_unlock(vmf->pte, vmf->ptl);
32218c2ecf20Sopenharmony_ci	return wp_page_copy(vmf);
32228c2ecf20Sopenharmony_ci}
32238c2ecf20Sopenharmony_ci
32248c2ecf20Sopenharmony_cistatic void unmap_mapping_range_vma(struct vm_area_struct *vma,
32258c2ecf20Sopenharmony_ci		unsigned long start_addr, unsigned long end_addr,
32268c2ecf20Sopenharmony_ci		struct zap_details *details)
32278c2ecf20Sopenharmony_ci{
32288c2ecf20Sopenharmony_ci	zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
32298c2ecf20Sopenharmony_ci}
32308c2ecf20Sopenharmony_ci
32318c2ecf20Sopenharmony_cistatic inline void unmap_mapping_range_tree(struct rb_root_cached *root,
32328c2ecf20Sopenharmony_ci					    struct zap_details *details)
32338c2ecf20Sopenharmony_ci{
32348c2ecf20Sopenharmony_ci	struct vm_area_struct *vma;
32358c2ecf20Sopenharmony_ci	pgoff_t vba, vea, zba, zea;
32368c2ecf20Sopenharmony_ci
32378c2ecf20Sopenharmony_ci	vma_interval_tree_foreach(vma, root,
32388c2ecf20Sopenharmony_ci			details->first_index, details->last_index) {
32398c2ecf20Sopenharmony_ci
32408c2ecf20Sopenharmony_ci		vba = vma->vm_pgoff;
32418c2ecf20Sopenharmony_ci		vea = vba + vma_pages(vma) - 1;
32428c2ecf20Sopenharmony_ci		zba = details->first_index;
32438c2ecf20Sopenharmony_ci		if (zba < vba)
32448c2ecf20Sopenharmony_ci			zba = vba;
32458c2ecf20Sopenharmony_ci		zea = details->last_index;
32468c2ecf20Sopenharmony_ci		if (zea > vea)
32478c2ecf20Sopenharmony_ci			zea = vea;
32488c2ecf20Sopenharmony_ci
32498c2ecf20Sopenharmony_ci		unmap_mapping_range_vma(vma,
32508c2ecf20Sopenharmony_ci			((zba - vba) << PAGE_SHIFT) + vma->vm_start,
32518c2ecf20Sopenharmony_ci			((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
32528c2ecf20Sopenharmony_ci				details);
32538c2ecf20Sopenharmony_ci	}
32548c2ecf20Sopenharmony_ci}
32558c2ecf20Sopenharmony_ci
32568c2ecf20Sopenharmony_ci/**
32578c2ecf20Sopenharmony_ci * unmap_mapping_page() - Unmap single page from processes.
32588c2ecf20Sopenharmony_ci * @page: The locked page to be unmapped.
32598c2ecf20Sopenharmony_ci *
32608c2ecf20Sopenharmony_ci * Unmap this page from any userspace process which still has it mmaped.
32618c2ecf20Sopenharmony_ci * Typically, for efficiency, the range of nearby pages has already been
32628c2ecf20Sopenharmony_ci * unmapped by unmap_mapping_pages() or unmap_mapping_range().  But once
32638c2ecf20Sopenharmony_ci * truncation or invalidation holds the lock on a page, it may find that
32648c2ecf20Sopenharmony_ci * the page has been remapped again: and then uses unmap_mapping_page()
32658c2ecf20Sopenharmony_ci * to unmap it finally.
32668c2ecf20Sopenharmony_ci */
32678c2ecf20Sopenharmony_civoid unmap_mapping_page(struct page *page)
32688c2ecf20Sopenharmony_ci{
32698c2ecf20Sopenharmony_ci	struct address_space *mapping = page->mapping;
32708c2ecf20Sopenharmony_ci	struct zap_details details = { };
32718c2ecf20Sopenharmony_ci
32728c2ecf20Sopenharmony_ci	VM_BUG_ON(!PageLocked(page));
32738c2ecf20Sopenharmony_ci	VM_BUG_ON(PageTail(page));
32748c2ecf20Sopenharmony_ci
32758c2ecf20Sopenharmony_ci	details.check_mapping = mapping;
32768c2ecf20Sopenharmony_ci	details.first_index = page->index;
32778c2ecf20Sopenharmony_ci	details.last_index = page->index + thp_nr_pages(page) - 1;
32788c2ecf20Sopenharmony_ci	details.single_page = page;
32798c2ecf20Sopenharmony_ci
32808c2ecf20Sopenharmony_ci	i_mmap_lock_write(mapping);
32818c2ecf20Sopenharmony_ci	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
32828c2ecf20Sopenharmony_ci		unmap_mapping_range_tree(&mapping->i_mmap, &details);
32838c2ecf20Sopenharmony_ci	i_mmap_unlock_write(mapping);
32848c2ecf20Sopenharmony_ci}
32858c2ecf20Sopenharmony_ci
32868c2ecf20Sopenharmony_ci/**
32878c2ecf20Sopenharmony_ci * unmap_mapping_pages() - Unmap pages from processes.
32888c2ecf20Sopenharmony_ci * @mapping: The address space containing pages to be unmapped.
32898c2ecf20Sopenharmony_ci * @start: Index of first page to be unmapped.
32908c2ecf20Sopenharmony_ci * @nr: Number of pages to be unmapped.  0 to unmap to end of file.
32918c2ecf20Sopenharmony_ci * @even_cows: Whether to unmap even private COWed pages.
32928c2ecf20Sopenharmony_ci *
32938c2ecf20Sopenharmony_ci * Unmap the pages in this address space from any userspace process which
32948c2ecf20Sopenharmony_ci * has them mmaped.  Generally, you want to remove COWed pages as well when
32958c2ecf20Sopenharmony_ci * a file is being truncated, but not when invalidating pages from the page
32968c2ecf20Sopenharmony_ci * cache.
32978c2ecf20Sopenharmony_ci */
32988c2ecf20Sopenharmony_civoid unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
32998c2ecf20Sopenharmony_ci		pgoff_t nr, bool even_cows)
33008c2ecf20Sopenharmony_ci{
33018c2ecf20Sopenharmony_ci	struct zap_details details = { };
33028c2ecf20Sopenharmony_ci
33038c2ecf20Sopenharmony_ci	details.check_mapping = even_cows ? NULL : mapping;
33048c2ecf20Sopenharmony_ci	details.first_index = start;
33058c2ecf20Sopenharmony_ci	details.last_index = start + nr - 1;
33068c2ecf20Sopenharmony_ci	if (details.last_index < details.first_index)
33078c2ecf20Sopenharmony_ci		details.last_index = ULONG_MAX;
33088c2ecf20Sopenharmony_ci
33098c2ecf20Sopenharmony_ci	i_mmap_lock_write(mapping);
33108c2ecf20Sopenharmony_ci	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
33118c2ecf20Sopenharmony_ci		unmap_mapping_range_tree(&mapping->i_mmap, &details);
33128c2ecf20Sopenharmony_ci	i_mmap_unlock_write(mapping);
33138c2ecf20Sopenharmony_ci}
33148c2ecf20Sopenharmony_ci
33158c2ecf20Sopenharmony_ci/**
33168c2ecf20Sopenharmony_ci * unmap_mapping_range - unmap the portion of all mmaps in the specified
33178c2ecf20Sopenharmony_ci * address_space corresponding to the specified byte range in the underlying
33188c2ecf20Sopenharmony_ci * file.
33198c2ecf20Sopenharmony_ci *
33208c2ecf20Sopenharmony_ci * @mapping: the address space containing mmaps to be unmapped.
33218c2ecf20Sopenharmony_ci * @holebegin: byte in first page to unmap, relative to the start of
33228c2ecf20Sopenharmony_ci * the underlying file.  This will be rounded down to a PAGE_SIZE
33238c2ecf20Sopenharmony_ci * boundary.  Note that this is different from truncate_pagecache(), which
33248c2ecf20Sopenharmony_ci * must keep the partial page.  In contrast, we must get rid of
33258c2ecf20Sopenharmony_ci * partial pages.
33268c2ecf20Sopenharmony_ci * @holelen: size of prospective hole in bytes.  This will be rounded
33278c2ecf20Sopenharmony_ci * up to a PAGE_SIZE boundary.  A holelen of zero truncates to the
33288c2ecf20Sopenharmony_ci * end of the file.
33298c2ecf20Sopenharmony_ci * @even_cows: 1 when truncating a file, unmap even private COWed pages;
33308c2ecf20Sopenharmony_ci * but 0 when invalidating pagecache, don't throw away private data.
33318c2ecf20Sopenharmony_ci */
33328c2ecf20Sopenharmony_civoid unmap_mapping_range(struct address_space *mapping,
33338c2ecf20Sopenharmony_ci		loff_t const holebegin, loff_t const holelen, int even_cows)
33348c2ecf20Sopenharmony_ci{
33358c2ecf20Sopenharmony_ci	pgoff_t hba = (pgoff_t)(holebegin) >> PAGE_SHIFT;
33368c2ecf20Sopenharmony_ci	pgoff_t hlen = ((pgoff_t)(holelen) + PAGE_SIZE - 1) >> PAGE_SHIFT;
33378c2ecf20Sopenharmony_ci
33388c2ecf20Sopenharmony_ci	/* Check for overflow. */
33398c2ecf20Sopenharmony_ci	if (sizeof(holelen) > sizeof(hlen)) {
33408c2ecf20Sopenharmony_ci		long long holeend =
33418c2ecf20Sopenharmony_ci			(holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
33428c2ecf20Sopenharmony_ci		if (holeend & ~(long long)ULONG_MAX)
33438c2ecf20Sopenharmony_ci			hlen = ULONG_MAX - hba + 1;
33448c2ecf20Sopenharmony_ci	}
33458c2ecf20Sopenharmony_ci
33468c2ecf20Sopenharmony_ci	unmap_mapping_pages(mapping, hba, hlen, even_cows);
33478c2ecf20Sopenharmony_ci}
33488c2ecf20Sopenharmony_ciEXPORT_SYMBOL(unmap_mapping_range);
33498c2ecf20Sopenharmony_ci
33508c2ecf20Sopenharmony_ci/*
33518c2ecf20Sopenharmony_ci * We enter with non-exclusive mmap_lock (to exclude vma changes,
33528c2ecf20Sopenharmony_ci * but allow concurrent faults), and pte mapped but not yet locked.
33538c2ecf20Sopenharmony_ci * We return with pte unmapped and unlocked.
33548c2ecf20Sopenharmony_ci *
33558c2ecf20Sopenharmony_ci * We return with the mmap_lock locked or unlocked in the same cases
33568c2ecf20Sopenharmony_ci * as does filemap_fault().
33578c2ecf20Sopenharmony_ci */
33588c2ecf20Sopenharmony_civm_fault_t do_swap_page(struct vm_fault *vmf)
33598c2ecf20Sopenharmony_ci{
33608c2ecf20Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
33618c2ecf20Sopenharmony_ci	struct page *page = NULL, *swapcache;
33628c2ecf20Sopenharmony_ci	swp_entry_t entry;
33638c2ecf20Sopenharmony_ci	pte_t pte;
33648c2ecf20Sopenharmony_ci	int locked;
33658c2ecf20Sopenharmony_ci	int exclusive = 0;
33668c2ecf20Sopenharmony_ci	vm_fault_t ret = 0;
33678c2ecf20Sopenharmony_ci	void *shadow = NULL;
33688c2ecf20Sopenharmony_ci
33698c2ecf20Sopenharmony_ci	if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
33708c2ecf20Sopenharmony_ci		goto out;
33718c2ecf20Sopenharmony_ci
33728c2ecf20Sopenharmony_ci	entry = pte_to_swp_entry(vmf->orig_pte);
33738c2ecf20Sopenharmony_ci	if (unlikely(non_swap_entry(entry))) {
33748c2ecf20Sopenharmony_ci		if (is_migration_entry(entry)) {
33758c2ecf20Sopenharmony_ci			migration_entry_wait(vma->vm_mm, vmf->pmd,
33768c2ecf20Sopenharmony_ci					     vmf->address);
33778c2ecf20Sopenharmony_ci		} else if (is_device_private_entry(entry)) {
33788c2ecf20Sopenharmony_ci			vmf->page = device_private_entry_to_page(entry);
33798c2ecf20Sopenharmony_ci			vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
33808c2ecf20Sopenharmony_ci					vmf->address, &vmf->ptl);
33818c2ecf20Sopenharmony_ci			if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
33828c2ecf20Sopenharmony_ci				spin_unlock(vmf->ptl);
33838c2ecf20Sopenharmony_ci				goto out;
33848c2ecf20Sopenharmony_ci			}
33858c2ecf20Sopenharmony_ci
33868c2ecf20Sopenharmony_ci			/*
33878c2ecf20Sopenharmony_ci			 * Get a page reference while we know the page can't be
33888c2ecf20Sopenharmony_ci			 * freed.
33898c2ecf20Sopenharmony_ci			 */
33908c2ecf20Sopenharmony_ci			get_page(vmf->page);
33918c2ecf20Sopenharmony_ci			pte_unmap_unlock(vmf->pte, vmf->ptl);
33928c2ecf20Sopenharmony_ci			vmf->page->pgmap->ops->migrate_to_ram(vmf);
33938c2ecf20Sopenharmony_ci			put_page(vmf->page);
33948c2ecf20Sopenharmony_ci		} else if (is_hwpoison_entry(entry)) {
33958c2ecf20Sopenharmony_ci			ret = VM_FAULT_HWPOISON;
33968c2ecf20Sopenharmony_ci		} else {
33978c2ecf20Sopenharmony_ci			print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
33988c2ecf20Sopenharmony_ci			ret = VM_FAULT_SIGBUS;
33998c2ecf20Sopenharmony_ci		}
34008c2ecf20Sopenharmony_ci		goto out;
34018c2ecf20Sopenharmony_ci	}
34028c2ecf20Sopenharmony_ci
34038c2ecf20Sopenharmony_ci
34048c2ecf20Sopenharmony_ci	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
34058c2ecf20Sopenharmony_ci	page = lookup_swap_cache(entry, vma, vmf->address);
34068c2ecf20Sopenharmony_ci	swapcache = page;
34078c2ecf20Sopenharmony_ci
34088c2ecf20Sopenharmony_ci	if (!page) {
34098c2ecf20Sopenharmony_ci		struct swap_info_struct *si = swp_swap_info(entry);
34108c2ecf20Sopenharmony_ci
34118c2ecf20Sopenharmony_ci		if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
34128c2ecf20Sopenharmony_ci		    __swap_count(entry) == 1) {
34138c2ecf20Sopenharmony_ci			/* skip swapcache */
34148c2ecf20Sopenharmony_ci			page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
34158c2ecf20Sopenharmony_ci							vmf->address);
34168c2ecf20Sopenharmony_ci			if (page) {
34178c2ecf20Sopenharmony_ci				int err;
34188c2ecf20Sopenharmony_ci
34198c2ecf20Sopenharmony_ci				__SetPageLocked(page);
34208c2ecf20Sopenharmony_ci				__SetPageSwapBacked(page);
34218c2ecf20Sopenharmony_ci				set_page_private(page, entry.val);
34228c2ecf20Sopenharmony_ci
34238c2ecf20Sopenharmony_ci				/* Tell memcg to use swap ownership records */
34248c2ecf20Sopenharmony_ci				SetPageSwapCache(page);
34258c2ecf20Sopenharmony_ci				err = mem_cgroup_charge(page, vma->vm_mm,
34268c2ecf20Sopenharmony_ci							GFP_KERNEL);
34278c2ecf20Sopenharmony_ci				ClearPageSwapCache(page);
34288c2ecf20Sopenharmony_ci				if (err) {
34298c2ecf20Sopenharmony_ci					ret = VM_FAULT_OOM;
34308c2ecf20Sopenharmony_ci					goto out_page;
34318c2ecf20Sopenharmony_ci				}
34328c2ecf20Sopenharmony_ci
34338c2ecf20Sopenharmony_ci				shadow = get_shadow_from_swap_cache(entry);
34348c2ecf20Sopenharmony_ci				if (shadow)
34358c2ecf20Sopenharmony_ci					workingset_refault(page, shadow);
34368c2ecf20Sopenharmony_ci
34378c2ecf20Sopenharmony_ci				lru_cache_add(page);
34388c2ecf20Sopenharmony_ci				swap_readpage(page, true);
34398c2ecf20Sopenharmony_ci			}
34408c2ecf20Sopenharmony_ci		} else {
34418c2ecf20Sopenharmony_ci			page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
34428c2ecf20Sopenharmony_ci						vmf);
34438c2ecf20Sopenharmony_ci			swapcache = page;
34448c2ecf20Sopenharmony_ci		}
34458c2ecf20Sopenharmony_ci
34468c2ecf20Sopenharmony_ci		if (!page) {
34478c2ecf20Sopenharmony_ci			/*
34488c2ecf20Sopenharmony_ci			 * Back out if somebody else faulted in this pte
34498c2ecf20Sopenharmony_ci			 * while we released the pte lock.
34508c2ecf20Sopenharmony_ci			 */
34518c2ecf20Sopenharmony_ci			vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
34528c2ecf20Sopenharmony_ci					vmf->address, &vmf->ptl);
34538c2ecf20Sopenharmony_ci			if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
34548c2ecf20Sopenharmony_ci				ret = VM_FAULT_OOM;
34558c2ecf20Sopenharmony_ci			delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
34568c2ecf20Sopenharmony_ci			goto unlock;
34578c2ecf20Sopenharmony_ci		}
34588c2ecf20Sopenharmony_ci
34598c2ecf20Sopenharmony_ci		/* Had to read the page from swap area: Major fault */
34608c2ecf20Sopenharmony_ci		ret = VM_FAULT_MAJOR;
34618c2ecf20Sopenharmony_ci		count_vm_event(PGMAJFAULT);
34628c2ecf20Sopenharmony_ci		count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
34638c2ecf20Sopenharmony_ci	} else if (PageHWPoison(page)) {
34648c2ecf20Sopenharmony_ci		/*
34658c2ecf20Sopenharmony_ci		 * hwpoisoned dirty swapcache pages are kept for killing
34668c2ecf20Sopenharmony_ci		 * owner processes (which may be unknown at hwpoison time)
34678c2ecf20Sopenharmony_ci		 */
34688c2ecf20Sopenharmony_ci		ret = VM_FAULT_HWPOISON;
34698c2ecf20Sopenharmony_ci		delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
34708c2ecf20Sopenharmony_ci		goto out_release;
34718c2ecf20Sopenharmony_ci	}
34728c2ecf20Sopenharmony_ci
34738c2ecf20Sopenharmony_ci	locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
34748c2ecf20Sopenharmony_ci
34758c2ecf20Sopenharmony_ci	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
34768c2ecf20Sopenharmony_ci	if (!locked) {
34778c2ecf20Sopenharmony_ci		ret |= VM_FAULT_RETRY;
34788c2ecf20Sopenharmony_ci		goto out_release;
34798c2ecf20Sopenharmony_ci	}
34808c2ecf20Sopenharmony_ci
34818c2ecf20Sopenharmony_ci	/*
34828c2ecf20Sopenharmony_ci	 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
34838c2ecf20Sopenharmony_ci	 * release the swapcache from under us.  The page pin, and pte_same
34848c2ecf20Sopenharmony_ci	 * test below, are not enough to exclude that.  Even if it is still
34858c2ecf20Sopenharmony_ci	 * swapcache, we need to check that the page's swap has not changed.
34868c2ecf20Sopenharmony_ci	 */
34878c2ecf20Sopenharmony_ci	if (unlikely((!PageSwapCache(page) ||
34888c2ecf20Sopenharmony_ci			page_private(page) != entry.val)) && swapcache)
34898c2ecf20Sopenharmony_ci		goto out_page;
34908c2ecf20Sopenharmony_ci
34918c2ecf20Sopenharmony_ci	page = ksm_might_need_to_copy(page, vma, vmf->address);
34928c2ecf20Sopenharmony_ci	if (unlikely(!page)) {
34938c2ecf20Sopenharmony_ci		ret = VM_FAULT_OOM;
34948c2ecf20Sopenharmony_ci		page = swapcache;
34958c2ecf20Sopenharmony_ci		goto out_page;
34968c2ecf20Sopenharmony_ci	}
34978c2ecf20Sopenharmony_ci
34988c2ecf20Sopenharmony_ci	cgroup_throttle_swaprate(page, GFP_KERNEL);
34998c2ecf20Sopenharmony_ci
35008c2ecf20Sopenharmony_ci	/*
35018c2ecf20Sopenharmony_ci	 * Back out if somebody else already faulted in this pte.
35028c2ecf20Sopenharmony_ci	 */
35038c2ecf20Sopenharmony_ci	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
35048c2ecf20Sopenharmony_ci			&vmf->ptl);
35058c2ecf20Sopenharmony_ci	if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
35068c2ecf20Sopenharmony_ci		goto out_nomap;
35078c2ecf20Sopenharmony_ci
35088c2ecf20Sopenharmony_ci	if (unlikely(!PageUptodate(page))) {
35098c2ecf20Sopenharmony_ci		ret = VM_FAULT_SIGBUS;
35108c2ecf20Sopenharmony_ci		goto out_nomap;
35118c2ecf20Sopenharmony_ci	}
35128c2ecf20Sopenharmony_ci
35138c2ecf20Sopenharmony_ci	/*
35148c2ecf20Sopenharmony_ci	 * The page isn't present yet, go ahead with the fault.
35158c2ecf20Sopenharmony_ci	 *
35168c2ecf20Sopenharmony_ci	 * Be careful about the sequence of operations here.
35178c2ecf20Sopenharmony_ci	 * To get its accounting right, reuse_swap_page() must be called
35188c2ecf20Sopenharmony_ci	 * while the page is counted on swap but not yet in mapcount i.e.
35198c2ecf20Sopenharmony_ci	 * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
35208c2ecf20Sopenharmony_ci	 * must be called after the swap_free(), or it will never succeed.
35218c2ecf20Sopenharmony_ci	 */
35228c2ecf20Sopenharmony_ci	if (unlikely(xpm_integrity_validate_hook(vmf->vma, vmf->flags,
35238c2ecf20Sopenharmony_ci		vmf->address, page))){
35248c2ecf20Sopenharmony_ci		ret = VM_FAULT_SIGSEGV;
35258c2ecf20Sopenharmony_ci		goto out_nomap;
35268c2ecf20Sopenharmony_ci	}
35278c2ecf20Sopenharmony_ci
35288c2ecf20Sopenharmony_ci	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
35298c2ecf20Sopenharmony_ci	dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
35308c2ecf20Sopenharmony_ci	pte = mk_pte(page, vma->vm_page_prot);
35318c2ecf20Sopenharmony_ci	if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
35328c2ecf20Sopenharmony_ci		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
35338c2ecf20Sopenharmony_ci		vmf->flags &= ~FAULT_FLAG_WRITE;
35348c2ecf20Sopenharmony_ci		ret |= VM_FAULT_WRITE;
35358c2ecf20Sopenharmony_ci		exclusive = RMAP_EXCLUSIVE;
35368c2ecf20Sopenharmony_ci	}
35378c2ecf20Sopenharmony_ci	flush_icache_page(vma, page);
35388c2ecf20Sopenharmony_ci	if (pte_swp_soft_dirty(vmf->orig_pte))
35398c2ecf20Sopenharmony_ci		pte = pte_mksoft_dirty(pte);
35408c2ecf20Sopenharmony_ci	if (pte_swp_uffd_wp(vmf->orig_pte)) {
35418c2ecf20Sopenharmony_ci		pte = pte_mkuffd_wp(pte);
35428c2ecf20Sopenharmony_ci		pte = pte_wrprotect(pte);
35438c2ecf20Sopenharmony_ci	}
35448c2ecf20Sopenharmony_ci	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
35458c2ecf20Sopenharmony_ci	arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
35468c2ecf20Sopenharmony_ci	vmf->orig_pte = pte;
35478c2ecf20Sopenharmony_ci
35488c2ecf20Sopenharmony_ci	/* ksm created a completely new copy */
35498c2ecf20Sopenharmony_ci	if (unlikely(page != swapcache && swapcache)) {
35508c2ecf20Sopenharmony_ci		page_add_new_anon_rmap(page, vma, vmf->address, false);
35518c2ecf20Sopenharmony_ci		lru_cache_add_inactive_or_unevictable(page, vma);
35528c2ecf20Sopenharmony_ci	} else {
35538c2ecf20Sopenharmony_ci		do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
35548c2ecf20Sopenharmony_ci	}
35558c2ecf20Sopenharmony_ci
35568c2ecf20Sopenharmony_ci	swap_free(entry);
35578c2ecf20Sopenharmony_ci	if (mem_cgroup_swap_full(page) ||
35588c2ecf20Sopenharmony_ci	    (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
35598c2ecf20Sopenharmony_ci		try_to_free_swap(page);
35608c2ecf20Sopenharmony_ci	unlock_page(page);
35618c2ecf20Sopenharmony_ci	if (page != swapcache && swapcache) {
35628c2ecf20Sopenharmony_ci		/*
35638c2ecf20Sopenharmony_ci		 * Hold the lock to avoid the swap entry to be reused
35648c2ecf20Sopenharmony_ci		 * until we take the PT lock for the pte_same() check
35658c2ecf20Sopenharmony_ci		 * (to avoid false positives from pte_same). For
35668c2ecf20Sopenharmony_ci		 * further safety release the lock after the swap_free
35678c2ecf20Sopenharmony_ci		 * so that the swap count won't change under a
35688c2ecf20Sopenharmony_ci		 * parallel locked swapcache.
35698c2ecf20Sopenharmony_ci		 */
35708c2ecf20Sopenharmony_ci		unlock_page(swapcache);
35718c2ecf20Sopenharmony_ci		put_page(swapcache);
35728c2ecf20Sopenharmony_ci	}
35738c2ecf20Sopenharmony_ci
35748c2ecf20Sopenharmony_ci	if (vmf->flags & FAULT_FLAG_WRITE) {
35758c2ecf20Sopenharmony_ci		ret |= do_wp_page(vmf);
35768c2ecf20Sopenharmony_ci		if (ret & VM_FAULT_ERROR)
35778c2ecf20Sopenharmony_ci			ret &= VM_FAULT_ERROR;
35788c2ecf20Sopenharmony_ci		goto out;
35798c2ecf20Sopenharmony_ci	}
35808c2ecf20Sopenharmony_ci
35818c2ecf20Sopenharmony_ci	/* No need to invalidate - it was non-present before */
35828c2ecf20Sopenharmony_ci	update_mmu_cache(vma, vmf->address, vmf->pte);
35838c2ecf20Sopenharmony_ciunlock:
35848c2ecf20Sopenharmony_ci	pte_unmap_unlock(vmf->pte, vmf->ptl);
35858c2ecf20Sopenharmony_ciout:
35868c2ecf20Sopenharmony_ci	return ret;
35878c2ecf20Sopenharmony_ciout_nomap:
35888c2ecf20Sopenharmony_ci	pte_unmap_unlock(vmf->pte, vmf->ptl);
35898c2ecf20Sopenharmony_ciout_page:
35908c2ecf20Sopenharmony_ci	unlock_page(page);
35918c2ecf20Sopenharmony_ciout_release:
35928c2ecf20Sopenharmony_ci	put_page(page);
35938c2ecf20Sopenharmony_ci	if (page != swapcache && swapcache) {
35948c2ecf20Sopenharmony_ci		unlock_page(swapcache);
35958c2ecf20Sopenharmony_ci		put_page(swapcache);
35968c2ecf20Sopenharmony_ci	}
35978c2ecf20Sopenharmony_ci	return ret;
35988c2ecf20Sopenharmony_ci}
35998c2ecf20Sopenharmony_ci
36008c2ecf20Sopenharmony_ci/*
36018c2ecf20Sopenharmony_ci * We enter with non-exclusive mmap_lock (to exclude vma changes,
36028c2ecf20Sopenharmony_ci * but allow concurrent faults), and pte mapped but not yet locked.
36038c2ecf20Sopenharmony_ci * We return with mmap_lock still held, but pte unmapped and unlocked.
36048c2ecf20Sopenharmony_ci */
36058c2ecf20Sopenharmony_cistatic vm_fault_t do_anonymous_page(struct vm_fault *vmf)
36068c2ecf20Sopenharmony_ci{
36078c2ecf20Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
36088c2ecf20Sopenharmony_ci	struct page *page;
36098c2ecf20Sopenharmony_ci	vm_fault_t ret = 0;
36108c2ecf20Sopenharmony_ci	pte_t entry;
36118c2ecf20Sopenharmony_ci
36128c2ecf20Sopenharmony_ci	/* File mapping without ->vm_ops ? */
36138c2ecf20Sopenharmony_ci	if (vma->vm_flags & VM_SHARED)
36148c2ecf20Sopenharmony_ci		return VM_FAULT_SIGBUS;
36158c2ecf20Sopenharmony_ci
36168c2ecf20Sopenharmony_ci	/*
36178c2ecf20Sopenharmony_ci	 * Use pte_alloc() instead of pte_alloc_map().  We can't run
36188c2ecf20Sopenharmony_ci	 * pte_offset_map() on pmds where a huge pmd might be created
36198c2ecf20Sopenharmony_ci	 * from a different thread.
36208c2ecf20Sopenharmony_ci	 *
36218c2ecf20Sopenharmony_ci	 * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
36228c2ecf20Sopenharmony_ci	 * parallel threads are excluded by other means.
36238c2ecf20Sopenharmony_ci	 *
36248c2ecf20Sopenharmony_ci	 * Here we only have mmap_read_lock(mm).
36258c2ecf20Sopenharmony_ci	 */
36268c2ecf20Sopenharmony_ci	if (pte_alloc(vma->vm_mm, vmf->pmd))
36278c2ecf20Sopenharmony_ci		return VM_FAULT_OOM;
36288c2ecf20Sopenharmony_ci
36298c2ecf20Sopenharmony_ci	/* See the comment in pte_alloc_one_map() */
36308c2ecf20Sopenharmony_ci	if (unlikely(pmd_trans_unstable(vmf->pmd)))
36318c2ecf20Sopenharmony_ci		return 0;
36328c2ecf20Sopenharmony_ci
36338c2ecf20Sopenharmony_ci	/* use extra page table for userexpte */
36348c2ecf20Sopenharmony_ci	if (vma->vm_flags & VM_USEREXPTE) {
36358c2ecf20Sopenharmony_ci		if (do_uxpte_page_fault(vmf, &entry))
36368c2ecf20Sopenharmony_ci			goto oom;
36378c2ecf20Sopenharmony_ci
36388c2ecf20Sopenharmony_ci		if(xpm_integrity_check_hook(vma, vmf->flags, vmf->address,
36398c2ecf20Sopenharmony_ci			pte_page(entry)))
36408c2ecf20Sopenharmony_ci			return VM_FAULT_SIGSEGV;
36418c2ecf20Sopenharmony_ci		else
36428c2ecf20Sopenharmony_ci			goto got_page;
36438c2ecf20Sopenharmony_ci	}
36448c2ecf20Sopenharmony_ci
36458c2ecf20Sopenharmony_ci	/* Use the zero-page for reads */
36468c2ecf20Sopenharmony_ci	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
36478c2ecf20Sopenharmony_ci			!mm_forbids_zeropage(vma->vm_mm)) {
36488c2ecf20Sopenharmony_ci		entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
36498c2ecf20Sopenharmony_ci						vma->vm_page_prot));
36508c2ecf20Sopenharmony_cigot_page:
36518c2ecf20Sopenharmony_ci		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
36528c2ecf20Sopenharmony_ci				vmf->address, &vmf->ptl);
36538c2ecf20Sopenharmony_ci		if (!pte_none(*vmf->pte)) {
36548c2ecf20Sopenharmony_ci			update_mmu_tlb(vma, vmf->address, vmf->pte);
36558c2ecf20Sopenharmony_ci			goto unlock;
36568c2ecf20Sopenharmony_ci		}
36578c2ecf20Sopenharmony_ci		ret = check_stable_address_space(vma->vm_mm);
36588c2ecf20Sopenharmony_ci		if (ret)
36598c2ecf20Sopenharmony_ci			goto unlock;
36608c2ecf20Sopenharmony_ci		/* Deliver the page fault to userland, check inside PT lock */
36618c2ecf20Sopenharmony_ci		if (userfaultfd_missing(vma)) {
36628c2ecf20Sopenharmony_ci			pte_unmap_unlock(vmf->pte, vmf->ptl);
36638c2ecf20Sopenharmony_ci			return handle_userfault(vmf, VM_UFFD_MISSING);
36648c2ecf20Sopenharmony_ci		}
36658c2ecf20Sopenharmony_ci		goto setpte;
36668c2ecf20Sopenharmony_ci	}
36678c2ecf20Sopenharmony_ci
36688c2ecf20Sopenharmony_ci	/* Allocate our own private page. */
36698c2ecf20Sopenharmony_ci	if (unlikely(anon_vma_prepare(vma)))
36708c2ecf20Sopenharmony_ci		goto oom;
36718c2ecf20Sopenharmony_ci	page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
36728c2ecf20Sopenharmony_ci	if (!page)
36738c2ecf20Sopenharmony_ci		goto oom;
36748c2ecf20Sopenharmony_ci
36758c2ecf20Sopenharmony_ci	if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
36768c2ecf20Sopenharmony_ci		goto oom_free_page;
36778c2ecf20Sopenharmony_ci	cgroup_throttle_swaprate(page, GFP_KERNEL);
36788c2ecf20Sopenharmony_ci
36798c2ecf20Sopenharmony_ci	/*
36808c2ecf20Sopenharmony_ci	 * The memory barrier inside __SetPageUptodate makes sure that
36818c2ecf20Sopenharmony_ci	 * preceding stores to the page contents become visible before
36828c2ecf20Sopenharmony_ci	 * the set_pte_at() write.
36838c2ecf20Sopenharmony_ci	 */
36848c2ecf20Sopenharmony_ci	__SetPageUptodate(page);
36858c2ecf20Sopenharmony_ci
36868c2ecf20Sopenharmony_ci	entry = mk_pte(page, vma->vm_page_prot);
36878c2ecf20Sopenharmony_ci	entry = pte_sw_mkyoung(entry);
36888c2ecf20Sopenharmony_ci	if (vma->vm_flags & VM_WRITE)
36898c2ecf20Sopenharmony_ci		entry = pte_mkwrite(pte_mkdirty(entry));
36908c2ecf20Sopenharmony_ci
36918c2ecf20Sopenharmony_ci	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
36928c2ecf20Sopenharmony_ci			&vmf->ptl);
36938c2ecf20Sopenharmony_ci	if (!pte_none(*vmf->pte)) {
36948c2ecf20Sopenharmony_ci		update_mmu_cache(vma, vmf->address, vmf->pte);
36958c2ecf20Sopenharmony_ci		goto release;
36968c2ecf20Sopenharmony_ci	}
36978c2ecf20Sopenharmony_ci
36988c2ecf20Sopenharmony_ci	ret = check_stable_address_space(vma->vm_mm);
36998c2ecf20Sopenharmony_ci	if (ret)
37008c2ecf20Sopenharmony_ci		goto release;
37018c2ecf20Sopenharmony_ci
37028c2ecf20Sopenharmony_ci	/* Deliver the page fault to userland, check inside PT lock */
37038c2ecf20Sopenharmony_ci	if (userfaultfd_missing(vma)) {
37048c2ecf20Sopenharmony_ci		pte_unmap_unlock(vmf->pte, vmf->ptl);
37058c2ecf20Sopenharmony_ci		put_page(page);
37068c2ecf20Sopenharmony_ci		return handle_userfault(vmf, VM_UFFD_MISSING);
37078c2ecf20Sopenharmony_ci	}
37088c2ecf20Sopenharmony_ci
37098c2ecf20Sopenharmony_ci	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
37108c2ecf20Sopenharmony_ci	page_add_new_anon_rmap(page, vma, vmf->address, false);
37118c2ecf20Sopenharmony_ci	if (vma->vm_flags & VM_PURGEABLE)
37128c2ecf20Sopenharmony_ci		SetPagePurgeable(page);
37138c2ecf20Sopenharmony_ci
37148c2ecf20Sopenharmony_ci	lru_cache_add_inactive_or_unevictable(page, vma);
37158c2ecf20Sopenharmony_cisetpte:
37168c2ecf20Sopenharmony_ci	if (vma->vm_flags & VM_PURGEABLE)
37178c2ecf20Sopenharmony_ci		uxpte_set_present(vma, vmf->address);
37188c2ecf20Sopenharmony_ci
37198c2ecf20Sopenharmony_ci	if(!pte_special(entry)){
37208c2ecf20Sopenharmony_ci		xpm_integrity_update_hook(vma, vmf->flags, page);
37218c2ecf20Sopenharmony_ci	}
37228c2ecf20Sopenharmony_ci
37238c2ecf20Sopenharmony_ci	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
37248c2ecf20Sopenharmony_ci
37258c2ecf20Sopenharmony_ci	/* No need to invalidate - it was non-present before */
37268c2ecf20Sopenharmony_ci	update_mmu_cache(vma, vmf->address, vmf->pte);
37278c2ecf20Sopenharmony_ciunlock:
37288c2ecf20Sopenharmony_ci	pte_unmap_unlock(vmf->pte, vmf->ptl);
37298c2ecf20Sopenharmony_ci	return ret;
37308c2ecf20Sopenharmony_cirelease:
37318c2ecf20Sopenharmony_ci	put_page(page);
37328c2ecf20Sopenharmony_ci	goto unlock;
37338c2ecf20Sopenharmony_cioom_free_page:
37348c2ecf20Sopenharmony_ci	put_page(page);
37358c2ecf20Sopenharmony_cioom:
37368c2ecf20Sopenharmony_ci	return VM_FAULT_OOM;
37378c2ecf20Sopenharmony_ci}
37388c2ecf20Sopenharmony_ci
37398c2ecf20Sopenharmony_ci/*
37408c2ecf20Sopenharmony_ci * The mmap_lock must have been held on entry, and may have been
37418c2ecf20Sopenharmony_ci * released depending on flags and vma->vm_ops->fault() return value.
37428c2ecf20Sopenharmony_ci * See filemap_fault() and __lock_page_retry().
37438c2ecf20Sopenharmony_ci */
37448c2ecf20Sopenharmony_cistatic vm_fault_t __do_fault(struct vm_fault *vmf)
37458c2ecf20Sopenharmony_ci{
37468c2ecf20Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
37478c2ecf20Sopenharmony_ci	vm_fault_t ret;
37488c2ecf20Sopenharmony_ci
37498c2ecf20Sopenharmony_ci	/*
37508c2ecf20Sopenharmony_ci	 * Preallocate pte before we take page_lock because this might lead to
37518c2ecf20Sopenharmony_ci	 * deadlocks for memcg reclaim which waits for pages under writeback:
37528c2ecf20Sopenharmony_ci	 *				lock_page(A)
37538c2ecf20Sopenharmony_ci	 *				SetPageWriteback(A)
37548c2ecf20Sopenharmony_ci	 *				unlock_page(A)
37558c2ecf20Sopenharmony_ci	 * lock_page(B)
37568c2ecf20Sopenharmony_ci	 *				lock_page(B)
37578c2ecf20Sopenharmony_ci	 * pte_alloc_one
37588c2ecf20Sopenharmony_ci	 *   shrink_page_list
37598c2ecf20Sopenharmony_ci	 *     wait_on_page_writeback(A)
37608c2ecf20Sopenharmony_ci	 *				SetPageWriteback(B)
37618c2ecf20Sopenharmony_ci	 *				unlock_page(B)
37628c2ecf20Sopenharmony_ci	 *				# flush A, B to clear the writeback
37638c2ecf20Sopenharmony_ci	 */
37648c2ecf20Sopenharmony_ci	if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
37658c2ecf20Sopenharmony_ci		vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
37668c2ecf20Sopenharmony_ci		if (!vmf->prealloc_pte)
37678c2ecf20Sopenharmony_ci			return VM_FAULT_OOM;
37688c2ecf20Sopenharmony_ci		smp_wmb(); /* See comment in __pte_alloc() */
37698c2ecf20Sopenharmony_ci	}
37708c2ecf20Sopenharmony_ci
37718c2ecf20Sopenharmony_ci	ret = vma->vm_ops->fault(vmf);
37728c2ecf20Sopenharmony_ci	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
37738c2ecf20Sopenharmony_ci			    VM_FAULT_DONE_COW)))
37748c2ecf20Sopenharmony_ci		return ret;
37758c2ecf20Sopenharmony_ci
37768c2ecf20Sopenharmony_ci	if (unlikely(PageHWPoison(vmf->page))) {
37778c2ecf20Sopenharmony_ci		struct page *page = vmf->page;
37788c2ecf20Sopenharmony_ci		vm_fault_t poisonret = VM_FAULT_HWPOISON;
37798c2ecf20Sopenharmony_ci		if (ret & VM_FAULT_LOCKED) {
37808c2ecf20Sopenharmony_ci			if (page_mapped(page))
37818c2ecf20Sopenharmony_ci				unmap_mapping_pages(page_mapping(page),
37828c2ecf20Sopenharmony_ci						    page->index, 1, false);
37838c2ecf20Sopenharmony_ci			/* Retry if a clean page was removed from the cache. */
37848c2ecf20Sopenharmony_ci			if (invalidate_inode_page(page))
37858c2ecf20Sopenharmony_ci				poisonret = VM_FAULT_NOPAGE;
37868c2ecf20Sopenharmony_ci			unlock_page(page);
37878c2ecf20Sopenharmony_ci		}
37888c2ecf20Sopenharmony_ci		put_page(page);
37898c2ecf20Sopenharmony_ci		vmf->page = NULL;
37908c2ecf20Sopenharmony_ci		return poisonret;
37918c2ecf20Sopenharmony_ci	}
37928c2ecf20Sopenharmony_ci
37938c2ecf20Sopenharmony_ci	if (unlikely(!(ret & VM_FAULT_LOCKED)))
37948c2ecf20Sopenharmony_ci		lock_page(vmf->page);
37958c2ecf20Sopenharmony_ci	else
37968c2ecf20Sopenharmony_ci		VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
37978c2ecf20Sopenharmony_ci
37988c2ecf20Sopenharmony_ci	return ret;
37998c2ecf20Sopenharmony_ci}
38008c2ecf20Sopenharmony_ci
38018c2ecf20Sopenharmony_ci/*
38028c2ecf20Sopenharmony_ci * The ordering of these checks is important for pmds with _PAGE_DEVMAP set.
38038c2ecf20Sopenharmony_ci * If we check pmd_trans_unstable() first we will trip the bad_pmd() check
38048c2ecf20Sopenharmony_ci * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly
38058c2ecf20Sopenharmony_ci * returning 1 but not before it spams dmesg with the pmd_clear_bad() output.
38068c2ecf20Sopenharmony_ci */
38078c2ecf20Sopenharmony_cistatic int pmd_devmap_trans_unstable(pmd_t *pmd)
38088c2ecf20Sopenharmony_ci{
38098c2ecf20Sopenharmony_ci	return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
38108c2ecf20Sopenharmony_ci}
38118c2ecf20Sopenharmony_ci
38128c2ecf20Sopenharmony_cistatic vm_fault_t pte_alloc_one_map(struct vm_fault *vmf)
38138c2ecf20Sopenharmony_ci{
38148c2ecf20Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
38158c2ecf20Sopenharmony_ci
38168c2ecf20Sopenharmony_ci	if (!pmd_none(*vmf->pmd))
38178c2ecf20Sopenharmony_ci		goto map_pte;
38188c2ecf20Sopenharmony_ci	if (vmf->prealloc_pte) {
38198c2ecf20Sopenharmony_ci		vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
38208c2ecf20Sopenharmony_ci		if (unlikely(!pmd_none(*vmf->pmd))) {
38218c2ecf20Sopenharmony_ci			spin_unlock(vmf->ptl);
38228c2ecf20Sopenharmony_ci			goto map_pte;
38238c2ecf20Sopenharmony_ci		}
38248c2ecf20Sopenharmony_ci
38258c2ecf20Sopenharmony_ci		mm_inc_nr_ptes(vma->vm_mm);
38268c2ecf20Sopenharmony_ci		pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
38278c2ecf20Sopenharmony_ci		spin_unlock(vmf->ptl);
38288c2ecf20Sopenharmony_ci		vmf->prealloc_pte = NULL;
38298c2ecf20Sopenharmony_ci	} else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
38308c2ecf20Sopenharmony_ci		return VM_FAULT_OOM;
38318c2ecf20Sopenharmony_ci	}
38328c2ecf20Sopenharmony_cimap_pte:
38338c2ecf20Sopenharmony_ci	/*
38348c2ecf20Sopenharmony_ci	 * If a huge pmd materialized under us just retry later.  Use
38358c2ecf20Sopenharmony_ci	 * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of
38368c2ecf20Sopenharmony_ci	 * pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge
38378c2ecf20Sopenharmony_ci	 * under us and then back to pmd_none, as a result of MADV_DONTNEED
38388c2ecf20Sopenharmony_ci	 * running immediately after a huge pmd fault in a different thread of
38398c2ecf20Sopenharmony_ci	 * this mm, in turn leading to a misleading pmd_trans_huge() retval.
38408c2ecf20Sopenharmony_ci	 * All we have to ensure is that it is a regular pmd that we can walk
38418c2ecf20Sopenharmony_ci	 * with pte_offset_map() and we can do that through an atomic read in
38428c2ecf20Sopenharmony_ci	 * C, which is what pmd_trans_unstable() provides.
38438c2ecf20Sopenharmony_ci	 */
38448c2ecf20Sopenharmony_ci	if (pmd_devmap_trans_unstable(vmf->pmd))
38458c2ecf20Sopenharmony_ci		return VM_FAULT_NOPAGE;
38468c2ecf20Sopenharmony_ci
38478c2ecf20Sopenharmony_ci	/*
38488c2ecf20Sopenharmony_ci	 * At this point we know that our vmf->pmd points to a page of ptes
38498c2ecf20Sopenharmony_ci	 * and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge()
38508c2ecf20Sopenharmony_ci	 * for the duration of the fault.  If a racing MADV_DONTNEED runs and
38518c2ecf20Sopenharmony_ci	 * we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still
38528c2ecf20Sopenharmony_ci	 * be valid and we will re-check to make sure the vmf->pte isn't
38538c2ecf20Sopenharmony_ci	 * pte_none() under vmf->ptl protection when we return to
38548c2ecf20Sopenharmony_ci	 * alloc_set_pte().
38558c2ecf20Sopenharmony_ci	 */
38568c2ecf20Sopenharmony_ci	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
38578c2ecf20Sopenharmony_ci			&vmf->ptl);
38588c2ecf20Sopenharmony_ci	return 0;
38598c2ecf20Sopenharmony_ci}
38608c2ecf20Sopenharmony_ci
38618c2ecf20Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE
38628c2ecf20Sopenharmony_cistatic void deposit_prealloc_pte(struct vm_fault *vmf)
38638c2ecf20Sopenharmony_ci{
38648c2ecf20Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
38658c2ecf20Sopenharmony_ci
38668c2ecf20Sopenharmony_ci	pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
38678c2ecf20Sopenharmony_ci	/*
38688c2ecf20Sopenharmony_ci	 * We are going to consume the prealloc table,
38698c2ecf20Sopenharmony_ci	 * count that as nr_ptes.
38708c2ecf20Sopenharmony_ci	 */
38718c2ecf20Sopenharmony_ci	mm_inc_nr_ptes(vma->vm_mm);
38728c2ecf20Sopenharmony_ci	vmf->prealloc_pte = NULL;
38738c2ecf20Sopenharmony_ci}
38748c2ecf20Sopenharmony_ci
38758c2ecf20Sopenharmony_cistatic vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
38768c2ecf20Sopenharmony_ci{
38778c2ecf20Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
38788c2ecf20Sopenharmony_ci	bool write = vmf->flags & FAULT_FLAG_WRITE;
38798c2ecf20Sopenharmony_ci	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
38808c2ecf20Sopenharmony_ci	pmd_t entry;
38818c2ecf20Sopenharmony_ci	int i;
38828c2ecf20Sopenharmony_ci	vm_fault_t ret = VM_FAULT_FALLBACK;
38838c2ecf20Sopenharmony_ci
38848c2ecf20Sopenharmony_ci	if (!transhuge_vma_suitable(vma, haddr))
38858c2ecf20Sopenharmony_ci		return ret;
38868c2ecf20Sopenharmony_ci
38878c2ecf20Sopenharmony_ci	page = compound_head(page);
38888c2ecf20Sopenharmony_ci	if (compound_order(page) != HPAGE_PMD_ORDER)
38898c2ecf20Sopenharmony_ci		return ret;
38908c2ecf20Sopenharmony_ci
38918c2ecf20Sopenharmony_ci	/*
38928c2ecf20Sopenharmony_ci	 * Archs like ppc64 need additonal space to store information
38938c2ecf20Sopenharmony_ci	 * related to pte entry. Use the preallocated table for that.
38948c2ecf20Sopenharmony_ci	 */
38958c2ecf20Sopenharmony_ci	if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
38968c2ecf20Sopenharmony_ci		vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
38978c2ecf20Sopenharmony_ci		if (!vmf->prealloc_pte)
38988c2ecf20Sopenharmony_ci			return VM_FAULT_OOM;
38998c2ecf20Sopenharmony_ci		smp_wmb(); /* See comment in __pte_alloc() */
39008c2ecf20Sopenharmony_ci	}
39018c2ecf20Sopenharmony_ci
39028c2ecf20Sopenharmony_ci	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
39038c2ecf20Sopenharmony_ci	if (unlikely(!pmd_none(*vmf->pmd)))
39048c2ecf20Sopenharmony_ci		goto out;
39058c2ecf20Sopenharmony_ci
39068c2ecf20Sopenharmony_ci	for (i = 0; i < HPAGE_PMD_NR; i++)
39078c2ecf20Sopenharmony_ci		flush_icache_page(vma, page + i);
39088c2ecf20Sopenharmony_ci
39098c2ecf20Sopenharmony_ci	entry = mk_huge_pmd(page, vma->vm_page_prot);
39108c2ecf20Sopenharmony_ci	if (write)
39118c2ecf20Sopenharmony_ci		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
39128c2ecf20Sopenharmony_ci
39138c2ecf20Sopenharmony_ci	add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
39148c2ecf20Sopenharmony_ci	page_add_file_rmap(page, true);
39158c2ecf20Sopenharmony_ci	/*
39168c2ecf20Sopenharmony_ci	 * deposit and withdraw with pmd lock held
39178c2ecf20Sopenharmony_ci	 */
39188c2ecf20Sopenharmony_ci	if (arch_needs_pgtable_deposit())
39198c2ecf20Sopenharmony_ci		deposit_prealloc_pte(vmf);
39208c2ecf20Sopenharmony_ci
39218c2ecf20Sopenharmony_ci	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
39228c2ecf20Sopenharmony_ci
39238c2ecf20Sopenharmony_ci	update_mmu_cache_pmd(vma, haddr, vmf->pmd);
39248c2ecf20Sopenharmony_ci
39258c2ecf20Sopenharmony_ci	/* fault is handled */
39268c2ecf20Sopenharmony_ci	ret = 0;
39278c2ecf20Sopenharmony_ci	count_vm_event(THP_FILE_MAPPED);
39288c2ecf20Sopenharmony_ciout:
39298c2ecf20Sopenharmony_ci	spin_unlock(vmf->ptl);
39308c2ecf20Sopenharmony_ci	return ret;
39318c2ecf20Sopenharmony_ci}
39328c2ecf20Sopenharmony_ci#else
39338c2ecf20Sopenharmony_cistatic vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
39348c2ecf20Sopenharmony_ci{
39358c2ecf20Sopenharmony_ci	BUILD_BUG();
39368c2ecf20Sopenharmony_ci	return 0;
39378c2ecf20Sopenharmony_ci}
39388c2ecf20Sopenharmony_ci#endif
39398c2ecf20Sopenharmony_ci
39408c2ecf20Sopenharmony_ci/**
39418c2ecf20Sopenharmony_ci * alloc_set_pte - setup new PTE entry for given page and add reverse page
39428c2ecf20Sopenharmony_ci * mapping. If needed, the function allocates page table or use pre-allocated.
39438c2ecf20Sopenharmony_ci *
39448c2ecf20Sopenharmony_ci * @vmf: fault environment
39458c2ecf20Sopenharmony_ci * @page: page to map
39468c2ecf20Sopenharmony_ci *
39478c2ecf20Sopenharmony_ci * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on
39488c2ecf20Sopenharmony_ci * return.
39498c2ecf20Sopenharmony_ci *
39508c2ecf20Sopenharmony_ci * Target users are page handler itself and implementations of
39518c2ecf20Sopenharmony_ci * vm_ops->map_pages.
39528c2ecf20Sopenharmony_ci *
39538c2ecf20Sopenharmony_ci * Return: %0 on success, %VM_FAULT_ code in case of error.
39548c2ecf20Sopenharmony_ci */
39558c2ecf20Sopenharmony_civm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
39568c2ecf20Sopenharmony_ci{
39578c2ecf20Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
39588c2ecf20Sopenharmony_ci	bool write = vmf->flags & FAULT_FLAG_WRITE;
39598c2ecf20Sopenharmony_ci	pte_t entry;
39608c2ecf20Sopenharmony_ci	vm_fault_t ret;
39618c2ecf20Sopenharmony_ci
39628c2ecf20Sopenharmony_ci	if (pmd_none(*vmf->pmd) && PageTransCompound(page)) {
39638c2ecf20Sopenharmony_ci		ret = do_set_pmd(vmf, page);
39648c2ecf20Sopenharmony_ci		if (ret != VM_FAULT_FALLBACK)
39658c2ecf20Sopenharmony_ci			return ret;
39668c2ecf20Sopenharmony_ci	}
39678c2ecf20Sopenharmony_ci
39688c2ecf20Sopenharmony_ci	if (!vmf->pte) {
39698c2ecf20Sopenharmony_ci		ret = pte_alloc_one_map(vmf);
39708c2ecf20Sopenharmony_ci		if (ret)
39718c2ecf20Sopenharmony_ci			return ret;
39728c2ecf20Sopenharmony_ci	}
39738c2ecf20Sopenharmony_ci
39748c2ecf20Sopenharmony_ci	/* Re-check under ptl */
39758c2ecf20Sopenharmony_ci	if (unlikely(!pte_none(*vmf->pte))) {
39768c2ecf20Sopenharmony_ci		update_mmu_tlb(vma, vmf->address, vmf->pte);
39778c2ecf20Sopenharmony_ci		return VM_FAULT_NOPAGE;
39788c2ecf20Sopenharmony_ci	}
39798c2ecf20Sopenharmony_ci
39808c2ecf20Sopenharmony_ci	/* check the confliction of xpm integrity flags*/
39818c2ecf20Sopenharmony_ci	if (unlikely(xpm_integrity_validate_hook(vmf->vma, vmf->flags,
39828c2ecf20Sopenharmony_ci		vmf->address, page)))
39838c2ecf20Sopenharmony_ci		return VM_FAULT_SIGSEGV;
39848c2ecf20Sopenharmony_ci
39858c2ecf20Sopenharmony_ci	flush_icache_page(vma, page);
39868c2ecf20Sopenharmony_ci	entry = mk_pte(page, vma->vm_page_prot);
39878c2ecf20Sopenharmony_ci	entry = pte_sw_mkyoung(entry);
39888c2ecf20Sopenharmony_ci	if (write)
39898c2ecf20Sopenharmony_ci		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
39908c2ecf20Sopenharmony_ci	/* copy-on-write page */
39918c2ecf20Sopenharmony_ci	if (write && !(vma->vm_flags & VM_SHARED)) {
39928c2ecf20Sopenharmony_ci		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
39938c2ecf20Sopenharmony_ci		page_add_new_anon_rmap(page, vma, vmf->address, false);
39948c2ecf20Sopenharmony_ci		lru_cache_add_inactive_or_unevictable(page, vma);
39958c2ecf20Sopenharmony_ci	} else {
39968c2ecf20Sopenharmony_ci		inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
39978c2ecf20Sopenharmony_ci		page_add_file_rmap(page, false);
39988c2ecf20Sopenharmony_ci	}
39998c2ecf20Sopenharmony_ci	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
40008c2ecf20Sopenharmony_ci
40018c2ecf20Sopenharmony_ci	/* no need to invalidate: a not-present page won't be cached */
40028c2ecf20Sopenharmony_ci	update_mmu_cache(vma, vmf->address, vmf->pte);
40038c2ecf20Sopenharmony_ci
40048c2ecf20Sopenharmony_ci	return 0;
40058c2ecf20Sopenharmony_ci}
40068c2ecf20Sopenharmony_ci
40078c2ecf20Sopenharmony_ci
40088c2ecf20Sopenharmony_ci/**
40098c2ecf20Sopenharmony_ci * finish_fault - finish page fault once we have prepared the page to fault
40108c2ecf20Sopenharmony_ci *
40118c2ecf20Sopenharmony_ci * @vmf: structure describing the fault
40128c2ecf20Sopenharmony_ci *
40138c2ecf20Sopenharmony_ci * This function handles all that is needed to finish a page fault once the
40148c2ecf20Sopenharmony_ci * page to fault in is prepared. It handles locking of PTEs, inserts PTE for
40158c2ecf20Sopenharmony_ci * given page, adds reverse page mapping, handles memcg charges and LRU
40168c2ecf20Sopenharmony_ci * addition.
40178c2ecf20Sopenharmony_ci *
40188c2ecf20Sopenharmony_ci * The function expects the page to be locked and on success it consumes a
40198c2ecf20Sopenharmony_ci * reference of a page being mapped (for the PTE which maps it).
40208c2ecf20Sopenharmony_ci *
40218c2ecf20Sopenharmony_ci * Return: %0 on success, %VM_FAULT_ code in case of error.
40228c2ecf20Sopenharmony_ci */
40238c2ecf20Sopenharmony_civm_fault_t finish_fault(struct vm_fault *vmf)
40248c2ecf20Sopenharmony_ci{
40258c2ecf20Sopenharmony_ci	struct page *page;
40268c2ecf20Sopenharmony_ci	vm_fault_t ret = 0;
40278c2ecf20Sopenharmony_ci
40288c2ecf20Sopenharmony_ci	/* Did we COW the page? */
40298c2ecf20Sopenharmony_ci	if ((vmf->flags & FAULT_FLAG_WRITE) &&
40308c2ecf20Sopenharmony_ci	    !(vmf->vma->vm_flags & VM_SHARED))
40318c2ecf20Sopenharmony_ci		page = vmf->cow_page;
40328c2ecf20Sopenharmony_ci	else
40338c2ecf20Sopenharmony_ci		page = vmf->page;
40348c2ecf20Sopenharmony_ci
40358c2ecf20Sopenharmony_ci	/*
40368c2ecf20Sopenharmony_ci	 * check even for read faults because we might have lost our CoWed
40378c2ecf20Sopenharmony_ci	 * page
40388c2ecf20Sopenharmony_ci	 */
40398c2ecf20Sopenharmony_ci	if (!(vmf->vma->vm_flags & VM_SHARED))
40408c2ecf20Sopenharmony_ci		ret = check_stable_address_space(vmf->vma->vm_mm);
40418c2ecf20Sopenharmony_ci	if (!ret)
40428c2ecf20Sopenharmony_ci		ret = alloc_set_pte(vmf, page);
40438c2ecf20Sopenharmony_ci	if (vmf->pte)
40448c2ecf20Sopenharmony_ci		pte_unmap_unlock(vmf->pte, vmf->ptl);
40458c2ecf20Sopenharmony_ci	return ret;
40468c2ecf20Sopenharmony_ci}
40478c2ecf20Sopenharmony_ci
40488c2ecf20Sopenharmony_cistatic unsigned long fault_around_bytes __read_mostly =
40498c2ecf20Sopenharmony_ci	rounddown_pow_of_two(65536);
40508c2ecf20Sopenharmony_ci
40518c2ecf20Sopenharmony_ci#ifdef CONFIG_DEBUG_FS
40528c2ecf20Sopenharmony_cistatic int fault_around_bytes_get(void *data, u64 *val)
40538c2ecf20Sopenharmony_ci{
40548c2ecf20Sopenharmony_ci	*val = fault_around_bytes;
40558c2ecf20Sopenharmony_ci	return 0;
40568c2ecf20Sopenharmony_ci}
40578c2ecf20Sopenharmony_ci
40588c2ecf20Sopenharmony_ci/*
40598c2ecf20Sopenharmony_ci * fault_around_bytes must be rounded down to the nearest page order as it's
40608c2ecf20Sopenharmony_ci * what do_fault_around() expects to see.
40618c2ecf20Sopenharmony_ci */
40628c2ecf20Sopenharmony_cistatic int fault_around_bytes_set(void *data, u64 val)
40638c2ecf20Sopenharmony_ci{
40648c2ecf20Sopenharmony_ci	if (val / PAGE_SIZE > PTRS_PER_PTE)
40658c2ecf20Sopenharmony_ci		return -EINVAL;
40668c2ecf20Sopenharmony_ci	if (val > PAGE_SIZE)
40678c2ecf20Sopenharmony_ci		fault_around_bytes = rounddown_pow_of_two(val);
40688c2ecf20Sopenharmony_ci	else
40698c2ecf20Sopenharmony_ci		fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */
40708c2ecf20Sopenharmony_ci	return 0;
40718c2ecf20Sopenharmony_ci}
40728c2ecf20Sopenharmony_ciDEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
40738c2ecf20Sopenharmony_ci		fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
40748c2ecf20Sopenharmony_ci
40758c2ecf20Sopenharmony_cistatic int __init fault_around_debugfs(void)
40768c2ecf20Sopenharmony_ci{
40778c2ecf20Sopenharmony_ci	debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
40788c2ecf20Sopenharmony_ci				   &fault_around_bytes_fops);
40798c2ecf20Sopenharmony_ci	return 0;
40808c2ecf20Sopenharmony_ci}
40818c2ecf20Sopenharmony_cilate_initcall(fault_around_debugfs);
40828c2ecf20Sopenharmony_ci#endif
40838c2ecf20Sopenharmony_ci
40848c2ecf20Sopenharmony_ci/*
40858c2ecf20Sopenharmony_ci * do_fault_around() tries to map few pages around the fault address. The hope
40868c2ecf20Sopenharmony_ci * is that the pages will be needed soon and this will lower the number of
40878c2ecf20Sopenharmony_ci * faults to handle.
40888c2ecf20Sopenharmony_ci *
40898c2ecf20Sopenharmony_ci * It uses vm_ops->map_pages() to map the pages, which skips the page if it's
40908c2ecf20Sopenharmony_ci * not ready to be mapped: not up-to-date, locked, etc.
40918c2ecf20Sopenharmony_ci *
40928c2ecf20Sopenharmony_ci * This function is called with the page table lock taken. In the split ptlock
40938c2ecf20Sopenharmony_ci * case the page table lock only protects only those entries which belong to
40948c2ecf20Sopenharmony_ci * the page table corresponding to the fault address.
40958c2ecf20Sopenharmony_ci *
40968c2ecf20Sopenharmony_ci * This function doesn't cross the VMA boundaries, in order to call map_pages()
40978c2ecf20Sopenharmony_ci * only once.
40988c2ecf20Sopenharmony_ci *
40998c2ecf20Sopenharmony_ci * fault_around_bytes defines how many bytes we'll try to map.
41008c2ecf20Sopenharmony_ci * do_fault_around() expects it to be set to a power of two less than or equal
41018c2ecf20Sopenharmony_ci * to PTRS_PER_PTE.
41028c2ecf20Sopenharmony_ci *
41038c2ecf20Sopenharmony_ci * The virtual address of the area that we map is naturally aligned to
41048c2ecf20Sopenharmony_ci * fault_around_bytes rounded down to the machine page size
41058c2ecf20Sopenharmony_ci * (and therefore to page order).  This way it's easier to guarantee
41068c2ecf20Sopenharmony_ci * that we don't cross page table boundaries.
41078c2ecf20Sopenharmony_ci */
41088c2ecf20Sopenharmony_cistatic vm_fault_t do_fault_around(struct vm_fault *vmf)
41098c2ecf20Sopenharmony_ci{
41108c2ecf20Sopenharmony_ci	unsigned long address = vmf->address, nr_pages, mask;
41118c2ecf20Sopenharmony_ci	pgoff_t start_pgoff = vmf->pgoff;
41128c2ecf20Sopenharmony_ci	pgoff_t end_pgoff;
41138c2ecf20Sopenharmony_ci	int off;
41148c2ecf20Sopenharmony_ci	vm_fault_t ret = 0;
41158c2ecf20Sopenharmony_ci
41168c2ecf20Sopenharmony_ci	nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
41178c2ecf20Sopenharmony_ci	mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
41188c2ecf20Sopenharmony_ci
41198c2ecf20Sopenharmony_ci	vmf->address = max(address & mask, vmf->vma->vm_start);
41208c2ecf20Sopenharmony_ci	off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
41218c2ecf20Sopenharmony_ci	start_pgoff -= off;
41228c2ecf20Sopenharmony_ci
41238c2ecf20Sopenharmony_ci	/*
41248c2ecf20Sopenharmony_ci	 *  end_pgoff is either the end of the page table, the end of
41258c2ecf20Sopenharmony_ci	 *  the vma or nr_pages from start_pgoff, depending what is nearest.
41268c2ecf20Sopenharmony_ci	 */
41278c2ecf20Sopenharmony_ci	end_pgoff = start_pgoff -
41288c2ecf20Sopenharmony_ci		((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
41298c2ecf20Sopenharmony_ci		PTRS_PER_PTE - 1;
41308c2ecf20Sopenharmony_ci	end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
41318c2ecf20Sopenharmony_ci			start_pgoff + nr_pages - 1);
41328c2ecf20Sopenharmony_ci
41338c2ecf20Sopenharmony_ci	if (pmd_none(*vmf->pmd)) {
41348c2ecf20Sopenharmony_ci		vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
41358c2ecf20Sopenharmony_ci		if (!vmf->prealloc_pte)
41368c2ecf20Sopenharmony_ci			goto out;
41378c2ecf20Sopenharmony_ci		smp_wmb(); /* See comment in __pte_alloc() */
41388c2ecf20Sopenharmony_ci	}
41398c2ecf20Sopenharmony_ci
41408c2ecf20Sopenharmony_ci	vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
41418c2ecf20Sopenharmony_ci
41428c2ecf20Sopenharmony_ci	/* Huge page is mapped? Page fault is solved */
41438c2ecf20Sopenharmony_ci	if (pmd_trans_huge(*vmf->pmd)) {
41448c2ecf20Sopenharmony_ci		ret = VM_FAULT_NOPAGE;
41458c2ecf20Sopenharmony_ci		goto out;
41468c2ecf20Sopenharmony_ci	}
41478c2ecf20Sopenharmony_ci
41488c2ecf20Sopenharmony_ci	/* ->map_pages() haven't done anything useful. Cold page cache? */
41498c2ecf20Sopenharmony_ci	if (!vmf->pte)
41508c2ecf20Sopenharmony_ci		goto out;
41518c2ecf20Sopenharmony_ci
41528c2ecf20Sopenharmony_ci	/* check if the page fault is solved */
41538c2ecf20Sopenharmony_ci	vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
41548c2ecf20Sopenharmony_ci	if (!pte_none(*vmf->pte))
41558c2ecf20Sopenharmony_ci		ret = VM_FAULT_NOPAGE;
41568c2ecf20Sopenharmony_ci	pte_unmap_unlock(vmf->pte, vmf->ptl);
41578c2ecf20Sopenharmony_ciout:
41588c2ecf20Sopenharmony_ci	vmf->address = address;
41598c2ecf20Sopenharmony_ci	vmf->pte = NULL;
41608c2ecf20Sopenharmony_ci	return ret;
41618c2ecf20Sopenharmony_ci}
41628c2ecf20Sopenharmony_ci
41638c2ecf20Sopenharmony_cistatic vm_fault_t do_read_fault(struct vm_fault *vmf)
41648c2ecf20Sopenharmony_ci{
41658c2ecf20Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
41668c2ecf20Sopenharmony_ci	vm_fault_t ret = 0;
41678c2ecf20Sopenharmony_ci
41688c2ecf20Sopenharmony_ci	/*
41698c2ecf20Sopenharmony_ci	 * Let's call ->map_pages() first and use ->fault() as fallback
41708c2ecf20Sopenharmony_ci	 * if page by the offset is not ready to be mapped (cold cache or
41718c2ecf20Sopenharmony_ci	 * something).
41728c2ecf20Sopenharmony_ci	 */
41738c2ecf20Sopenharmony_ci	if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
41748c2ecf20Sopenharmony_ci		ret = do_fault_around(vmf);
41758c2ecf20Sopenharmony_ci		if (ret)
41768c2ecf20Sopenharmony_ci			return ret;
41778c2ecf20Sopenharmony_ci	}
41788c2ecf20Sopenharmony_ci
41798c2ecf20Sopenharmony_ci	ret = __do_fault(vmf);
41808c2ecf20Sopenharmony_ci	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
41818c2ecf20Sopenharmony_ci		return ret;
41828c2ecf20Sopenharmony_ci
41838c2ecf20Sopenharmony_ci	ret |= finish_fault(vmf);
41848c2ecf20Sopenharmony_ci	unlock_page(vmf->page);
41858c2ecf20Sopenharmony_ci	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
41868c2ecf20Sopenharmony_ci		put_page(vmf->page);
41878c2ecf20Sopenharmony_ci	return ret;
41888c2ecf20Sopenharmony_ci}
41898c2ecf20Sopenharmony_ci
41908c2ecf20Sopenharmony_cistatic vm_fault_t do_cow_fault(struct vm_fault *vmf)
41918c2ecf20Sopenharmony_ci{
41928c2ecf20Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
41938c2ecf20Sopenharmony_ci	vm_fault_t ret;
41948c2ecf20Sopenharmony_ci
41958c2ecf20Sopenharmony_ci	if (unlikely(anon_vma_prepare(vma)))
41968c2ecf20Sopenharmony_ci		return VM_FAULT_OOM;
41978c2ecf20Sopenharmony_ci
41988c2ecf20Sopenharmony_ci	vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
41998c2ecf20Sopenharmony_ci	if (!vmf->cow_page)
42008c2ecf20Sopenharmony_ci		return VM_FAULT_OOM;
42018c2ecf20Sopenharmony_ci
42028c2ecf20Sopenharmony_ci	if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) {
42038c2ecf20Sopenharmony_ci		put_page(vmf->cow_page);
42048c2ecf20Sopenharmony_ci		return VM_FAULT_OOM;
42058c2ecf20Sopenharmony_ci	}
42068c2ecf20Sopenharmony_ci	cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL);
42078c2ecf20Sopenharmony_ci
42088c2ecf20Sopenharmony_ci	ret = __do_fault(vmf);
42098c2ecf20Sopenharmony_ci	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
42108c2ecf20Sopenharmony_ci		goto uncharge_out;
42118c2ecf20Sopenharmony_ci	if (ret & VM_FAULT_DONE_COW)
42128c2ecf20Sopenharmony_ci		return ret;
42138c2ecf20Sopenharmony_ci
42148c2ecf20Sopenharmony_ci	copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
42158c2ecf20Sopenharmony_ci	__SetPageUptodate(vmf->cow_page);
42168c2ecf20Sopenharmony_ci
42178c2ecf20Sopenharmony_ci	ret |= finish_fault(vmf);
42188c2ecf20Sopenharmony_ci	unlock_page(vmf->page);
42198c2ecf20Sopenharmony_ci	put_page(vmf->page);
42208c2ecf20Sopenharmony_ci	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
42218c2ecf20Sopenharmony_ci		goto uncharge_out;
42228c2ecf20Sopenharmony_ci	return ret;
42238c2ecf20Sopenharmony_ciuncharge_out:
42248c2ecf20Sopenharmony_ci	put_page(vmf->cow_page);
42258c2ecf20Sopenharmony_ci	return ret;
42268c2ecf20Sopenharmony_ci}
42278c2ecf20Sopenharmony_ci
42288c2ecf20Sopenharmony_cistatic vm_fault_t do_shared_fault(struct vm_fault *vmf)
42298c2ecf20Sopenharmony_ci{
42308c2ecf20Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
42318c2ecf20Sopenharmony_ci	vm_fault_t ret, tmp;
42328c2ecf20Sopenharmony_ci
42338c2ecf20Sopenharmony_ci	ret = __do_fault(vmf);
42348c2ecf20Sopenharmony_ci	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
42358c2ecf20Sopenharmony_ci		return ret;
42368c2ecf20Sopenharmony_ci
42378c2ecf20Sopenharmony_ci	/*
42388c2ecf20Sopenharmony_ci	 * Check if the backing address space wants to know that the page is
42398c2ecf20Sopenharmony_ci	 * about to become writable
42408c2ecf20Sopenharmony_ci	 */
42418c2ecf20Sopenharmony_ci	if (vma->vm_ops->page_mkwrite) {
42428c2ecf20Sopenharmony_ci		unlock_page(vmf->page);
42438c2ecf20Sopenharmony_ci		tmp = do_page_mkwrite(vmf);
42448c2ecf20Sopenharmony_ci		if (unlikely(!tmp ||
42458c2ecf20Sopenharmony_ci				(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
42468c2ecf20Sopenharmony_ci			put_page(vmf->page);
42478c2ecf20Sopenharmony_ci			return tmp;
42488c2ecf20Sopenharmony_ci		}
42498c2ecf20Sopenharmony_ci	}
42508c2ecf20Sopenharmony_ci
42518c2ecf20Sopenharmony_ci	ret |= finish_fault(vmf);
42528c2ecf20Sopenharmony_ci	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
42538c2ecf20Sopenharmony_ci					VM_FAULT_RETRY))) {
42548c2ecf20Sopenharmony_ci		unlock_page(vmf->page);
42558c2ecf20Sopenharmony_ci		put_page(vmf->page);
42568c2ecf20Sopenharmony_ci		return ret;
42578c2ecf20Sopenharmony_ci	}
42588c2ecf20Sopenharmony_ci
42598c2ecf20Sopenharmony_ci	ret |= fault_dirty_shared_page(vmf);
42608c2ecf20Sopenharmony_ci	return ret;
42618c2ecf20Sopenharmony_ci}
42628c2ecf20Sopenharmony_ci
42638c2ecf20Sopenharmony_ci/*
42648c2ecf20Sopenharmony_ci * We enter with non-exclusive mmap_lock (to exclude vma changes,
42658c2ecf20Sopenharmony_ci * but allow concurrent faults).
42668c2ecf20Sopenharmony_ci * The mmap_lock may have been released depending on flags and our
42678c2ecf20Sopenharmony_ci * return value.  See filemap_fault() and __lock_page_or_retry().
42688c2ecf20Sopenharmony_ci * If mmap_lock is released, vma may become invalid (for example
42698c2ecf20Sopenharmony_ci * by other thread calling munmap()).
42708c2ecf20Sopenharmony_ci */
42718c2ecf20Sopenharmony_cistatic vm_fault_t do_fault(struct vm_fault *vmf)
42728c2ecf20Sopenharmony_ci{
42738c2ecf20Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
42748c2ecf20Sopenharmony_ci	struct mm_struct *vm_mm = vma->vm_mm;
42758c2ecf20Sopenharmony_ci	vm_fault_t ret;
42768c2ecf20Sopenharmony_ci
42778c2ecf20Sopenharmony_ci	/*
42788c2ecf20Sopenharmony_ci	 * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
42798c2ecf20Sopenharmony_ci	 */
42808c2ecf20Sopenharmony_ci	if (!vma->vm_ops->fault) {
42818c2ecf20Sopenharmony_ci		/*
42828c2ecf20Sopenharmony_ci		 * If we find a migration pmd entry or a none pmd entry, which
42838c2ecf20Sopenharmony_ci		 * should never happen, return SIGBUS
42848c2ecf20Sopenharmony_ci		 */
42858c2ecf20Sopenharmony_ci		if (unlikely(!pmd_present(*vmf->pmd)))
42868c2ecf20Sopenharmony_ci			ret = VM_FAULT_SIGBUS;
42878c2ecf20Sopenharmony_ci		else {
42888c2ecf20Sopenharmony_ci			vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
42898c2ecf20Sopenharmony_ci						       vmf->pmd,
42908c2ecf20Sopenharmony_ci						       vmf->address,
42918c2ecf20Sopenharmony_ci						       &vmf->ptl);
42928c2ecf20Sopenharmony_ci			/*
42938c2ecf20Sopenharmony_ci			 * Make sure this is not a temporary clearing of pte
42948c2ecf20Sopenharmony_ci			 * by holding ptl and checking again. A R/M/W update
42958c2ecf20Sopenharmony_ci			 * of pte involves: take ptl, clearing the pte so that
42968c2ecf20Sopenharmony_ci			 * we don't have concurrent modification by hardware
42978c2ecf20Sopenharmony_ci			 * followed by an update.
42988c2ecf20Sopenharmony_ci			 */
42998c2ecf20Sopenharmony_ci			if (unlikely(pte_none(*vmf->pte)))
43008c2ecf20Sopenharmony_ci				ret = VM_FAULT_SIGBUS;
43018c2ecf20Sopenharmony_ci			else
43028c2ecf20Sopenharmony_ci				ret = VM_FAULT_NOPAGE;
43038c2ecf20Sopenharmony_ci
43048c2ecf20Sopenharmony_ci			pte_unmap_unlock(vmf->pte, vmf->ptl);
43058c2ecf20Sopenharmony_ci		}
43068c2ecf20Sopenharmony_ci	} else if (!(vmf->flags & FAULT_FLAG_WRITE))
43078c2ecf20Sopenharmony_ci		ret = do_read_fault(vmf);
43088c2ecf20Sopenharmony_ci	else if (!(vma->vm_flags & VM_SHARED))
43098c2ecf20Sopenharmony_ci		ret = do_cow_fault(vmf);
43108c2ecf20Sopenharmony_ci	else
43118c2ecf20Sopenharmony_ci		ret = do_shared_fault(vmf);
43128c2ecf20Sopenharmony_ci
43138c2ecf20Sopenharmony_ci	/* preallocated pagetable is unused: free it */
43148c2ecf20Sopenharmony_ci	if (vmf->prealloc_pte) {
43158c2ecf20Sopenharmony_ci		pte_free(vm_mm, vmf->prealloc_pte);
43168c2ecf20Sopenharmony_ci		vmf->prealloc_pte = NULL;
43178c2ecf20Sopenharmony_ci	}
43188c2ecf20Sopenharmony_ci	return ret;
43198c2ecf20Sopenharmony_ci}
43208c2ecf20Sopenharmony_ci
43218c2ecf20Sopenharmony_cistatic int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
43228c2ecf20Sopenharmony_ci				unsigned long addr, int page_nid,
43238c2ecf20Sopenharmony_ci				int *flags)
43248c2ecf20Sopenharmony_ci{
43258c2ecf20Sopenharmony_ci	get_page(page);
43268c2ecf20Sopenharmony_ci
43278c2ecf20Sopenharmony_ci	count_vm_numa_event(NUMA_HINT_FAULTS);
43288c2ecf20Sopenharmony_ci	if (page_nid == numa_node_id()) {
43298c2ecf20Sopenharmony_ci		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
43308c2ecf20Sopenharmony_ci		*flags |= TNF_FAULT_LOCAL;
43318c2ecf20Sopenharmony_ci	}
43328c2ecf20Sopenharmony_ci
43338c2ecf20Sopenharmony_ci	return mpol_misplaced(page, vma, addr);
43348c2ecf20Sopenharmony_ci}
43358c2ecf20Sopenharmony_ci
43368c2ecf20Sopenharmony_cistatic vm_fault_t do_numa_page(struct vm_fault *vmf)
43378c2ecf20Sopenharmony_ci{
43388c2ecf20Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
43398c2ecf20Sopenharmony_ci	struct page *page = NULL;
43408c2ecf20Sopenharmony_ci	int page_nid = NUMA_NO_NODE;
43418c2ecf20Sopenharmony_ci	int last_cpupid;
43428c2ecf20Sopenharmony_ci	int target_nid;
43438c2ecf20Sopenharmony_ci	bool migrated = false;
43448c2ecf20Sopenharmony_ci	pte_t pte, old_pte;
43458c2ecf20Sopenharmony_ci	bool was_writable = pte_savedwrite(vmf->orig_pte);
43468c2ecf20Sopenharmony_ci	int flags = 0;
43478c2ecf20Sopenharmony_ci
43488c2ecf20Sopenharmony_ci	/*
43498c2ecf20Sopenharmony_ci	 * The "pte" at this point cannot be used safely without
43508c2ecf20Sopenharmony_ci	 * validation through pte_unmap_same(). It's of NUMA type but
43518c2ecf20Sopenharmony_ci	 * the pfn may be screwed if the read is non atomic.
43528c2ecf20Sopenharmony_ci	 */
43538c2ecf20Sopenharmony_ci	vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
43548c2ecf20Sopenharmony_ci	spin_lock(vmf->ptl);
43558c2ecf20Sopenharmony_ci	if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
43568c2ecf20Sopenharmony_ci		pte_unmap_unlock(vmf->pte, vmf->ptl);
43578c2ecf20Sopenharmony_ci		goto out;
43588c2ecf20Sopenharmony_ci	}
43598c2ecf20Sopenharmony_ci
43608c2ecf20Sopenharmony_ci	/*
43618c2ecf20Sopenharmony_ci	 * Make it present again, Depending on how arch implementes non
43628c2ecf20Sopenharmony_ci	 * accessible ptes, some can allow access by kernel mode.
43638c2ecf20Sopenharmony_ci	 */
43648c2ecf20Sopenharmony_ci	old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
43658c2ecf20Sopenharmony_ci	pte = pte_modify(old_pte, vma->vm_page_prot);
43668c2ecf20Sopenharmony_ci	pte = pte_mkyoung(pte);
43678c2ecf20Sopenharmony_ci	if (was_writable)
43688c2ecf20Sopenharmony_ci		pte = pte_mkwrite(pte);
43698c2ecf20Sopenharmony_ci	ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
43708c2ecf20Sopenharmony_ci	update_mmu_cache(vma, vmf->address, vmf->pte);
43718c2ecf20Sopenharmony_ci
43728c2ecf20Sopenharmony_ci	page = vm_normal_page(vma, vmf->address, pte);
43738c2ecf20Sopenharmony_ci	if (!page) {
43748c2ecf20Sopenharmony_ci		pte_unmap_unlock(vmf->pte, vmf->ptl);
43758c2ecf20Sopenharmony_ci		return 0;
43768c2ecf20Sopenharmony_ci	}
43778c2ecf20Sopenharmony_ci
43788c2ecf20Sopenharmony_ci	/* TODO: handle PTE-mapped THP */
43798c2ecf20Sopenharmony_ci	if (PageCompound(page)) {
43808c2ecf20Sopenharmony_ci		pte_unmap_unlock(vmf->pte, vmf->ptl);
43818c2ecf20Sopenharmony_ci		return 0;
43828c2ecf20Sopenharmony_ci	}
43838c2ecf20Sopenharmony_ci
43848c2ecf20Sopenharmony_ci	/*
43858c2ecf20Sopenharmony_ci	 * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
43868c2ecf20Sopenharmony_ci	 * much anyway since they can be in shared cache state. This misses
43878c2ecf20Sopenharmony_ci	 * the case where a mapping is writable but the process never writes
43888c2ecf20Sopenharmony_ci	 * to it but pte_write gets cleared during protection updates and
43898c2ecf20Sopenharmony_ci	 * pte_dirty has unpredictable behaviour between PTE scan updates,
43908c2ecf20Sopenharmony_ci	 * background writeback, dirty balancing and application behaviour.
43918c2ecf20Sopenharmony_ci	 */
43928c2ecf20Sopenharmony_ci	if (!pte_write(pte))
43938c2ecf20Sopenharmony_ci		flags |= TNF_NO_GROUP;
43948c2ecf20Sopenharmony_ci
43958c2ecf20Sopenharmony_ci	/*
43968c2ecf20Sopenharmony_ci	 * Flag if the page is shared between multiple address spaces. This
43978c2ecf20Sopenharmony_ci	 * is later used when determining whether to group tasks together
43988c2ecf20Sopenharmony_ci	 */
43998c2ecf20Sopenharmony_ci	if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
44008c2ecf20Sopenharmony_ci		flags |= TNF_SHARED;
44018c2ecf20Sopenharmony_ci
44028c2ecf20Sopenharmony_ci	last_cpupid = page_cpupid_last(page);
44038c2ecf20Sopenharmony_ci	page_nid = page_to_nid(page);
44048c2ecf20Sopenharmony_ci	target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
44058c2ecf20Sopenharmony_ci			&flags);
44068c2ecf20Sopenharmony_ci	pte_unmap_unlock(vmf->pte, vmf->ptl);
44078c2ecf20Sopenharmony_ci	if (target_nid == NUMA_NO_NODE) {
44088c2ecf20Sopenharmony_ci		put_page(page);
44098c2ecf20Sopenharmony_ci		goto out;
44108c2ecf20Sopenharmony_ci	}
44118c2ecf20Sopenharmony_ci
44128c2ecf20Sopenharmony_ci	/* Migrate to the requested node */
44138c2ecf20Sopenharmony_ci	migrated = migrate_misplaced_page(page, vma, target_nid);
44148c2ecf20Sopenharmony_ci	if (migrated) {
44158c2ecf20Sopenharmony_ci		page_nid = target_nid;
44168c2ecf20Sopenharmony_ci		flags |= TNF_MIGRATED;
44178c2ecf20Sopenharmony_ci	} else
44188c2ecf20Sopenharmony_ci		flags |= TNF_MIGRATE_FAIL;
44198c2ecf20Sopenharmony_ci
44208c2ecf20Sopenharmony_ciout:
44218c2ecf20Sopenharmony_ci	if (page_nid != NUMA_NO_NODE)
44228c2ecf20Sopenharmony_ci		task_numa_fault(last_cpupid, page_nid, 1, flags);
44238c2ecf20Sopenharmony_ci	return 0;
44248c2ecf20Sopenharmony_ci}
44258c2ecf20Sopenharmony_ci
44268c2ecf20Sopenharmony_cistatic inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
44278c2ecf20Sopenharmony_ci{
44288c2ecf20Sopenharmony_ci	if (vma_is_anonymous(vmf->vma))
44298c2ecf20Sopenharmony_ci		return do_huge_pmd_anonymous_page(vmf);
44308c2ecf20Sopenharmony_ci	if (vmf->vma->vm_ops->huge_fault)
44318c2ecf20Sopenharmony_ci		return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
44328c2ecf20Sopenharmony_ci	return VM_FAULT_FALLBACK;
44338c2ecf20Sopenharmony_ci}
44348c2ecf20Sopenharmony_ci
44358c2ecf20Sopenharmony_ci/* `inline' is required to avoid gcc 4.1.2 build error */
44368c2ecf20Sopenharmony_cistatic inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
44378c2ecf20Sopenharmony_ci{
44388c2ecf20Sopenharmony_ci	if (vma_is_anonymous(vmf->vma)) {
44398c2ecf20Sopenharmony_ci		if (userfaultfd_huge_pmd_wp(vmf->vma, orig_pmd))
44408c2ecf20Sopenharmony_ci			return handle_userfault(vmf, VM_UFFD_WP);
44418c2ecf20Sopenharmony_ci		return do_huge_pmd_wp_page(vmf, orig_pmd);
44428c2ecf20Sopenharmony_ci	}
44438c2ecf20Sopenharmony_ci	if (vmf->vma->vm_ops->huge_fault) {
44448c2ecf20Sopenharmony_ci		vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
44458c2ecf20Sopenharmony_ci
44468c2ecf20Sopenharmony_ci		if (!(ret & VM_FAULT_FALLBACK))
44478c2ecf20Sopenharmony_ci			return ret;
44488c2ecf20Sopenharmony_ci	}
44498c2ecf20Sopenharmony_ci
44508c2ecf20Sopenharmony_ci	/* COW or write-notify handled on pte level: split pmd. */
44518c2ecf20Sopenharmony_ci	__split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
44528c2ecf20Sopenharmony_ci
44538c2ecf20Sopenharmony_ci	return VM_FAULT_FALLBACK;
44548c2ecf20Sopenharmony_ci}
44558c2ecf20Sopenharmony_ci
44568c2ecf20Sopenharmony_cistatic vm_fault_t create_huge_pud(struct vm_fault *vmf)
44578c2ecf20Sopenharmony_ci{
44588c2ecf20Sopenharmony_ci#if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&			\
44598c2ecf20Sopenharmony_ci	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
44608c2ecf20Sopenharmony_ci	/* No support for anonymous transparent PUD pages yet */
44618c2ecf20Sopenharmony_ci	if (vma_is_anonymous(vmf->vma))
44628c2ecf20Sopenharmony_ci		return VM_FAULT_FALLBACK;
44638c2ecf20Sopenharmony_ci	if (vmf->vma->vm_ops->huge_fault)
44648c2ecf20Sopenharmony_ci		return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
44658c2ecf20Sopenharmony_ci#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
44668c2ecf20Sopenharmony_ci	return VM_FAULT_FALLBACK;
44678c2ecf20Sopenharmony_ci}
44688c2ecf20Sopenharmony_ci
44698c2ecf20Sopenharmony_cistatic vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
44708c2ecf20Sopenharmony_ci{
44718c2ecf20Sopenharmony_ci#if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&			\
44728c2ecf20Sopenharmony_ci	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
44738c2ecf20Sopenharmony_ci	/* No support for anonymous transparent PUD pages yet */
44748c2ecf20Sopenharmony_ci	if (vma_is_anonymous(vmf->vma))
44758c2ecf20Sopenharmony_ci		goto split;
44768c2ecf20Sopenharmony_ci	if (vmf->vma->vm_ops->huge_fault) {
44778c2ecf20Sopenharmony_ci		vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
44788c2ecf20Sopenharmony_ci
44798c2ecf20Sopenharmony_ci		if (!(ret & VM_FAULT_FALLBACK))
44808c2ecf20Sopenharmony_ci			return ret;
44818c2ecf20Sopenharmony_ci	}
44828c2ecf20Sopenharmony_cisplit:
44838c2ecf20Sopenharmony_ci	/* COW or write-notify not handled on PUD level: split pud.*/
44848c2ecf20Sopenharmony_ci	__split_huge_pud(vmf->vma, vmf->pud, vmf->address);
44858c2ecf20Sopenharmony_ci#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
44868c2ecf20Sopenharmony_ci	return VM_FAULT_FALLBACK;
44878c2ecf20Sopenharmony_ci}
44888c2ecf20Sopenharmony_ci
44898c2ecf20Sopenharmony_ci/*
44908c2ecf20Sopenharmony_ci * These routines also need to handle stuff like marking pages dirty
44918c2ecf20Sopenharmony_ci * and/or accessed for architectures that don't do it in hardware (most
44928c2ecf20Sopenharmony_ci * RISC architectures).  The early dirtying is also good on the i386.
44938c2ecf20Sopenharmony_ci *
44948c2ecf20Sopenharmony_ci * There is also a hook called "update_mmu_cache()" that architectures
44958c2ecf20Sopenharmony_ci * with external mmu caches can use to update those (ie the Sparc or
44968c2ecf20Sopenharmony_ci * PowerPC hashed page tables that act as extended TLBs).
44978c2ecf20Sopenharmony_ci *
44988c2ecf20Sopenharmony_ci * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow
44998c2ecf20Sopenharmony_ci * concurrent faults).
45008c2ecf20Sopenharmony_ci *
45018c2ecf20Sopenharmony_ci * The mmap_lock may have been released depending on flags and our return value.
45028c2ecf20Sopenharmony_ci * See filemap_fault() and __lock_page_or_retry().
45038c2ecf20Sopenharmony_ci */
45048c2ecf20Sopenharmony_cistatic vm_fault_t handle_pte_fault(struct vm_fault *vmf)
45058c2ecf20Sopenharmony_ci{
45068c2ecf20Sopenharmony_ci	pte_t entry;
45078c2ecf20Sopenharmony_ci
45088c2ecf20Sopenharmony_ci	if (unlikely(pmd_none(*vmf->pmd))) {
45098c2ecf20Sopenharmony_ci		/*
45108c2ecf20Sopenharmony_ci		 * Leave __pte_alloc() until later: because vm_ops->fault may
45118c2ecf20Sopenharmony_ci		 * want to allocate huge page, and if we expose page table
45128c2ecf20Sopenharmony_ci		 * for an instant, it will be difficult to retract from
45138c2ecf20Sopenharmony_ci		 * concurrent faults and from rmap lookups.
45148c2ecf20Sopenharmony_ci		 */
45158c2ecf20Sopenharmony_ci		vmf->pte = NULL;
45168c2ecf20Sopenharmony_ci	} else {
45178c2ecf20Sopenharmony_ci		/* See comment in pte_alloc_one_map() */
45188c2ecf20Sopenharmony_ci		if (pmd_devmap_trans_unstable(vmf->pmd))
45198c2ecf20Sopenharmony_ci			return 0;
45208c2ecf20Sopenharmony_ci		/*
45218c2ecf20Sopenharmony_ci		 * A regular pmd is established and it can't morph into a huge
45228c2ecf20Sopenharmony_ci		 * pmd from under us anymore at this point because we hold the
45238c2ecf20Sopenharmony_ci		 * mmap_lock read mode and khugepaged takes it in write mode.
45248c2ecf20Sopenharmony_ci		 * So now it's safe to run pte_offset_map().
45258c2ecf20Sopenharmony_ci		 */
45268c2ecf20Sopenharmony_ci		vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
45278c2ecf20Sopenharmony_ci		vmf->orig_pte = *vmf->pte;
45288c2ecf20Sopenharmony_ci
45298c2ecf20Sopenharmony_ci		/*
45308c2ecf20Sopenharmony_ci		 * some architectures can have larger ptes than wordsize,
45318c2ecf20Sopenharmony_ci		 * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and
45328c2ecf20Sopenharmony_ci		 * CONFIG_32BIT=y, so READ_ONCE cannot guarantee atomic
45338c2ecf20Sopenharmony_ci		 * accesses.  The code below just needs a consistent view
45348c2ecf20Sopenharmony_ci		 * for the ifs and we later double check anyway with the
45358c2ecf20Sopenharmony_ci		 * ptl lock held. So here a barrier will do.
45368c2ecf20Sopenharmony_ci		 */
45378c2ecf20Sopenharmony_ci		barrier();
45388c2ecf20Sopenharmony_ci		if (pte_none(vmf->orig_pte)) {
45398c2ecf20Sopenharmony_ci			pte_unmap(vmf->pte);
45408c2ecf20Sopenharmony_ci			vmf->pte = NULL;
45418c2ecf20Sopenharmony_ci		}
45428c2ecf20Sopenharmony_ci	}
45438c2ecf20Sopenharmony_ci
45448c2ecf20Sopenharmony_ci	if (!vmf->pte) {
45458c2ecf20Sopenharmony_ci		if (vma_is_anonymous(vmf->vma))
45468c2ecf20Sopenharmony_ci			return do_anonymous_page(vmf);
45478c2ecf20Sopenharmony_ci		else
45488c2ecf20Sopenharmony_ci			return do_fault(vmf);
45498c2ecf20Sopenharmony_ci	}
45508c2ecf20Sopenharmony_ci
45518c2ecf20Sopenharmony_ci	if (!pte_present(vmf->orig_pte))
45528c2ecf20Sopenharmony_ci		return do_swap_page(vmf);
45538c2ecf20Sopenharmony_ci
45548c2ecf20Sopenharmony_ci	if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
45558c2ecf20Sopenharmony_ci		return do_numa_page(vmf);
45568c2ecf20Sopenharmony_ci
45578c2ecf20Sopenharmony_ci	vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
45588c2ecf20Sopenharmony_ci	spin_lock(vmf->ptl);
45598c2ecf20Sopenharmony_ci	entry = vmf->orig_pte;
45608c2ecf20Sopenharmony_ci	if (unlikely(!pte_same(*vmf->pte, entry))) {
45618c2ecf20Sopenharmony_ci		update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
45628c2ecf20Sopenharmony_ci		goto unlock;
45638c2ecf20Sopenharmony_ci	}
45648c2ecf20Sopenharmony_ci	if (vmf->flags & FAULT_FLAG_WRITE) {
45658c2ecf20Sopenharmony_ci		if (!pte_write(entry))
45668c2ecf20Sopenharmony_ci			return do_wp_page(vmf);
45678c2ecf20Sopenharmony_ci		entry = pte_mkdirty(entry);
45688c2ecf20Sopenharmony_ci	}
45698c2ecf20Sopenharmony_ci	entry = pte_mkyoung(entry);
45708c2ecf20Sopenharmony_ci	if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
45718c2ecf20Sopenharmony_ci				vmf->flags & FAULT_FLAG_WRITE)) {
45728c2ecf20Sopenharmony_ci		update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
45738c2ecf20Sopenharmony_ci	} else {
45748c2ecf20Sopenharmony_ci		/* Skip spurious TLB flush for retried page fault */
45758c2ecf20Sopenharmony_ci		if (vmf->flags & FAULT_FLAG_TRIED)
45768c2ecf20Sopenharmony_ci			goto unlock;
45778c2ecf20Sopenharmony_ci		/*
45788c2ecf20Sopenharmony_ci		 * This is needed only for protection faults but the arch code
45798c2ecf20Sopenharmony_ci		 * is not yet telling us if this is a protection fault or not.
45808c2ecf20Sopenharmony_ci		 * This still avoids useless tlb flushes for .text page faults
45818c2ecf20Sopenharmony_ci		 * with threads.
45828c2ecf20Sopenharmony_ci		 */
45838c2ecf20Sopenharmony_ci		if (vmf->flags & FAULT_FLAG_WRITE)
45848c2ecf20Sopenharmony_ci			flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
45858c2ecf20Sopenharmony_ci	}
45868c2ecf20Sopenharmony_ciunlock:
45878c2ecf20Sopenharmony_ci	pte_unmap_unlock(vmf->pte, vmf->ptl);
45888c2ecf20Sopenharmony_ci	return 0;
45898c2ecf20Sopenharmony_ci}
45908c2ecf20Sopenharmony_ci
45918c2ecf20Sopenharmony_ci/*
45928c2ecf20Sopenharmony_ci * By the time we get here, we already hold the mm semaphore
45938c2ecf20Sopenharmony_ci *
45948c2ecf20Sopenharmony_ci * The mmap_lock may have been released depending on flags and our
45958c2ecf20Sopenharmony_ci * return value.  See filemap_fault() and __lock_page_or_retry().
45968c2ecf20Sopenharmony_ci */
45978c2ecf20Sopenharmony_cistatic vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
45988c2ecf20Sopenharmony_ci		unsigned long address, unsigned int flags)
45998c2ecf20Sopenharmony_ci{
46008c2ecf20Sopenharmony_ci	struct vm_fault vmf = {
46018c2ecf20Sopenharmony_ci		.vma = vma,
46028c2ecf20Sopenharmony_ci		.address = address & PAGE_MASK,
46038c2ecf20Sopenharmony_ci		.flags = flags,
46048c2ecf20Sopenharmony_ci		.pgoff = linear_page_index(vma, address),
46058c2ecf20Sopenharmony_ci		.gfp_mask = __get_fault_gfp_mask(vma),
46068c2ecf20Sopenharmony_ci	};
46078c2ecf20Sopenharmony_ci	unsigned int dirty = flags & FAULT_FLAG_WRITE;
46088c2ecf20Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
46098c2ecf20Sopenharmony_ci	pgd_t *pgd;
46108c2ecf20Sopenharmony_ci	p4d_t *p4d;
46118c2ecf20Sopenharmony_ci	vm_fault_t ret;
46128c2ecf20Sopenharmony_ci
46138c2ecf20Sopenharmony_ci	pgd = pgd_offset(mm, address);
46148c2ecf20Sopenharmony_ci	p4d = p4d_alloc(mm, pgd, address);
46158c2ecf20Sopenharmony_ci	if (!p4d)
46168c2ecf20Sopenharmony_ci		return VM_FAULT_OOM;
46178c2ecf20Sopenharmony_ci
46188c2ecf20Sopenharmony_ci	vmf.pud = pud_alloc(mm, p4d, address);
46198c2ecf20Sopenharmony_ci	if (!vmf.pud)
46208c2ecf20Sopenharmony_ci		return VM_FAULT_OOM;
46218c2ecf20Sopenharmony_ciretry_pud:
46228c2ecf20Sopenharmony_ci	if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
46238c2ecf20Sopenharmony_ci		ret = create_huge_pud(&vmf);
46248c2ecf20Sopenharmony_ci		if (!(ret & VM_FAULT_FALLBACK))
46258c2ecf20Sopenharmony_ci			return ret;
46268c2ecf20Sopenharmony_ci	} else {
46278c2ecf20Sopenharmony_ci		pud_t orig_pud = *vmf.pud;
46288c2ecf20Sopenharmony_ci
46298c2ecf20Sopenharmony_ci		barrier();
46308c2ecf20Sopenharmony_ci		if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
46318c2ecf20Sopenharmony_ci
46328c2ecf20Sopenharmony_ci			/* NUMA case for anonymous PUDs would go here */
46338c2ecf20Sopenharmony_ci
46348c2ecf20Sopenharmony_ci			if (dirty && !pud_write(orig_pud)) {
46358c2ecf20Sopenharmony_ci				ret = wp_huge_pud(&vmf, orig_pud);
46368c2ecf20Sopenharmony_ci				if (!(ret & VM_FAULT_FALLBACK))
46378c2ecf20Sopenharmony_ci					return ret;
46388c2ecf20Sopenharmony_ci			} else {
46398c2ecf20Sopenharmony_ci				huge_pud_set_accessed(&vmf, orig_pud);
46408c2ecf20Sopenharmony_ci				return 0;
46418c2ecf20Sopenharmony_ci			}
46428c2ecf20Sopenharmony_ci		}
46438c2ecf20Sopenharmony_ci	}
46448c2ecf20Sopenharmony_ci
46458c2ecf20Sopenharmony_ci	vmf.pmd = pmd_alloc(mm, vmf.pud, address);
46468c2ecf20Sopenharmony_ci	if (!vmf.pmd)
46478c2ecf20Sopenharmony_ci		return VM_FAULT_OOM;
46488c2ecf20Sopenharmony_ci
46498c2ecf20Sopenharmony_ci	/* Huge pud page fault raced with pmd_alloc? */
46508c2ecf20Sopenharmony_ci	if (pud_trans_unstable(vmf.pud))
46518c2ecf20Sopenharmony_ci		goto retry_pud;
46528c2ecf20Sopenharmony_ci
46538c2ecf20Sopenharmony_ci	if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
46548c2ecf20Sopenharmony_ci		ret = create_huge_pmd(&vmf);
46558c2ecf20Sopenharmony_ci		if (!(ret & VM_FAULT_FALLBACK))
46568c2ecf20Sopenharmony_ci			return ret;
46578c2ecf20Sopenharmony_ci	} else {
46588c2ecf20Sopenharmony_ci		pmd_t orig_pmd = *vmf.pmd;
46598c2ecf20Sopenharmony_ci
46608c2ecf20Sopenharmony_ci		barrier();
46618c2ecf20Sopenharmony_ci		if (unlikely(is_swap_pmd(orig_pmd))) {
46628c2ecf20Sopenharmony_ci			VM_BUG_ON(thp_migration_supported() &&
46638c2ecf20Sopenharmony_ci					  !is_pmd_migration_entry(orig_pmd));
46648c2ecf20Sopenharmony_ci			if (is_pmd_migration_entry(orig_pmd))
46658c2ecf20Sopenharmony_ci				pmd_migration_entry_wait(mm, vmf.pmd);
46668c2ecf20Sopenharmony_ci			return 0;
46678c2ecf20Sopenharmony_ci		}
46688c2ecf20Sopenharmony_ci		if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
46698c2ecf20Sopenharmony_ci			if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
46708c2ecf20Sopenharmony_ci				return do_huge_pmd_numa_page(&vmf, orig_pmd);
46718c2ecf20Sopenharmony_ci
46728c2ecf20Sopenharmony_ci			if (dirty && !pmd_write(orig_pmd)) {
46738c2ecf20Sopenharmony_ci				ret = wp_huge_pmd(&vmf, orig_pmd);
46748c2ecf20Sopenharmony_ci				if (!(ret & VM_FAULT_FALLBACK))
46758c2ecf20Sopenharmony_ci					return ret;
46768c2ecf20Sopenharmony_ci			} else {
46778c2ecf20Sopenharmony_ci				huge_pmd_set_accessed(&vmf, orig_pmd);
46788c2ecf20Sopenharmony_ci				return 0;
46798c2ecf20Sopenharmony_ci			}
46808c2ecf20Sopenharmony_ci		}
46818c2ecf20Sopenharmony_ci	}
46828c2ecf20Sopenharmony_ci
46838c2ecf20Sopenharmony_ci	return handle_pte_fault(&vmf);
46848c2ecf20Sopenharmony_ci}
46858c2ecf20Sopenharmony_ci
46868c2ecf20Sopenharmony_ci/**
46878c2ecf20Sopenharmony_ci * mm_account_fault - Do page fault accountings
46888c2ecf20Sopenharmony_ci *
46898c2ecf20Sopenharmony_ci * @regs: the pt_regs struct pointer.  When set to NULL, will skip accounting
46908c2ecf20Sopenharmony_ci *        of perf event counters, but we'll still do the per-task accounting to
46918c2ecf20Sopenharmony_ci *        the task who triggered this page fault.
46928c2ecf20Sopenharmony_ci * @address: the faulted address.
46938c2ecf20Sopenharmony_ci * @flags: the fault flags.
46948c2ecf20Sopenharmony_ci * @ret: the fault retcode.
46958c2ecf20Sopenharmony_ci *
46968c2ecf20Sopenharmony_ci * This will take care of most of the page fault accountings.  Meanwhile, it
46978c2ecf20Sopenharmony_ci * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
46988c2ecf20Sopenharmony_ci * updates.  However note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
46998c2ecf20Sopenharmony_ci * still be in per-arch page fault handlers at the entry of page fault.
47008c2ecf20Sopenharmony_ci */
47018c2ecf20Sopenharmony_cistatic inline void mm_account_fault(struct pt_regs *regs,
47028c2ecf20Sopenharmony_ci				    unsigned long address, unsigned int flags,
47038c2ecf20Sopenharmony_ci				    vm_fault_t ret)
47048c2ecf20Sopenharmony_ci{
47058c2ecf20Sopenharmony_ci	bool major;
47068c2ecf20Sopenharmony_ci
47078c2ecf20Sopenharmony_ci	/*
47088c2ecf20Sopenharmony_ci	 * We don't do accounting for some specific faults:
47098c2ecf20Sopenharmony_ci	 *
47108c2ecf20Sopenharmony_ci	 * - Unsuccessful faults (e.g. when the address wasn't valid).  That
47118c2ecf20Sopenharmony_ci	 *   includes arch_vma_access_permitted() failing before reaching here.
47128c2ecf20Sopenharmony_ci	 *   So this is not a "this many hardware page faults" counter.  We
47138c2ecf20Sopenharmony_ci	 *   should use the hw profiling for that.
47148c2ecf20Sopenharmony_ci	 *
47158c2ecf20Sopenharmony_ci	 * - Incomplete faults (VM_FAULT_RETRY).  They will only be counted
47168c2ecf20Sopenharmony_ci	 *   once they're completed.
47178c2ecf20Sopenharmony_ci	 */
47188c2ecf20Sopenharmony_ci	if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY))
47198c2ecf20Sopenharmony_ci		return;
47208c2ecf20Sopenharmony_ci
47218c2ecf20Sopenharmony_ci	/*
47228c2ecf20Sopenharmony_ci	 * We define the fault as a major fault when the final successful fault
47238c2ecf20Sopenharmony_ci	 * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't
47248c2ecf20Sopenharmony_ci	 * handle it immediately previously).
47258c2ecf20Sopenharmony_ci	 */
47268c2ecf20Sopenharmony_ci	major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED);
47278c2ecf20Sopenharmony_ci
47288c2ecf20Sopenharmony_ci	if (major)
47298c2ecf20Sopenharmony_ci		current->maj_flt++;
47308c2ecf20Sopenharmony_ci	else
47318c2ecf20Sopenharmony_ci		current->min_flt++;
47328c2ecf20Sopenharmony_ci
47338c2ecf20Sopenharmony_ci	/*
47348c2ecf20Sopenharmony_ci	 * If the fault is done for GUP, regs will be NULL.  We only do the
47358c2ecf20Sopenharmony_ci	 * accounting for the per thread fault counters who triggered the
47368c2ecf20Sopenharmony_ci	 * fault, and we skip the perf event updates.
47378c2ecf20Sopenharmony_ci	 */
47388c2ecf20Sopenharmony_ci	if (!regs)
47398c2ecf20Sopenharmony_ci		return;
47408c2ecf20Sopenharmony_ci
47418c2ecf20Sopenharmony_ci	if (major)
47428c2ecf20Sopenharmony_ci		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
47438c2ecf20Sopenharmony_ci	else
47448c2ecf20Sopenharmony_ci		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
47458c2ecf20Sopenharmony_ci}
47468c2ecf20Sopenharmony_ci
47478c2ecf20Sopenharmony_ci/*
47488c2ecf20Sopenharmony_ci * By the time we get here, we already hold the mm semaphore
47498c2ecf20Sopenharmony_ci *
47508c2ecf20Sopenharmony_ci * The mmap_lock may have been released depending on flags and our
47518c2ecf20Sopenharmony_ci * return value.  See filemap_fault() and __lock_page_or_retry().
47528c2ecf20Sopenharmony_ci */
47538c2ecf20Sopenharmony_civm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
47548c2ecf20Sopenharmony_ci			   unsigned int flags, struct pt_regs *regs)
47558c2ecf20Sopenharmony_ci{
47568c2ecf20Sopenharmony_ci	vm_fault_t ret;
47578c2ecf20Sopenharmony_ci
47588c2ecf20Sopenharmony_ci	__set_current_state(TASK_RUNNING);
47598c2ecf20Sopenharmony_ci
47608c2ecf20Sopenharmony_ci	count_vm_event(PGFAULT);
47618c2ecf20Sopenharmony_ci	count_memcg_event_mm(vma->vm_mm, PGFAULT);
47628c2ecf20Sopenharmony_ci
47638c2ecf20Sopenharmony_ci	/* do counter updates before entering really critical section. */
47648c2ecf20Sopenharmony_ci	check_sync_rss_stat(current);
47658c2ecf20Sopenharmony_ci
47668c2ecf20Sopenharmony_ci	if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
47678c2ecf20Sopenharmony_ci					    flags & FAULT_FLAG_INSTRUCTION,
47688c2ecf20Sopenharmony_ci					    flags & FAULT_FLAG_REMOTE))
47698c2ecf20Sopenharmony_ci		return VM_FAULT_SIGSEGV;
47708c2ecf20Sopenharmony_ci
47718c2ecf20Sopenharmony_ci	/*
47728c2ecf20Sopenharmony_ci	 * Enable the memcg OOM handling for faults triggered in user
47738c2ecf20Sopenharmony_ci	 * space.  Kernel faults are handled more gracefully.
47748c2ecf20Sopenharmony_ci	 */
47758c2ecf20Sopenharmony_ci	if (flags & FAULT_FLAG_USER)
47768c2ecf20Sopenharmony_ci		mem_cgroup_enter_user_fault();
47778c2ecf20Sopenharmony_ci
47788c2ecf20Sopenharmony_ci	if (unlikely(is_vm_hugetlb_page(vma)))
47798c2ecf20Sopenharmony_ci		ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
47808c2ecf20Sopenharmony_ci	else
47818c2ecf20Sopenharmony_ci		ret = __handle_mm_fault(vma, address, flags);
47828c2ecf20Sopenharmony_ci
47838c2ecf20Sopenharmony_ci	if (flags & FAULT_FLAG_USER) {
47848c2ecf20Sopenharmony_ci		mem_cgroup_exit_user_fault();
47858c2ecf20Sopenharmony_ci		/*
47868c2ecf20Sopenharmony_ci		 * The task may have entered a memcg OOM situation but
47878c2ecf20Sopenharmony_ci		 * if the allocation error was handled gracefully (no
47888c2ecf20Sopenharmony_ci		 * VM_FAULT_OOM), there is no need to kill anything.
47898c2ecf20Sopenharmony_ci		 * Just clean up the OOM state peacefully.
47908c2ecf20Sopenharmony_ci		 */
47918c2ecf20Sopenharmony_ci		if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
47928c2ecf20Sopenharmony_ci			mem_cgroup_oom_synchronize(false);
47938c2ecf20Sopenharmony_ci	}
47948c2ecf20Sopenharmony_ci
47958c2ecf20Sopenharmony_ci	mm_account_fault(regs, address, flags, ret);
47968c2ecf20Sopenharmony_ci
47978c2ecf20Sopenharmony_ci	return ret;
47988c2ecf20Sopenharmony_ci}
47998c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(handle_mm_fault);
48008c2ecf20Sopenharmony_ci
48018c2ecf20Sopenharmony_ci#ifndef __PAGETABLE_P4D_FOLDED
48028c2ecf20Sopenharmony_ci/*
48038c2ecf20Sopenharmony_ci * Allocate p4d page table.
48048c2ecf20Sopenharmony_ci * We've already handled the fast-path in-line.
48058c2ecf20Sopenharmony_ci */
48068c2ecf20Sopenharmony_ciint __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
48078c2ecf20Sopenharmony_ci{
48088c2ecf20Sopenharmony_ci	p4d_t *new = p4d_alloc_one(mm, address);
48098c2ecf20Sopenharmony_ci	if (!new)
48108c2ecf20Sopenharmony_ci		return -ENOMEM;
48118c2ecf20Sopenharmony_ci
48128c2ecf20Sopenharmony_ci	smp_wmb(); /* See comment in __pte_alloc */
48138c2ecf20Sopenharmony_ci
48148c2ecf20Sopenharmony_ci	spin_lock(&mm->page_table_lock);
48158c2ecf20Sopenharmony_ci	if (pgd_present(*pgd))		/* Another has populated it */
48168c2ecf20Sopenharmony_ci		p4d_free(mm, new);
48178c2ecf20Sopenharmony_ci	else
48188c2ecf20Sopenharmony_ci		pgd_populate(mm, pgd, new);
48198c2ecf20Sopenharmony_ci	spin_unlock(&mm->page_table_lock);
48208c2ecf20Sopenharmony_ci	return 0;
48218c2ecf20Sopenharmony_ci}
48228c2ecf20Sopenharmony_ci#endif /* __PAGETABLE_P4D_FOLDED */
48238c2ecf20Sopenharmony_ci
48248c2ecf20Sopenharmony_ci#ifndef __PAGETABLE_PUD_FOLDED
48258c2ecf20Sopenharmony_ci/*
48268c2ecf20Sopenharmony_ci * Allocate page upper directory.
48278c2ecf20Sopenharmony_ci * We've already handled the fast-path in-line.
48288c2ecf20Sopenharmony_ci */
48298c2ecf20Sopenharmony_ciint __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
48308c2ecf20Sopenharmony_ci{
48318c2ecf20Sopenharmony_ci	pud_t *new = pud_alloc_one(mm, address);
48328c2ecf20Sopenharmony_ci	if (!new)
48338c2ecf20Sopenharmony_ci		return -ENOMEM;
48348c2ecf20Sopenharmony_ci
48358c2ecf20Sopenharmony_ci	smp_wmb(); /* See comment in __pte_alloc */
48368c2ecf20Sopenharmony_ci
48378c2ecf20Sopenharmony_ci	spin_lock(&mm->page_table_lock);
48388c2ecf20Sopenharmony_ci	if (!p4d_present(*p4d)) {
48398c2ecf20Sopenharmony_ci		mm_inc_nr_puds(mm);
48408c2ecf20Sopenharmony_ci		p4d_populate(mm, p4d, new);
48418c2ecf20Sopenharmony_ci	} else	/* Another has populated it */
48428c2ecf20Sopenharmony_ci		pud_free(mm, new);
48438c2ecf20Sopenharmony_ci	spin_unlock(&mm->page_table_lock);
48448c2ecf20Sopenharmony_ci	return 0;
48458c2ecf20Sopenharmony_ci}
48468c2ecf20Sopenharmony_ci#endif /* __PAGETABLE_PUD_FOLDED */
48478c2ecf20Sopenharmony_ci
48488c2ecf20Sopenharmony_ci#ifndef __PAGETABLE_PMD_FOLDED
48498c2ecf20Sopenharmony_ci/*
48508c2ecf20Sopenharmony_ci * Allocate page middle directory.
48518c2ecf20Sopenharmony_ci * We've already handled the fast-path in-line.
48528c2ecf20Sopenharmony_ci */
48538c2ecf20Sopenharmony_ciint __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
48548c2ecf20Sopenharmony_ci{
48558c2ecf20Sopenharmony_ci	spinlock_t *ptl;
48568c2ecf20Sopenharmony_ci	pmd_t *new = pmd_alloc_one(mm, address);
48578c2ecf20Sopenharmony_ci	if (!new)
48588c2ecf20Sopenharmony_ci		return -ENOMEM;
48598c2ecf20Sopenharmony_ci
48608c2ecf20Sopenharmony_ci	smp_wmb(); /* See comment in __pte_alloc */
48618c2ecf20Sopenharmony_ci
48628c2ecf20Sopenharmony_ci	ptl = pud_lock(mm, pud);
48638c2ecf20Sopenharmony_ci	if (!pud_present(*pud)) {
48648c2ecf20Sopenharmony_ci		mm_inc_nr_pmds(mm);
48658c2ecf20Sopenharmony_ci		pud_populate(mm, pud, new);
48668c2ecf20Sopenharmony_ci	} else	/* Another has populated it */
48678c2ecf20Sopenharmony_ci		pmd_free(mm, new);
48688c2ecf20Sopenharmony_ci	spin_unlock(ptl);
48698c2ecf20Sopenharmony_ci	return 0;
48708c2ecf20Sopenharmony_ci}
48718c2ecf20Sopenharmony_ci#endif /* __PAGETABLE_PMD_FOLDED */
48728c2ecf20Sopenharmony_ci
48738c2ecf20Sopenharmony_ciint follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
48748c2ecf20Sopenharmony_ci			  struct mmu_notifier_range *range, pte_t **ptepp,
48758c2ecf20Sopenharmony_ci			  pmd_t **pmdpp, spinlock_t **ptlp)
48768c2ecf20Sopenharmony_ci{
48778c2ecf20Sopenharmony_ci	pgd_t *pgd;
48788c2ecf20Sopenharmony_ci	p4d_t *p4d;
48798c2ecf20Sopenharmony_ci	pud_t *pud;
48808c2ecf20Sopenharmony_ci	pmd_t *pmd;
48818c2ecf20Sopenharmony_ci	pte_t *ptep;
48828c2ecf20Sopenharmony_ci
48838c2ecf20Sopenharmony_ci	pgd = pgd_offset(mm, address);
48848c2ecf20Sopenharmony_ci	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
48858c2ecf20Sopenharmony_ci		goto out;
48868c2ecf20Sopenharmony_ci
48878c2ecf20Sopenharmony_ci	p4d = p4d_offset(pgd, address);
48888c2ecf20Sopenharmony_ci	if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
48898c2ecf20Sopenharmony_ci		goto out;
48908c2ecf20Sopenharmony_ci
48918c2ecf20Sopenharmony_ci	pud = pud_offset(p4d, address);
48928c2ecf20Sopenharmony_ci	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
48938c2ecf20Sopenharmony_ci		goto out;
48948c2ecf20Sopenharmony_ci
48958c2ecf20Sopenharmony_ci	pmd = pmd_offset(pud, address);
48968c2ecf20Sopenharmony_ci	VM_BUG_ON(pmd_trans_huge(*pmd));
48978c2ecf20Sopenharmony_ci
48988c2ecf20Sopenharmony_ci	if (pmd_huge(*pmd)) {
48998c2ecf20Sopenharmony_ci		if (!pmdpp)
49008c2ecf20Sopenharmony_ci			goto out;
49018c2ecf20Sopenharmony_ci
49028c2ecf20Sopenharmony_ci		if (range) {
49038c2ecf20Sopenharmony_ci			mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
49048c2ecf20Sopenharmony_ci						NULL, mm, address & PMD_MASK,
49058c2ecf20Sopenharmony_ci						(address & PMD_MASK) + PMD_SIZE);
49068c2ecf20Sopenharmony_ci			mmu_notifier_invalidate_range_start(range);
49078c2ecf20Sopenharmony_ci		}
49088c2ecf20Sopenharmony_ci		*ptlp = pmd_lock(mm, pmd);
49098c2ecf20Sopenharmony_ci		if (pmd_huge(*pmd)) {
49108c2ecf20Sopenharmony_ci			*pmdpp = pmd;
49118c2ecf20Sopenharmony_ci			return 0;
49128c2ecf20Sopenharmony_ci		}
49138c2ecf20Sopenharmony_ci		spin_unlock(*ptlp);
49148c2ecf20Sopenharmony_ci		if (range)
49158c2ecf20Sopenharmony_ci			mmu_notifier_invalidate_range_end(range);
49168c2ecf20Sopenharmony_ci	}
49178c2ecf20Sopenharmony_ci
49188c2ecf20Sopenharmony_ci	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
49198c2ecf20Sopenharmony_ci		goto out;
49208c2ecf20Sopenharmony_ci
49218c2ecf20Sopenharmony_ci	if (range) {
49228c2ecf20Sopenharmony_ci		mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
49238c2ecf20Sopenharmony_ci					address & PAGE_MASK,
49248c2ecf20Sopenharmony_ci					(address & PAGE_MASK) + PAGE_SIZE);
49258c2ecf20Sopenharmony_ci		mmu_notifier_invalidate_range_start(range);
49268c2ecf20Sopenharmony_ci	}
49278c2ecf20Sopenharmony_ci	ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
49288c2ecf20Sopenharmony_ci	if (!pte_present(*ptep))
49298c2ecf20Sopenharmony_ci		goto unlock;
49308c2ecf20Sopenharmony_ci	*ptepp = ptep;
49318c2ecf20Sopenharmony_ci	return 0;
49328c2ecf20Sopenharmony_ciunlock:
49338c2ecf20Sopenharmony_ci	pte_unmap_unlock(ptep, *ptlp);
49348c2ecf20Sopenharmony_ci	if (range)
49358c2ecf20Sopenharmony_ci		mmu_notifier_invalidate_range_end(range);
49368c2ecf20Sopenharmony_ciout:
49378c2ecf20Sopenharmony_ci	return -EINVAL;
49388c2ecf20Sopenharmony_ci}
49398c2ecf20Sopenharmony_ci
49408c2ecf20Sopenharmony_ci/**
49418c2ecf20Sopenharmony_ci * follow_pte - look up PTE at a user virtual address
49428c2ecf20Sopenharmony_ci * @mm: the mm_struct of the target address space
49438c2ecf20Sopenharmony_ci * @address: user virtual address
49448c2ecf20Sopenharmony_ci * @ptepp: location to store found PTE
49458c2ecf20Sopenharmony_ci * @ptlp: location to store the lock for the PTE
49468c2ecf20Sopenharmony_ci *
49478c2ecf20Sopenharmony_ci * On a successful return, the pointer to the PTE is stored in @ptepp;
49488c2ecf20Sopenharmony_ci * the corresponding lock is taken and its location is stored in @ptlp.
49498c2ecf20Sopenharmony_ci * The contents of the PTE are only stable until @ptlp is released;
49508c2ecf20Sopenharmony_ci * any further use, if any, must be protected against invalidation
49518c2ecf20Sopenharmony_ci * with MMU notifiers.
49528c2ecf20Sopenharmony_ci *
49538c2ecf20Sopenharmony_ci * Only IO mappings and raw PFN mappings are allowed.  The mmap semaphore
49548c2ecf20Sopenharmony_ci * should be taken for read.
49558c2ecf20Sopenharmony_ci *
49568c2ecf20Sopenharmony_ci * KVM uses this function.  While it is arguably less bad than ``follow_pfn``,
49578c2ecf20Sopenharmony_ci * it is not a good general-purpose API.
49588c2ecf20Sopenharmony_ci *
49598c2ecf20Sopenharmony_ci * Return: zero on success, -ve otherwise.
49608c2ecf20Sopenharmony_ci */
49618c2ecf20Sopenharmony_ciint follow_pte(struct mm_struct *mm, unsigned long address,
49628c2ecf20Sopenharmony_ci	       pte_t **ptepp, spinlock_t **ptlp)
49638c2ecf20Sopenharmony_ci{
49648c2ecf20Sopenharmony_ci	return follow_invalidate_pte(mm, address, NULL, ptepp, NULL, ptlp);
49658c2ecf20Sopenharmony_ci}
49668c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(follow_pte);
49678c2ecf20Sopenharmony_ci
49688c2ecf20Sopenharmony_ci/**
49698c2ecf20Sopenharmony_ci * follow_pfn - look up PFN at a user virtual address
49708c2ecf20Sopenharmony_ci * @vma: memory mapping
49718c2ecf20Sopenharmony_ci * @address: user virtual address
49728c2ecf20Sopenharmony_ci * @pfn: location to store found PFN
49738c2ecf20Sopenharmony_ci *
49748c2ecf20Sopenharmony_ci * Only IO mappings and raw PFN mappings are allowed.
49758c2ecf20Sopenharmony_ci *
49768c2ecf20Sopenharmony_ci * This function does not allow the caller to read the permissions
49778c2ecf20Sopenharmony_ci * of the PTE.  Do not use it.
49788c2ecf20Sopenharmony_ci *
49798c2ecf20Sopenharmony_ci * Return: zero and the pfn at @pfn on success, -ve otherwise.
49808c2ecf20Sopenharmony_ci */
49818c2ecf20Sopenharmony_ciint follow_pfn(struct vm_area_struct *vma, unsigned long address,
49828c2ecf20Sopenharmony_ci	unsigned long *pfn)
49838c2ecf20Sopenharmony_ci{
49848c2ecf20Sopenharmony_ci	int ret = -EINVAL;
49858c2ecf20Sopenharmony_ci	spinlock_t *ptl;
49868c2ecf20Sopenharmony_ci	pte_t *ptep;
49878c2ecf20Sopenharmony_ci
49888c2ecf20Sopenharmony_ci	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
49898c2ecf20Sopenharmony_ci		return ret;
49908c2ecf20Sopenharmony_ci
49918c2ecf20Sopenharmony_ci	ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
49928c2ecf20Sopenharmony_ci	if (ret)
49938c2ecf20Sopenharmony_ci		return ret;
49948c2ecf20Sopenharmony_ci	*pfn = pte_pfn(*ptep);
49958c2ecf20Sopenharmony_ci	pte_unmap_unlock(ptep, ptl);
49968c2ecf20Sopenharmony_ci	return 0;
49978c2ecf20Sopenharmony_ci}
49988c2ecf20Sopenharmony_ciEXPORT_SYMBOL(follow_pfn);
49998c2ecf20Sopenharmony_ci
50008c2ecf20Sopenharmony_ci#ifdef CONFIG_HAVE_IOREMAP_PROT
50018c2ecf20Sopenharmony_ciint follow_phys(struct vm_area_struct *vma,
50028c2ecf20Sopenharmony_ci		unsigned long address, unsigned int flags,
50038c2ecf20Sopenharmony_ci		unsigned long *prot, resource_size_t *phys)
50048c2ecf20Sopenharmony_ci{
50058c2ecf20Sopenharmony_ci	int ret = -EINVAL;
50068c2ecf20Sopenharmony_ci	pte_t *ptep, pte;
50078c2ecf20Sopenharmony_ci	spinlock_t *ptl;
50088c2ecf20Sopenharmony_ci
50098c2ecf20Sopenharmony_ci	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
50108c2ecf20Sopenharmony_ci		goto out;
50118c2ecf20Sopenharmony_ci
50128c2ecf20Sopenharmony_ci	if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
50138c2ecf20Sopenharmony_ci		goto out;
50148c2ecf20Sopenharmony_ci	pte = *ptep;
50158c2ecf20Sopenharmony_ci
50168c2ecf20Sopenharmony_ci	if ((flags & FOLL_WRITE) && !pte_write(pte))
50178c2ecf20Sopenharmony_ci		goto unlock;
50188c2ecf20Sopenharmony_ci
50198c2ecf20Sopenharmony_ci	*prot = pgprot_val(pte_pgprot(pte));
50208c2ecf20Sopenharmony_ci	*phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
50218c2ecf20Sopenharmony_ci
50228c2ecf20Sopenharmony_ci	ret = 0;
50238c2ecf20Sopenharmony_ciunlock:
50248c2ecf20Sopenharmony_ci	pte_unmap_unlock(ptep, ptl);
50258c2ecf20Sopenharmony_ciout:
50268c2ecf20Sopenharmony_ci	return ret;
50278c2ecf20Sopenharmony_ci}
50288c2ecf20Sopenharmony_ci
50298c2ecf20Sopenharmony_ciint generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
50308c2ecf20Sopenharmony_ci			void *buf, int len, int write)
50318c2ecf20Sopenharmony_ci{
50328c2ecf20Sopenharmony_ci	resource_size_t phys_addr;
50338c2ecf20Sopenharmony_ci	unsigned long prot = 0;
50348c2ecf20Sopenharmony_ci	void __iomem *maddr;
50358c2ecf20Sopenharmony_ci	int offset = addr & (PAGE_SIZE-1);
50368c2ecf20Sopenharmony_ci
50378c2ecf20Sopenharmony_ci	if (follow_phys(vma, addr, write, &prot, &phys_addr))
50388c2ecf20Sopenharmony_ci		return -EINVAL;
50398c2ecf20Sopenharmony_ci
50408c2ecf20Sopenharmony_ci	maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
50418c2ecf20Sopenharmony_ci	if (!maddr)
50428c2ecf20Sopenharmony_ci		return -ENOMEM;
50438c2ecf20Sopenharmony_ci
50448c2ecf20Sopenharmony_ci	if (write)
50458c2ecf20Sopenharmony_ci		memcpy_toio(maddr + offset, buf, len);
50468c2ecf20Sopenharmony_ci	else
50478c2ecf20Sopenharmony_ci		memcpy_fromio(buf, maddr + offset, len);
50488c2ecf20Sopenharmony_ci	iounmap(maddr);
50498c2ecf20Sopenharmony_ci
50508c2ecf20Sopenharmony_ci	return len;
50518c2ecf20Sopenharmony_ci}
50528c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(generic_access_phys);
50538c2ecf20Sopenharmony_ci#endif
50548c2ecf20Sopenharmony_ci
50558c2ecf20Sopenharmony_ci/*
50568c2ecf20Sopenharmony_ci * Access another process' address space as given in mm.  If non-NULL, use the
50578c2ecf20Sopenharmony_ci * given task for page fault accounting.
50588c2ecf20Sopenharmony_ci */
50598c2ecf20Sopenharmony_ciint __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
50608c2ecf20Sopenharmony_ci		unsigned long addr, void *buf, int len, unsigned int gup_flags)
50618c2ecf20Sopenharmony_ci{
50628c2ecf20Sopenharmony_ci	struct vm_area_struct *vma;
50638c2ecf20Sopenharmony_ci	void *old_buf = buf;
50648c2ecf20Sopenharmony_ci	int write = gup_flags & FOLL_WRITE;
50658c2ecf20Sopenharmony_ci
50668c2ecf20Sopenharmony_ci	if (mmap_read_lock_killable(mm))
50678c2ecf20Sopenharmony_ci		return 0;
50688c2ecf20Sopenharmony_ci
50698c2ecf20Sopenharmony_ci	/* ignore errors, just check how much was successfully transferred */
50708c2ecf20Sopenharmony_ci	while (len) {
50718c2ecf20Sopenharmony_ci		int bytes, ret, offset;
50728c2ecf20Sopenharmony_ci		void *maddr;
50738c2ecf20Sopenharmony_ci		struct page *page = NULL;
50748c2ecf20Sopenharmony_ci
50758c2ecf20Sopenharmony_ci		ret = get_user_pages_remote(mm, addr, 1,
50768c2ecf20Sopenharmony_ci				gup_flags, &page, &vma, NULL);
50778c2ecf20Sopenharmony_ci		if (ret <= 0) {
50788c2ecf20Sopenharmony_ci#ifndef CONFIG_HAVE_IOREMAP_PROT
50798c2ecf20Sopenharmony_ci			break;
50808c2ecf20Sopenharmony_ci#else
50818c2ecf20Sopenharmony_ci			/*
50828c2ecf20Sopenharmony_ci			 * Check if this is a VM_IO | VM_PFNMAP VMA, which
50838c2ecf20Sopenharmony_ci			 * we can access using slightly different code.
50848c2ecf20Sopenharmony_ci			 */
50858c2ecf20Sopenharmony_ci			vma = find_vma(mm, addr);
50868c2ecf20Sopenharmony_ci			if (!vma || vma->vm_start > addr)
50878c2ecf20Sopenharmony_ci				break;
50888c2ecf20Sopenharmony_ci			if (vma->vm_ops && vma->vm_ops->access)
50898c2ecf20Sopenharmony_ci				ret = vma->vm_ops->access(vma, addr, buf,
50908c2ecf20Sopenharmony_ci							  len, write);
50918c2ecf20Sopenharmony_ci			if (ret <= 0)
50928c2ecf20Sopenharmony_ci				break;
50938c2ecf20Sopenharmony_ci			bytes = ret;
50948c2ecf20Sopenharmony_ci#endif
50958c2ecf20Sopenharmony_ci		} else {
50968c2ecf20Sopenharmony_ci			bytes = len;
50978c2ecf20Sopenharmony_ci			offset = addr & (PAGE_SIZE-1);
50988c2ecf20Sopenharmony_ci			if (bytes > PAGE_SIZE-offset)
50998c2ecf20Sopenharmony_ci				bytes = PAGE_SIZE-offset;
51008c2ecf20Sopenharmony_ci
51018c2ecf20Sopenharmony_ci			maddr = kmap(page);
51028c2ecf20Sopenharmony_ci			if (write) {
51038c2ecf20Sopenharmony_ci				copy_to_user_page(vma, page, addr,
51048c2ecf20Sopenharmony_ci						  maddr + offset, buf, bytes);
51058c2ecf20Sopenharmony_ci				set_page_dirty_lock(page);
51068c2ecf20Sopenharmony_ci			} else {
51078c2ecf20Sopenharmony_ci				copy_from_user_page(vma, page, addr,
51088c2ecf20Sopenharmony_ci						    buf, maddr + offset, bytes);
51098c2ecf20Sopenharmony_ci			}
51108c2ecf20Sopenharmony_ci			kunmap(page);
51118c2ecf20Sopenharmony_ci			put_page(page);
51128c2ecf20Sopenharmony_ci		}
51138c2ecf20Sopenharmony_ci		len -= bytes;
51148c2ecf20Sopenharmony_ci		buf += bytes;
51158c2ecf20Sopenharmony_ci		addr += bytes;
51168c2ecf20Sopenharmony_ci	}
51178c2ecf20Sopenharmony_ci	mmap_read_unlock(mm);
51188c2ecf20Sopenharmony_ci
51198c2ecf20Sopenharmony_ci	return buf - old_buf;
51208c2ecf20Sopenharmony_ci}
51218c2ecf20Sopenharmony_ci
51228c2ecf20Sopenharmony_ci/**
51238c2ecf20Sopenharmony_ci * access_remote_vm - access another process' address space
51248c2ecf20Sopenharmony_ci * @mm:		the mm_struct of the target address space
51258c2ecf20Sopenharmony_ci * @addr:	start address to access
51268c2ecf20Sopenharmony_ci * @buf:	source or destination buffer
51278c2ecf20Sopenharmony_ci * @len:	number of bytes to transfer
51288c2ecf20Sopenharmony_ci * @gup_flags:	flags modifying lookup behaviour
51298c2ecf20Sopenharmony_ci *
51308c2ecf20Sopenharmony_ci * The caller must hold a reference on @mm.
51318c2ecf20Sopenharmony_ci *
51328c2ecf20Sopenharmony_ci * Return: number of bytes copied from source to destination.
51338c2ecf20Sopenharmony_ci */
51348c2ecf20Sopenharmony_ciint access_remote_vm(struct mm_struct *mm, unsigned long addr,
51358c2ecf20Sopenharmony_ci		void *buf, int len, unsigned int gup_flags)
51368c2ecf20Sopenharmony_ci{
51378c2ecf20Sopenharmony_ci	return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
51388c2ecf20Sopenharmony_ci}
51398c2ecf20Sopenharmony_ci
51408c2ecf20Sopenharmony_ci/*
51418c2ecf20Sopenharmony_ci * Access another process' address space.
51428c2ecf20Sopenharmony_ci * Source/target buffer must be kernel space,
51438c2ecf20Sopenharmony_ci * Do not walk the page table directly, use get_user_pages
51448c2ecf20Sopenharmony_ci */
51458c2ecf20Sopenharmony_ciint access_process_vm(struct task_struct *tsk, unsigned long addr,
51468c2ecf20Sopenharmony_ci		void *buf, int len, unsigned int gup_flags)
51478c2ecf20Sopenharmony_ci{
51488c2ecf20Sopenharmony_ci	struct mm_struct *mm;
51498c2ecf20Sopenharmony_ci	int ret;
51508c2ecf20Sopenharmony_ci
51518c2ecf20Sopenharmony_ci	mm = get_task_mm(tsk);
51528c2ecf20Sopenharmony_ci	if (!mm)
51538c2ecf20Sopenharmony_ci		return 0;
51548c2ecf20Sopenharmony_ci
51558c2ecf20Sopenharmony_ci	ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
51568c2ecf20Sopenharmony_ci
51578c2ecf20Sopenharmony_ci	mmput(mm);
51588c2ecf20Sopenharmony_ci
51598c2ecf20Sopenharmony_ci	return ret;
51608c2ecf20Sopenharmony_ci}
51618c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(access_process_vm);
51628c2ecf20Sopenharmony_ci
51638c2ecf20Sopenharmony_ci/*
51648c2ecf20Sopenharmony_ci * Print the name of a VMA.
51658c2ecf20Sopenharmony_ci */
51668c2ecf20Sopenharmony_civoid print_vma_addr(char *prefix, unsigned long ip)
51678c2ecf20Sopenharmony_ci{
51688c2ecf20Sopenharmony_ci	struct mm_struct *mm = current->mm;
51698c2ecf20Sopenharmony_ci	struct vm_area_struct *vma;
51708c2ecf20Sopenharmony_ci
51718c2ecf20Sopenharmony_ci	/*
51728c2ecf20Sopenharmony_ci	 * we might be running from an atomic context so we cannot sleep
51738c2ecf20Sopenharmony_ci	 */
51748c2ecf20Sopenharmony_ci	if (!mmap_read_trylock(mm))
51758c2ecf20Sopenharmony_ci		return;
51768c2ecf20Sopenharmony_ci
51778c2ecf20Sopenharmony_ci	vma = find_vma(mm, ip);
51788c2ecf20Sopenharmony_ci	if (vma && vma->vm_file) {
51798c2ecf20Sopenharmony_ci		struct file *f = vma->vm_file;
51808c2ecf20Sopenharmony_ci		char *buf = (char *)__get_free_page(GFP_NOWAIT);
51818c2ecf20Sopenharmony_ci		if (buf) {
51828c2ecf20Sopenharmony_ci			char *p;
51838c2ecf20Sopenharmony_ci
51848c2ecf20Sopenharmony_ci			p = file_path(f, buf, PAGE_SIZE);
51858c2ecf20Sopenharmony_ci			if (IS_ERR(p))
51868c2ecf20Sopenharmony_ci				p = "?";
51878c2ecf20Sopenharmony_ci			printk("%s%s[%lx+%lx]", prefix, kbasename(p),
51888c2ecf20Sopenharmony_ci					vma->vm_start,
51898c2ecf20Sopenharmony_ci					vma->vm_end - vma->vm_start);
51908c2ecf20Sopenharmony_ci			free_page((unsigned long)buf);
51918c2ecf20Sopenharmony_ci		}
51928c2ecf20Sopenharmony_ci	}
51938c2ecf20Sopenharmony_ci	mmap_read_unlock(mm);
51948c2ecf20Sopenharmony_ci}
51958c2ecf20Sopenharmony_ci
51968c2ecf20Sopenharmony_ci#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
51978c2ecf20Sopenharmony_civoid __might_fault(const char *file, int line)
51988c2ecf20Sopenharmony_ci{
51998c2ecf20Sopenharmony_ci	/*
52008c2ecf20Sopenharmony_ci	 * Some code (nfs/sunrpc) uses socket ops on kernel memory while
52018c2ecf20Sopenharmony_ci	 * holding the mmap_lock, this is safe because kernel memory doesn't
52028c2ecf20Sopenharmony_ci	 * get paged out, therefore we'll never actually fault, and the
52038c2ecf20Sopenharmony_ci	 * below annotations will generate false positives.
52048c2ecf20Sopenharmony_ci	 */
52058c2ecf20Sopenharmony_ci	if (uaccess_kernel())
52068c2ecf20Sopenharmony_ci		return;
52078c2ecf20Sopenharmony_ci	if (pagefault_disabled())
52088c2ecf20Sopenharmony_ci		return;
52098c2ecf20Sopenharmony_ci	__might_sleep(file, line, 0);
52108c2ecf20Sopenharmony_ci#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
52118c2ecf20Sopenharmony_ci	if (current->mm)
52128c2ecf20Sopenharmony_ci		might_lock_read(&current->mm->mmap_lock);
52138c2ecf20Sopenharmony_ci#endif
52148c2ecf20Sopenharmony_ci}
52158c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__might_fault);
52168c2ecf20Sopenharmony_ci#endif
52178c2ecf20Sopenharmony_ci
52188c2ecf20Sopenharmony_ci#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
52198c2ecf20Sopenharmony_ci/*
52208c2ecf20Sopenharmony_ci * Process all subpages of the specified huge page with the specified
52218c2ecf20Sopenharmony_ci * operation.  The target subpage will be processed last to keep its
52228c2ecf20Sopenharmony_ci * cache lines hot.
52238c2ecf20Sopenharmony_ci */
52248c2ecf20Sopenharmony_cistatic inline void process_huge_page(
52258c2ecf20Sopenharmony_ci	unsigned long addr_hint, unsigned int pages_per_huge_page,
52268c2ecf20Sopenharmony_ci	void (*process_subpage)(unsigned long addr, int idx, void *arg),
52278c2ecf20Sopenharmony_ci	void *arg)
52288c2ecf20Sopenharmony_ci{
52298c2ecf20Sopenharmony_ci	int i, n, base, l;
52308c2ecf20Sopenharmony_ci	unsigned long addr = addr_hint &
52318c2ecf20Sopenharmony_ci		~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
52328c2ecf20Sopenharmony_ci
52338c2ecf20Sopenharmony_ci	/* Process target subpage last to keep its cache lines hot */
52348c2ecf20Sopenharmony_ci	might_sleep();
52358c2ecf20Sopenharmony_ci	n = (addr_hint - addr) / PAGE_SIZE;
52368c2ecf20Sopenharmony_ci	if (2 * n <= pages_per_huge_page) {
52378c2ecf20Sopenharmony_ci		/* If target subpage in first half of huge page */
52388c2ecf20Sopenharmony_ci		base = 0;
52398c2ecf20Sopenharmony_ci		l = n;
52408c2ecf20Sopenharmony_ci		/* Process subpages at the end of huge page */
52418c2ecf20Sopenharmony_ci		for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
52428c2ecf20Sopenharmony_ci			cond_resched();
52438c2ecf20Sopenharmony_ci			process_subpage(addr + i * PAGE_SIZE, i, arg);
52448c2ecf20Sopenharmony_ci		}
52458c2ecf20Sopenharmony_ci	} else {
52468c2ecf20Sopenharmony_ci		/* If target subpage in second half of huge page */
52478c2ecf20Sopenharmony_ci		base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
52488c2ecf20Sopenharmony_ci		l = pages_per_huge_page - n;
52498c2ecf20Sopenharmony_ci		/* Process subpages at the begin of huge page */
52508c2ecf20Sopenharmony_ci		for (i = 0; i < base; i++) {
52518c2ecf20Sopenharmony_ci			cond_resched();
52528c2ecf20Sopenharmony_ci			process_subpage(addr + i * PAGE_SIZE, i, arg);
52538c2ecf20Sopenharmony_ci		}
52548c2ecf20Sopenharmony_ci	}
52558c2ecf20Sopenharmony_ci	/*
52568c2ecf20Sopenharmony_ci	 * Process remaining subpages in left-right-left-right pattern
52578c2ecf20Sopenharmony_ci	 * towards the target subpage
52588c2ecf20Sopenharmony_ci	 */
52598c2ecf20Sopenharmony_ci	for (i = 0; i < l; i++) {
52608c2ecf20Sopenharmony_ci		int left_idx = base + i;
52618c2ecf20Sopenharmony_ci		int right_idx = base + 2 * l - 1 - i;
52628c2ecf20Sopenharmony_ci
52638c2ecf20Sopenharmony_ci		cond_resched();
52648c2ecf20Sopenharmony_ci		process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
52658c2ecf20Sopenharmony_ci		cond_resched();
52668c2ecf20Sopenharmony_ci		process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
52678c2ecf20Sopenharmony_ci	}
52688c2ecf20Sopenharmony_ci}
52698c2ecf20Sopenharmony_ci
52708c2ecf20Sopenharmony_cistatic void clear_gigantic_page(struct page *page,
52718c2ecf20Sopenharmony_ci				unsigned long addr,
52728c2ecf20Sopenharmony_ci				unsigned int pages_per_huge_page)
52738c2ecf20Sopenharmony_ci{
52748c2ecf20Sopenharmony_ci	int i;
52758c2ecf20Sopenharmony_ci	struct page *p = page;
52768c2ecf20Sopenharmony_ci
52778c2ecf20Sopenharmony_ci	might_sleep();
52788c2ecf20Sopenharmony_ci	for (i = 0; i < pages_per_huge_page;
52798c2ecf20Sopenharmony_ci	     i++, p = mem_map_next(p, page, i)) {
52808c2ecf20Sopenharmony_ci		cond_resched();
52818c2ecf20Sopenharmony_ci		clear_user_highpage(p, addr + i * PAGE_SIZE);
52828c2ecf20Sopenharmony_ci	}
52838c2ecf20Sopenharmony_ci}
52848c2ecf20Sopenharmony_ci
52858c2ecf20Sopenharmony_cistatic void clear_subpage(unsigned long addr, int idx, void *arg)
52868c2ecf20Sopenharmony_ci{
52878c2ecf20Sopenharmony_ci	struct page *page = arg;
52888c2ecf20Sopenharmony_ci
52898c2ecf20Sopenharmony_ci	clear_user_highpage(page + idx, addr);
52908c2ecf20Sopenharmony_ci}
52918c2ecf20Sopenharmony_ci
52928c2ecf20Sopenharmony_civoid clear_huge_page(struct page *page,
52938c2ecf20Sopenharmony_ci		     unsigned long addr_hint, unsigned int pages_per_huge_page)
52948c2ecf20Sopenharmony_ci{
52958c2ecf20Sopenharmony_ci	unsigned long addr = addr_hint &
52968c2ecf20Sopenharmony_ci		~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
52978c2ecf20Sopenharmony_ci
52988c2ecf20Sopenharmony_ci	if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
52998c2ecf20Sopenharmony_ci		clear_gigantic_page(page, addr, pages_per_huge_page);
53008c2ecf20Sopenharmony_ci		return;
53018c2ecf20Sopenharmony_ci	}
53028c2ecf20Sopenharmony_ci
53038c2ecf20Sopenharmony_ci	process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
53048c2ecf20Sopenharmony_ci}
53058c2ecf20Sopenharmony_ci
53068c2ecf20Sopenharmony_cistatic void copy_user_gigantic_page(struct page *dst, struct page *src,
53078c2ecf20Sopenharmony_ci				    unsigned long addr,
53088c2ecf20Sopenharmony_ci				    struct vm_area_struct *vma,
53098c2ecf20Sopenharmony_ci				    unsigned int pages_per_huge_page)
53108c2ecf20Sopenharmony_ci{
53118c2ecf20Sopenharmony_ci	int i;
53128c2ecf20Sopenharmony_ci	struct page *dst_base = dst;
53138c2ecf20Sopenharmony_ci	struct page *src_base = src;
53148c2ecf20Sopenharmony_ci
53158c2ecf20Sopenharmony_ci	for (i = 0; i < pages_per_huge_page; ) {
53168c2ecf20Sopenharmony_ci		cond_resched();
53178c2ecf20Sopenharmony_ci		copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
53188c2ecf20Sopenharmony_ci
53198c2ecf20Sopenharmony_ci		i++;
53208c2ecf20Sopenharmony_ci		dst = mem_map_next(dst, dst_base, i);
53218c2ecf20Sopenharmony_ci		src = mem_map_next(src, src_base, i);
53228c2ecf20Sopenharmony_ci	}
53238c2ecf20Sopenharmony_ci}
53248c2ecf20Sopenharmony_ci
53258c2ecf20Sopenharmony_cistruct copy_subpage_arg {
53268c2ecf20Sopenharmony_ci	struct page *dst;
53278c2ecf20Sopenharmony_ci	struct page *src;
53288c2ecf20Sopenharmony_ci	struct vm_area_struct *vma;
53298c2ecf20Sopenharmony_ci};
53308c2ecf20Sopenharmony_ci
53318c2ecf20Sopenharmony_cistatic void copy_subpage(unsigned long addr, int idx, void *arg)
53328c2ecf20Sopenharmony_ci{
53338c2ecf20Sopenharmony_ci	struct copy_subpage_arg *copy_arg = arg;
53348c2ecf20Sopenharmony_ci
53358c2ecf20Sopenharmony_ci	copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
53368c2ecf20Sopenharmony_ci			   addr, copy_arg->vma);
53378c2ecf20Sopenharmony_ci}
53388c2ecf20Sopenharmony_ci
53398c2ecf20Sopenharmony_civoid copy_user_huge_page(struct page *dst, struct page *src,
53408c2ecf20Sopenharmony_ci			 unsigned long addr_hint, struct vm_area_struct *vma,
53418c2ecf20Sopenharmony_ci			 unsigned int pages_per_huge_page)
53428c2ecf20Sopenharmony_ci{
53438c2ecf20Sopenharmony_ci	unsigned long addr = addr_hint &
53448c2ecf20Sopenharmony_ci		~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
53458c2ecf20Sopenharmony_ci	struct copy_subpage_arg arg = {
53468c2ecf20Sopenharmony_ci		.dst = dst,
53478c2ecf20Sopenharmony_ci		.src = src,
53488c2ecf20Sopenharmony_ci		.vma = vma,
53498c2ecf20Sopenharmony_ci	};
53508c2ecf20Sopenharmony_ci
53518c2ecf20Sopenharmony_ci	if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
53528c2ecf20Sopenharmony_ci		copy_user_gigantic_page(dst, src, addr, vma,
53538c2ecf20Sopenharmony_ci					pages_per_huge_page);
53548c2ecf20Sopenharmony_ci		return;
53558c2ecf20Sopenharmony_ci	}
53568c2ecf20Sopenharmony_ci
53578c2ecf20Sopenharmony_ci	process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
53588c2ecf20Sopenharmony_ci}
53598c2ecf20Sopenharmony_ci
53608c2ecf20Sopenharmony_cilong copy_huge_page_from_user(struct page *dst_page,
53618c2ecf20Sopenharmony_ci				const void __user *usr_src,
53628c2ecf20Sopenharmony_ci				unsigned int pages_per_huge_page,
53638c2ecf20Sopenharmony_ci				bool allow_pagefault)
53648c2ecf20Sopenharmony_ci{
53658c2ecf20Sopenharmony_ci	void *src = (void *)usr_src;
53668c2ecf20Sopenharmony_ci	void *page_kaddr;
53678c2ecf20Sopenharmony_ci	unsigned long i, rc = 0;
53688c2ecf20Sopenharmony_ci	unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
53698c2ecf20Sopenharmony_ci	struct page *subpage = dst_page;
53708c2ecf20Sopenharmony_ci
53718c2ecf20Sopenharmony_ci	for (i = 0; i < pages_per_huge_page;
53728c2ecf20Sopenharmony_ci	     i++, subpage = mem_map_next(subpage, dst_page, i)) {
53738c2ecf20Sopenharmony_ci		if (allow_pagefault)
53748c2ecf20Sopenharmony_ci			page_kaddr = kmap(subpage);
53758c2ecf20Sopenharmony_ci		else
53768c2ecf20Sopenharmony_ci			page_kaddr = kmap_atomic(subpage);
53778c2ecf20Sopenharmony_ci		rc = copy_from_user(page_kaddr,
53788c2ecf20Sopenharmony_ci				(const void __user *)(src + i * PAGE_SIZE),
53798c2ecf20Sopenharmony_ci				PAGE_SIZE);
53808c2ecf20Sopenharmony_ci		if (allow_pagefault)
53818c2ecf20Sopenharmony_ci			kunmap(subpage);
53828c2ecf20Sopenharmony_ci		else
53838c2ecf20Sopenharmony_ci			kunmap_atomic(page_kaddr);
53848c2ecf20Sopenharmony_ci
53858c2ecf20Sopenharmony_ci		ret_val -= (PAGE_SIZE - rc);
53868c2ecf20Sopenharmony_ci		if (rc)
53878c2ecf20Sopenharmony_ci			break;
53888c2ecf20Sopenharmony_ci
53898c2ecf20Sopenharmony_ci		flush_dcache_page(subpage);
53908c2ecf20Sopenharmony_ci
53918c2ecf20Sopenharmony_ci		cond_resched();
53928c2ecf20Sopenharmony_ci	}
53938c2ecf20Sopenharmony_ci	return ret_val;
53948c2ecf20Sopenharmony_ci}
53958c2ecf20Sopenharmony_ci#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
53968c2ecf20Sopenharmony_ci
53978c2ecf20Sopenharmony_ci#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
53988c2ecf20Sopenharmony_ci
53998c2ecf20Sopenharmony_cistatic struct kmem_cache *page_ptl_cachep;
54008c2ecf20Sopenharmony_ci
54018c2ecf20Sopenharmony_civoid __init ptlock_cache_init(void)
54028c2ecf20Sopenharmony_ci{
54038c2ecf20Sopenharmony_ci	page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
54048c2ecf20Sopenharmony_ci			SLAB_PANIC, NULL);
54058c2ecf20Sopenharmony_ci}
54068c2ecf20Sopenharmony_ci
54078c2ecf20Sopenharmony_cibool ptlock_alloc(struct page *page)
54088c2ecf20Sopenharmony_ci{
54098c2ecf20Sopenharmony_ci	spinlock_t *ptl;
54108c2ecf20Sopenharmony_ci
54118c2ecf20Sopenharmony_ci	ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
54128c2ecf20Sopenharmony_ci	if (!ptl)
54138c2ecf20Sopenharmony_ci		return false;
54148c2ecf20Sopenharmony_ci	page->ptl = ptl;
54158c2ecf20Sopenharmony_ci	return true;
54168c2ecf20Sopenharmony_ci}
54178c2ecf20Sopenharmony_ci
54188c2ecf20Sopenharmony_civoid ptlock_free(struct page *page)
54198c2ecf20Sopenharmony_ci{
54208c2ecf20Sopenharmony_ci	kmem_cache_free(page_ptl_cachep, page->ptl);
54218c2ecf20Sopenharmony_ci}
54228c2ecf20Sopenharmony_ci#endif
5423