162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci *  Copyright (C) 2009  Red Hat, Inc.
462306a36Sopenharmony_ci */
562306a36Sopenharmony_ci
662306a36Sopenharmony_ci#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
762306a36Sopenharmony_ci
862306a36Sopenharmony_ci#include <linux/mm.h>
962306a36Sopenharmony_ci#include <linux/sched.h>
1062306a36Sopenharmony_ci#include <linux/sched/mm.h>
1162306a36Sopenharmony_ci#include <linux/sched/coredump.h>
1262306a36Sopenharmony_ci#include <linux/sched/numa_balancing.h>
1362306a36Sopenharmony_ci#include <linux/highmem.h>
1462306a36Sopenharmony_ci#include <linux/hugetlb.h>
1562306a36Sopenharmony_ci#include <linux/mmu_notifier.h>
1662306a36Sopenharmony_ci#include <linux/rmap.h>
1762306a36Sopenharmony_ci#include <linux/swap.h>
1862306a36Sopenharmony_ci#include <linux/shrinker.h>
1962306a36Sopenharmony_ci#include <linux/mm_inline.h>
2062306a36Sopenharmony_ci#include <linux/swapops.h>
2162306a36Sopenharmony_ci#include <linux/backing-dev.h>
2262306a36Sopenharmony_ci#include <linux/dax.h>
2362306a36Sopenharmony_ci#include <linux/khugepaged.h>
2462306a36Sopenharmony_ci#include <linux/freezer.h>
2562306a36Sopenharmony_ci#include <linux/pfn_t.h>
2662306a36Sopenharmony_ci#include <linux/mman.h>
2762306a36Sopenharmony_ci#include <linux/memremap.h>
2862306a36Sopenharmony_ci#include <linux/pagemap.h>
2962306a36Sopenharmony_ci#include <linux/debugfs.h>
3062306a36Sopenharmony_ci#include <linux/migrate.h>
3162306a36Sopenharmony_ci#include <linux/hashtable.h>
3262306a36Sopenharmony_ci#include <linux/userfaultfd_k.h>
3362306a36Sopenharmony_ci#include <linux/page_idle.h>
3462306a36Sopenharmony_ci#include <linux/shmem_fs.h>
3562306a36Sopenharmony_ci#include <linux/oom.h>
3662306a36Sopenharmony_ci#include <linux/numa.h>
3762306a36Sopenharmony_ci#include <linux/page_owner.h>
3862306a36Sopenharmony_ci#include <linux/sched/sysctl.h>
3962306a36Sopenharmony_ci#include <linux/memory-tiers.h>
4062306a36Sopenharmony_ci
4162306a36Sopenharmony_ci#include <asm/tlb.h>
4262306a36Sopenharmony_ci#include <asm/pgalloc.h>
4362306a36Sopenharmony_ci#include "internal.h"
4462306a36Sopenharmony_ci#include "swap.h"
4562306a36Sopenharmony_ci
4662306a36Sopenharmony_ci#define CREATE_TRACE_POINTS
4762306a36Sopenharmony_ci#include <trace/events/thp.h>
4862306a36Sopenharmony_ci
4962306a36Sopenharmony_ci/*
5062306a36Sopenharmony_ci * By default, transparent hugepage support is disabled in order to avoid
5162306a36Sopenharmony_ci * risking an increased memory footprint for applications that are not
5262306a36Sopenharmony_ci * guaranteed to benefit from it. When transparent hugepage support is
5362306a36Sopenharmony_ci * enabled, it is for all mappings, and khugepaged scans all mappings.
5462306a36Sopenharmony_ci * Defrag is invoked by khugepaged hugepage allocations and by page faults
5562306a36Sopenharmony_ci * for all hugepage allocations.
5662306a36Sopenharmony_ci */
5762306a36Sopenharmony_ciunsigned long transparent_hugepage_flags __read_mostly =
5862306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
5962306a36Sopenharmony_ci	(1<<TRANSPARENT_HUGEPAGE_FLAG)|
6062306a36Sopenharmony_ci#endif
6162306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
6262306a36Sopenharmony_ci	(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
6362306a36Sopenharmony_ci#endif
6462306a36Sopenharmony_ci	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
6562306a36Sopenharmony_ci	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
6662306a36Sopenharmony_ci	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
6762306a36Sopenharmony_ci
6862306a36Sopenharmony_cistatic struct shrinker deferred_split_shrinker;
6962306a36Sopenharmony_ci
7062306a36Sopenharmony_cistatic atomic_t huge_zero_refcount;
7162306a36Sopenharmony_cistruct page *huge_zero_page __read_mostly;
7262306a36Sopenharmony_ciunsigned long huge_zero_pfn __read_mostly = ~0UL;
7362306a36Sopenharmony_ci
7462306a36Sopenharmony_cibool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
7562306a36Sopenharmony_ci			bool smaps, bool in_pf, bool enforce_sysfs)
7662306a36Sopenharmony_ci{
7762306a36Sopenharmony_ci	if (!vma->vm_mm)		/* vdso */
7862306a36Sopenharmony_ci		return false;
7962306a36Sopenharmony_ci
8062306a36Sopenharmony_ci	/*
8162306a36Sopenharmony_ci	 * Explicitly disabled through madvise or prctl, or some
8262306a36Sopenharmony_ci	 * architectures may disable THP for some mappings, for
8362306a36Sopenharmony_ci	 * example, s390 kvm.
8462306a36Sopenharmony_ci	 * */
8562306a36Sopenharmony_ci	if ((vm_flags & VM_NOHUGEPAGE) ||
8662306a36Sopenharmony_ci	    test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
8762306a36Sopenharmony_ci		return false;
8862306a36Sopenharmony_ci	/*
8962306a36Sopenharmony_ci	 * If the hardware/firmware marked hugepage support disabled.
9062306a36Sopenharmony_ci	 */
9162306a36Sopenharmony_ci	if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED))
9262306a36Sopenharmony_ci		return false;
9362306a36Sopenharmony_ci
9462306a36Sopenharmony_ci	/* khugepaged doesn't collapse DAX vma, but page fault is fine. */
9562306a36Sopenharmony_ci	if (vma_is_dax(vma))
9662306a36Sopenharmony_ci		return in_pf;
9762306a36Sopenharmony_ci
9862306a36Sopenharmony_ci	/*
9962306a36Sopenharmony_ci	 * Special VMA and hugetlb VMA.
10062306a36Sopenharmony_ci	 * Must be checked after dax since some dax mappings may have
10162306a36Sopenharmony_ci	 * VM_MIXEDMAP set.
10262306a36Sopenharmony_ci	 */
10362306a36Sopenharmony_ci	if (vm_flags & VM_NO_KHUGEPAGED)
10462306a36Sopenharmony_ci		return false;
10562306a36Sopenharmony_ci
10662306a36Sopenharmony_ci	/*
10762306a36Sopenharmony_ci	 * Check alignment for file vma and size for both file and anon vma.
10862306a36Sopenharmony_ci	 *
10962306a36Sopenharmony_ci	 * Skip the check for page fault. Huge fault does the check in fault
11062306a36Sopenharmony_ci	 * handlers. And this check is not suitable for huge PUD fault.
11162306a36Sopenharmony_ci	 */
11262306a36Sopenharmony_ci	if (!in_pf &&
11362306a36Sopenharmony_ci	    !transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE)))
11462306a36Sopenharmony_ci		return false;
11562306a36Sopenharmony_ci
11662306a36Sopenharmony_ci	/*
11762306a36Sopenharmony_ci	 * Enabled via shmem mount options or sysfs settings.
11862306a36Sopenharmony_ci	 * Must be done before hugepage flags check since shmem has its
11962306a36Sopenharmony_ci	 * own flags.
12062306a36Sopenharmony_ci	 */
12162306a36Sopenharmony_ci	if (!in_pf && shmem_file(vma->vm_file))
12262306a36Sopenharmony_ci		return shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff,
12362306a36Sopenharmony_ci				     !enforce_sysfs, vma->vm_mm, vm_flags);
12462306a36Sopenharmony_ci
12562306a36Sopenharmony_ci	/* Enforce sysfs THP requirements as necessary */
12662306a36Sopenharmony_ci	if (enforce_sysfs &&
12762306a36Sopenharmony_ci	    (!hugepage_flags_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
12862306a36Sopenharmony_ci					   !hugepage_flags_always())))
12962306a36Sopenharmony_ci		return false;
13062306a36Sopenharmony_ci
13162306a36Sopenharmony_ci	/* Only regular file is valid */
13262306a36Sopenharmony_ci	if (!in_pf && file_thp_enabled(vma))
13362306a36Sopenharmony_ci		return true;
13462306a36Sopenharmony_ci
13562306a36Sopenharmony_ci	if (!vma_is_anonymous(vma))
13662306a36Sopenharmony_ci		return false;
13762306a36Sopenharmony_ci
13862306a36Sopenharmony_ci	if (vma_is_temporary_stack(vma))
13962306a36Sopenharmony_ci		return false;
14062306a36Sopenharmony_ci
14162306a36Sopenharmony_ci	/*
14262306a36Sopenharmony_ci	 * THPeligible bit of smaps should show 1 for proper VMAs even
14362306a36Sopenharmony_ci	 * though anon_vma is not initialized yet.
14462306a36Sopenharmony_ci	 *
14562306a36Sopenharmony_ci	 * Allow page fault since anon_vma may be not initialized until
14662306a36Sopenharmony_ci	 * the first page fault.
14762306a36Sopenharmony_ci	 */
14862306a36Sopenharmony_ci	if (!vma->anon_vma)
14962306a36Sopenharmony_ci		return (smaps || in_pf);
15062306a36Sopenharmony_ci
15162306a36Sopenharmony_ci	return true;
15262306a36Sopenharmony_ci}
15362306a36Sopenharmony_ci
15462306a36Sopenharmony_cistatic bool get_huge_zero_page(void)
15562306a36Sopenharmony_ci{
15662306a36Sopenharmony_ci	struct page *zero_page;
15762306a36Sopenharmony_ciretry:
15862306a36Sopenharmony_ci	if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
15962306a36Sopenharmony_ci		return true;
16062306a36Sopenharmony_ci
16162306a36Sopenharmony_ci	zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
16262306a36Sopenharmony_ci			HPAGE_PMD_ORDER);
16362306a36Sopenharmony_ci	if (!zero_page) {
16462306a36Sopenharmony_ci		count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
16562306a36Sopenharmony_ci		return false;
16662306a36Sopenharmony_ci	}
16762306a36Sopenharmony_ci	preempt_disable();
16862306a36Sopenharmony_ci	if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
16962306a36Sopenharmony_ci		preempt_enable();
17062306a36Sopenharmony_ci		__free_pages(zero_page, compound_order(zero_page));
17162306a36Sopenharmony_ci		goto retry;
17262306a36Sopenharmony_ci	}
17362306a36Sopenharmony_ci	WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page));
17462306a36Sopenharmony_ci
17562306a36Sopenharmony_ci	/* We take additional reference here. It will be put back by shrinker */
17662306a36Sopenharmony_ci	atomic_set(&huge_zero_refcount, 2);
17762306a36Sopenharmony_ci	preempt_enable();
17862306a36Sopenharmony_ci	count_vm_event(THP_ZERO_PAGE_ALLOC);
17962306a36Sopenharmony_ci	return true;
18062306a36Sopenharmony_ci}
18162306a36Sopenharmony_ci
18262306a36Sopenharmony_cistatic void put_huge_zero_page(void)
18362306a36Sopenharmony_ci{
18462306a36Sopenharmony_ci	/*
18562306a36Sopenharmony_ci	 * Counter should never go to zero here. Only shrinker can put
18662306a36Sopenharmony_ci	 * last reference.
18762306a36Sopenharmony_ci	 */
18862306a36Sopenharmony_ci	BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
18962306a36Sopenharmony_ci}
19062306a36Sopenharmony_ci
19162306a36Sopenharmony_cistruct page *mm_get_huge_zero_page(struct mm_struct *mm)
19262306a36Sopenharmony_ci{
19362306a36Sopenharmony_ci	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
19462306a36Sopenharmony_ci		return READ_ONCE(huge_zero_page);
19562306a36Sopenharmony_ci
19662306a36Sopenharmony_ci	if (!get_huge_zero_page())
19762306a36Sopenharmony_ci		return NULL;
19862306a36Sopenharmony_ci
19962306a36Sopenharmony_ci	if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
20062306a36Sopenharmony_ci		put_huge_zero_page();
20162306a36Sopenharmony_ci
20262306a36Sopenharmony_ci	return READ_ONCE(huge_zero_page);
20362306a36Sopenharmony_ci}
20462306a36Sopenharmony_ci
20562306a36Sopenharmony_civoid mm_put_huge_zero_page(struct mm_struct *mm)
20662306a36Sopenharmony_ci{
20762306a36Sopenharmony_ci	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
20862306a36Sopenharmony_ci		put_huge_zero_page();
20962306a36Sopenharmony_ci}
21062306a36Sopenharmony_ci
21162306a36Sopenharmony_cistatic unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
21262306a36Sopenharmony_ci					struct shrink_control *sc)
21362306a36Sopenharmony_ci{
21462306a36Sopenharmony_ci	/* we can free zero page only if last reference remains */
21562306a36Sopenharmony_ci	return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
21662306a36Sopenharmony_ci}
21762306a36Sopenharmony_ci
21862306a36Sopenharmony_cistatic unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
21962306a36Sopenharmony_ci				       struct shrink_control *sc)
22062306a36Sopenharmony_ci{
22162306a36Sopenharmony_ci	if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
22262306a36Sopenharmony_ci		struct page *zero_page = xchg(&huge_zero_page, NULL);
22362306a36Sopenharmony_ci		BUG_ON(zero_page == NULL);
22462306a36Sopenharmony_ci		WRITE_ONCE(huge_zero_pfn, ~0UL);
22562306a36Sopenharmony_ci		__free_pages(zero_page, compound_order(zero_page));
22662306a36Sopenharmony_ci		return HPAGE_PMD_NR;
22762306a36Sopenharmony_ci	}
22862306a36Sopenharmony_ci
22962306a36Sopenharmony_ci	return 0;
23062306a36Sopenharmony_ci}
23162306a36Sopenharmony_ci
23262306a36Sopenharmony_cistatic struct shrinker huge_zero_page_shrinker = {
23362306a36Sopenharmony_ci	.count_objects = shrink_huge_zero_page_count,
23462306a36Sopenharmony_ci	.scan_objects = shrink_huge_zero_page_scan,
23562306a36Sopenharmony_ci	.seeks = DEFAULT_SEEKS,
23662306a36Sopenharmony_ci};
23762306a36Sopenharmony_ci
23862306a36Sopenharmony_ci#ifdef CONFIG_SYSFS
23962306a36Sopenharmony_cistatic ssize_t enabled_show(struct kobject *kobj,
24062306a36Sopenharmony_ci			    struct kobj_attribute *attr, char *buf)
24162306a36Sopenharmony_ci{
24262306a36Sopenharmony_ci	const char *output;
24362306a36Sopenharmony_ci
24462306a36Sopenharmony_ci	if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
24562306a36Sopenharmony_ci		output = "[always] madvise never";
24662306a36Sopenharmony_ci	else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
24762306a36Sopenharmony_ci			  &transparent_hugepage_flags))
24862306a36Sopenharmony_ci		output = "always [madvise] never";
24962306a36Sopenharmony_ci	else
25062306a36Sopenharmony_ci		output = "always madvise [never]";
25162306a36Sopenharmony_ci
25262306a36Sopenharmony_ci	return sysfs_emit(buf, "%s\n", output);
25362306a36Sopenharmony_ci}
25462306a36Sopenharmony_ci
25562306a36Sopenharmony_cistatic ssize_t enabled_store(struct kobject *kobj,
25662306a36Sopenharmony_ci			     struct kobj_attribute *attr,
25762306a36Sopenharmony_ci			     const char *buf, size_t count)
25862306a36Sopenharmony_ci{
25962306a36Sopenharmony_ci	ssize_t ret = count;
26062306a36Sopenharmony_ci
26162306a36Sopenharmony_ci	if (sysfs_streq(buf, "always")) {
26262306a36Sopenharmony_ci		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
26362306a36Sopenharmony_ci		set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
26462306a36Sopenharmony_ci	} else if (sysfs_streq(buf, "madvise")) {
26562306a36Sopenharmony_ci		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
26662306a36Sopenharmony_ci		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
26762306a36Sopenharmony_ci	} else if (sysfs_streq(buf, "never")) {
26862306a36Sopenharmony_ci		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
26962306a36Sopenharmony_ci		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
27062306a36Sopenharmony_ci	} else
27162306a36Sopenharmony_ci		ret = -EINVAL;
27262306a36Sopenharmony_ci
27362306a36Sopenharmony_ci	if (ret > 0) {
27462306a36Sopenharmony_ci		int err = start_stop_khugepaged();
27562306a36Sopenharmony_ci		if (err)
27662306a36Sopenharmony_ci			ret = err;
27762306a36Sopenharmony_ci	}
27862306a36Sopenharmony_ci	return ret;
27962306a36Sopenharmony_ci}
28062306a36Sopenharmony_ci
28162306a36Sopenharmony_cistatic struct kobj_attribute enabled_attr = __ATTR_RW(enabled);
28262306a36Sopenharmony_ci
28362306a36Sopenharmony_cissize_t single_hugepage_flag_show(struct kobject *kobj,
28462306a36Sopenharmony_ci				  struct kobj_attribute *attr, char *buf,
28562306a36Sopenharmony_ci				  enum transparent_hugepage_flag flag)
28662306a36Sopenharmony_ci{
28762306a36Sopenharmony_ci	return sysfs_emit(buf, "%d\n",
28862306a36Sopenharmony_ci			  !!test_bit(flag, &transparent_hugepage_flags));
28962306a36Sopenharmony_ci}
29062306a36Sopenharmony_ci
29162306a36Sopenharmony_cissize_t single_hugepage_flag_store(struct kobject *kobj,
29262306a36Sopenharmony_ci				 struct kobj_attribute *attr,
29362306a36Sopenharmony_ci				 const char *buf, size_t count,
29462306a36Sopenharmony_ci				 enum transparent_hugepage_flag flag)
29562306a36Sopenharmony_ci{
29662306a36Sopenharmony_ci	unsigned long value;
29762306a36Sopenharmony_ci	int ret;
29862306a36Sopenharmony_ci
29962306a36Sopenharmony_ci	ret = kstrtoul(buf, 10, &value);
30062306a36Sopenharmony_ci	if (ret < 0)
30162306a36Sopenharmony_ci		return ret;
30262306a36Sopenharmony_ci	if (value > 1)
30362306a36Sopenharmony_ci		return -EINVAL;
30462306a36Sopenharmony_ci
30562306a36Sopenharmony_ci	if (value)
30662306a36Sopenharmony_ci		set_bit(flag, &transparent_hugepage_flags);
30762306a36Sopenharmony_ci	else
30862306a36Sopenharmony_ci		clear_bit(flag, &transparent_hugepage_flags);
30962306a36Sopenharmony_ci
31062306a36Sopenharmony_ci	return count;
31162306a36Sopenharmony_ci}
31262306a36Sopenharmony_ci
31362306a36Sopenharmony_cistatic ssize_t defrag_show(struct kobject *kobj,
31462306a36Sopenharmony_ci			   struct kobj_attribute *attr, char *buf)
31562306a36Sopenharmony_ci{
31662306a36Sopenharmony_ci	const char *output;
31762306a36Sopenharmony_ci
31862306a36Sopenharmony_ci	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
31962306a36Sopenharmony_ci		     &transparent_hugepage_flags))
32062306a36Sopenharmony_ci		output = "[always] defer defer+madvise madvise never";
32162306a36Sopenharmony_ci	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
32262306a36Sopenharmony_ci			  &transparent_hugepage_flags))
32362306a36Sopenharmony_ci		output = "always [defer] defer+madvise madvise never";
32462306a36Sopenharmony_ci	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
32562306a36Sopenharmony_ci			  &transparent_hugepage_flags))
32662306a36Sopenharmony_ci		output = "always defer [defer+madvise] madvise never";
32762306a36Sopenharmony_ci	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
32862306a36Sopenharmony_ci			  &transparent_hugepage_flags))
32962306a36Sopenharmony_ci		output = "always defer defer+madvise [madvise] never";
33062306a36Sopenharmony_ci	else
33162306a36Sopenharmony_ci		output = "always defer defer+madvise madvise [never]";
33262306a36Sopenharmony_ci
33362306a36Sopenharmony_ci	return sysfs_emit(buf, "%s\n", output);
33462306a36Sopenharmony_ci}
33562306a36Sopenharmony_ci
33662306a36Sopenharmony_cistatic ssize_t defrag_store(struct kobject *kobj,
33762306a36Sopenharmony_ci			    struct kobj_attribute *attr,
33862306a36Sopenharmony_ci			    const char *buf, size_t count)
33962306a36Sopenharmony_ci{
34062306a36Sopenharmony_ci	if (sysfs_streq(buf, "always")) {
34162306a36Sopenharmony_ci		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
34262306a36Sopenharmony_ci		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
34362306a36Sopenharmony_ci		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
34462306a36Sopenharmony_ci		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
34562306a36Sopenharmony_ci	} else if (sysfs_streq(buf, "defer+madvise")) {
34662306a36Sopenharmony_ci		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
34762306a36Sopenharmony_ci		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
34862306a36Sopenharmony_ci		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
34962306a36Sopenharmony_ci		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
35062306a36Sopenharmony_ci	} else if (sysfs_streq(buf, "defer")) {
35162306a36Sopenharmony_ci		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
35262306a36Sopenharmony_ci		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
35362306a36Sopenharmony_ci		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
35462306a36Sopenharmony_ci		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
35562306a36Sopenharmony_ci	} else if (sysfs_streq(buf, "madvise")) {
35662306a36Sopenharmony_ci		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
35762306a36Sopenharmony_ci		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
35862306a36Sopenharmony_ci		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
35962306a36Sopenharmony_ci		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
36062306a36Sopenharmony_ci	} else if (sysfs_streq(buf, "never")) {
36162306a36Sopenharmony_ci		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
36262306a36Sopenharmony_ci		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
36362306a36Sopenharmony_ci		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
36462306a36Sopenharmony_ci		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
36562306a36Sopenharmony_ci	} else
36662306a36Sopenharmony_ci		return -EINVAL;
36762306a36Sopenharmony_ci
36862306a36Sopenharmony_ci	return count;
36962306a36Sopenharmony_ci}
37062306a36Sopenharmony_cistatic struct kobj_attribute defrag_attr = __ATTR_RW(defrag);
37162306a36Sopenharmony_ci
37262306a36Sopenharmony_cistatic ssize_t use_zero_page_show(struct kobject *kobj,
37362306a36Sopenharmony_ci				  struct kobj_attribute *attr, char *buf)
37462306a36Sopenharmony_ci{
37562306a36Sopenharmony_ci	return single_hugepage_flag_show(kobj, attr, buf,
37662306a36Sopenharmony_ci					 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
37762306a36Sopenharmony_ci}
37862306a36Sopenharmony_cistatic ssize_t use_zero_page_store(struct kobject *kobj,
37962306a36Sopenharmony_ci		struct kobj_attribute *attr, const char *buf, size_t count)
38062306a36Sopenharmony_ci{
38162306a36Sopenharmony_ci	return single_hugepage_flag_store(kobj, attr, buf, count,
38262306a36Sopenharmony_ci				 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
38362306a36Sopenharmony_ci}
38462306a36Sopenharmony_cistatic struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page);
38562306a36Sopenharmony_ci
38662306a36Sopenharmony_cistatic ssize_t hpage_pmd_size_show(struct kobject *kobj,
38762306a36Sopenharmony_ci				   struct kobj_attribute *attr, char *buf)
38862306a36Sopenharmony_ci{
38962306a36Sopenharmony_ci	return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE);
39062306a36Sopenharmony_ci}
39162306a36Sopenharmony_cistatic struct kobj_attribute hpage_pmd_size_attr =
39262306a36Sopenharmony_ci	__ATTR_RO(hpage_pmd_size);
39362306a36Sopenharmony_ci
39462306a36Sopenharmony_cistatic struct attribute *hugepage_attr[] = {
39562306a36Sopenharmony_ci	&enabled_attr.attr,
39662306a36Sopenharmony_ci	&defrag_attr.attr,
39762306a36Sopenharmony_ci	&use_zero_page_attr.attr,
39862306a36Sopenharmony_ci	&hpage_pmd_size_attr.attr,
39962306a36Sopenharmony_ci#ifdef CONFIG_SHMEM
40062306a36Sopenharmony_ci	&shmem_enabled_attr.attr,
40162306a36Sopenharmony_ci#endif
40262306a36Sopenharmony_ci	NULL,
40362306a36Sopenharmony_ci};
40462306a36Sopenharmony_ci
40562306a36Sopenharmony_cistatic const struct attribute_group hugepage_attr_group = {
40662306a36Sopenharmony_ci	.attrs = hugepage_attr,
40762306a36Sopenharmony_ci};
40862306a36Sopenharmony_ci
40962306a36Sopenharmony_cistatic int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
41062306a36Sopenharmony_ci{
41162306a36Sopenharmony_ci	int err;
41262306a36Sopenharmony_ci
41362306a36Sopenharmony_ci	*hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
41462306a36Sopenharmony_ci	if (unlikely(!*hugepage_kobj)) {
41562306a36Sopenharmony_ci		pr_err("failed to create transparent hugepage kobject\n");
41662306a36Sopenharmony_ci		return -ENOMEM;
41762306a36Sopenharmony_ci	}
41862306a36Sopenharmony_ci
41962306a36Sopenharmony_ci	err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
42062306a36Sopenharmony_ci	if (err) {
42162306a36Sopenharmony_ci		pr_err("failed to register transparent hugepage group\n");
42262306a36Sopenharmony_ci		goto delete_obj;
42362306a36Sopenharmony_ci	}
42462306a36Sopenharmony_ci
42562306a36Sopenharmony_ci	err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
42662306a36Sopenharmony_ci	if (err) {
42762306a36Sopenharmony_ci		pr_err("failed to register transparent hugepage group\n");
42862306a36Sopenharmony_ci		goto remove_hp_group;
42962306a36Sopenharmony_ci	}
43062306a36Sopenharmony_ci
43162306a36Sopenharmony_ci	return 0;
43262306a36Sopenharmony_ci
43362306a36Sopenharmony_ciremove_hp_group:
43462306a36Sopenharmony_ci	sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
43562306a36Sopenharmony_cidelete_obj:
43662306a36Sopenharmony_ci	kobject_put(*hugepage_kobj);
43762306a36Sopenharmony_ci	return err;
43862306a36Sopenharmony_ci}
43962306a36Sopenharmony_ci
44062306a36Sopenharmony_cistatic void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
44162306a36Sopenharmony_ci{
44262306a36Sopenharmony_ci	sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
44362306a36Sopenharmony_ci	sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
44462306a36Sopenharmony_ci	kobject_put(hugepage_kobj);
44562306a36Sopenharmony_ci}
44662306a36Sopenharmony_ci#else
44762306a36Sopenharmony_cistatic inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
44862306a36Sopenharmony_ci{
44962306a36Sopenharmony_ci	return 0;
45062306a36Sopenharmony_ci}
45162306a36Sopenharmony_ci
45262306a36Sopenharmony_cistatic inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
45362306a36Sopenharmony_ci{
45462306a36Sopenharmony_ci}
45562306a36Sopenharmony_ci#endif /* CONFIG_SYSFS */
45662306a36Sopenharmony_ci
45762306a36Sopenharmony_cistatic int __init hugepage_init(void)
45862306a36Sopenharmony_ci{
45962306a36Sopenharmony_ci	int err;
46062306a36Sopenharmony_ci	struct kobject *hugepage_kobj;
46162306a36Sopenharmony_ci
46262306a36Sopenharmony_ci	if (!has_transparent_hugepage()) {
46362306a36Sopenharmony_ci		transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED;
46462306a36Sopenharmony_ci		return -EINVAL;
46562306a36Sopenharmony_ci	}
46662306a36Sopenharmony_ci
46762306a36Sopenharmony_ci	/*
46862306a36Sopenharmony_ci	 * hugepages can't be allocated by the buddy allocator
46962306a36Sopenharmony_ci	 */
47062306a36Sopenharmony_ci	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_ORDER);
47162306a36Sopenharmony_ci	/*
47262306a36Sopenharmony_ci	 * we use page->mapping and page->index in second tail page
47362306a36Sopenharmony_ci	 * as list_head: assuming THP order >= 2
47462306a36Sopenharmony_ci	 */
47562306a36Sopenharmony_ci	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
47662306a36Sopenharmony_ci
47762306a36Sopenharmony_ci	err = hugepage_init_sysfs(&hugepage_kobj);
47862306a36Sopenharmony_ci	if (err)
47962306a36Sopenharmony_ci		goto err_sysfs;
48062306a36Sopenharmony_ci
48162306a36Sopenharmony_ci	err = khugepaged_init();
48262306a36Sopenharmony_ci	if (err)
48362306a36Sopenharmony_ci		goto err_slab;
48462306a36Sopenharmony_ci
48562306a36Sopenharmony_ci	err = register_shrinker(&huge_zero_page_shrinker, "thp-zero");
48662306a36Sopenharmony_ci	if (err)
48762306a36Sopenharmony_ci		goto err_hzp_shrinker;
48862306a36Sopenharmony_ci	err = register_shrinker(&deferred_split_shrinker, "thp-deferred_split");
48962306a36Sopenharmony_ci	if (err)
49062306a36Sopenharmony_ci		goto err_split_shrinker;
49162306a36Sopenharmony_ci
49262306a36Sopenharmony_ci	/*
49362306a36Sopenharmony_ci	 * By default disable transparent hugepages on smaller systems,
49462306a36Sopenharmony_ci	 * where the extra memory used could hurt more than TLB overhead
49562306a36Sopenharmony_ci	 * is likely to save.  The admin can still enable it through /sys.
49662306a36Sopenharmony_ci	 */
49762306a36Sopenharmony_ci	if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
49862306a36Sopenharmony_ci		transparent_hugepage_flags = 0;
49962306a36Sopenharmony_ci		return 0;
50062306a36Sopenharmony_ci	}
50162306a36Sopenharmony_ci
50262306a36Sopenharmony_ci	err = start_stop_khugepaged();
50362306a36Sopenharmony_ci	if (err)
50462306a36Sopenharmony_ci		goto err_khugepaged;
50562306a36Sopenharmony_ci
50662306a36Sopenharmony_ci	return 0;
50762306a36Sopenharmony_cierr_khugepaged:
50862306a36Sopenharmony_ci	unregister_shrinker(&deferred_split_shrinker);
50962306a36Sopenharmony_cierr_split_shrinker:
51062306a36Sopenharmony_ci	unregister_shrinker(&huge_zero_page_shrinker);
51162306a36Sopenharmony_cierr_hzp_shrinker:
51262306a36Sopenharmony_ci	khugepaged_destroy();
51362306a36Sopenharmony_cierr_slab:
51462306a36Sopenharmony_ci	hugepage_exit_sysfs(hugepage_kobj);
51562306a36Sopenharmony_cierr_sysfs:
51662306a36Sopenharmony_ci	return err;
51762306a36Sopenharmony_ci}
51862306a36Sopenharmony_cisubsys_initcall(hugepage_init);
51962306a36Sopenharmony_ci
52062306a36Sopenharmony_cistatic int __init setup_transparent_hugepage(char *str)
52162306a36Sopenharmony_ci{
52262306a36Sopenharmony_ci	int ret = 0;
52362306a36Sopenharmony_ci	if (!str)
52462306a36Sopenharmony_ci		goto out;
52562306a36Sopenharmony_ci	if (!strcmp(str, "always")) {
52662306a36Sopenharmony_ci		set_bit(TRANSPARENT_HUGEPAGE_FLAG,
52762306a36Sopenharmony_ci			&transparent_hugepage_flags);
52862306a36Sopenharmony_ci		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
52962306a36Sopenharmony_ci			  &transparent_hugepage_flags);
53062306a36Sopenharmony_ci		ret = 1;
53162306a36Sopenharmony_ci	} else if (!strcmp(str, "madvise")) {
53262306a36Sopenharmony_ci		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
53362306a36Sopenharmony_ci			  &transparent_hugepage_flags);
53462306a36Sopenharmony_ci		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
53562306a36Sopenharmony_ci			&transparent_hugepage_flags);
53662306a36Sopenharmony_ci		ret = 1;
53762306a36Sopenharmony_ci	} else if (!strcmp(str, "never")) {
53862306a36Sopenharmony_ci		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
53962306a36Sopenharmony_ci			  &transparent_hugepage_flags);
54062306a36Sopenharmony_ci		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
54162306a36Sopenharmony_ci			  &transparent_hugepage_flags);
54262306a36Sopenharmony_ci		ret = 1;
54362306a36Sopenharmony_ci	}
54462306a36Sopenharmony_ciout:
54562306a36Sopenharmony_ci	if (!ret)
54662306a36Sopenharmony_ci		pr_warn("transparent_hugepage= cannot parse, ignored\n");
54762306a36Sopenharmony_ci	return ret;
54862306a36Sopenharmony_ci}
54962306a36Sopenharmony_ci__setup("transparent_hugepage=", setup_transparent_hugepage);
55062306a36Sopenharmony_ci
55162306a36Sopenharmony_cipmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
55262306a36Sopenharmony_ci{
55362306a36Sopenharmony_ci	if (likely(vma->vm_flags & VM_WRITE))
55462306a36Sopenharmony_ci		pmd = pmd_mkwrite(pmd, vma);
55562306a36Sopenharmony_ci	return pmd;
55662306a36Sopenharmony_ci}
55762306a36Sopenharmony_ci
55862306a36Sopenharmony_ci#ifdef CONFIG_MEMCG
55962306a36Sopenharmony_cistatic inline
56062306a36Sopenharmony_cistruct deferred_split *get_deferred_split_queue(struct folio *folio)
56162306a36Sopenharmony_ci{
56262306a36Sopenharmony_ci	struct mem_cgroup *memcg = folio_memcg(folio);
56362306a36Sopenharmony_ci	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
56462306a36Sopenharmony_ci
56562306a36Sopenharmony_ci	if (memcg)
56662306a36Sopenharmony_ci		return &memcg->deferred_split_queue;
56762306a36Sopenharmony_ci	else
56862306a36Sopenharmony_ci		return &pgdat->deferred_split_queue;
56962306a36Sopenharmony_ci}
57062306a36Sopenharmony_ci#else
57162306a36Sopenharmony_cistatic inline
57262306a36Sopenharmony_cistruct deferred_split *get_deferred_split_queue(struct folio *folio)
57362306a36Sopenharmony_ci{
57462306a36Sopenharmony_ci	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
57562306a36Sopenharmony_ci
57662306a36Sopenharmony_ci	return &pgdat->deferred_split_queue;
57762306a36Sopenharmony_ci}
57862306a36Sopenharmony_ci#endif
57962306a36Sopenharmony_ci
58062306a36Sopenharmony_civoid folio_prep_large_rmappable(struct folio *folio)
58162306a36Sopenharmony_ci{
58262306a36Sopenharmony_ci	VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio);
58362306a36Sopenharmony_ci	INIT_LIST_HEAD(&folio->_deferred_list);
58462306a36Sopenharmony_ci	folio_set_large_rmappable(folio);
58562306a36Sopenharmony_ci}
58662306a36Sopenharmony_ci
58762306a36Sopenharmony_cistatic inline bool is_transparent_hugepage(struct folio *folio)
58862306a36Sopenharmony_ci{
58962306a36Sopenharmony_ci	if (!folio_test_large(folio))
59062306a36Sopenharmony_ci		return false;
59162306a36Sopenharmony_ci
59262306a36Sopenharmony_ci	return is_huge_zero_page(&folio->page) ||
59362306a36Sopenharmony_ci		folio_test_large_rmappable(folio);
59462306a36Sopenharmony_ci}
59562306a36Sopenharmony_ci
59662306a36Sopenharmony_cistatic unsigned long __thp_get_unmapped_area(struct file *filp,
59762306a36Sopenharmony_ci		unsigned long addr, unsigned long len,
59862306a36Sopenharmony_ci		loff_t off, unsigned long flags, unsigned long size)
59962306a36Sopenharmony_ci{
60062306a36Sopenharmony_ci	loff_t off_end = off + len;
60162306a36Sopenharmony_ci	loff_t off_align = round_up(off, size);
60262306a36Sopenharmony_ci	unsigned long len_pad, ret;
60362306a36Sopenharmony_ci
60462306a36Sopenharmony_ci	if (off_end <= off_align || (off_end - off_align) < size)
60562306a36Sopenharmony_ci		return 0;
60662306a36Sopenharmony_ci
60762306a36Sopenharmony_ci	len_pad = len + size;
60862306a36Sopenharmony_ci	if (len_pad < len || (off + len_pad) < off)
60962306a36Sopenharmony_ci		return 0;
61062306a36Sopenharmony_ci
61162306a36Sopenharmony_ci	ret = current->mm->get_unmapped_area(filp, addr, len_pad,
61262306a36Sopenharmony_ci					      off >> PAGE_SHIFT, flags);
61362306a36Sopenharmony_ci
61462306a36Sopenharmony_ci	/*
61562306a36Sopenharmony_ci	 * The failure might be due to length padding. The caller will retry
61662306a36Sopenharmony_ci	 * without the padding.
61762306a36Sopenharmony_ci	 */
61862306a36Sopenharmony_ci	if (IS_ERR_VALUE(ret))
61962306a36Sopenharmony_ci		return 0;
62062306a36Sopenharmony_ci
62162306a36Sopenharmony_ci	/*
62262306a36Sopenharmony_ci	 * Do not try to align to THP boundary if allocation at the address
62362306a36Sopenharmony_ci	 * hint succeeds.
62462306a36Sopenharmony_ci	 */
62562306a36Sopenharmony_ci	if (ret == addr)
62662306a36Sopenharmony_ci		return addr;
62762306a36Sopenharmony_ci
62862306a36Sopenharmony_ci	ret += (off - ret) & (size - 1);
62962306a36Sopenharmony_ci	return ret;
63062306a36Sopenharmony_ci}
63162306a36Sopenharmony_ci
63262306a36Sopenharmony_ciunsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
63362306a36Sopenharmony_ci		unsigned long len, unsigned long pgoff, unsigned long flags)
63462306a36Sopenharmony_ci{
63562306a36Sopenharmony_ci	unsigned long ret;
63662306a36Sopenharmony_ci	loff_t off = (loff_t)pgoff << PAGE_SHIFT;
63762306a36Sopenharmony_ci
63862306a36Sopenharmony_ci	ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE);
63962306a36Sopenharmony_ci	if (ret)
64062306a36Sopenharmony_ci		return ret;
64162306a36Sopenharmony_ci
64262306a36Sopenharmony_ci	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
64362306a36Sopenharmony_ci}
64462306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(thp_get_unmapped_area);
64562306a36Sopenharmony_ci
64662306a36Sopenharmony_cistatic vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
64762306a36Sopenharmony_ci			struct page *page, gfp_t gfp)
64862306a36Sopenharmony_ci{
64962306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
65062306a36Sopenharmony_ci	struct folio *folio = page_folio(page);
65162306a36Sopenharmony_ci	pgtable_t pgtable;
65262306a36Sopenharmony_ci	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
65362306a36Sopenharmony_ci	vm_fault_t ret = 0;
65462306a36Sopenharmony_ci
65562306a36Sopenharmony_ci	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
65662306a36Sopenharmony_ci
65762306a36Sopenharmony_ci	if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
65862306a36Sopenharmony_ci		folio_put(folio);
65962306a36Sopenharmony_ci		count_vm_event(THP_FAULT_FALLBACK);
66062306a36Sopenharmony_ci		count_vm_event(THP_FAULT_FALLBACK_CHARGE);
66162306a36Sopenharmony_ci		return VM_FAULT_FALLBACK;
66262306a36Sopenharmony_ci	}
66362306a36Sopenharmony_ci	folio_throttle_swaprate(folio, gfp);
66462306a36Sopenharmony_ci
66562306a36Sopenharmony_ci	pgtable = pte_alloc_one(vma->vm_mm);
66662306a36Sopenharmony_ci	if (unlikely(!pgtable)) {
66762306a36Sopenharmony_ci		ret = VM_FAULT_OOM;
66862306a36Sopenharmony_ci		goto release;
66962306a36Sopenharmony_ci	}
67062306a36Sopenharmony_ci
67162306a36Sopenharmony_ci	clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
67262306a36Sopenharmony_ci	/*
67362306a36Sopenharmony_ci	 * The memory barrier inside __folio_mark_uptodate makes sure that
67462306a36Sopenharmony_ci	 * clear_huge_page writes become visible before the set_pmd_at()
67562306a36Sopenharmony_ci	 * write.
67662306a36Sopenharmony_ci	 */
67762306a36Sopenharmony_ci	__folio_mark_uptodate(folio);
67862306a36Sopenharmony_ci
67962306a36Sopenharmony_ci	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
68062306a36Sopenharmony_ci	if (unlikely(!pmd_none(*vmf->pmd))) {
68162306a36Sopenharmony_ci		goto unlock_release;
68262306a36Sopenharmony_ci	} else {
68362306a36Sopenharmony_ci		pmd_t entry;
68462306a36Sopenharmony_ci
68562306a36Sopenharmony_ci		ret = check_stable_address_space(vma->vm_mm);
68662306a36Sopenharmony_ci		if (ret)
68762306a36Sopenharmony_ci			goto unlock_release;
68862306a36Sopenharmony_ci
68962306a36Sopenharmony_ci		/* Deliver the page fault to userland */
69062306a36Sopenharmony_ci		if (userfaultfd_missing(vma)) {
69162306a36Sopenharmony_ci			spin_unlock(vmf->ptl);
69262306a36Sopenharmony_ci			folio_put(folio);
69362306a36Sopenharmony_ci			pte_free(vma->vm_mm, pgtable);
69462306a36Sopenharmony_ci			ret = handle_userfault(vmf, VM_UFFD_MISSING);
69562306a36Sopenharmony_ci			VM_BUG_ON(ret & VM_FAULT_FALLBACK);
69662306a36Sopenharmony_ci			return ret;
69762306a36Sopenharmony_ci		}
69862306a36Sopenharmony_ci
69962306a36Sopenharmony_ci		entry = mk_huge_pmd(page, vma->vm_page_prot);
70062306a36Sopenharmony_ci		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
70162306a36Sopenharmony_ci		folio_add_new_anon_rmap(folio, vma, haddr);
70262306a36Sopenharmony_ci		folio_add_lru_vma(folio, vma);
70362306a36Sopenharmony_ci		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
70462306a36Sopenharmony_ci		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
70562306a36Sopenharmony_ci		update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
70662306a36Sopenharmony_ci		add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
70762306a36Sopenharmony_ci		mm_inc_nr_ptes(vma->vm_mm);
70862306a36Sopenharmony_ci		spin_unlock(vmf->ptl);
70962306a36Sopenharmony_ci		count_vm_event(THP_FAULT_ALLOC);
71062306a36Sopenharmony_ci		count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
71162306a36Sopenharmony_ci	}
71262306a36Sopenharmony_ci
71362306a36Sopenharmony_ci	return 0;
71462306a36Sopenharmony_ciunlock_release:
71562306a36Sopenharmony_ci	spin_unlock(vmf->ptl);
71662306a36Sopenharmony_cirelease:
71762306a36Sopenharmony_ci	if (pgtable)
71862306a36Sopenharmony_ci		pte_free(vma->vm_mm, pgtable);
71962306a36Sopenharmony_ci	folio_put(folio);
72062306a36Sopenharmony_ci	return ret;
72162306a36Sopenharmony_ci
72262306a36Sopenharmony_ci}
72362306a36Sopenharmony_ci
72462306a36Sopenharmony_ci/*
72562306a36Sopenharmony_ci * always: directly stall for all thp allocations
72662306a36Sopenharmony_ci * defer: wake kswapd and fail if not immediately available
72762306a36Sopenharmony_ci * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
72862306a36Sopenharmony_ci *		  fail if not immediately available
72962306a36Sopenharmony_ci * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
73062306a36Sopenharmony_ci *	    available
73162306a36Sopenharmony_ci * never: never stall for any thp allocation
73262306a36Sopenharmony_ci */
73362306a36Sopenharmony_cigfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
73462306a36Sopenharmony_ci{
73562306a36Sopenharmony_ci	const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
73662306a36Sopenharmony_ci
73762306a36Sopenharmony_ci	/* Always do synchronous compaction */
73862306a36Sopenharmony_ci	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
73962306a36Sopenharmony_ci		return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
74062306a36Sopenharmony_ci
74162306a36Sopenharmony_ci	/* Kick kcompactd and fail quickly */
74262306a36Sopenharmony_ci	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
74362306a36Sopenharmony_ci		return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
74462306a36Sopenharmony_ci
74562306a36Sopenharmony_ci	/* Synchronous compaction if madvised, otherwise kick kcompactd */
74662306a36Sopenharmony_ci	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
74762306a36Sopenharmony_ci		return GFP_TRANSHUGE_LIGHT |
74862306a36Sopenharmony_ci			(vma_madvised ? __GFP_DIRECT_RECLAIM :
74962306a36Sopenharmony_ci					__GFP_KSWAPD_RECLAIM);
75062306a36Sopenharmony_ci
75162306a36Sopenharmony_ci	/* Only do synchronous compaction if madvised */
75262306a36Sopenharmony_ci	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
75362306a36Sopenharmony_ci		return GFP_TRANSHUGE_LIGHT |
75462306a36Sopenharmony_ci		       (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
75562306a36Sopenharmony_ci
75662306a36Sopenharmony_ci	return GFP_TRANSHUGE_LIGHT;
75762306a36Sopenharmony_ci}
75862306a36Sopenharmony_ci
75962306a36Sopenharmony_ci/* Caller must hold page table lock. */
76062306a36Sopenharmony_cistatic void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
76162306a36Sopenharmony_ci		struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
76262306a36Sopenharmony_ci		struct page *zero_page)
76362306a36Sopenharmony_ci{
76462306a36Sopenharmony_ci	pmd_t entry;
76562306a36Sopenharmony_ci	if (!pmd_none(*pmd))
76662306a36Sopenharmony_ci		return;
76762306a36Sopenharmony_ci	entry = mk_pmd(zero_page, vma->vm_page_prot);
76862306a36Sopenharmony_ci	entry = pmd_mkhuge(entry);
76962306a36Sopenharmony_ci	pgtable_trans_huge_deposit(mm, pmd, pgtable);
77062306a36Sopenharmony_ci	set_pmd_at(mm, haddr, pmd, entry);
77162306a36Sopenharmony_ci	mm_inc_nr_ptes(mm);
77262306a36Sopenharmony_ci}
77362306a36Sopenharmony_ci
77462306a36Sopenharmony_civm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
77562306a36Sopenharmony_ci{
77662306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
77762306a36Sopenharmony_ci	gfp_t gfp;
77862306a36Sopenharmony_ci	struct folio *folio;
77962306a36Sopenharmony_ci	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
78062306a36Sopenharmony_ci
78162306a36Sopenharmony_ci	if (!transhuge_vma_suitable(vma, haddr))
78262306a36Sopenharmony_ci		return VM_FAULT_FALLBACK;
78362306a36Sopenharmony_ci	if (unlikely(anon_vma_prepare(vma)))
78462306a36Sopenharmony_ci		return VM_FAULT_OOM;
78562306a36Sopenharmony_ci	khugepaged_enter_vma(vma, vma->vm_flags);
78662306a36Sopenharmony_ci
78762306a36Sopenharmony_ci	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
78862306a36Sopenharmony_ci			!mm_forbids_zeropage(vma->vm_mm) &&
78962306a36Sopenharmony_ci			transparent_hugepage_use_zero_page()) {
79062306a36Sopenharmony_ci		pgtable_t pgtable;
79162306a36Sopenharmony_ci		struct page *zero_page;
79262306a36Sopenharmony_ci		vm_fault_t ret;
79362306a36Sopenharmony_ci		pgtable = pte_alloc_one(vma->vm_mm);
79462306a36Sopenharmony_ci		if (unlikely(!pgtable))
79562306a36Sopenharmony_ci			return VM_FAULT_OOM;
79662306a36Sopenharmony_ci		zero_page = mm_get_huge_zero_page(vma->vm_mm);
79762306a36Sopenharmony_ci		if (unlikely(!zero_page)) {
79862306a36Sopenharmony_ci			pte_free(vma->vm_mm, pgtable);
79962306a36Sopenharmony_ci			count_vm_event(THP_FAULT_FALLBACK);
80062306a36Sopenharmony_ci			return VM_FAULT_FALLBACK;
80162306a36Sopenharmony_ci		}
80262306a36Sopenharmony_ci		vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
80362306a36Sopenharmony_ci		ret = 0;
80462306a36Sopenharmony_ci		if (pmd_none(*vmf->pmd)) {
80562306a36Sopenharmony_ci			ret = check_stable_address_space(vma->vm_mm);
80662306a36Sopenharmony_ci			if (ret) {
80762306a36Sopenharmony_ci				spin_unlock(vmf->ptl);
80862306a36Sopenharmony_ci				pte_free(vma->vm_mm, pgtable);
80962306a36Sopenharmony_ci			} else if (userfaultfd_missing(vma)) {
81062306a36Sopenharmony_ci				spin_unlock(vmf->ptl);
81162306a36Sopenharmony_ci				pte_free(vma->vm_mm, pgtable);
81262306a36Sopenharmony_ci				ret = handle_userfault(vmf, VM_UFFD_MISSING);
81362306a36Sopenharmony_ci				VM_BUG_ON(ret & VM_FAULT_FALLBACK);
81462306a36Sopenharmony_ci			} else {
81562306a36Sopenharmony_ci				set_huge_zero_page(pgtable, vma->vm_mm, vma,
81662306a36Sopenharmony_ci						   haddr, vmf->pmd, zero_page);
81762306a36Sopenharmony_ci				update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
81862306a36Sopenharmony_ci				spin_unlock(vmf->ptl);
81962306a36Sopenharmony_ci			}
82062306a36Sopenharmony_ci		} else {
82162306a36Sopenharmony_ci			spin_unlock(vmf->ptl);
82262306a36Sopenharmony_ci			pte_free(vma->vm_mm, pgtable);
82362306a36Sopenharmony_ci		}
82462306a36Sopenharmony_ci		return ret;
82562306a36Sopenharmony_ci	}
82662306a36Sopenharmony_ci	gfp = vma_thp_gfp_mask(vma);
82762306a36Sopenharmony_ci	folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
82862306a36Sopenharmony_ci	if (unlikely(!folio)) {
82962306a36Sopenharmony_ci		count_vm_event(THP_FAULT_FALLBACK);
83062306a36Sopenharmony_ci		return VM_FAULT_FALLBACK;
83162306a36Sopenharmony_ci	}
83262306a36Sopenharmony_ci	return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
83362306a36Sopenharmony_ci}
83462306a36Sopenharmony_ci
83562306a36Sopenharmony_cistatic void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
83662306a36Sopenharmony_ci		pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
83762306a36Sopenharmony_ci		pgtable_t pgtable)
83862306a36Sopenharmony_ci{
83962306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
84062306a36Sopenharmony_ci	pmd_t entry;
84162306a36Sopenharmony_ci	spinlock_t *ptl;
84262306a36Sopenharmony_ci
84362306a36Sopenharmony_ci	ptl = pmd_lock(mm, pmd);
84462306a36Sopenharmony_ci	if (!pmd_none(*pmd)) {
84562306a36Sopenharmony_ci		if (write) {
84662306a36Sopenharmony_ci			if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
84762306a36Sopenharmony_ci				WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
84862306a36Sopenharmony_ci				goto out_unlock;
84962306a36Sopenharmony_ci			}
85062306a36Sopenharmony_ci			entry = pmd_mkyoung(*pmd);
85162306a36Sopenharmony_ci			entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
85262306a36Sopenharmony_ci			if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
85362306a36Sopenharmony_ci				update_mmu_cache_pmd(vma, addr, pmd);
85462306a36Sopenharmony_ci		}
85562306a36Sopenharmony_ci
85662306a36Sopenharmony_ci		goto out_unlock;
85762306a36Sopenharmony_ci	}
85862306a36Sopenharmony_ci
85962306a36Sopenharmony_ci	entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
86062306a36Sopenharmony_ci	if (pfn_t_devmap(pfn))
86162306a36Sopenharmony_ci		entry = pmd_mkdevmap(entry);
86262306a36Sopenharmony_ci	if (write) {
86362306a36Sopenharmony_ci		entry = pmd_mkyoung(pmd_mkdirty(entry));
86462306a36Sopenharmony_ci		entry = maybe_pmd_mkwrite(entry, vma);
86562306a36Sopenharmony_ci	}
86662306a36Sopenharmony_ci
86762306a36Sopenharmony_ci	if (pgtable) {
86862306a36Sopenharmony_ci		pgtable_trans_huge_deposit(mm, pmd, pgtable);
86962306a36Sopenharmony_ci		mm_inc_nr_ptes(mm);
87062306a36Sopenharmony_ci		pgtable = NULL;
87162306a36Sopenharmony_ci	}
87262306a36Sopenharmony_ci
87362306a36Sopenharmony_ci	set_pmd_at(mm, addr, pmd, entry);
87462306a36Sopenharmony_ci	update_mmu_cache_pmd(vma, addr, pmd);
87562306a36Sopenharmony_ci
87662306a36Sopenharmony_ciout_unlock:
87762306a36Sopenharmony_ci	spin_unlock(ptl);
87862306a36Sopenharmony_ci	if (pgtable)
87962306a36Sopenharmony_ci		pte_free(mm, pgtable);
88062306a36Sopenharmony_ci}
88162306a36Sopenharmony_ci
88262306a36Sopenharmony_ci/**
88362306a36Sopenharmony_ci * vmf_insert_pfn_pmd - insert a pmd size pfn
88462306a36Sopenharmony_ci * @vmf: Structure describing the fault
88562306a36Sopenharmony_ci * @pfn: pfn to insert
88662306a36Sopenharmony_ci * @write: whether it's a write fault
88762306a36Sopenharmony_ci *
88862306a36Sopenharmony_ci * Insert a pmd size pfn. See vmf_insert_pfn() for additional info.
88962306a36Sopenharmony_ci *
89062306a36Sopenharmony_ci * Return: vm_fault_t value.
89162306a36Sopenharmony_ci */
89262306a36Sopenharmony_civm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
89362306a36Sopenharmony_ci{
89462306a36Sopenharmony_ci	unsigned long addr = vmf->address & PMD_MASK;
89562306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
89662306a36Sopenharmony_ci	pgprot_t pgprot = vma->vm_page_prot;
89762306a36Sopenharmony_ci	pgtable_t pgtable = NULL;
89862306a36Sopenharmony_ci
89962306a36Sopenharmony_ci	/*
90062306a36Sopenharmony_ci	 * If we had pmd_special, we could avoid all these restrictions,
90162306a36Sopenharmony_ci	 * but we need to be consistent with PTEs and architectures that
90262306a36Sopenharmony_ci	 * can't support a 'special' bit.
90362306a36Sopenharmony_ci	 */
90462306a36Sopenharmony_ci	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
90562306a36Sopenharmony_ci			!pfn_t_devmap(pfn));
90662306a36Sopenharmony_ci	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
90762306a36Sopenharmony_ci						(VM_PFNMAP|VM_MIXEDMAP));
90862306a36Sopenharmony_ci	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
90962306a36Sopenharmony_ci
91062306a36Sopenharmony_ci	if (addr < vma->vm_start || addr >= vma->vm_end)
91162306a36Sopenharmony_ci		return VM_FAULT_SIGBUS;
91262306a36Sopenharmony_ci
91362306a36Sopenharmony_ci	if (arch_needs_pgtable_deposit()) {
91462306a36Sopenharmony_ci		pgtable = pte_alloc_one(vma->vm_mm);
91562306a36Sopenharmony_ci		if (!pgtable)
91662306a36Sopenharmony_ci			return VM_FAULT_OOM;
91762306a36Sopenharmony_ci	}
91862306a36Sopenharmony_ci
91962306a36Sopenharmony_ci	track_pfn_insert(vma, &pgprot, pfn);
92062306a36Sopenharmony_ci
92162306a36Sopenharmony_ci	insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
92262306a36Sopenharmony_ci	return VM_FAULT_NOPAGE;
92362306a36Sopenharmony_ci}
92462306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
92562306a36Sopenharmony_ci
92662306a36Sopenharmony_ci#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
92762306a36Sopenharmony_cistatic pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
92862306a36Sopenharmony_ci{
92962306a36Sopenharmony_ci	if (likely(vma->vm_flags & VM_WRITE))
93062306a36Sopenharmony_ci		pud = pud_mkwrite(pud);
93162306a36Sopenharmony_ci	return pud;
93262306a36Sopenharmony_ci}
93362306a36Sopenharmony_ci
93462306a36Sopenharmony_cistatic void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
93562306a36Sopenharmony_ci		pud_t *pud, pfn_t pfn, bool write)
93662306a36Sopenharmony_ci{
93762306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
93862306a36Sopenharmony_ci	pgprot_t prot = vma->vm_page_prot;
93962306a36Sopenharmony_ci	pud_t entry;
94062306a36Sopenharmony_ci	spinlock_t *ptl;
94162306a36Sopenharmony_ci
94262306a36Sopenharmony_ci	ptl = pud_lock(mm, pud);
94362306a36Sopenharmony_ci	if (!pud_none(*pud)) {
94462306a36Sopenharmony_ci		if (write) {
94562306a36Sopenharmony_ci			if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) {
94662306a36Sopenharmony_ci				WARN_ON_ONCE(!is_huge_zero_pud(*pud));
94762306a36Sopenharmony_ci				goto out_unlock;
94862306a36Sopenharmony_ci			}
94962306a36Sopenharmony_ci			entry = pud_mkyoung(*pud);
95062306a36Sopenharmony_ci			entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
95162306a36Sopenharmony_ci			if (pudp_set_access_flags(vma, addr, pud, entry, 1))
95262306a36Sopenharmony_ci				update_mmu_cache_pud(vma, addr, pud);
95362306a36Sopenharmony_ci		}
95462306a36Sopenharmony_ci		goto out_unlock;
95562306a36Sopenharmony_ci	}
95662306a36Sopenharmony_ci
95762306a36Sopenharmony_ci	entry = pud_mkhuge(pfn_t_pud(pfn, prot));
95862306a36Sopenharmony_ci	if (pfn_t_devmap(pfn))
95962306a36Sopenharmony_ci		entry = pud_mkdevmap(entry);
96062306a36Sopenharmony_ci	if (write) {
96162306a36Sopenharmony_ci		entry = pud_mkyoung(pud_mkdirty(entry));
96262306a36Sopenharmony_ci		entry = maybe_pud_mkwrite(entry, vma);
96362306a36Sopenharmony_ci	}
96462306a36Sopenharmony_ci	set_pud_at(mm, addr, pud, entry);
96562306a36Sopenharmony_ci	update_mmu_cache_pud(vma, addr, pud);
96662306a36Sopenharmony_ci
96762306a36Sopenharmony_ciout_unlock:
96862306a36Sopenharmony_ci	spin_unlock(ptl);
96962306a36Sopenharmony_ci}
97062306a36Sopenharmony_ci
97162306a36Sopenharmony_ci/**
97262306a36Sopenharmony_ci * vmf_insert_pfn_pud - insert a pud size pfn
97362306a36Sopenharmony_ci * @vmf: Structure describing the fault
97462306a36Sopenharmony_ci * @pfn: pfn to insert
97562306a36Sopenharmony_ci * @write: whether it's a write fault
97662306a36Sopenharmony_ci *
97762306a36Sopenharmony_ci * Insert a pud size pfn. See vmf_insert_pfn() for additional info.
97862306a36Sopenharmony_ci *
97962306a36Sopenharmony_ci * Return: vm_fault_t value.
98062306a36Sopenharmony_ci */
98162306a36Sopenharmony_civm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
98262306a36Sopenharmony_ci{
98362306a36Sopenharmony_ci	unsigned long addr = vmf->address & PUD_MASK;
98462306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
98562306a36Sopenharmony_ci	pgprot_t pgprot = vma->vm_page_prot;
98662306a36Sopenharmony_ci
98762306a36Sopenharmony_ci	/*
98862306a36Sopenharmony_ci	 * If we had pud_special, we could avoid all these restrictions,
98962306a36Sopenharmony_ci	 * but we need to be consistent with PTEs and architectures that
99062306a36Sopenharmony_ci	 * can't support a 'special' bit.
99162306a36Sopenharmony_ci	 */
99262306a36Sopenharmony_ci	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
99362306a36Sopenharmony_ci			!pfn_t_devmap(pfn));
99462306a36Sopenharmony_ci	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
99562306a36Sopenharmony_ci						(VM_PFNMAP|VM_MIXEDMAP));
99662306a36Sopenharmony_ci	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
99762306a36Sopenharmony_ci
99862306a36Sopenharmony_ci	if (addr < vma->vm_start || addr >= vma->vm_end)
99962306a36Sopenharmony_ci		return VM_FAULT_SIGBUS;
100062306a36Sopenharmony_ci
100162306a36Sopenharmony_ci	track_pfn_insert(vma, &pgprot, pfn);
100262306a36Sopenharmony_ci
100362306a36Sopenharmony_ci	insert_pfn_pud(vma, addr, vmf->pud, pfn, write);
100462306a36Sopenharmony_ci	return VM_FAULT_NOPAGE;
100562306a36Sopenharmony_ci}
100662306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
100762306a36Sopenharmony_ci#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
100862306a36Sopenharmony_ci
100962306a36Sopenharmony_cistatic void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
101062306a36Sopenharmony_ci		      pmd_t *pmd, bool write)
101162306a36Sopenharmony_ci{
101262306a36Sopenharmony_ci	pmd_t _pmd;
101362306a36Sopenharmony_ci
101462306a36Sopenharmony_ci	_pmd = pmd_mkyoung(*pmd);
101562306a36Sopenharmony_ci	if (write)
101662306a36Sopenharmony_ci		_pmd = pmd_mkdirty(_pmd);
101762306a36Sopenharmony_ci	if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
101862306a36Sopenharmony_ci				  pmd, _pmd, write))
101962306a36Sopenharmony_ci		update_mmu_cache_pmd(vma, addr, pmd);
102062306a36Sopenharmony_ci}
102162306a36Sopenharmony_ci
102262306a36Sopenharmony_cistruct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
102362306a36Sopenharmony_ci		pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
102462306a36Sopenharmony_ci{
102562306a36Sopenharmony_ci	unsigned long pfn = pmd_pfn(*pmd);
102662306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
102762306a36Sopenharmony_ci	struct page *page;
102862306a36Sopenharmony_ci	int ret;
102962306a36Sopenharmony_ci
103062306a36Sopenharmony_ci	assert_spin_locked(pmd_lockptr(mm, pmd));
103162306a36Sopenharmony_ci
103262306a36Sopenharmony_ci	if (flags & FOLL_WRITE && !pmd_write(*pmd))
103362306a36Sopenharmony_ci		return NULL;
103462306a36Sopenharmony_ci
103562306a36Sopenharmony_ci	if (pmd_present(*pmd) && pmd_devmap(*pmd))
103662306a36Sopenharmony_ci		/* pass */;
103762306a36Sopenharmony_ci	else
103862306a36Sopenharmony_ci		return NULL;
103962306a36Sopenharmony_ci
104062306a36Sopenharmony_ci	if (flags & FOLL_TOUCH)
104162306a36Sopenharmony_ci		touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
104262306a36Sopenharmony_ci
104362306a36Sopenharmony_ci	/*
104462306a36Sopenharmony_ci	 * device mapped pages can only be returned if the
104562306a36Sopenharmony_ci	 * caller will manage the page reference count.
104662306a36Sopenharmony_ci	 */
104762306a36Sopenharmony_ci	if (!(flags & (FOLL_GET | FOLL_PIN)))
104862306a36Sopenharmony_ci		return ERR_PTR(-EEXIST);
104962306a36Sopenharmony_ci
105062306a36Sopenharmony_ci	pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
105162306a36Sopenharmony_ci	*pgmap = get_dev_pagemap(pfn, *pgmap);
105262306a36Sopenharmony_ci	if (!*pgmap)
105362306a36Sopenharmony_ci		return ERR_PTR(-EFAULT);
105462306a36Sopenharmony_ci	page = pfn_to_page(pfn);
105562306a36Sopenharmony_ci	ret = try_grab_page(page, flags);
105662306a36Sopenharmony_ci	if (ret)
105762306a36Sopenharmony_ci		page = ERR_PTR(ret);
105862306a36Sopenharmony_ci
105962306a36Sopenharmony_ci	return page;
106062306a36Sopenharmony_ci}
106162306a36Sopenharmony_ci
106262306a36Sopenharmony_ciint copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
106362306a36Sopenharmony_ci		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
106462306a36Sopenharmony_ci		  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
106562306a36Sopenharmony_ci{
106662306a36Sopenharmony_ci	spinlock_t *dst_ptl, *src_ptl;
106762306a36Sopenharmony_ci	struct page *src_page;
106862306a36Sopenharmony_ci	pmd_t pmd;
106962306a36Sopenharmony_ci	pgtable_t pgtable = NULL;
107062306a36Sopenharmony_ci	int ret = -ENOMEM;
107162306a36Sopenharmony_ci
107262306a36Sopenharmony_ci	/* Skip if can be re-fill on fault */
107362306a36Sopenharmony_ci	if (!vma_is_anonymous(dst_vma))
107462306a36Sopenharmony_ci		return 0;
107562306a36Sopenharmony_ci
107662306a36Sopenharmony_ci	pgtable = pte_alloc_one(dst_mm);
107762306a36Sopenharmony_ci	if (unlikely(!pgtable))
107862306a36Sopenharmony_ci		goto out;
107962306a36Sopenharmony_ci
108062306a36Sopenharmony_ci	dst_ptl = pmd_lock(dst_mm, dst_pmd);
108162306a36Sopenharmony_ci	src_ptl = pmd_lockptr(src_mm, src_pmd);
108262306a36Sopenharmony_ci	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
108362306a36Sopenharmony_ci
108462306a36Sopenharmony_ci	ret = -EAGAIN;
108562306a36Sopenharmony_ci	pmd = *src_pmd;
108662306a36Sopenharmony_ci
108762306a36Sopenharmony_ci#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
108862306a36Sopenharmony_ci	if (unlikely(is_swap_pmd(pmd))) {
108962306a36Sopenharmony_ci		swp_entry_t entry = pmd_to_swp_entry(pmd);
109062306a36Sopenharmony_ci
109162306a36Sopenharmony_ci		VM_BUG_ON(!is_pmd_migration_entry(pmd));
109262306a36Sopenharmony_ci		if (!is_readable_migration_entry(entry)) {
109362306a36Sopenharmony_ci			entry = make_readable_migration_entry(
109462306a36Sopenharmony_ci							swp_offset(entry));
109562306a36Sopenharmony_ci			pmd = swp_entry_to_pmd(entry);
109662306a36Sopenharmony_ci			if (pmd_swp_soft_dirty(*src_pmd))
109762306a36Sopenharmony_ci				pmd = pmd_swp_mksoft_dirty(pmd);
109862306a36Sopenharmony_ci			if (pmd_swp_uffd_wp(*src_pmd))
109962306a36Sopenharmony_ci				pmd = pmd_swp_mkuffd_wp(pmd);
110062306a36Sopenharmony_ci			set_pmd_at(src_mm, addr, src_pmd, pmd);
110162306a36Sopenharmony_ci		}
110262306a36Sopenharmony_ci		add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
110362306a36Sopenharmony_ci		mm_inc_nr_ptes(dst_mm);
110462306a36Sopenharmony_ci		pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
110562306a36Sopenharmony_ci		if (!userfaultfd_wp(dst_vma))
110662306a36Sopenharmony_ci			pmd = pmd_swp_clear_uffd_wp(pmd);
110762306a36Sopenharmony_ci		set_pmd_at(dst_mm, addr, dst_pmd, pmd);
110862306a36Sopenharmony_ci		ret = 0;
110962306a36Sopenharmony_ci		goto out_unlock;
111062306a36Sopenharmony_ci	}
111162306a36Sopenharmony_ci#endif
111262306a36Sopenharmony_ci
111362306a36Sopenharmony_ci	if (unlikely(!pmd_trans_huge(pmd))) {
111462306a36Sopenharmony_ci		pte_free(dst_mm, pgtable);
111562306a36Sopenharmony_ci		goto out_unlock;
111662306a36Sopenharmony_ci	}
111762306a36Sopenharmony_ci	/*
111862306a36Sopenharmony_ci	 * When page table lock is held, the huge zero pmd should not be
111962306a36Sopenharmony_ci	 * under splitting since we don't split the page itself, only pmd to
112062306a36Sopenharmony_ci	 * a page table.
112162306a36Sopenharmony_ci	 */
112262306a36Sopenharmony_ci	if (is_huge_zero_pmd(pmd)) {
112362306a36Sopenharmony_ci		/*
112462306a36Sopenharmony_ci		 * get_huge_zero_page() will never allocate a new page here,
112562306a36Sopenharmony_ci		 * since we already have a zero page to copy. It just takes a
112662306a36Sopenharmony_ci		 * reference.
112762306a36Sopenharmony_ci		 */
112862306a36Sopenharmony_ci		mm_get_huge_zero_page(dst_mm);
112962306a36Sopenharmony_ci		goto out_zero_page;
113062306a36Sopenharmony_ci	}
113162306a36Sopenharmony_ci
113262306a36Sopenharmony_ci	src_page = pmd_page(pmd);
113362306a36Sopenharmony_ci	VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
113462306a36Sopenharmony_ci
113562306a36Sopenharmony_ci	get_page(src_page);
113662306a36Sopenharmony_ci	if (unlikely(page_try_dup_anon_rmap(src_page, true, src_vma))) {
113762306a36Sopenharmony_ci		/* Page maybe pinned: split and retry the fault on PTEs. */
113862306a36Sopenharmony_ci		put_page(src_page);
113962306a36Sopenharmony_ci		pte_free(dst_mm, pgtable);
114062306a36Sopenharmony_ci		spin_unlock(src_ptl);
114162306a36Sopenharmony_ci		spin_unlock(dst_ptl);
114262306a36Sopenharmony_ci		__split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
114362306a36Sopenharmony_ci		return -EAGAIN;
114462306a36Sopenharmony_ci	}
114562306a36Sopenharmony_ci	add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
114662306a36Sopenharmony_ciout_zero_page:
114762306a36Sopenharmony_ci	mm_inc_nr_ptes(dst_mm);
114862306a36Sopenharmony_ci	pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
114962306a36Sopenharmony_ci	pmdp_set_wrprotect(src_mm, addr, src_pmd);
115062306a36Sopenharmony_ci	if (!userfaultfd_wp(dst_vma))
115162306a36Sopenharmony_ci		pmd = pmd_clear_uffd_wp(pmd);
115262306a36Sopenharmony_ci	pmd = pmd_mkold(pmd_wrprotect(pmd));
115362306a36Sopenharmony_ci	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
115462306a36Sopenharmony_ci
115562306a36Sopenharmony_ci	ret = 0;
115662306a36Sopenharmony_ciout_unlock:
115762306a36Sopenharmony_ci	spin_unlock(src_ptl);
115862306a36Sopenharmony_ci	spin_unlock(dst_ptl);
115962306a36Sopenharmony_ciout:
116062306a36Sopenharmony_ci	return ret;
116162306a36Sopenharmony_ci}
116262306a36Sopenharmony_ci
116362306a36Sopenharmony_ci#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
116462306a36Sopenharmony_cistatic void touch_pud(struct vm_area_struct *vma, unsigned long addr,
116562306a36Sopenharmony_ci		      pud_t *pud, bool write)
116662306a36Sopenharmony_ci{
116762306a36Sopenharmony_ci	pud_t _pud;
116862306a36Sopenharmony_ci
116962306a36Sopenharmony_ci	_pud = pud_mkyoung(*pud);
117062306a36Sopenharmony_ci	if (write)
117162306a36Sopenharmony_ci		_pud = pud_mkdirty(_pud);
117262306a36Sopenharmony_ci	if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
117362306a36Sopenharmony_ci				  pud, _pud, write))
117462306a36Sopenharmony_ci		update_mmu_cache_pud(vma, addr, pud);
117562306a36Sopenharmony_ci}
117662306a36Sopenharmony_ci
117762306a36Sopenharmony_cistruct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
117862306a36Sopenharmony_ci		pud_t *pud, int flags, struct dev_pagemap **pgmap)
117962306a36Sopenharmony_ci{
118062306a36Sopenharmony_ci	unsigned long pfn = pud_pfn(*pud);
118162306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
118262306a36Sopenharmony_ci	struct page *page;
118362306a36Sopenharmony_ci	int ret;
118462306a36Sopenharmony_ci
118562306a36Sopenharmony_ci	assert_spin_locked(pud_lockptr(mm, pud));
118662306a36Sopenharmony_ci
118762306a36Sopenharmony_ci	if (flags & FOLL_WRITE && !pud_write(*pud))
118862306a36Sopenharmony_ci		return NULL;
118962306a36Sopenharmony_ci
119062306a36Sopenharmony_ci	if (pud_present(*pud) && pud_devmap(*pud))
119162306a36Sopenharmony_ci		/* pass */;
119262306a36Sopenharmony_ci	else
119362306a36Sopenharmony_ci		return NULL;
119462306a36Sopenharmony_ci
119562306a36Sopenharmony_ci	if (flags & FOLL_TOUCH)
119662306a36Sopenharmony_ci		touch_pud(vma, addr, pud, flags & FOLL_WRITE);
119762306a36Sopenharmony_ci
119862306a36Sopenharmony_ci	/*
119962306a36Sopenharmony_ci	 * device mapped pages can only be returned if the
120062306a36Sopenharmony_ci	 * caller will manage the page reference count.
120162306a36Sopenharmony_ci	 *
120262306a36Sopenharmony_ci	 * At least one of FOLL_GET | FOLL_PIN must be set, so assert that here:
120362306a36Sopenharmony_ci	 */
120462306a36Sopenharmony_ci	if (!(flags & (FOLL_GET | FOLL_PIN)))
120562306a36Sopenharmony_ci		return ERR_PTR(-EEXIST);
120662306a36Sopenharmony_ci
120762306a36Sopenharmony_ci	pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
120862306a36Sopenharmony_ci	*pgmap = get_dev_pagemap(pfn, *pgmap);
120962306a36Sopenharmony_ci	if (!*pgmap)
121062306a36Sopenharmony_ci		return ERR_PTR(-EFAULT);
121162306a36Sopenharmony_ci	page = pfn_to_page(pfn);
121262306a36Sopenharmony_ci
121362306a36Sopenharmony_ci	ret = try_grab_page(page, flags);
121462306a36Sopenharmony_ci	if (ret)
121562306a36Sopenharmony_ci		page = ERR_PTR(ret);
121662306a36Sopenharmony_ci
121762306a36Sopenharmony_ci	return page;
121862306a36Sopenharmony_ci}
121962306a36Sopenharmony_ci
122062306a36Sopenharmony_ciint copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
122162306a36Sopenharmony_ci		  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
122262306a36Sopenharmony_ci		  struct vm_area_struct *vma)
122362306a36Sopenharmony_ci{
122462306a36Sopenharmony_ci	spinlock_t *dst_ptl, *src_ptl;
122562306a36Sopenharmony_ci	pud_t pud;
122662306a36Sopenharmony_ci	int ret;
122762306a36Sopenharmony_ci
122862306a36Sopenharmony_ci	dst_ptl = pud_lock(dst_mm, dst_pud);
122962306a36Sopenharmony_ci	src_ptl = pud_lockptr(src_mm, src_pud);
123062306a36Sopenharmony_ci	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
123162306a36Sopenharmony_ci
123262306a36Sopenharmony_ci	ret = -EAGAIN;
123362306a36Sopenharmony_ci	pud = *src_pud;
123462306a36Sopenharmony_ci	if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
123562306a36Sopenharmony_ci		goto out_unlock;
123662306a36Sopenharmony_ci
123762306a36Sopenharmony_ci	/*
123862306a36Sopenharmony_ci	 * When page table lock is held, the huge zero pud should not be
123962306a36Sopenharmony_ci	 * under splitting since we don't split the page itself, only pud to
124062306a36Sopenharmony_ci	 * a page table.
124162306a36Sopenharmony_ci	 */
124262306a36Sopenharmony_ci	if (is_huge_zero_pud(pud)) {
124362306a36Sopenharmony_ci		/* No huge zero pud yet */
124462306a36Sopenharmony_ci	}
124562306a36Sopenharmony_ci
124662306a36Sopenharmony_ci	/*
124762306a36Sopenharmony_ci	 * TODO: once we support anonymous pages, use page_try_dup_anon_rmap()
124862306a36Sopenharmony_ci	 * and split if duplicating fails.
124962306a36Sopenharmony_ci	 */
125062306a36Sopenharmony_ci	pudp_set_wrprotect(src_mm, addr, src_pud);
125162306a36Sopenharmony_ci	pud = pud_mkold(pud_wrprotect(pud));
125262306a36Sopenharmony_ci	set_pud_at(dst_mm, addr, dst_pud, pud);
125362306a36Sopenharmony_ci
125462306a36Sopenharmony_ci	ret = 0;
125562306a36Sopenharmony_ciout_unlock:
125662306a36Sopenharmony_ci	spin_unlock(src_ptl);
125762306a36Sopenharmony_ci	spin_unlock(dst_ptl);
125862306a36Sopenharmony_ci	return ret;
125962306a36Sopenharmony_ci}
126062306a36Sopenharmony_ci
126162306a36Sopenharmony_civoid huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
126262306a36Sopenharmony_ci{
126362306a36Sopenharmony_ci	bool write = vmf->flags & FAULT_FLAG_WRITE;
126462306a36Sopenharmony_ci
126562306a36Sopenharmony_ci	vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
126662306a36Sopenharmony_ci	if (unlikely(!pud_same(*vmf->pud, orig_pud)))
126762306a36Sopenharmony_ci		goto unlock;
126862306a36Sopenharmony_ci
126962306a36Sopenharmony_ci	touch_pud(vmf->vma, vmf->address, vmf->pud, write);
127062306a36Sopenharmony_ciunlock:
127162306a36Sopenharmony_ci	spin_unlock(vmf->ptl);
127262306a36Sopenharmony_ci}
127362306a36Sopenharmony_ci#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
127462306a36Sopenharmony_ci
127562306a36Sopenharmony_civoid huge_pmd_set_accessed(struct vm_fault *vmf)
127662306a36Sopenharmony_ci{
127762306a36Sopenharmony_ci	bool write = vmf->flags & FAULT_FLAG_WRITE;
127862306a36Sopenharmony_ci
127962306a36Sopenharmony_ci	vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
128062306a36Sopenharmony_ci	if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd)))
128162306a36Sopenharmony_ci		goto unlock;
128262306a36Sopenharmony_ci
128362306a36Sopenharmony_ci	touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
128462306a36Sopenharmony_ci
128562306a36Sopenharmony_ciunlock:
128662306a36Sopenharmony_ci	spin_unlock(vmf->ptl);
128762306a36Sopenharmony_ci}
128862306a36Sopenharmony_ci
128962306a36Sopenharmony_civm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
129062306a36Sopenharmony_ci{
129162306a36Sopenharmony_ci	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
129262306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
129362306a36Sopenharmony_ci	struct folio *folio;
129462306a36Sopenharmony_ci	struct page *page;
129562306a36Sopenharmony_ci	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
129662306a36Sopenharmony_ci	pmd_t orig_pmd = vmf->orig_pmd;
129762306a36Sopenharmony_ci
129862306a36Sopenharmony_ci	vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
129962306a36Sopenharmony_ci	VM_BUG_ON_VMA(!vma->anon_vma, vma);
130062306a36Sopenharmony_ci
130162306a36Sopenharmony_ci	if (is_huge_zero_pmd(orig_pmd))
130262306a36Sopenharmony_ci		goto fallback;
130362306a36Sopenharmony_ci
130462306a36Sopenharmony_ci	spin_lock(vmf->ptl);
130562306a36Sopenharmony_ci
130662306a36Sopenharmony_ci	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
130762306a36Sopenharmony_ci		spin_unlock(vmf->ptl);
130862306a36Sopenharmony_ci		return 0;
130962306a36Sopenharmony_ci	}
131062306a36Sopenharmony_ci
131162306a36Sopenharmony_ci	page = pmd_page(orig_pmd);
131262306a36Sopenharmony_ci	folio = page_folio(page);
131362306a36Sopenharmony_ci	VM_BUG_ON_PAGE(!PageHead(page), page);
131462306a36Sopenharmony_ci
131562306a36Sopenharmony_ci	/* Early check when only holding the PT lock. */
131662306a36Sopenharmony_ci	if (PageAnonExclusive(page))
131762306a36Sopenharmony_ci		goto reuse;
131862306a36Sopenharmony_ci
131962306a36Sopenharmony_ci	if (!folio_trylock(folio)) {
132062306a36Sopenharmony_ci		folio_get(folio);
132162306a36Sopenharmony_ci		spin_unlock(vmf->ptl);
132262306a36Sopenharmony_ci		folio_lock(folio);
132362306a36Sopenharmony_ci		spin_lock(vmf->ptl);
132462306a36Sopenharmony_ci		if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
132562306a36Sopenharmony_ci			spin_unlock(vmf->ptl);
132662306a36Sopenharmony_ci			folio_unlock(folio);
132762306a36Sopenharmony_ci			folio_put(folio);
132862306a36Sopenharmony_ci			return 0;
132962306a36Sopenharmony_ci		}
133062306a36Sopenharmony_ci		folio_put(folio);
133162306a36Sopenharmony_ci	}
133262306a36Sopenharmony_ci
133362306a36Sopenharmony_ci	/* Recheck after temporarily dropping the PT lock. */
133462306a36Sopenharmony_ci	if (PageAnonExclusive(page)) {
133562306a36Sopenharmony_ci		folio_unlock(folio);
133662306a36Sopenharmony_ci		goto reuse;
133762306a36Sopenharmony_ci	}
133862306a36Sopenharmony_ci
133962306a36Sopenharmony_ci	/*
134062306a36Sopenharmony_ci	 * See do_wp_page(): we can only reuse the folio exclusively if
134162306a36Sopenharmony_ci	 * there are no additional references. Note that we always drain
134262306a36Sopenharmony_ci	 * the LRU cache immediately after adding a THP.
134362306a36Sopenharmony_ci	 */
134462306a36Sopenharmony_ci	if (folio_ref_count(folio) >
134562306a36Sopenharmony_ci			1 + folio_test_swapcache(folio) * folio_nr_pages(folio))
134662306a36Sopenharmony_ci		goto unlock_fallback;
134762306a36Sopenharmony_ci	if (folio_test_swapcache(folio))
134862306a36Sopenharmony_ci		folio_free_swap(folio);
134962306a36Sopenharmony_ci	if (folio_ref_count(folio) == 1) {
135062306a36Sopenharmony_ci		pmd_t entry;
135162306a36Sopenharmony_ci
135262306a36Sopenharmony_ci		page_move_anon_rmap(page, vma);
135362306a36Sopenharmony_ci		folio_unlock(folio);
135462306a36Sopenharmony_cireuse:
135562306a36Sopenharmony_ci		if (unlikely(unshare)) {
135662306a36Sopenharmony_ci			spin_unlock(vmf->ptl);
135762306a36Sopenharmony_ci			return 0;
135862306a36Sopenharmony_ci		}
135962306a36Sopenharmony_ci		entry = pmd_mkyoung(orig_pmd);
136062306a36Sopenharmony_ci		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
136162306a36Sopenharmony_ci		if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
136262306a36Sopenharmony_ci			update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
136362306a36Sopenharmony_ci		spin_unlock(vmf->ptl);
136462306a36Sopenharmony_ci		return 0;
136562306a36Sopenharmony_ci	}
136662306a36Sopenharmony_ci
136762306a36Sopenharmony_ciunlock_fallback:
136862306a36Sopenharmony_ci	folio_unlock(folio);
136962306a36Sopenharmony_ci	spin_unlock(vmf->ptl);
137062306a36Sopenharmony_cifallback:
137162306a36Sopenharmony_ci	__split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
137262306a36Sopenharmony_ci	return VM_FAULT_FALLBACK;
137362306a36Sopenharmony_ci}
137462306a36Sopenharmony_ci
137562306a36Sopenharmony_cistatic inline bool can_change_pmd_writable(struct vm_area_struct *vma,
137662306a36Sopenharmony_ci					   unsigned long addr, pmd_t pmd)
137762306a36Sopenharmony_ci{
137862306a36Sopenharmony_ci	struct page *page;
137962306a36Sopenharmony_ci
138062306a36Sopenharmony_ci	if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
138162306a36Sopenharmony_ci		return false;
138262306a36Sopenharmony_ci
138362306a36Sopenharmony_ci	/* Don't touch entries that are not even readable (NUMA hinting). */
138462306a36Sopenharmony_ci	if (pmd_protnone(pmd))
138562306a36Sopenharmony_ci		return false;
138662306a36Sopenharmony_ci
138762306a36Sopenharmony_ci	/* Do we need write faults for softdirty tracking? */
138862306a36Sopenharmony_ci	if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd))
138962306a36Sopenharmony_ci		return false;
139062306a36Sopenharmony_ci
139162306a36Sopenharmony_ci	/* Do we need write faults for uffd-wp tracking? */
139262306a36Sopenharmony_ci	if (userfaultfd_huge_pmd_wp(vma, pmd))
139362306a36Sopenharmony_ci		return false;
139462306a36Sopenharmony_ci
139562306a36Sopenharmony_ci	if (!(vma->vm_flags & VM_SHARED)) {
139662306a36Sopenharmony_ci		/* See can_change_pte_writable(). */
139762306a36Sopenharmony_ci		page = vm_normal_page_pmd(vma, addr, pmd);
139862306a36Sopenharmony_ci		return page && PageAnon(page) && PageAnonExclusive(page);
139962306a36Sopenharmony_ci	}
140062306a36Sopenharmony_ci
140162306a36Sopenharmony_ci	/* See can_change_pte_writable(). */
140262306a36Sopenharmony_ci	return pmd_dirty(pmd);
140362306a36Sopenharmony_ci}
140462306a36Sopenharmony_ci
140562306a36Sopenharmony_ci/* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */
140662306a36Sopenharmony_cistatic inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
140762306a36Sopenharmony_ci					struct vm_area_struct *vma,
140862306a36Sopenharmony_ci					unsigned int flags)
140962306a36Sopenharmony_ci{
141062306a36Sopenharmony_ci	/* If the pmd is writable, we can write to the page. */
141162306a36Sopenharmony_ci	if (pmd_write(pmd))
141262306a36Sopenharmony_ci		return true;
141362306a36Sopenharmony_ci
141462306a36Sopenharmony_ci	/* Maybe FOLL_FORCE is set to override it? */
141562306a36Sopenharmony_ci	if (!(flags & FOLL_FORCE))
141662306a36Sopenharmony_ci		return false;
141762306a36Sopenharmony_ci
141862306a36Sopenharmony_ci	/* But FOLL_FORCE has no effect on shared mappings */
141962306a36Sopenharmony_ci	if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
142062306a36Sopenharmony_ci		return false;
142162306a36Sopenharmony_ci
142262306a36Sopenharmony_ci	/* ... or read-only private ones */
142362306a36Sopenharmony_ci	if (!(vma->vm_flags & VM_MAYWRITE))
142462306a36Sopenharmony_ci		return false;
142562306a36Sopenharmony_ci
142662306a36Sopenharmony_ci	/* ... or already writable ones that just need to take a write fault */
142762306a36Sopenharmony_ci	if (vma->vm_flags & VM_WRITE)
142862306a36Sopenharmony_ci		return false;
142962306a36Sopenharmony_ci
143062306a36Sopenharmony_ci	/*
143162306a36Sopenharmony_ci	 * See can_change_pte_writable(): we broke COW and could map the page
143262306a36Sopenharmony_ci	 * writable if we have an exclusive anonymous page ...
143362306a36Sopenharmony_ci	 */
143462306a36Sopenharmony_ci	if (!page || !PageAnon(page) || !PageAnonExclusive(page))
143562306a36Sopenharmony_ci		return false;
143662306a36Sopenharmony_ci
143762306a36Sopenharmony_ci	/* ... and a write-fault isn't required for other reasons. */
143862306a36Sopenharmony_ci	if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd))
143962306a36Sopenharmony_ci		return false;
144062306a36Sopenharmony_ci	return !userfaultfd_huge_pmd_wp(vma, pmd);
144162306a36Sopenharmony_ci}
144262306a36Sopenharmony_ci
144362306a36Sopenharmony_cistruct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
144462306a36Sopenharmony_ci				   unsigned long addr,
144562306a36Sopenharmony_ci				   pmd_t *pmd,
144662306a36Sopenharmony_ci				   unsigned int flags)
144762306a36Sopenharmony_ci{
144862306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
144962306a36Sopenharmony_ci	struct page *page;
145062306a36Sopenharmony_ci	int ret;
145162306a36Sopenharmony_ci
145262306a36Sopenharmony_ci	assert_spin_locked(pmd_lockptr(mm, pmd));
145362306a36Sopenharmony_ci
145462306a36Sopenharmony_ci	page = pmd_page(*pmd);
145562306a36Sopenharmony_ci	VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
145662306a36Sopenharmony_ci
145762306a36Sopenharmony_ci	if ((flags & FOLL_WRITE) &&
145862306a36Sopenharmony_ci	    !can_follow_write_pmd(*pmd, page, vma, flags))
145962306a36Sopenharmony_ci		return NULL;
146062306a36Sopenharmony_ci
146162306a36Sopenharmony_ci	/* Avoid dumping huge zero page */
146262306a36Sopenharmony_ci	if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
146362306a36Sopenharmony_ci		return ERR_PTR(-EFAULT);
146462306a36Sopenharmony_ci
146562306a36Sopenharmony_ci	if (pmd_protnone(*pmd) && !gup_can_follow_protnone(vma, flags))
146662306a36Sopenharmony_ci		return NULL;
146762306a36Sopenharmony_ci
146862306a36Sopenharmony_ci	if (!pmd_write(*pmd) && gup_must_unshare(vma, flags, page))
146962306a36Sopenharmony_ci		return ERR_PTR(-EMLINK);
147062306a36Sopenharmony_ci
147162306a36Sopenharmony_ci	VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
147262306a36Sopenharmony_ci			!PageAnonExclusive(page), page);
147362306a36Sopenharmony_ci
147462306a36Sopenharmony_ci	ret = try_grab_page(page, flags);
147562306a36Sopenharmony_ci	if (ret)
147662306a36Sopenharmony_ci		return ERR_PTR(ret);
147762306a36Sopenharmony_ci
147862306a36Sopenharmony_ci	if (flags & FOLL_TOUCH)
147962306a36Sopenharmony_ci		touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
148062306a36Sopenharmony_ci
148162306a36Sopenharmony_ci	page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
148262306a36Sopenharmony_ci	VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
148362306a36Sopenharmony_ci
148462306a36Sopenharmony_ci	return page;
148562306a36Sopenharmony_ci}
148662306a36Sopenharmony_ci
148762306a36Sopenharmony_ci/* NUMA hinting page fault entry point for trans huge pmds */
148862306a36Sopenharmony_civm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
148962306a36Sopenharmony_ci{
149062306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
149162306a36Sopenharmony_ci	pmd_t oldpmd = vmf->orig_pmd;
149262306a36Sopenharmony_ci	pmd_t pmd;
149362306a36Sopenharmony_ci	struct page *page;
149462306a36Sopenharmony_ci	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
149562306a36Sopenharmony_ci	int page_nid = NUMA_NO_NODE;
149662306a36Sopenharmony_ci	int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
149762306a36Sopenharmony_ci	bool migrated = false, writable = false;
149862306a36Sopenharmony_ci	int flags = 0;
149962306a36Sopenharmony_ci
150062306a36Sopenharmony_ci	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
150162306a36Sopenharmony_ci	if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
150262306a36Sopenharmony_ci		spin_unlock(vmf->ptl);
150362306a36Sopenharmony_ci		goto out;
150462306a36Sopenharmony_ci	}
150562306a36Sopenharmony_ci
150662306a36Sopenharmony_ci	pmd = pmd_modify(oldpmd, vma->vm_page_prot);
150762306a36Sopenharmony_ci
150862306a36Sopenharmony_ci	/*
150962306a36Sopenharmony_ci	 * Detect now whether the PMD could be writable; this information
151062306a36Sopenharmony_ci	 * is only valid while holding the PT lock.
151162306a36Sopenharmony_ci	 */
151262306a36Sopenharmony_ci	writable = pmd_write(pmd);
151362306a36Sopenharmony_ci	if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
151462306a36Sopenharmony_ci	    can_change_pmd_writable(vma, vmf->address, pmd))
151562306a36Sopenharmony_ci		writable = true;
151662306a36Sopenharmony_ci
151762306a36Sopenharmony_ci	page = vm_normal_page_pmd(vma, haddr, pmd);
151862306a36Sopenharmony_ci	if (!page)
151962306a36Sopenharmony_ci		goto out_map;
152062306a36Sopenharmony_ci
152162306a36Sopenharmony_ci	/* See similar comment in do_numa_page for explanation */
152262306a36Sopenharmony_ci	if (!writable)
152362306a36Sopenharmony_ci		flags |= TNF_NO_GROUP;
152462306a36Sopenharmony_ci
152562306a36Sopenharmony_ci	page_nid = page_to_nid(page);
152662306a36Sopenharmony_ci	/*
152762306a36Sopenharmony_ci	 * For memory tiering mode, cpupid of slow memory page is used
152862306a36Sopenharmony_ci	 * to record page access time.  So use default value.
152962306a36Sopenharmony_ci	 */
153062306a36Sopenharmony_ci	if (node_is_toptier(page_nid))
153162306a36Sopenharmony_ci		last_cpupid = page_cpupid_last(page);
153262306a36Sopenharmony_ci	target_nid = numa_migrate_prep(page, vma, haddr, page_nid,
153362306a36Sopenharmony_ci				       &flags);
153462306a36Sopenharmony_ci
153562306a36Sopenharmony_ci	if (target_nid == NUMA_NO_NODE) {
153662306a36Sopenharmony_ci		put_page(page);
153762306a36Sopenharmony_ci		goto out_map;
153862306a36Sopenharmony_ci	}
153962306a36Sopenharmony_ci
154062306a36Sopenharmony_ci	spin_unlock(vmf->ptl);
154162306a36Sopenharmony_ci	writable = false;
154262306a36Sopenharmony_ci
154362306a36Sopenharmony_ci	migrated = migrate_misplaced_page(page, vma, target_nid);
154462306a36Sopenharmony_ci	if (migrated) {
154562306a36Sopenharmony_ci		flags |= TNF_MIGRATED;
154662306a36Sopenharmony_ci		page_nid = target_nid;
154762306a36Sopenharmony_ci	} else {
154862306a36Sopenharmony_ci		flags |= TNF_MIGRATE_FAIL;
154962306a36Sopenharmony_ci		vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
155062306a36Sopenharmony_ci		if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
155162306a36Sopenharmony_ci			spin_unlock(vmf->ptl);
155262306a36Sopenharmony_ci			goto out;
155362306a36Sopenharmony_ci		}
155462306a36Sopenharmony_ci		goto out_map;
155562306a36Sopenharmony_ci	}
155662306a36Sopenharmony_ci
155762306a36Sopenharmony_ciout:
155862306a36Sopenharmony_ci	if (page_nid != NUMA_NO_NODE)
155962306a36Sopenharmony_ci		task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
156062306a36Sopenharmony_ci				flags);
156162306a36Sopenharmony_ci
156262306a36Sopenharmony_ci	return 0;
156362306a36Sopenharmony_ci
156462306a36Sopenharmony_ciout_map:
156562306a36Sopenharmony_ci	/* Restore the PMD */
156662306a36Sopenharmony_ci	pmd = pmd_modify(oldpmd, vma->vm_page_prot);
156762306a36Sopenharmony_ci	pmd = pmd_mkyoung(pmd);
156862306a36Sopenharmony_ci	if (writable)
156962306a36Sopenharmony_ci		pmd = pmd_mkwrite(pmd, vma);
157062306a36Sopenharmony_ci	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
157162306a36Sopenharmony_ci	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
157262306a36Sopenharmony_ci	spin_unlock(vmf->ptl);
157362306a36Sopenharmony_ci	goto out;
157462306a36Sopenharmony_ci}
157562306a36Sopenharmony_ci
157662306a36Sopenharmony_ci/*
157762306a36Sopenharmony_ci * Return true if we do MADV_FREE successfully on entire pmd page.
157862306a36Sopenharmony_ci * Otherwise, return false.
157962306a36Sopenharmony_ci */
158062306a36Sopenharmony_cibool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
158162306a36Sopenharmony_ci		pmd_t *pmd, unsigned long addr, unsigned long next)
158262306a36Sopenharmony_ci{
158362306a36Sopenharmony_ci	spinlock_t *ptl;
158462306a36Sopenharmony_ci	pmd_t orig_pmd;
158562306a36Sopenharmony_ci	struct folio *folio;
158662306a36Sopenharmony_ci	struct mm_struct *mm = tlb->mm;
158762306a36Sopenharmony_ci	bool ret = false;
158862306a36Sopenharmony_ci
158962306a36Sopenharmony_ci	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
159062306a36Sopenharmony_ci
159162306a36Sopenharmony_ci	ptl = pmd_trans_huge_lock(pmd, vma);
159262306a36Sopenharmony_ci	if (!ptl)
159362306a36Sopenharmony_ci		goto out_unlocked;
159462306a36Sopenharmony_ci
159562306a36Sopenharmony_ci	orig_pmd = *pmd;
159662306a36Sopenharmony_ci	if (is_huge_zero_pmd(orig_pmd))
159762306a36Sopenharmony_ci		goto out;
159862306a36Sopenharmony_ci
159962306a36Sopenharmony_ci	if (unlikely(!pmd_present(orig_pmd))) {
160062306a36Sopenharmony_ci		VM_BUG_ON(thp_migration_supported() &&
160162306a36Sopenharmony_ci				  !is_pmd_migration_entry(orig_pmd));
160262306a36Sopenharmony_ci		goto out;
160362306a36Sopenharmony_ci	}
160462306a36Sopenharmony_ci
160562306a36Sopenharmony_ci	folio = pfn_folio(pmd_pfn(orig_pmd));
160662306a36Sopenharmony_ci	/*
160762306a36Sopenharmony_ci	 * If other processes are mapping this folio, we couldn't discard
160862306a36Sopenharmony_ci	 * the folio unless they all do MADV_FREE so let's skip the folio.
160962306a36Sopenharmony_ci	 */
161062306a36Sopenharmony_ci	if (folio_estimated_sharers(folio) != 1)
161162306a36Sopenharmony_ci		goto out;
161262306a36Sopenharmony_ci
161362306a36Sopenharmony_ci	if (!folio_trylock(folio))
161462306a36Sopenharmony_ci		goto out;
161562306a36Sopenharmony_ci
161662306a36Sopenharmony_ci	/*
161762306a36Sopenharmony_ci	 * If user want to discard part-pages of THP, split it so MADV_FREE
161862306a36Sopenharmony_ci	 * will deactivate only them.
161962306a36Sopenharmony_ci	 */
162062306a36Sopenharmony_ci	if (next - addr != HPAGE_PMD_SIZE) {
162162306a36Sopenharmony_ci		folio_get(folio);
162262306a36Sopenharmony_ci		spin_unlock(ptl);
162362306a36Sopenharmony_ci		split_folio(folio);
162462306a36Sopenharmony_ci		folio_unlock(folio);
162562306a36Sopenharmony_ci		folio_put(folio);
162662306a36Sopenharmony_ci		goto out_unlocked;
162762306a36Sopenharmony_ci	}
162862306a36Sopenharmony_ci
162962306a36Sopenharmony_ci	if (folio_test_dirty(folio))
163062306a36Sopenharmony_ci		folio_clear_dirty(folio);
163162306a36Sopenharmony_ci	folio_unlock(folio);
163262306a36Sopenharmony_ci
163362306a36Sopenharmony_ci	if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
163462306a36Sopenharmony_ci		pmdp_invalidate(vma, addr, pmd);
163562306a36Sopenharmony_ci		orig_pmd = pmd_mkold(orig_pmd);
163662306a36Sopenharmony_ci		orig_pmd = pmd_mkclean(orig_pmd);
163762306a36Sopenharmony_ci
163862306a36Sopenharmony_ci		set_pmd_at(mm, addr, pmd, orig_pmd);
163962306a36Sopenharmony_ci		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
164062306a36Sopenharmony_ci	}
164162306a36Sopenharmony_ci
164262306a36Sopenharmony_ci	folio_mark_lazyfree(folio);
164362306a36Sopenharmony_ci	ret = true;
164462306a36Sopenharmony_ciout:
164562306a36Sopenharmony_ci	spin_unlock(ptl);
164662306a36Sopenharmony_ciout_unlocked:
164762306a36Sopenharmony_ci	return ret;
164862306a36Sopenharmony_ci}
164962306a36Sopenharmony_ci
165062306a36Sopenharmony_cistatic inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
165162306a36Sopenharmony_ci{
165262306a36Sopenharmony_ci	pgtable_t pgtable;
165362306a36Sopenharmony_ci
165462306a36Sopenharmony_ci	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
165562306a36Sopenharmony_ci	pte_free(mm, pgtable);
165662306a36Sopenharmony_ci	mm_dec_nr_ptes(mm);
165762306a36Sopenharmony_ci}
165862306a36Sopenharmony_ci
165962306a36Sopenharmony_ciint zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
166062306a36Sopenharmony_ci		 pmd_t *pmd, unsigned long addr)
166162306a36Sopenharmony_ci{
166262306a36Sopenharmony_ci	pmd_t orig_pmd;
166362306a36Sopenharmony_ci	spinlock_t *ptl;
166462306a36Sopenharmony_ci
166562306a36Sopenharmony_ci	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
166662306a36Sopenharmony_ci
166762306a36Sopenharmony_ci	ptl = __pmd_trans_huge_lock(pmd, vma);
166862306a36Sopenharmony_ci	if (!ptl)
166962306a36Sopenharmony_ci		return 0;
167062306a36Sopenharmony_ci	/*
167162306a36Sopenharmony_ci	 * For architectures like ppc64 we look at deposited pgtable
167262306a36Sopenharmony_ci	 * when calling pmdp_huge_get_and_clear. So do the
167362306a36Sopenharmony_ci	 * pgtable_trans_huge_withdraw after finishing pmdp related
167462306a36Sopenharmony_ci	 * operations.
167562306a36Sopenharmony_ci	 */
167662306a36Sopenharmony_ci	orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
167762306a36Sopenharmony_ci						tlb->fullmm);
167862306a36Sopenharmony_ci	arch_check_zapped_pmd(vma, orig_pmd);
167962306a36Sopenharmony_ci	tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
168062306a36Sopenharmony_ci	if (vma_is_special_huge(vma)) {
168162306a36Sopenharmony_ci		if (arch_needs_pgtable_deposit())
168262306a36Sopenharmony_ci			zap_deposited_table(tlb->mm, pmd);
168362306a36Sopenharmony_ci		spin_unlock(ptl);
168462306a36Sopenharmony_ci	} else if (is_huge_zero_pmd(orig_pmd)) {
168562306a36Sopenharmony_ci		zap_deposited_table(tlb->mm, pmd);
168662306a36Sopenharmony_ci		spin_unlock(ptl);
168762306a36Sopenharmony_ci	} else {
168862306a36Sopenharmony_ci		struct page *page = NULL;
168962306a36Sopenharmony_ci		int flush_needed = 1;
169062306a36Sopenharmony_ci
169162306a36Sopenharmony_ci		if (pmd_present(orig_pmd)) {
169262306a36Sopenharmony_ci			page = pmd_page(orig_pmd);
169362306a36Sopenharmony_ci			page_remove_rmap(page, vma, true);
169462306a36Sopenharmony_ci			VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
169562306a36Sopenharmony_ci			VM_BUG_ON_PAGE(!PageHead(page), page);
169662306a36Sopenharmony_ci		} else if (thp_migration_supported()) {
169762306a36Sopenharmony_ci			swp_entry_t entry;
169862306a36Sopenharmony_ci
169962306a36Sopenharmony_ci			VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
170062306a36Sopenharmony_ci			entry = pmd_to_swp_entry(orig_pmd);
170162306a36Sopenharmony_ci			page = pfn_swap_entry_to_page(entry);
170262306a36Sopenharmony_ci			flush_needed = 0;
170362306a36Sopenharmony_ci		} else
170462306a36Sopenharmony_ci			WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
170562306a36Sopenharmony_ci
170662306a36Sopenharmony_ci		if (PageAnon(page)) {
170762306a36Sopenharmony_ci			zap_deposited_table(tlb->mm, pmd);
170862306a36Sopenharmony_ci			add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
170962306a36Sopenharmony_ci		} else {
171062306a36Sopenharmony_ci			if (arch_needs_pgtable_deposit())
171162306a36Sopenharmony_ci				zap_deposited_table(tlb->mm, pmd);
171262306a36Sopenharmony_ci			add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR);
171362306a36Sopenharmony_ci		}
171462306a36Sopenharmony_ci
171562306a36Sopenharmony_ci		spin_unlock(ptl);
171662306a36Sopenharmony_ci		if (flush_needed)
171762306a36Sopenharmony_ci			tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
171862306a36Sopenharmony_ci	}
171962306a36Sopenharmony_ci	return 1;
172062306a36Sopenharmony_ci}
172162306a36Sopenharmony_ci
172262306a36Sopenharmony_ci#ifndef pmd_move_must_withdraw
172362306a36Sopenharmony_cistatic inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
172462306a36Sopenharmony_ci					 spinlock_t *old_pmd_ptl,
172562306a36Sopenharmony_ci					 struct vm_area_struct *vma)
172662306a36Sopenharmony_ci{
172762306a36Sopenharmony_ci	/*
172862306a36Sopenharmony_ci	 * With split pmd lock we also need to move preallocated
172962306a36Sopenharmony_ci	 * PTE page table if new_pmd is on different PMD page table.
173062306a36Sopenharmony_ci	 *
173162306a36Sopenharmony_ci	 * We also don't deposit and withdraw tables for file pages.
173262306a36Sopenharmony_ci	 */
173362306a36Sopenharmony_ci	return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
173462306a36Sopenharmony_ci}
173562306a36Sopenharmony_ci#endif
173662306a36Sopenharmony_ci
173762306a36Sopenharmony_cistatic pmd_t move_soft_dirty_pmd(pmd_t pmd)
173862306a36Sopenharmony_ci{
173962306a36Sopenharmony_ci#ifdef CONFIG_MEM_SOFT_DIRTY
174062306a36Sopenharmony_ci	if (unlikely(is_pmd_migration_entry(pmd)))
174162306a36Sopenharmony_ci		pmd = pmd_swp_mksoft_dirty(pmd);
174262306a36Sopenharmony_ci	else if (pmd_present(pmd))
174362306a36Sopenharmony_ci		pmd = pmd_mksoft_dirty(pmd);
174462306a36Sopenharmony_ci#endif
174562306a36Sopenharmony_ci	return pmd;
174662306a36Sopenharmony_ci}
174762306a36Sopenharmony_ci
174862306a36Sopenharmony_cibool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
174962306a36Sopenharmony_ci		  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
175062306a36Sopenharmony_ci{
175162306a36Sopenharmony_ci	spinlock_t *old_ptl, *new_ptl;
175262306a36Sopenharmony_ci	pmd_t pmd;
175362306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
175462306a36Sopenharmony_ci	bool force_flush = false;
175562306a36Sopenharmony_ci
175662306a36Sopenharmony_ci	/*
175762306a36Sopenharmony_ci	 * The destination pmd shouldn't be established, free_pgtables()
175862306a36Sopenharmony_ci	 * should have released it; but move_page_tables() might have already
175962306a36Sopenharmony_ci	 * inserted a page table, if racing against shmem/file collapse.
176062306a36Sopenharmony_ci	 */
176162306a36Sopenharmony_ci	if (!pmd_none(*new_pmd)) {
176262306a36Sopenharmony_ci		VM_BUG_ON(pmd_trans_huge(*new_pmd));
176362306a36Sopenharmony_ci		return false;
176462306a36Sopenharmony_ci	}
176562306a36Sopenharmony_ci
176662306a36Sopenharmony_ci	/*
176762306a36Sopenharmony_ci	 * We don't have to worry about the ordering of src and dst
176862306a36Sopenharmony_ci	 * ptlocks because exclusive mmap_lock prevents deadlock.
176962306a36Sopenharmony_ci	 */
177062306a36Sopenharmony_ci	old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
177162306a36Sopenharmony_ci	if (old_ptl) {
177262306a36Sopenharmony_ci		new_ptl = pmd_lockptr(mm, new_pmd);
177362306a36Sopenharmony_ci		if (new_ptl != old_ptl)
177462306a36Sopenharmony_ci			spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
177562306a36Sopenharmony_ci		pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
177662306a36Sopenharmony_ci		if (pmd_present(pmd))
177762306a36Sopenharmony_ci			force_flush = true;
177862306a36Sopenharmony_ci		VM_BUG_ON(!pmd_none(*new_pmd));
177962306a36Sopenharmony_ci
178062306a36Sopenharmony_ci		if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
178162306a36Sopenharmony_ci			pgtable_t pgtable;
178262306a36Sopenharmony_ci			pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
178362306a36Sopenharmony_ci			pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
178462306a36Sopenharmony_ci		}
178562306a36Sopenharmony_ci		pmd = move_soft_dirty_pmd(pmd);
178662306a36Sopenharmony_ci		set_pmd_at(mm, new_addr, new_pmd, pmd);
178762306a36Sopenharmony_ci		if (force_flush)
178862306a36Sopenharmony_ci			flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
178962306a36Sopenharmony_ci		if (new_ptl != old_ptl)
179062306a36Sopenharmony_ci			spin_unlock(new_ptl);
179162306a36Sopenharmony_ci		spin_unlock(old_ptl);
179262306a36Sopenharmony_ci		return true;
179362306a36Sopenharmony_ci	}
179462306a36Sopenharmony_ci	return false;
179562306a36Sopenharmony_ci}
179662306a36Sopenharmony_ci
179762306a36Sopenharmony_ci/*
179862306a36Sopenharmony_ci * Returns
179962306a36Sopenharmony_ci *  - 0 if PMD could not be locked
180062306a36Sopenharmony_ci *  - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
180162306a36Sopenharmony_ci *      or if prot_numa but THP migration is not supported
180262306a36Sopenharmony_ci *  - HPAGE_PMD_NR if protections changed and TLB flush necessary
180362306a36Sopenharmony_ci */
180462306a36Sopenharmony_ciint change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
180562306a36Sopenharmony_ci		    pmd_t *pmd, unsigned long addr, pgprot_t newprot,
180662306a36Sopenharmony_ci		    unsigned long cp_flags)
180762306a36Sopenharmony_ci{
180862306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
180962306a36Sopenharmony_ci	spinlock_t *ptl;
181062306a36Sopenharmony_ci	pmd_t oldpmd, entry;
181162306a36Sopenharmony_ci	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
181262306a36Sopenharmony_ci	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
181362306a36Sopenharmony_ci	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
181462306a36Sopenharmony_ci	int ret = 1;
181562306a36Sopenharmony_ci
181662306a36Sopenharmony_ci	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
181762306a36Sopenharmony_ci
181862306a36Sopenharmony_ci	if (prot_numa && !thp_migration_supported())
181962306a36Sopenharmony_ci		return 1;
182062306a36Sopenharmony_ci
182162306a36Sopenharmony_ci	ptl = __pmd_trans_huge_lock(pmd, vma);
182262306a36Sopenharmony_ci	if (!ptl)
182362306a36Sopenharmony_ci		return 0;
182462306a36Sopenharmony_ci
182562306a36Sopenharmony_ci#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
182662306a36Sopenharmony_ci	if (is_swap_pmd(*pmd)) {
182762306a36Sopenharmony_ci		swp_entry_t entry = pmd_to_swp_entry(*pmd);
182862306a36Sopenharmony_ci		struct page *page = pfn_swap_entry_to_page(entry);
182962306a36Sopenharmony_ci		pmd_t newpmd;
183062306a36Sopenharmony_ci
183162306a36Sopenharmony_ci		VM_BUG_ON(!is_pmd_migration_entry(*pmd));
183262306a36Sopenharmony_ci		if (is_writable_migration_entry(entry)) {
183362306a36Sopenharmony_ci			/*
183462306a36Sopenharmony_ci			 * A protection check is difficult so
183562306a36Sopenharmony_ci			 * just be safe and disable write
183662306a36Sopenharmony_ci			 */
183762306a36Sopenharmony_ci			if (PageAnon(page))
183862306a36Sopenharmony_ci				entry = make_readable_exclusive_migration_entry(swp_offset(entry));
183962306a36Sopenharmony_ci			else
184062306a36Sopenharmony_ci				entry = make_readable_migration_entry(swp_offset(entry));
184162306a36Sopenharmony_ci			newpmd = swp_entry_to_pmd(entry);
184262306a36Sopenharmony_ci			if (pmd_swp_soft_dirty(*pmd))
184362306a36Sopenharmony_ci				newpmd = pmd_swp_mksoft_dirty(newpmd);
184462306a36Sopenharmony_ci		} else {
184562306a36Sopenharmony_ci			newpmd = *pmd;
184662306a36Sopenharmony_ci		}
184762306a36Sopenharmony_ci
184862306a36Sopenharmony_ci		if (uffd_wp)
184962306a36Sopenharmony_ci			newpmd = pmd_swp_mkuffd_wp(newpmd);
185062306a36Sopenharmony_ci		else if (uffd_wp_resolve)
185162306a36Sopenharmony_ci			newpmd = pmd_swp_clear_uffd_wp(newpmd);
185262306a36Sopenharmony_ci		if (!pmd_same(*pmd, newpmd))
185362306a36Sopenharmony_ci			set_pmd_at(mm, addr, pmd, newpmd);
185462306a36Sopenharmony_ci		goto unlock;
185562306a36Sopenharmony_ci	}
185662306a36Sopenharmony_ci#endif
185762306a36Sopenharmony_ci
185862306a36Sopenharmony_ci	if (prot_numa) {
185962306a36Sopenharmony_ci		struct page *page;
186062306a36Sopenharmony_ci		bool toptier;
186162306a36Sopenharmony_ci		/*
186262306a36Sopenharmony_ci		 * Avoid trapping faults against the zero page. The read-only
186362306a36Sopenharmony_ci		 * data is likely to be read-cached on the local CPU and
186462306a36Sopenharmony_ci		 * local/remote hits to the zero page are not interesting.
186562306a36Sopenharmony_ci		 */
186662306a36Sopenharmony_ci		if (is_huge_zero_pmd(*pmd))
186762306a36Sopenharmony_ci			goto unlock;
186862306a36Sopenharmony_ci
186962306a36Sopenharmony_ci		if (pmd_protnone(*pmd))
187062306a36Sopenharmony_ci			goto unlock;
187162306a36Sopenharmony_ci
187262306a36Sopenharmony_ci		page = pmd_page(*pmd);
187362306a36Sopenharmony_ci		toptier = node_is_toptier(page_to_nid(page));
187462306a36Sopenharmony_ci		/*
187562306a36Sopenharmony_ci		 * Skip scanning top tier node if normal numa
187662306a36Sopenharmony_ci		 * balancing is disabled
187762306a36Sopenharmony_ci		 */
187862306a36Sopenharmony_ci		if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
187962306a36Sopenharmony_ci		    toptier)
188062306a36Sopenharmony_ci			goto unlock;
188162306a36Sopenharmony_ci
188262306a36Sopenharmony_ci		if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
188362306a36Sopenharmony_ci		    !toptier)
188462306a36Sopenharmony_ci			xchg_page_access_time(page, jiffies_to_msecs(jiffies));
188562306a36Sopenharmony_ci	}
188662306a36Sopenharmony_ci	/*
188762306a36Sopenharmony_ci	 * In case prot_numa, we are under mmap_read_lock(mm). It's critical
188862306a36Sopenharmony_ci	 * to not clear pmd intermittently to avoid race with MADV_DONTNEED
188962306a36Sopenharmony_ci	 * which is also under mmap_read_lock(mm):
189062306a36Sopenharmony_ci	 *
189162306a36Sopenharmony_ci	 *	CPU0:				CPU1:
189262306a36Sopenharmony_ci	 *				change_huge_pmd(prot_numa=1)
189362306a36Sopenharmony_ci	 *				 pmdp_huge_get_and_clear_notify()
189462306a36Sopenharmony_ci	 * madvise_dontneed()
189562306a36Sopenharmony_ci	 *  zap_pmd_range()
189662306a36Sopenharmony_ci	 *   pmd_trans_huge(*pmd) == 0 (without ptl)
189762306a36Sopenharmony_ci	 *   // skip the pmd
189862306a36Sopenharmony_ci	 *				 set_pmd_at();
189962306a36Sopenharmony_ci	 *				 // pmd is re-established
190062306a36Sopenharmony_ci	 *
190162306a36Sopenharmony_ci	 * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
190262306a36Sopenharmony_ci	 * which may break userspace.
190362306a36Sopenharmony_ci	 *
190462306a36Sopenharmony_ci	 * pmdp_invalidate_ad() is required to make sure we don't miss
190562306a36Sopenharmony_ci	 * dirty/young flags set by hardware.
190662306a36Sopenharmony_ci	 */
190762306a36Sopenharmony_ci	oldpmd = pmdp_invalidate_ad(vma, addr, pmd);
190862306a36Sopenharmony_ci
190962306a36Sopenharmony_ci	entry = pmd_modify(oldpmd, newprot);
191062306a36Sopenharmony_ci	if (uffd_wp)
191162306a36Sopenharmony_ci		entry = pmd_mkuffd_wp(entry);
191262306a36Sopenharmony_ci	else if (uffd_wp_resolve)
191362306a36Sopenharmony_ci		/*
191462306a36Sopenharmony_ci		 * Leave the write bit to be handled by PF interrupt
191562306a36Sopenharmony_ci		 * handler, then things like COW could be properly
191662306a36Sopenharmony_ci		 * handled.
191762306a36Sopenharmony_ci		 */
191862306a36Sopenharmony_ci		entry = pmd_clear_uffd_wp(entry);
191962306a36Sopenharmony_ci
192062306a36Sopenharmony_ci	/* See change_pte_range(). */
192162306a36Sopenharmony_ci	if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
192262306a36Sopenharmony_ci	    can_change_pmd_writable(vma, addr, entry))
192362306a36Sopenharmony_ci		entry = pmd_mkwrite(entry, vma);
192462306a36Sopenharmony_ci
192562306a36Sopenharmony_ci	ret = HPAGE_PMD_NR;
192662306a36Sopenharmony_ci	set_pmd_at(mm, addr, pmd, entry);
192762306a36Sopenharmony_ci
192862306a36Sopenharmony_ci	if (huge_pmd_needs_flush(oldpmd, entry))
192962306a36Sopenharmony_ci		tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
193062306a36Sopenharmony_ciunlock:
193162306a36Sopenharmony_ci	spin_unlock(ptl);
193262306a36Sopenharmony_ci	return ret;
193362306a36Sopenharmony_ci}
193462306a36Sopenharmony_ci
193562306a36Sopenharmony_ci/*
193662306a36Sopenharmony_ci * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
193762306a36Sopenharmony_ci *
193862306a36Sopenharmony_ci * Note that if it returns page table lock pointer, this routine returns without
193962306a36Sopenharmony_ci * unlocking page table lock. So callers must unlock it.
194062306a36Sopenharmony_ci */
194162306a36Sopenharmony_cispinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
194262306a36Sopenharmony_ci{
194362306a36Sopenharmony_ci	spinlock_t *ptl;
194462306a36Sopenharmony_ci	ptl = pmd_lock(vma->vm_mm, pmd);
194562306a36Sopenharmony_ci	if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
194662306a36Sopenharmony_ci			pmd_devmap(*pmd)))
194762306a36Sopenharmony_ci		return ptl;
194862306a36Sopenharmony_ci	spin_unlock(ptl);
194962306a36Sopenharmony_ci	return NULL;
195062306a36Sopenharmony_ci}
195162306a36Sopenharmony_ci
195262306a36Sopenharmony_ci/*
195362306a36Sopenharmony_ci * Returns page table lock pointer if a given pud maps a thp, NULL otherwise.
195462306a36Sopenharmony_ci *
195562306a36Sopenharmony_ci * Note that if it returns page table lock pointer, this routine returns without
195662306a36Sopenharmony_ci * unlocking page table lock. So callers must unlock it.
195762306a36Sopenharmony_ci */
195862306a36Sopenharmony_cispinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
195962306a36Sopenharmony_ci{
196062306a36Sopenharmony_ci	spinlock_t *ptl;
196162306a36Sopenharmony_ci
196262306a36Sopenharmony_ci	ptl = pud_lock(vma->vm_mm, pud);
196362306a36Sopenharmony_ci	if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
196462306a36Sopenharmony_ci		return ptl;
196562306a36Sopenharmony_ci	spin_unlock(ptl);
196662306a36Sopenharmony_ci	return NULL;
196762306a36Sopenharmony_ci}
196862306a36Sopenharmony_ci
196962306a36Sopenharmony_ci#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
197062306a36Sopenharmony_ciint zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
197162306a36Sopenharmony_ci		 pud_t *pud, unsigned long addr)
197262306a36Sopenharmony_ci{
197362306a36Sopenharmony_ci	spinlock_t *ptl;
197462306a36Sopenharmony_ci
197562306a36Sopenharmony_ci	ptl = __pud_trans_huge_lock(pud, vma);
197662306a36Sopenharmony_ci	if (!ptl)
197762306a36Sopenharmony_ci		return 0;
197862306a36Sopenharmony_ci
197962306a36Sopenharmony_ci	pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
198062306a36Sopenharmony_ci	tlb_remove_pud_tlb_entry(tlb, pud, addr);
198162306a36Sopenharmony_ci	if (vma_is_special_huge(vma)) {
198262306a36Sopenharmony_ci		spin_unlock(ptl);
198362306a36Sopenharmony_ci		/* No zero page support yet */
198462306a36Sopenharmony_ci	} else {
198562306a36Sopenharmony_ci		/* No support for anonymous PUD pages yet */
198662306a36Sopenharmony_ci		BUG();
198762306a36Sopenharmony_ci	}
198862306a36Sopenharmony_ci	return 1;
198962306a36Sopenharmony_ci}
199062306a36Sopenharmony_ci
199162306a36Sopenharmony_cistatic void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
199262306a36Sopenharmony_ci		unsigned long haddr)
199362306a36Sopenharmony_ci{
199462306a36Sopenharmony_ci	VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
199562306a36Sopenharmony_ci	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
199662306a36Sopenharmony_ci	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
199762306a36Sopenharmony_ci	VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
199862306a36Sopenharmony_ci
199962306a36Sopenharmony_ci	count_vm_event(THP_SPLIT_PUD);
200062306a36Sopenharmony_ci
200162306a36Sopenharmony_ci	pudp_huge_clear_flush(vma, haddr, pud);
200262306a36Sopenharmony_ci}
200362306a36Sopenharmony_ci
200462306a36Sopenharmony_civoid __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
200562306a36Sopenharmony_ci		unsigned long address)
200662306a36Sopenharmony_ci{
200762306a36Sopenharmony_ci	spinlock_t *ptl;
200862306a36Sopenharmony_ci	struct mmu_notifier_range range;
200962306a36Sopenharmony_ci
201062306a36Sopenharmony_ci	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
201162306a36Sopenharmony_ci				address & HPAGE_PUD_MASK,
201262306a36Sopenharmony_ci				(address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
201362306a36Sopenharmony_ci	mmu_notifier_invalidate_range_start(&range);
201462306a36Sopenharmony_ci	ptl = pud_lock(vma->vm_mm, pud);
201562306a36Sopenharmony_ci	if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
201662306a36Sopenharmony_ci		goto out;
201762306a36Sopenharmony_ci	__split_huge_pud_locked(vma, pud, range.start);
201862306a36Sopenharmony_ci
201962306a36Sopenharmony_ciout:
202062306a36Sopenharmony_ci	spin_unlock(ptl);
202162306a36Sopenharmony_ci	mmu_notifier_invalidate_range_end(&range);
202262306a36Sopenharmony_ci}
202362306a36Sopenharmony_ci#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
202462306a36Sopenharmony_ci
202562306a36Sopenharmony_cistatic void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
202662306a36Sopenharmony_ci		unsigned long haddr, pmd_t *pmd)
202762306a36Sopenharmony_ci{
202862306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
202962306a36Sopenharmony_ci	pgtable_t pgtable;
203062306a36Sopenharmony_ci	pmd_t _pmd, old_pmd;
203162306a36Sopenharmony_ci	unsigned long addr;
203262306a36Sopenharmony_ci	pte_t *pte;
203362306a36Sopenharmony_ci	int i;
203462306a36Sopenharmony_ci
203562306a36Sopenharmony_ci	/*
203662306a36Sopenharmony_ci	 * Leave pmd empty until pte is filled note that it is fine to delay
203762306a36Sopenharmony_ci	 * notification until mmu_notifier_invalidate_range_end() as we are
203862306a36Sopenharmony_ci	 * replacing a zero pmd write protected page with a zero pte write
203962306a36Sopenharmony_ci	 * protected page.
204062306a36Sopenharmony_ci	 *
204162306a36Sopenharmony_ci	 * See Documentation/mm/mmu_notifier.rst
204262306a36Sopenharmony_ci	 */
204362306a36Sopenharmony_ci	old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
204462306a36Sopenharmony_ci
204562306a36Sopenharmony_ci	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
204662306a36Sopenharmony_ci	pmd_populate(mm, &_pmd, pgtable);
204762306a36Sopenharmony_ci
204862306a36Sopenharmony_ci	pte = pte_offset_map(&_pmd, haddr);
204962306a36Sopenharmony_ci	VM_BUG_ON(!pte);
205062306a36Sopenharmony_ci	for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
205162306a36Sopenharmony_ci		pte_t entry;
205262306a36Sopenharmony_ci
205362306a36Sopenharmony_ci		entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot);
205462306a36Sopenharmony_ci		entry = pte_mkspecial(entry);
205562306a36Sopenharmony_ci		if (pmd_uffd_wp(old_pmd))
205662306a36Sopenharmony_ci			entry = pte_mkuffd_wp(entry);
205762306a36Sopenharmony_ci		VM_BUG_ON(!pte_none(ptep_get(pte)));
205862306a36Sopenharmony_ci		set_pte_at(mm, addr, pte, entry);
205962306a36Sopenharmony_ci		pte++;
206062306a36Sopenharmony_ci	}
206162306a36Sopenharmony_ci	pte_unmap(pte - 1);
206262306a36Sopenharmony_ci	smp_wmb(); /* make pte visible before pmd */
206362306a36Sopenharmony_ci	pmd_populate(mm, pmd, pgtable);
206462306a36Sopenharmony_ci}
206562306a36Sopenharmony_ci
206662306a36Sopenharmony_cistatic void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
206762306a36Sopenharmony_ci		unsigned long haddr, bool freeze)
206862306a36Sopenharmony_ci{
206962306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
207062306a36Sopenharmony_ci	struct page *page;
207162306a36Sopenharmony_ci	pgtable_t pgtable;
207262306a36Sopenharmony_ci	pmd_t old_pmd, _pmd;
207362306a36Sopenharmony_ci	bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
207462306a36Sopenharmony_ci	bool anon_exclusive = false, dirty = false;
207562306a36Sopenharmony_ci	unsigned long addr;
207662306a36Sopenharmony_ci	pte_t *pte;
207762306a36Sopenharmony_ci	int i;
207862306a36Sopenharmony_ci
207962306a36Sopenharmony_ci	VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
208062306a36Sopenharmony_ci	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
208162306a36Sopenharmony_ci	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
208262306a36Sopenharmony_ci	VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
208362306a36Sopenharmony_ci				&& !pmd_devmap(*pmd));
208462306a36Sopenharmony_ci
208562306a36Sopenharmony_ci	count_vm_event(THP_SPLIT_PMD);
208662306a36Sopenharmony_ci
208762306a36Sopenharmony_ci	if (!vma_is_anonymous(vma)) {
208862306a36Sopenharmony_ci		old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
208962306a36Sopenharmony_ci		/*
209062306a36Sopenharmony_ci		 * We are going to unmap this huge page. So
209162306a36Sopenharmony_ci		 * just go ahead and zap it
209262306a36Sopenharmony_ci		 */
209362306a36Sopenharmony_ci		if (arch_needs_pgtable_deposit())
209462306a36Sopenharmony_ci			zap_deposited_table(mm, pmd);
209562306a36Sopenharmony_ci		if (vma_is_special_huge(vma))
209662306a36Sopenharmony_ci			return;
209762306a36Sopenharmony_ci		if (unlikely(is_pmd_migration_entry(old_pmd))) {
209862306a36Sopenharmony_ci			swp_entry_t entry;
209962306a36Sopenharmony_ci
210062306a36Sopenharmony_ci			entry = pmd_to_swp_entry(old_pmd);
210162306a36Sopenharmony_ci			page = pfn_swap_entry_to_page(entry);
210262306a36Sopenharmony_ci		} else {
210362306a36Sopenharmony_ci			page = pmd_page(old_pmd);
210462306a36Sopenharmony_ci			if (!PageDirty(page) && pmd_dirty(old_pmd))
210562306a36Sopenharmony_ci				set_page_dirty(page);
210662306a36Sopenharmony_ci			if (!PageReferenced(page) && pmd_young(old_pmd))
210762306a36Sopenharmony_ci				SetPageReferenced(page);
210862306a36Sopenharmony_ci			page_remove_rmap(page, vma, true);
210962306a36Sopenharmony_ci			put_page(page);
211062306a36Sopenharmony_ci		}
211162306a36Sopenharmony_ci		add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
211262306a36Sopenharmony_ci		return;
211362306a36Sopenharmony_ci	}
211462306a36Sopenharmony_ci
211562306a36Sopenharmony_ci	if (is_huge_zero_pmd(*pmd)) {
211662306a36Sopenharmony_ci		/*
211762306a36Sopenharmony_ci		 * FIXME: Do we want to invalidate secondary mmu by calling
211862306a36Sopenharmony_ci		 * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below
211962306a36Sopenharmony_ci		 * inside __split_huge_pmd() ?
212062306a36Sopenharmony_ci		 *
212162306a36Sopenharmony_ci		 * We are going from a zero huge page write protected to zero
212262306a36Sopenharmony_ci		 * small page also write protected so it does not seems useful
212362306a36Sopenharmony_ci		 * to invalidate secondary mmu at this time.
212462306a36Sopenharmony_ci		 */
212562306a36Sopenharmony_ci		return __split_huge_zero_page_pmd(vma, haddr, pmd);
212662306a36Sopenharmony_ci	}
212762306a36Sopenharmony_ci
212862306a36Sopenharmony_ci	/*
212962306a36Sopenharmony_ci	 * Up to this point the pmd is present and huge and userland has the
213062306a36Sopenharmony_ci	 * whole access to the hugepage during the split (which happens in
213162306a36Sopenharmony_ci	 * place). If we overwrite the pmd with the not-huge version pointing
213262306a36Sopenharmony_ci	 * to the pte here (which of course we could if all CPUs were bug
213362306a36Sopenharmony_ci	 * free), userland could trigger a small page size TLB miss on the
213462306a36Sopenharmony_ci	 * small sized TLB while the hugepage TLB entry is still established in
213562306a36Sopenharmony_ci	 * the huge TLB. Some CPU doesn't like that.
213662306a36Sopenharmony_ci	 * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
213762306a36Sopenharmony_ci	 * 383 on page 105. Intel should be safe but is also warns that it's
213862306a36Sopenharmony_ci	 * only safe if the permission and cache attributes of the two entries
213962306a36Sopenharmony_ci	 * loaded in the two TLB is identical (which should be the case here).
214062306a36Sopenharmony_ci	 * But it is generally safer to never allow small and huge TLB entries
214162306a36Sopenharmony_ci	 * for the same virtual address to be loaded simultaneously. So instead
214262306a36Sopenharmony_ci	 * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
214362306a36Sopenharmony_ci	 * current pmd notpresent (atomically because here the pmd_trans_huge
214462306a36Sopenharmony_ci	 * must remain set at all times on the pmd until the split is complete
214562306a36Sopenharmony_ci	 * for this pmd), then we flush the SMP TLB and finally we write the
214662306a36Sopenharmony_ci	 * non-huge version of the pmd entry with pmd_populate.
214762306a36Sopenharmony_ci	 */
214862306a36Sopenharmony_ci	old_pmd = pmdp_invalidate(vma, haddr, pmd);
214962306a36Sopenharmony_ci
215062306a36Sopenharmony_ci	pmd_migration = is_pmd_migration_entry(old_pmd);
215162306a36Sopenharmony_ci	if (unlikely(pmd_migration)) {
215262306a36Sopenharmony_ci		swp_entry_t entry;
215362306a36Sopenharmony_ci
215462306a36Sopenharmony_ci		entry = pmd_to_swp_entry(old_pmd);
215562306a36Sopenharmony_ci		page = pfn_swap_entry_to_page(entry);
215662306a36Sopenharmony_ci		write = is_writable_migration_entry(entry);
215762306a36Sopenharmony_ci		if (PageAnon(page))
215862306a36Sopenharmony_ci			anon_exclusive = is_readable_exclusive_migration_entry(entry);
215962306a36Sopenharmony_ci		young = is_migration_entry_young(entry);
216062306a36Sopenharmony_ci		dirty = is_migration_entry_dirty(entry);
216162306a36Sopenharmony_ci		soft_dirty = pmd_swp_soft_dirty(old_pmd);
216262306a36Sopenharmony_ci		uffd_wp = pmd_swp_uffd_wp(old_pmd);
216362306a36Sopenharmony_ci	} else {
216462306a36Sopenharmony_ci		page = pmd_page(old_pmd);
216562306a36Sopenharmony_ci		if (pmd_dirty(old_pmd)) {
216662306a36Sopenharmony_ci			dirty = true;
216762306a36Sopenharmony_ci			SetPageDirty(page);
216862306a36Sopenharmony_ci		}
216962306a36Sopenharmony_ci		write = pmd_write(old_pmd);
217062306a36Sopenharmony_ci		young = pmd_young(old_pmd);
217162306a36Sopenharmony_ci		soft_dirty = pmd_soft_dirty(old_pmd);
217262306a36Sopenharmony_ci		uffd_wp = pmd_uffd_wp(old_pmd);
217362306a36Sopenharmony_ci
217462306a36Sopenharmony_ci		VM_BUG_ON_PAGE(!page_count(page), page);
217562306a36Sopenharmony_ci
217662306a36Sopenharmony_ci		/*
217762306a36Sopenharmony_ci		 * Without "freeze", we'll simply split the PMD, propagating the
217862306a36Sopenharmony_ci		 * PageAnonExclusive() flag for each PTE by setting it for
217962306a36Sopenharmony_ci		 * each subpage -- no need to (temporarily) clear.
218062306a36Sopenharmony_ci		 *
218162306a36Sopenharmony_ci		 * With "freeze" we want to replace mapped pages by
218262306a36Sopenharmony_ci		 * migration entries right away. This is only possible if we
218362306a36Sopenharmony_ci		 * managed to clear PageAnonExclusive() -- see
218462306a36Sopenharmony_ci		 * set_pmd_migration_entry().
218562306a36Sopenharmony_ci		 *
218662306a36Sopenharmony_ci		 * In case we cannot clear PageAnonExclusive(), split the PMD
218762306a36Sopenharmony_ci		 * only and let try_to_migrate_one() fail later.
218862306a36Sopenharmony_ci		 *
218962306a36Sopenharmony_ci		 * See page_try_share_anon_rmap(): invalidate PMD first.
219062306a36Sopenharmony_ci		 */
219162306a36Sopenharmony_ci		anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
219262306a36Sopenharmony_ci		if (freeze && anon_exclusive && page_try_share_anon_rmap(page))
219362306a36Sopenharmony_ci			freeze = false;
219462306a36Sopenharmony_ci		if (!freeze)
219562306a36Sopenharmony_ci			page_ref_add(page, HPAGE_PMD_NR - 1);
219662306a36Sopenharmony_ci	}
219762306a36Sopenharmony_ci
219862306a36Sopenharmony_ci	/*
219962306a36Sopenharmony_ci	 * Withdraw the table only after we mark the pmd entry invalid.
220062306a36Sopenharmony_ci	 * This's critical for some architectures (Power).
220162306a36Sopenharmony_ci	 */
220262306a36Sopenharmony_ci	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
220362306a36Sopenharmony_ci	pmd_populate(mm, &_pmd, pgtable);
220462306a36Sopenharmony_ci
220562306a36Sopenharmony_ci	pte = pte_offset_map(&_pmd, haddr);
220662306a36Sopenharmony_ci	VM_BUG_ON(!pte);
220762306a36Sopenharmony_ci	for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
220862306a36Sopenharmony_ci		pte_t entry;
220962306a36Sopenharmony_ci		/*
221062306a36Sopenharmony_ci		 * Note that NUMA hinting access restrictions are not
221162306a36Sopenharmony_ci		 * transferred to avoid any possibility of altering
221262306a36Sopenharmony_ci		 * permissions across VMAs.
221362306a36Sopenharmony_ci		 */
221462306a36Sopenharmony_ci		if (freeze || pmd_migration) {
221562306a36Sopenharmony_ci			swp_entry_t swp_entry;
221662306a36Sopenharmony_ci			if (write)
221762306a36Sopenharmony_ci				swp_entry = make_writable_migration_entry(
221862306a36Sopenharmony_ci							page_to_pfn(page + i));
221962306a36Sopenharmony_ci			else if (anon_exclusive)
222062306a36Sopenharmony_ci				swp_entry = make_readable_exclusive_migration_entry(
222162306a36Sopenharmony_ci							page_to_pfn(page + i));
222262306a36Sopenharmony_ci			else
222362306a36Sopenharmony_ci				swp_entry = make_readable_migration_entry(
222462306a36Sopenharmony_ci							page_to_pfn(page + i));
222562306a36Sopenharmony_ci			if (young)
222662306a36Sopenharmony_ci				swp_entry = make_migration_entry_young(swp_entry);
222762306a36Sopenharmony_ci			if (dirty)
222862306a36Sopenharmony_ci				swp_entry = make_migration_entry_dirty(swp_entry);
222962306a36Sopenharmony_ci			entry = swp_entry_to_pte(swp_entry);
223062306a36Sopenharmony_ci			if (soft_dirty)
223162306a36Sopenharmony_ci				entry = pte_swp_mksoft_dirty(entry);
223262306a36Sopenharmony_ci			if (uffd_wp)
223362306a36Sopenharmony_ci				entry = pte_swp_mkuffd_wp(entry);
223462306a36Sopenharmony_ci		} else {
223562306a36Sopenharmony_ci			entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
223662306a36Sopenharmony_ci			if (write)
223762306a36Sopenharmony_ci				entry = pte_mkwrite(entry, vma);
223862306a36Sopenharmony_ci			if (anon_exclusive)
223962306a36Sopenharmony_ci				SetPageAnonExclusive(page + i);
224062306a36Sopenharmony_ci			if (!young)
224162306a36Sopenharmony_ci				entry = pte_mkold(entry);
224262306a36Sopenharmony_ci			/* NOTE: this may set soft-dirty too on some archs */
224362306a36Sopenharmony_ci			if (dirty)
224462306a36Sopenharmony_ci				entry = pte_mkdirty(entry);
224562306a36Sopenharmony_ci			if (soft_dirty)
224662306a36Sopenharmony_ci				entry = pte_mksoft_dirty(entry);
224762306a36Sopenharmony_ci			if (uffd_wp)
224862306a36Sopenharmony_ci				entry = pte_mkuffd_wp(entry);
224962306a36Sopenharmony_ci			page_add_anon_rmap(page + i, vma, addr, RMAP_NONE);
225062306a36Sopenharmony_ci		}
225162306a36Sopenharmony_ci		VM_BUG_ON(!pte_none(ptep_get(pte)));
225262306a36Sopenharmony_ci		set_pte_at(mm, addr, pte, entry);
225362306a36Sopenharmony_ci		pte++;
225462306a36Sopenharmony_ci	}
225562306a36Sopenharmony_ci	pte_unmap(pte - 1);
225662306a36Sopenharmony_ci
225762306a36Sopenharmony_ci	if (!pmd_migration)
225862306a36Sopenharmony_ci		page_remove_rmap(page, vma, true);
225962306a36Sopenharmony_ci	if (freeze)
226062306a36Sopenharmony_ci		put_page(page);
226162306a36Sopenharmony_ci
226262306a36Sopenharmony_ci	smp_wmb(); /* make pte visible before pmd */
226362306a36Sopenharmony_ci	pmd_populate(mm, pmd, pgtable);
226462306a36Sopenharmony_ci}
226562306a36Sopenharmony_ci
226662306a36Sopenharmony_civoid __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
226762306a36Sopenharmony_ci		unsigned long address, bool freeze, struct folio *folio)
226862306a36Sopenharmony_ci{
226962306a36Sopenharmony_ci	spinlock_t *ptl;
227062306a36Sopenharmony_ci	struct mmu_notifier_range range;
227162306a36Sopenharmony_ci
227262306a36Sopenharmony_ci	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
227362306a36Sopenharmony_ci				address & HPAGE_PMD_MASK,
227462306a36Sopenharmony_ci				(address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
227562306a36Sopenharmony_ci	mmu_notifier_invalidate_range_start(&range);
227662306a36Sopenharmony_ci	ptl = pmd_lock(vma->vm_mm, pmd);
227762306a36Sopenharmony_ci
227862306a36Sopenharmony_ci	/*
227962306a36Sopenharmony_ci	 * If caller asks to setup a migration entry, we need a folio to check
228062306a36Sopenharmony_ci	 * pmd against. Otherwise we can end up replacing wrong folio.
228162306a36Sopenharmony_ci	 */
228262306a36Sopenharmony_ci	VM_BUG_ON(freeze && !folio);
228362306a36Sopenharmony_ci	VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));
228462306a36Sopenharmony_ci
228562306a36Sopenharmony_ci	if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
228662306a36Sopenharmony_ci	    is_pmd_migration_entry(*pmd)) {
228762306a36Sopenharmony_ci		/*
228862306a36Sopenharmony_ci		 * It's safe to call pmd_page when folio is set because it's
228962306a36Sopenharmony_ci		 * guaranteed that pmd is present.
229062306a36Sopenharmony_ci		 */
229162306a36Sopenharmony_ci		if (folio && folio != page_folio(pmd_page(*pmd)))
229262306a36Sopenharmony_ci			goto out;
229362306a36Sopenharmony_ci		__split_huge_pmd_locked(vma, pmd, range.start, freeze);
229462306a36Sopenharmony_ci	}
229562306a36Sopenharmony_ci
229662306a36Sopenharmony_ciout:
229762306a36Sopenharmony_ci	spin_unlock(ptl);
229862306a36Sopenharmony_ci	mmu_notifier_invalidate_range_end(&range);
229962306a36Sopenharmony_ci}
230062306a36Sopenharmony_ci
230162306a36Sopenharmony_civoid split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
230262306a36Sopenharmony_ci		bool freeze, struct folio *folio)
230362306a36Sopenharmony_ci{
230462306a36Sopenharmony_ci	pmd_t *pmd = mm_find_pmd(vma->vm_mm, address);
230562306a36Sopenharmony_ci
230662306a36Sopenharmony_ci	if (!pmd)
230762306a36Sopenharmony_ci		return;
230862306a36Sopenharmony_ci
230962306a36Sopenharmony_ci	__split_huge_pmd(vma, pmd, address, freeze, folio);
231062306a36Sopenharmony_ci}
231162306a36Sopenharmony_ci
231262306a36Sopenharmony_cistatic inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
231362306a36Sopenharmony_ci{
231462306a36Sopenharmony_ci	/*
231562306a36Sopenharmony_ci	 * If the new address isn't hpage aligned and it could previously
231662306a36Sopenharmony_ci	 * contain an hugepage: check if we need to split an huge pmd.
231762306a36Sopenharmony_ci	 */
231862306a36Sopenharmony_ci	if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
231962306a36Sopenharmony_ci	    range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
232062306a36Sopenharmony_ci			 ALIGN(address, HPAGE_PMD_SIZE)))
232162306a36Sopenharmony_ci		split_huge_pmd_address(vma, address, false, NULL);
232262306a36Sopenharmony_ci}
232362306a36Sopenharmony_ci
232462306a36Sopenharmony_civoid vma_adjust_trans_huge(struct vm_area_struct *vma,
232562306a36Sopenharmony_ci			     unsigned long start,
232662306a36Sopenharmony_ci			     unsigned long end,
232762306a36Sopenharmony_ci			     long adjust_next)
232862306a36Sopenharmony_ci{
232962306a36Sopenharmony_ci	/* Check if we need to split start first. */
233062306a36Sopenharmony_ci	split_huge_pmd_if_needed(vma, start);
233162306a36Sopenharmony_ci
233262306a36Sopenharmony_ci	/* Check if we need to split end next. */
233362306a36Sopenharmony_ci	split_huge_pmd_if_needed(vma, end);
233462306a36Sopenharmony_ci
233562306a36Sopenharmony_ci	/*
233662306a36Sopenharmony_ci	 * If we're also updating the next vma vm_start,
233762306a36Sopenharmony_ci	 * check if we need to split it.
233862306a36Sopenharmony_ci	 */
233962306a36Sopenharmony_ci	if (adjust_next > 0) {
234062306a36Sopenharmony_ci		struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end);
234162306a36Sopenharmony_ci		unsigned long nstart = next->vm_start;
234262306a36Sopenharmony_ci		nstart += adjust_next;
234362306a36Sopenharmony_ci		split_huge_pmd_if_needed(next, nstart);
234462306a36Sopenharmony_ci	}
234562306a36Sopenharmony_ci}
234662306a36Sopenharmony_ci
234762306a36Sopenharmony_cistatic void unmap_folio(struct folio *folio)
234862306a36Sopenharmony_ci{
234962306a36Sopenharmony_ci	enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
235062306a36Sopenharmony_ci		TTU_SYNC;
235162306a36Sopenharmony_ci
235262306a36Sopenharmony_ci	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
235362306a36Sopenharmony_ci
235462306a36Sopenharmony_ci	/*
235562306a36Sopenharmony_ci	 * Anon pages need migration entries to preserve them, but file
235662306a36Sopenharmony_ci	 * pages can simply be left unmapped, then faulted back on demand.
235762306a36Sopenharmony_ci	 * If that is ever changed (perhaps for mlock), update remap_page().
235862306a36Sopenharmony_ci	 */
235962306a36Sopenharmony_ci	if (folio_test_anon(folio))
236062306a36Sopenharmony_ci		try_to_migrate(folio, ttu_flags);
236162306a36Sopenharmony_ci	else
236262306a36Sopenharmony_ci		try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);
236362306a36Sopenharmony_ci}
236462306a36Sopenharmony_ci
236562306a36Sopenharmony_cistatic void remap_page(struct folio *folio, unsigned long nr)
236662306a36Sopenharmony_ci{
236762306a36Sopenharmony_ci	int i = 0;
236862306a36Sopenharmony_ci
236962306a36Sopenharmony_ci	/* If unmap_folio() uses try_to_migrate() on file, remove this check */
237062306a36Sopenharmony_ci	if (!folio_test_anon(folio))
237162306a36Sopenharmony_ci		return;
237262306a36Sopenharmony_ci	for (;;) {
237362306a36Sopenharmony_ci		remove_migration_ptes(folio, folio, true);
237462306a36Sopenharmony_ci		i += folio_nr_pages(folio);
237562306a36Sopenharmony_ci		if (i >= nr)
237662306a36Sopenharmony_ci			break;
237762306a36Sopenharmony_ci		folio = folio_next(folio);
237862306a36Sopenharmony_ci	}
237962306a36Sopenharmony_ci}
238062306a36Sopenharmony_ci
238162306a36Sopenharmony_cistatic void lru_add_page_tail(struct page *head, struct page *tail,
238262306a36Sopenharmony_ci		struct lruvec *lruvec, struct list_head *list)
238362306a36Sopenharmony_ci{
238462306a36Sopenharmony_ci	VM_BUG_ON_PAGE(!PageHead(head), head);
238562306a36Sopenharmony_ci	VM_BUG_ON_PAGE(PageCompound(tail), head);
238662306a36Sopenharmony_ci	VM_BUG_ON_PAGE(PageLRU(tail), head);
238762306a36Sopenharmony_ci	lockdep_assert_held(&lruvec->lru_lock);
238862306a36Sopenharmony_ci
238962306a36Sopenharmony_ci	if (list) {
239062306a36Sopenharmony_ci		/* page reclaim is reclaiming a huge page */
239162306a36Sopenharmony_ci		VM_WARN_ON(PageLRU(head));
239262306a36Sopenharmony_ci		get_page(tail);
239362306a36Sopenharmony_ci		list_add_tail(&tail->lru, list);
239462306a36Sopenharmony_ci	} else {
239562306a36Sopenharmony_ci		/* head is still on lru (and we have it frozen) */
239662306a36Sopenharmony_ci		VM_WARN_ON(!PageLRU(head));
239762306a36Sopenharmony_ci		if (PageUnevictable(tail))
239862306a36Sopenharmony_ci			tail->mlock_count = 0;
239962306a36Sopenharmony_ci		else
240062306a36Sopenharmony_ci			list_add_tail(&tail->lru, &head->lru);
240162306a36Sopenharmony_ci		SetPageLRU(tail);
240262306a36Sopenharmony_ci	}
240362306a36Sopenharmony_ci}
240462306a36Sopenharmony_ci
240562306a36Sopenharmony_cistatic void __split_huge_page_tail(struct folio *folio, int tail,
240662306a36Sopenharmony_ci		struct lruvec *lruvec, struct list_head *list)
240762306a36Sopenharmony_ci{
240862306a36Sopenharmony_ci	struct page *head = &folio->page;
240962306a36Sopenharmony_ci	struct page *page_tail = head + tail;
241062306a36Sopenharmony_ci	/*
241162306a36Sopenharmony_ci	 * Careful: new_folio is not a "real" folio before we cleared PageTail.
241262306a36Sopenharmony_ci	 * Don't pass it around before clear_compound_head().
241362306a36Sopenharmony_ci	 */
241462306a36Sopenharmony_ci	struct folio *new_folio = (struct folio *)page_tail;
241562306a36Sopenharmony_ci
241662306a36Sopenharmony_ci	VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
241762306a36Sopenharmony_ci
241862306a36Sopenharmony_ci	/*
241962306a36Sopenharmony_ci	 * Clone page flags before unfreezing refcount.
242062306a36Sopenharmony_ci	 *
242162306a36Sopenharmony_ci	 * After successful get_page_unless_zero() might follow flags change,
242262306a36Sopenharmony_ci	 * for example lock_page() which set PG_waiters.
242362306a36Sopenharmony_ci	 *
242462306a36Sopenharmony_ci	 * Note that for mapped sub-pages of an anonymous THP,
242562306a36Sopenharmony_ci	 * PG_anon_exclusive has been cleared in unmap_folio() and is stored in
242662306a36Sopenharmony_ci	 * the migration entry instead from where remap_page() will restore it.
242762306a36Sopenharmony_ci	 * We can still have PG_anon_exclusive set on effectively unmapped and
242862306a36Sopenharmony_ci	 * unreferenced sub-pages of an anonymous THP: we can simply drop
242962306a36Sopenharmony_ci	 * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
243062306a36Sopenharmony_ci	 */
243162306a36Sopenharmony_ci	page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
243262306a36Sopenharmony_ci	page_tail->flags |= (head->flags &
243362306a36Sopenharmony_ci			((1L << PG_referenced) |
243462306a36Sopenharmony_ci			 (1L << PG_swapbacked) |
243562306a36Sopenharmony_ci			 (1L << PG_swapcache) |
243662306a36Sopenharmony_ci			 (1L << PG_mlocked) |
243762306a36Sopenharmony_ci			 (1L << PG_uptodate) |
243862306a36Sopenharmony_ci			 (1L << PG_active) |
243962306a36Sopenharmony_ci			 (1L << PG_workingset) |
244062306a36Sopenharmony_ci			 (1L << PG_locked) |
244162306a36Sopenharmony_ci			 (1L << PG_unevictable) |
244262306a36Sopenharmony_ci#ifdef CONFIG_ARCH_USES_PG_ARCH_X
244362306a36Sopenharmony_ci			 (1L << PG_arch_2) |
244462306a36Sopenharmony_ci			 (1L << PG_arch_3) |
244562306a36Sopenharmony_ci#endif
244662306a36Sopenharmony_ci			 (1L << PG_dirty) |
244762306a36Sopenharmony_ci			 LRU_GEN_MASK | LRU_REFS_MASK));
244862306a36Sopenharmony_ci
244962306a36Sopenharmony_ci	/* ->mapping in first and second tail page is replaced by other uses */
245062306a36Sopenharmony_ci	VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
245162306a36Sopenharmony_ci			page_tail);
245262306a36Sopenharmony_ci	page_tail->mapping = head->mapping;
245362306a36Sopenharmony_ci	page_tail->index = head->index + tail;
245462306a36Sopenharmony_ci
245562306a36Sopenharmony_ci	/*
245662306a36Sopenharmony_ci	 * page->private should not be set in tail pages. Fix up and warn once
245762306a36Sopenharmony_ci	 * if private is unexpectedly set.
245862306a36Sopenharmony_ci	 */
245962306a36Sopenharmony_ci	if (unlikely(page_tail->private)) {
246062306a36Sopenharmony_ci		VM_WARN_ON_ONCE_PAGE(true, page_tail);
246162306a36Sopenharmony_ci		page_tail->private = 0;
246262306a36Sopenharmony_ci	}
246362306a36Sopenharmony_ci	if (folio_test_swapcache(folio))
246462306a36Sopenharmony_ci		new_folio->swap.val = folio->swap.val + tail;
246562306a36Sopenharmony_ci
246662306a36Sopenharmony_ci	/* Page flags must be visible before we make the page non-compound. */
246762306a36Sopenharmony_ci	smp_wmb();
246862306a36Sopenharmony_ci
246962306a36Sopenharmony_ci	/*
247062306a36Sopenharmony_ci	 * Clear PageTail before unfreezing page refcount.
247162306a36Sopenharmony_ci	 *
247262306a36Sopenharmony_ci	 * After successful get_page_unless_zero() might follow put_page()
247362306a36Sopenharmony_ci	 * which needs correct compound_head().
247462306a36Sopenharmony_ci	 */
247562306a36Sopenharmony_ci	clear_compound_head(page_tail);
247662306a36Sopenharmony_ci
247762306a36Sopenharmony_ci	/* Finally unfreeze refcount. Additional reference from page cache. */
247862306a36Sopenharmony_ci	page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) ||
247962306a36Sopenharmony_ci					  PageSwapCache(head)));
248062306a36Sopenharmony_ci
248162306a36Sopenharmony_ci	if (page_is_young(head))
248262306a36Sopenharmony_ci		set_page_young(page_tail);
248362306a36Sopenharmony_ci	if (page_is_idle(head))
248462306a36Sopenharmony_ci		set_page_idle(page_tail);
248562306a36Sopenharmony_ci
248662306a36Sopenharmony_ci	page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
248762306a36Sopenharmony_ci
248862306a36Sopenharmony_ci	/*
248962306a36Sopenharmony_ci	 * always add to the tail because some iterators expect new
249062306a36Sopenharmony_ci	 * pages to show after the currently processed elements - e.g.
249162306a36Sopenharmony_ci	 * migrate_pages
249262306a36Sopenharmony_ci	 */
249362306a36Sopenharmony_ci	lru_add_page_tail(head, page_tail, lruvec, list);
249462306a36Sopenharmony_ci}
249562306a36Sopenharmony_ci
249662306a36Sopenharmony_cistatic void __split_huge_page(struct page *page, struct list_head *list,
249762306a36Sopenharmony_ci		pgoff_t end)
249862306a36Sopenharmony_ci{
249962306a36Sopenharmony_ci	struct folio *folio = page_folio(page);
250062306a36Sopenharmony_ci	struct page *head = &folio->page;
250162306a36Sopenharmony_ci	struct lruvec *lruvec;
250262306a36Sopenharmony_ci	struct address_space *swap_cache = NULL;
250362306a36Sopenharmony_ci	unsigned long offset = 0;
250462306a36Sopenharmony_ci	unsigned int nr = thp_nr_pages(head);
250562306a36Sopenharmony_ci	int i, nr_dropped = 0;
250662306a36Sopenharmony_ci
250762306a36Sopenharmony_ci	/* complete memcg works before add pages to LRU */
250862306a36Sopenharmony_ci	split_page_memcg(head, nr);
250962306a36Sopenharmony_ci
251062306a36Sopenharmony_ci	if (folio_test_anon(folio) && folio_test_swapcache(folio)) {
251162306a36Sopenharmony_ci		offset = swp_offset(folio->swap);
251262306a36Sopenharmony_ci		swap_cache = swap_address_space(folio->swap);
251362306a36Sopenharmony_ci		xa_lock(&swap_cache->i_pages);
251462306a36Sopenharmony_ci	}
251562306a36Sopenharmony_ci
251662306a36Sopenharmony_ci	/* lock lru list/PageCompound, ref frozen by page_ref_freeze */
251762306a36Sopenharmony_ci	lruvec = folio_lruvec_lock(folio);
251862306a36Sopenharmony_ci
251962306a36Sopenharmony_ci	ClearPageHasHWPoisoned(head);
252062306a36Sopenharmony_ci
252162306a36Sopenharmony_ci	for (i = nr - 1; i >= 1; i--) {
252262306a36Sopenharmony_ci		__split_huge_page_tail(folio, i, lruvec, list);
252362306a36Sopenharmony_ci		/* Some pages can be beyond EOF: drop them from page cache */
252462306a36Sopenharmony_ci		if (head[i].index >= end) {
252562306a36Sopenharmony_ci			struct folio *tail = page_folio(head + i);
252662306a36Sopenharmony_ci
252762306a36Sopenharmony_ci			if (shmem_mapping(head->mapping))
252862306a36Sopenharmony_ci				nr_dropped++;
252962306a36Sopenharmony_ci			else if (folio_test_clear_dirty(tail))
253062306a36Sopenharmony_ci				folio_account_cleaned(tail,
253162306a36Sopenharmony_ci					inode_to_wb(folio->mapping->host));
253262306a36Sopenharmony_ci			__filemap_remove_folio(tail, NULL);
253362306a36Sopenharmony_ci			folio_put(tail);
253462306a36Sopenharmony_ci		} else if (!PageAnon(page)) {
253562306a36Sopenharmony_ci			__xa_store(&head->mapping->i_pages, head[i].index,
253662306a36Sopenharmony_ci					head + i, 0);
253762306a36Sopenharmony_ci		} else if (swap_cache) {
253862306a36Sopenharmony_ci			__xa_store(&swap_cache->i_pages, offset + i,
253962306a36Sopenharmony_ci					head + i, 0);
254062306a36Sopenharmony_ci		}
254162306a36Sopenharmony_ci	}
254262306a36Sopenharmony_ci
254362306a36Sopenharmony_ci	ClearPageCompound(head);
254462306a36Sopenharmony_ci	unlock_page_lruvec(lruvec);
254562306a36Sopenharmony_ci	/* Caller disabled irqs, so they are still disabled here */
254662306a36Sopenharmony_ci
254762306a36Sopenharmony_ci	split_page_owner(head, nr);
254862306a36Sopenharmony_ci
254962306a36Sopenharmony_ci	/* See comment in __split_huge_page_tail() */
255062306a36Sopenharmony_ci	if (PageAnon(head)) {
255162306a36Sopenharmony_ci		/* Additional pin to swap cache */
255262306a36Sopenharmony_ci		if (PageSwapCache(head)) {
255362306a36Sopenharmony_ci			page_ref_add(head, 2);
255462306a36Sopenharmony_ci			xa_unlock(&swap_cache->i_pages);
255562306a36Sopenharmony_ci		} else {
255662306a36Sopenharmony_ci			page_ref_inc(head);
255762306a36Sopenharmony_ci		}
255862306a36Sopenharmony_ci	} else {
255962306a36Sopenharmony_ci		/* Additional pin to page cache */
256062306a36Sopenharmony_ci		page_ref_add(head, 2);
256162306a36Sopenharmony_ci		xa_unlock(&head->mapping->i_pages);
256262306a36Sopenharmony_ci	}
256362306a36Sopenharmony_ci	local_irq_enable();
256462306a36Sopenharmony_ci
256562306a36Sopenharmony_ci	if (nr_dropped)
256662306a36Sopenharmony_ci		shmem_uncharge(head->mapping->host, nr_dropped);
256762306a36Sopenharmony_ci	remap_page(folio, nr);
256862306a36Sopenharmony_ci
256962306a36Sopenharmony_ci	if (folio_test_swapcache(folio))
257062306a36Sopenharmony_ci		split_swap_cluster(folio->swap);
257162306a36Sopenharmony_ci
257262306a36Sopenharmony_ci	for (i = 0; i < nr; i++) {
257362306a36Sopenharmony_ci		struct page *subpage = head + i;
257462306a36Sopenharmony_ci		if (subpage == page)
257562306a36Sopenharmony_ci			continue;
257662306a36Sopenharmony_ci		unlock_page(subpage);
257762306a36Sopenharmony_ci
257862306a36Sopenharmony_ci		/*
257962306a36Sopenharmony_ci		 * Subpages may be freed if there wasn't any mapping
258062306a36Sopenharmony_ci		 * like if add_to_swap() is running on a lru page that
258162306a36Sopenharmony_ci		 * had its mapping zapped. And freeing these pages
258262306a36Sopenharmony_ci		 * requires taking the lru_lock so we do the put_page
258362306a36Sopenharmony_ci		 * of the tail pages after the split is complete.
258462306a36Sopenharmony_ci		 */
258562306a36Sopenharmony_ci		free_page_and_swap_cache(subpage);
258662306a36Sopenharmony_ci	}
258762306a36Sopenharmony_ci}
258862306a36Sopenharmony_ci
258962306a36Sopenharmony_ci/* Racy check whether the huge page can be split */
259062306a36Sopenharmony_cibool can_split_folio(struct folio *folio, int *pextra_pins)
259162306a36Sopenharmony_ci{
259262306a36Sopenharmony_ci	int extra_pins;
259362306a36Sopenharmony_ci
259462306a36Sopenharmony_ci	/* Additional pins from page cache */
259562306a36Sopenharmony_ci	if (folio_test_anon(folio))
259662306a36Sopenharmony_ci		extra_pins = folio_test_swapcache(folio) ?
259762306a36Sopenharmony_ci				folio_nr_pages(folio) : 0;
259862306a36Sopenharmony_ci	else
259962306a36Sopenharmony_ci		extra_pins = folio_nr_pages(folio);
260062306a36Sopenharmony_ci	if (pextra_pins)
260162306a36Sopenharmony_ci		*pextra_pins = extra_pins;
260262306a36Sopenharmony_ci	return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1;
260362306a36Sopenharmony_ci}
260462306a36Sopenharmony_ci
260562306a36Sopenharmony_ci/*
260662306a36Sopenharmony_ci * This function splits huge page into normal pages. @page can point to any
260762306a36Sopenharmony_ci * subpage of huge page to split. Split doesn't change the position of @page.
260862306a36Sopenharmony_ci *
260962306a36Sopenharmony_ci * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
261062306a36Sopenharmony_ci * The huge page must be locked.
261162306a36Sopenharmony_ci *
261262306a36Sopenharmony_ci * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
261362306a36Sopenharmony_ci *
261462306a36Sopenharmony_ci * Both head page and tail pages will inherit mapping, flags, and so on from
261562306a36Sopenharmony_ci * the hugepage.
261662306a36Sopenharmony_ci *
261762306a36Sopenharmony_ci * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
261862306a36Sopenharmony_ci * they are not mapped.
261962306a36Sopenharmony_ci *
262062306a36Sopenharmony_ci * Returns 0 if the hugepage is split successfully.
262162306a36Sopenharmony_ci * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
262262306a36Sopenharmony_ci * us.
262362306a36Sopenharmony_ci */
262462306a36Sopenharmony_ciint split_huge_page_to_list(struct page *page, struct list_head *list)
262562306a36Sopenharmony_ci{
262662306a36Sopenharmony_ci	struct folio *folio = page_folio(page);
262762306a36Sopenharmony_ci	struct deferred_split *ds_queue = get_deferred_split_queue(folio);
262862306a36Sopenharmony_ci	XA_STATE(xas, &folio->mapping->i_pages, folio->index);
262962306a36Sopenharmony_ci	struct anon_vma *anon_vma = NULL;
263062306a36Sopenharmony_ci	struct address_space *mapping = NULL;
263162306a36Sopenharmony_ci	int extra_pins, ret;
263262306a36Sopenharmony_ci	pgoff_t end;
263362306a36Sopenharmony_ci	bool is_hzp;
263462306a36Sopenharmony_ci
263562306a36Sopenharmony_ci	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
263662306a36Sopenharmony_ci	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
263762306a36Sopenharmony_ci
263862306a36Sopenharmony_ci	is_hzp = is_huge_zero_page(&folio->page);
263962306a36Sopenharmony_ci	if (is_hzp) {
264062306a36Sopenharmony_ci		pr_warn_ratelimited("Called split_huge_page for huge zero page\n");
264162306a36Sopenharmony_ci		return -EBUSY;
264262306a36Sopenharmony_ci	}
264362306a36Sopenharmony_ci
264462306a36Sopenharmony_ci	if (folio_test_writeback(folio))
264562306a36Sopenharmony_ci		return -EBUSY;
264662306a36Sopenharmony_ci
264762306a36Sopenharmony_ci	if (folio_test_anon(folio)) {
264862306a36Sopenharmony_ci		/*
264962306a36Sopenharmony_ci		 * The caller does not necessarily hold an mmap_lock that would
265062306a36Sopenharmony_ci		 * prevent the anon_vma disappearing so we first we take a
265162306a36Sopenharmony_ci		 * reference to it and then lock the anon_vma for write. This
265262306a36Sopenharmony_ci		 * is similar to folio_lock_anon_vma_read except the write lock
265362306a36Sopenharmony_ci		 * is taken to serialise against parallel split or collapse
265462306a36Sopenharmony_ci		 * operations.
265562306a36Sopenharmony_ci		 */
265662306a36Sopenharmony_ci		anon_vma = folio_get_anon_vma(folio);
265762306a36Sopenharmony_ci		if (!anon_vma) {
265862306a36Sopenharmony_ci			ret = -EBUSY;
265962306a36Sopenharmony_ci			goto out;
266062306a36Sopenharmony_ci		}
266162306a36Sopenharmony_ci		end = -1;
266262306a36Sopenharmony_ci		mapping = NULL;
266362306a36Sopenharmony_ci		anon_vma_lock_write(anon_vma);
266462306a36Sopenharmony_ci	} else {
266562306a36Sopenharmony_ci		gfp_t gfp;
266662306a36Sopenharmony_ci
266762306a36Sopenharmony_ci		mapping = folio->mapping;
266862306a36Sopenharmony_ci
266962306a36Sopenharmony_ci		/* Truncated ? */
267062306a36Sopenharmony_ci		if (!mapping) {
267162306a36Sopenharmony_ci			ret = -EBUSY;
267262306a36Sopenharmony_ci			goto out;
267362306a36Sopenharmony_ci		}
267462306a36Sopenharmony_ci
267562306a36Sopenharmony_ci		gfp = current_gfp_context(mapping_gfp_mask(mapping) &
267662306a36Sopenharmony_ci							GFP_RECLAIM_MASK);
267762306a36Sopenharmony_ci
267862306a36Sopenharmony_ci		if (!filemap_release_folio(folio, gfp)) {
267962306a36Sopenharmony_ci			ret = -EBUSY;
268062306a36Sopenharmony_ci			goto out;
268162306a36Sopenharmony_ci		}
268262306a36Sopenharmony_ci
268362306a36Sopenharmony_ci		xas_split_alloc(&xas, folio, folio_order(folio), gfp);
268462306a36Sopenharmony_ci		if (xas_error(&xas)) {
268562306a36Sopenharmony_ci			ret = xas_error(&xas);
268662306a36Sopenharmony_ci			goto out;
268762306a36Sopenharmony_ci		}
268862306a36Sopenharmony_ci
268962306a36Sopenharmony_ci		anon_vma = NULL;
269062306a36Sopenharmony_ci		i_mmap_lock_read(mapping);
269162306a36Sopenharmony_ci
269262306a36Sopenharmony_ci		/*
269362306a36Sopenharmony_ci		 *__split_huge_page() may need to trim off pages beyond EOF:
269462306a36Sopenharmony_ci		 * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
269562306a36Sopenharmony_ci		 * which cannot be nested inside the page tree lock. So note
269662306a36Sopenharmony_ci		 * end now: i_size itself may be changed at any moment, but
269762306a36Sopenharmony_ci		 * folio lock is good enough to serialize the trimming.
269862306a36Sopenharmony_ci		 */
269962306a36Sopenharmony_ci		end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
270062306a36Sopenharmony_ci		if (shmem_mapping(mapping))
270162306a36Sopenharmony_ci			end = shmem_fallocend(mapping->host, end);
270262306a36Sopenharmony_ci	}
270362306a36Sopenharmony_ci
270462306a36Sopenharmony_ci	/*
270562306a36Sopenharmony_ci	 * Racy check if we can split the page, before unmap_folio() will
270662306a36Sopenharmony_ci	 * split PMDs
270762306a36Sopenharmony_ci	 */
270862306a36Sopenharmony_ci	if (!can_split_folio(folio, &extra_pins)) {
270962306a36Sopenharmony_ci		ret = -EAGAIN;
271062306a36Sopenharmony_ci		goto out_unlock;
271162306a36Sopenharmony_ci	}
271262306a36Sopenharmony_ci
271362306a36Sopenharmony_ci	unmap_folio(folio);
271462306a36Sopenharmony_ci
271562306a36Sopenharmony_ci	/* block interrupt reentry in xa_lock and spinlock */
271662306a36Sopenharmony_ci	local_irq_disable();
271762306a36Sopenharmony_ci	if (mapping) {
271862306a36Sopenharmony_ci		/*
271962306a36Sopenharmony_ci		 * Check if the folio is present in page cache.
272062306a36Sopenharmony_ci		 * We assume all tail are present too, if folio is there.
272162306a36Sopenharmony_ci		 */
272262306a36Sopenharmony_ci		xas_lock(&xas);
272362306a36Sopenharmony_ci		xas_reset(&xas);
272462306a36Sopenharmony_ci		if (xas_load(&xas) != folio)
272562306a36Sopenharmony_ci			goto fail;
272662306a36Sopenharmony_ci	}
272762306a36Sopenharmony_ci
272862306a36Sopenharmony_ci	/* Prevent deferred_split_scan() touching ->_refcount */
272962306a36Sopenharmony_ci	spin_lock(&ds_queue->split_queue_lock);
273062306a36Sopenharmony_ci	if (folio_ref_freeze(folio, 1 + extra_pins)) {
273162306a36Sopenharmony_ci		if (!list_empty(&folio->_deferred_list)) {
273262306a36Sopenharmony_ci			ds_queue->split_queue_len--;
273362306a36Sopenharmony_ci			list_del(&folio->_deferred_list);
273462306a36Sopenharmony_ci		}
273562306a36Sopenharmony_ci		spin_unlock(&ds_queue->split_queue_lock);
273662306a36Sopenharmony_ci		if (mapping) {
273762306a36Sopenharmony_ci			int nr = folio_nr_pages(folio);
273862306a36Sopenharmony_ci
273962306a36Sopenharmony_ci			xas_split(&xas, folio, folio_order(folio));
274062306a36Sopenharmony_ci			if (folio_test_pmd_mappable(folio)) {
274162306a36Sopenharmony_ci				if (folio_test_swapbacked(folio)) {
274262306a36Sopenharmony_ci					__lruvec_stat_mod_folio(folio,
274362306a36Sopenharmony_ci							NR_SHMEM_THPS, -nr);
274462306a36Sopenharmony_ci				} else {
274562306a36Sopenharmony_ci					__lruvec_stat_mod_folio(folio,
274662306a36Sopenharmony_ci							NR_FILE_THPS, -nr);
274762306a36Sopenharmony_ci					filemap_nr_thps_dec(mapping);
274862306a36Sopenharmony_ci				}
274962306a36Sopenharmony_ci			}
275062306a36Sopenharmony_ci		}
275162306a36Sopenharmony_ci
275262306a36Sopenharmony_ci		__split_huge_page(page, list, end);
275362306a36Sopenharmony_ci		ret = 0;
275462306a36Sopenharmony_ci	} else {
275562306a36Sopenharmony_ci		spin_unlock(&ds_queue->split_queue_lock);
275662306a36Sopenharmony_cifail:
275762306a36Sopenharmony_ci		if (mapping)
275862306a36Sopenharmony_ci			xas_unlock(&xas);
275962306a36Sopenharmony_ci		local_irq_enable();
276062306a36Sopenharmony_ci		remap_page(folio, folio_nr_pages(folio));
276162306a36Sopenharmony_ci		ret = -EAGAIN;
276262306a36Sopenharmony_ci	}
276362306a36Sopenharmony_ci
276462306a36Sopenharmony_ciout_unlock:
276562306a36Sopenharmony_ci	if (anon_vma) {
276662306a36Sopenharmony_ci		anon_vma_unlock_write(anon_vma);
276762306a36Sopenharmony_ci		put_anon_vma(anon_vma);
276862306a36Sopenharmony_ci	}
276962306a36Sopenharmony_ci	if (mapping)
277062306a36Sopenharmony_ci		i_mmap_unlock_read(mapping);
277162306a36Sopenharmony_ciout:
277262306a36Sopenharmony_ci	xas_destroy(&xas);
277362306a36Sopenharmony_ci	count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
277462306a36Sopenharmony_ci	return ret;
277562306a36Sopenharmony_ci}
277662306a36Sopenharmony_ci
277762306a36Sopenharmony_civoid folio_undo_large_rmappable(struct folio *folio)
277862306a36Sopenharmony_ci{
277962306a36Sopenharmony_ci	struct deferred_split *ds_queue;
278062306a36Sopenharmony_ci	unsigned long flags;
278162306a36Sopenharmony_ci
278262306a36Sopenharmony_ci	/*
278362306a36Sopenharmony_ci	 * At this point, there is no one trying to add the folio to
278462306a36Sopenharmony_ci	 * deferred_list. If folio is not in deferred_list, it's safe
278562306a36Sopenharmony_ci	 * to check without acquiring the split_queue_lock.
278662306a36Sopenharmony_ci	 */
278762306a36Sopenharmony_ci	if (data_race(list_empty(&folio->_deferred_list)))
278862306a36Sopenharmony_ci		return;
278962306a36Sopenharmony_ci
279062306a36Sopenharmony_ci	ds_queue = get_deferred_split_queue(folio);
279162306a36Sopenharmony_ci	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
279262306a36Sopenharmony_ci	if (!list_empty(&folio->_deferred_list)) {
279362306a36Sopenharmony_ci		ds_queue->split_queue_len--;
279462306a36Sopenharmony_ci		list_del(&folio->_deferred_list);
279562306a36Sopenharmony_ci	}
279662306a36Sopenharmony_ci	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
279762306a36Sopenharmony_ci}
279862306a36Sopenharmony_ci
279962306a36Sopenharmony_civoid deferred_split_folio(struct folio *folio)
280062306a36Sopenharmony_ci{
280162306a36Sopenharmony_ci	struct deferred_split *ds_queue = get_deferred_split_queue(folio);
280262306a36Sopenharmony_ci#ifdef CONFIG_MEMCG
280362306a36Sopenharmony_ci	struct mem_cgroup *memcg = folio_memcg(folio);
280462306a36Sopenharmony_ci#endif
280562306a36Sopenharmony_ci	unsigned long flags;
280662306a36Sopenharmony_ci
280762306a36Sopenharmony_ci	VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio);
280862306a36Sopenharmony_ci
280962306a36Sopenharmony_ci	/*
281062306a36Sopenharmony_ci	 * The try_to_unmap() in page reclaim path might reach here too,
281162306a36Sopenharmony_ci	 * this may cause a race condition to corrupt deferred split queue.
281262306a36Sopenharmony_ci	 * And, if page reclaim is already handling the same folio, it is
281362306a36Sopenharmony_ci	 * unnecessary to handle it again in shrinker.
281462306a36Sopenharmony_ci	 *
281562306a36Sopenharmony_ci	 * Check the swapcache flag to determine if the folio is being
281662306a36Sopenharmony_ci	 * handled by page reclaim since THP swap would add the folio into
281762306a36Sopenharmony_ci	 * swap cache before calling try_to_unmap().
281862306a36Sopenharmony_ci	 */
281962306a36Sopenharmony_ci	if (folio_test_swapcache(folio))
282062306a36Sopenharmony_ci		return;
282162306a36Sopenharmony_ci
282262306a36Sopenharmony_ci	if (!list_empty(&folio->_deferred_list))
282362306a36Sopenharmony_ci		return;
282462306a36Sopenharmony_ci
282562306a36Sopenharmony_ci	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
282662306a36Sopenharmony_ci	if (list_empty(&folio->_deferred_list)) {
282762306a36Sopenharmony_ci		count_vm_event(THP_DEFERRED_SPLIT_PAGE);
282862306a36Sopenharmony_ci		list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
282962306a36Sopenharmony_ci		ds_queue->split_queue_len++;
283062306a36Sopenharmony_ci#ifdef CONFIG_MEMCG
283162306a36Sopenharmony_ci		if (memcg)
283262306a36Sopenharmony_ci			set_shrinker_bit(memcg, folio_nid(folio),
283362306a36Sopenharmony_ci					 deferred_split_shrinker.id);
283462306a36Sopenharmony_ci#endif
283562306a36Sopenharmony_ci	}
283662306a36Sopenharmony_ci	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
283762306a36Sopenharmony_ci}
283862306a36Sopenharmony_ci
283962306a36Sopenharmony_cistatic unsigned long deferred_split_count(struct shrinker *shrink,
284062306a36Sopenharmony_ci		struct shrink_control *sc)
284162306a36Sopenharmony_ci{
284262306a36Sopenharmony_ci	struct pglist_data *pgdata = NODE_DATA(sc->nid);
284362306a36Sopenharmony_ci	struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
284462306a36Sopenharmony_ci
284562306a36Sopenharmony_ci#ifdef CONFIG_MEMCG
284662306a36Sopenharmony_ci	if (sc->memcg)
284762306a36Sopenharmony_ci		ds_queue = &sc->memcg->deferred_split_queue;
284862306a36Sopenharmony_ci#endif
284962306a36Sopenharmony_ci	return READ_ONCE(ds_queue->split_queue_len);
285062306a36Sopenharmony_ci}
285162306a36Sopenharmony_ci
285262306a36Sopenharmony_cistatic unsigned long deferred_split_scan(struct shrinker *shrink,
285362306a36Sopenharmony_ci		struct shrink_control *sc)
285462306a36Sopenharmony_ci{
285562306a36Sopenharmony_ci	struct pglist_data *pgdata = NODE_DATA(sc->nid);
285662306a36Sopenharmony_ci	struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
285762306a36Sopenharmony_ci	unsigned long flags;
285862306a36Sopenharmony_ci	LIST_HEAD(list);
285962306a36Sopenharmony_ci	struct folio *folio, *next;
286062306a36Sopenharmony_ci	int split = 0;
286162306a36Sopenharmony_ci
286262306a36Sopenharmony_ci#ifdef CONFIG_MEMCG
286362306a36Sopenharmony_ci	if (sc->memcg)
286462306a36Sopenharmony_ci		ds_queue = &sc->memcg->deferred_split_queue;
286562306a36Sopenharmony_ci#endif
286662306a36Sopenharmony_ci
286762306a36Sopenharmony_ci	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
286862306a36Sopenharmony_ci	/* Take pin on all head pages to avoid freeing them under us */
286962306a36Sopenharmony_ci	list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
287062306a36Sopenharmony_ci							_deferred_list) {
287162306a36Sopenharmony_ci		if (folio_try_get(folio)) {
287262306a36Sopenharmony_ci			list_move(&folio->_deferred_list, &list);
287362306a36Sopenharmony_ci		} else {
287462306a36Sopenharmony_ci			/* We lost race with folio_put() */
287562306a36Sopenharmony_ci			list_del_init(&folio->_deferred_list);
287662306a36Sopenharmony_ci			ds_queue->split_queue_len--;
287762306a36Sopenharmony_ci		}
287862306a36Sopenharmony_ci		if (!--sc->nr_to_scan)
287962306a36Sopenharmony_ci			break;
288062306a36Sopenharmony_ci	}
288162306a36Sopenharmony_ci	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
288262306a36Sopenharmony_ci
288362306a36Sopenharmony_ci	list_for_each_entry_safe(folio, next, &list, _deferred_list) {
288462306a36Sopenharmony_ci		if (!folio_trylock(folio))
288562306a36Sopenharmony_ci			goto next;
288662306a36Sopenharmony_ci		/* split_huge_page() removes page from list on success */
288762306a36Sopenharmony_ci		if (!split_folio(folio))
288862306a36Sopenharmony_ci			split++;
288962306a36Sopenharmony_ci		folio_unlock(folio);
289062306a36Sopenharmony_cinext:
289162306a36Sopenharmony_ci		folio_put(folio);
289262306a36Sopenharmony_ci	}
289362306a36Sopenharmony_ci
289462306a36Sopenharmony_ci	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
289562306a36Sopenharmony_ci	list_splice_tail(&list, &ds_queue->split_queue);
289662306a36Sopenharmony_ci	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
289762306a36Sopenharmony_ci
289862306a36Sopenharmony_ci	/*
289962306a36Sopenharmony_ci	 * Stop shrinker if we didn't split any page, but the queue is empty.
290062306a36Sopenharmony_ci	 * This can happen if pages were freed under us.
290162306a36Sopenharmony_ci	 */
290262306a36Sopenharmony_ci	if (!split && list_empty(&ds_queue->split_queue))
290362306a36Sopenharmony_ci		return SHRINK_STOP;
290462306a36Sopenharmony_ci	return split;
290562306a36Sopenharmony_ci}
290662306a36Sopenharmony_ci
290762306a36Sopenharmony_cistatic struct shrinker deferred_split_shrinker = {
290862306a36Sopenharmony_ci	.count_objects = deferred_split_count,
290962306a36Sopenharmony_ci	.scan_objects = deferred_split_scan,
291062306a36Sopenharmony_ci	.seeks = DEFAULT_SEEKS,
291162306a36Sopenharmony_ci	.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE |
291262306a36Sopenharmony_ci		 SHRINKER_NONSLAB,
291362306a36Sopenharmony_ci};
291462306a36Sopenharmony_ci
291562306a36Sopenharmony_ci#ifdef CONFIG_DEBUG_FS
291662306a36Sopenharmony_cistatic void split_huge_pages_all(void)
291762306a36Sopenharmony_ci{
291862306a36Sopenharmony_ci	struct zone *zone;
291962306a36Sopenharmony_ci	struct page *page;
292062306a36Sopenharmony_ci	struct folio *folio;
292162306a36Sopenharmony_ci	unsigned long pfn, max_zone_pfn;
292262306a36Sopenharmony_ci	unsigned long total = 0, split = 0;
292362306a36Sopenharmony_ci
292462306a36Sopenharmony_ci	pr_debug("Split all THPs\n");
292562306a36Sopenharmony_ci	for_each_zone(zone) {
292662306a36Sopenharmony_ci		if (!managed_zone(zone))
292762306a36Sopenharmony_ci			continue;
292862306a36Sopenharmony_ci		max_zone_pfn = zone_end_pfn(zone);
292962306a36Sopenharmony_ci		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
293062306a36Sopenharmony_ci			int nr_pages;
293162306a36Sopenharmony_ci
293262306a36Sopenharmony_ci			page = pfn_to_online_page(pfn);
293362306a36Sopenharmony_ci			if (!page || PageTail(page))
293462306a36Sopenharmony_ci				continue;
293562306a36Sopenharmony_ci			folio = page_folio(page);
293662306a36Sopenharmony_ci			if (!folio_try_get(folio))
293762306a36Sopenharmony_ci				continue;
293862306a36Sopenharmony_ci
293962306a36Sopenharmony_ci			if (unlikely(page_folio(page) != folio))
294062306a36Sopenharmony_ci				goto next;
294162306a36Sopenharmony_ci
294262306a36Sopenharmony_ci			if (zone != folio_zone(folio))
294362306a36Sopenharmony_ci				goto next;
294462306a36Sopenharmony_ci
294562306a36Sopenharmony_ci			if (!folio_test_large(folio)
294662306a36Sopenharmony_ci				|| folio_test_hugetlb(folio)
294762306a36Sopenharmony_ci				|| !folio_test_lru(folio))
294862306a36Sopenharmony_ci				goto next;
294962306a36Sopenharmony_ci
295062306a36Sopenharmony_ci			total++;
295162306a36Sopenharmony_ci			folio_lock(folio);
295262306a36Sopenharmony_ci			nr_pages = folio_nr_pages(folio);
295362306a36Sopenharmony_ci			if (!split_folio(folio))
295462306a36Sopenharmony_ci				split++;
295562306a36Sopenharmony_ci			pfn += nr_pages - 1;
295662306a36Sopenharmony_ci			folio_unlock(folio);
295762306a36Sopenharmony_cinext:
295862306a36Sopenharmony_ci			folio_put(folio);
295962306a36Sopenharmony_ci			cond_resched();
296062306a36Sopenharmony_ci		}
296162306a36Sopenharmony_ci	}
296262306a36Sopenharmony_ci
296362306a36Sopenharmony_ci	pr_debug("%lu of %lu THP split\n", split, total);
296462306a36Sopenharmony_ci}
296562306a36Sopenharmony_ci
296662306a36Sopenharmony_cistatic inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
296762306a36Sopenharmony_ci{
296862306a36Sopenharmony_ci	return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) ||
296962306a36Sopenharmony_ci		    is_vm_hugetlb_page(vma);
297062306a36Sopenharmony_ci}
297162306a36Sopenharmony_ci
297262306a36Sopenharmony_cistatic int split_huge_pages_pid(int pid, unsigned long vaddr_start,
297362306a36Sopenharmony_ci				unsigned long vaddr_end)
297462306a36Sopenharmony_ci{
297562306a36Sopenharmony_ci	int ret = 0;
297662306a36Sopenharmony_ci	struct task_struct *task;
297762306a36Sopenharmony_ci	struct mm_struct *mm;
297862306a36Sopenharmony_ci	unsigned long total = 0, split = 0;
297962306a36Sopenharmony_ci	unsigned long addr;
298062306a36Sopenharmony_ci
298162306a36Sopenharmony_ci	vaddr_start &= PAGE_MASK;
298262306a36Sopenharmony_ci	vaddr_end &= PAGE_MASK;
298362306a36Sopenharmony_ci
298462306a36Sopenharmony_ci	/* Find the task_struct from pid */
298562306a36Sopenharmony_ci	rcu_read_lock();
298662306a36Sopenharmony_ci	task = find_task_by_vpid(pid);
298762306a36Sopenharmony_ci	if (!task) {
298862306a36Sopenharmony_ci		rcu_read_unlock();
298962306a36Sopenharmony_ci		ret = -ESRCH;
299062306a36Sopenharmony_ci		goto out;
299162306a36Sopenharmony_ci	}
299262306a36Sopenharmony_ci	get_task_struct(task);
299362306a36Sopenharmony_ci	rcu_read_unlock();
299462306a36Sopenharmony_ci
299562306a36Sopenharmony_ci	/* Find the mm_struct */
299662306a36Sopenharmony_ci	mm = get_task_mm(task);
299762306a36Sopenharmony_ci	put_task_struct(task);
299862306a36Sopenharmony_ci
299962306a36Sopenharmony_ci	if (!mm) {
300062306a36Sopenharmony_ci		ret = -EINVAL;
300162306a36Sopenharmony_ci		goto out;
300262306a36Sopenharmony_ci	}
300362306a36Sopenharmony_ci
300462306a36Sopenharmony_ci	pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
300562306a36Sopenharmony_ci		 pid, vaddr_start, vaddr_end);
300662306a36Sopenharmony_ci
300762306a36Sopenharmony_ci	mmap_read_lock(mm);
300862306a36Sopenharmony_ci	/*
300962306a36Sopenharmony_ci	 * always increase addr by PAGE_SIZE, since we could have a PTE page
301062306a36Sopenharmony_ci	 * table filled with PTE-mapped THPs, each of which is distinct.
301162306a36Sopenharmony_ci	 */
301262306a36Sopenharmony_ci	for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
301362306a36Sopenharmony_ci		struct vm_area_struct *vma = vma_lookup(mm, addr);
301462306a36Sopenharmony_ci		struct page *page;
301562306a36Sopenharmony_ci		struct folio *folio;
301662306a36Sopenharmony_ci
301762306a36Sopenharmony_ci		if (!vma)
301862306a36Sopenharmony_ci			break;
301962306a36Sopenharmony_ci
302062306a36Sopenharmony_ci		/* skip special VMA and hugetlb VMA */
302162306a36Sopenharmony_ci		if (vma_not_suitable_for_thp_split(vma)) {
302262306a36Sopenharmony_ci			addr = vma->vm_end;
302362306a36Sopenharmony_ci			continue;
302462306a36Sopenharmony_ci		}
302562306a36Sopenharmony_ci
302662306a36Sopenharmony_ci		/* FOLL_DUMP to ignore special (like zero) pages */
302762306a36Sopenharmony_ci		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
302862306a36Sopenharmony_ci
302962306a36Sopenharmony_ci		if (IS_ERR_OR_NULL(page))
303062306a36Sopenharmony_ci			continue;
303162306a36Sopenharmony_ci
303262306a36Sopenharmony_ci		folio = page_folio(page);
303362306a36Sopenharmony_ci		if (!is_transparent_hugepage(folio))
303462306a36Sopenharmony_ci			goto next;
303562306a36Sopenharmony_ci
303662306a36Sopenharmony_ci		total++;
303762306a36Sopenharmony_ci		if (!can_split_folio(folio, NULL))
303862306a36Sopenharmony_ci			goto next;
303962306a36Sopenharmony_ci
304062306a36Sopenharmony_ci		if (!folio_trylock(folio))
304162306a36Sopenharmony_ci			goto next;
304262306a36Sopenharmony_ci
304362306a36Sopenharmony_ci		if (!split_folio(folio))
304462306a36Sopenharmony_ci			split++;
304562306a36Sopenharmony_ci
304662306a36Sopenharmony_ci		folio_unlock(folio);
304762306a36Sopenharmony_cinext:
304862306a36Sopenharmony_ci		folio_put(folio);
304962306a36Sopenharmony_ci		cond_resched();
305062306a36Sopenharmony_ci	}
305162306a36Sopenharmony_ci	mmap_read_unlock(mm);
305262306a36Sopenharmony_ci	mmput(mm);
305362306a36Sopenharmony_ci
305462306a36Sopenharmony_ci	pr_debug("%lu of %lu THP split\n", split, total);
305562306a36Sopenharmony_ci
305662306a36Sopenharmony_ciout:
305762306a36Sopenharmony_ci	return ret;
305862306a36Sopenharmony_ci}
305962306a36Sopenharmony_ci
306062306a36Sopenharmony_cistatic int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
306162306a36Sopenharmony_ci				pgoff_t off_end)
306262306a36Sopenharmony_ci{
306362306a36Sopenharmony_ci	struct filename *file;
306462306a36Sopenharmony_ci	struct file *candidate;
306562306a36Sopenharmony_ci	struct address_space *mapping;
306662306a36Sopenharmony_ci	int ret = -EINVAL;
306762306a36Sopenharmony_ci	pgoff_t index;
306862306a36Sopenharmony_ci	int nr_pages = 1;
306962306a36Sopenharmony_ci	unsigned long total = 0, split = 0;
307062306a36Sopenharmony_ci
307162306a36Sopenharmony_ci	file = getname_kernel(file_path);
307262306a36Sopenharmony_ci	if (IS_ERR(file))
307362306a36Sopenharmony_ci		return ret;
307462306a36Sopenharmony_ci
307562306a36Sopenharmony_ci	candidate = file_open_name(file, O_RDONLY, 0);
307662306a36Sopenharmony_ci	if (IS_ERR(candidate))
307762306a36Sopenharmony_ci		goto out;
307862306a36Sopenharmony_ci
307962306a36Sopenharmony_ci	pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
308062306a36Sopenharmony_ci		 file_path, off_start, off_end);
308162306a36Sopenharmony_ci
308262306a36Sopenharmony_ci	mapping = candidate->f_mapping;
308362306a36Sopenharmony_ci
308462306a36Sopenharmony_ci	for (index = off_start; index < off_end; index += nr_pages) {
308562306a36Sopenharmony_ci		struct folio *folio = filemap_get_folio(mapping, index);
308662306a36Sopenharmony_ci
308762306a36Sopenharmony_ci		nr_pages = 1;
308862306a36Sopenharmony_ci		if (IS_ERR(folio))
308962306a36Sopenharmony_ci			continue;
309062306a36Sopenharmony_ci
309162306a36Sopenharmony_ci		if (!folio_test_large(folio))
309262306a36Sopenharmony_ci			goto next;
309362306a36Sopenharmony_ci
309462306a36Sopenharmony_ci		total++;
309562306a36Sopenharmony_ci		nr_pages = folio_nr_pages(folio);
309662306a36Sopenharmony_ci
309762306a36Sopenharmony_ci		if (!folio_trylock(folio))
309862306a36Sopenharmony_ci			goto next;
309962306a36Sopenharmony_ci
310062306a36Sopenharmony_ci		if (!split_folio(folio))
310162306a36Sopenharmony_ci			split++;
310262306a36Sopenharmony_ci
310362306a36Sopenharmony_ci		folio_unlock(folio);
310462306a36Sopenharmony_cinext:
310562306a36Sopenharmony_ci		folio_put(folio);
310662306a36Sopenharmony_ci		cond_resched();
310762306a36Sopenharmony_ci	}
310862306a36Sopenharmony_ci
310962306a36Sopenharmony_ci	filp_close(candidate, NULL);
311062306a36Sopenharmony_ci	ret = 0;
311162306a36Sopenharmony_ci
311262306a36Sopenharmony_ci	pr_debug("%lu of %lu file-backed THP split\n", split, total);
311362306a36Sopenharmony_ciout:
311462306a36Sopenharmony_ci	putname(file);
311562306a36Sopenharmony_ci	return ret;
311662306a36Sopenharmony_ci}
311762306a36Sopenharmony_ci
311862306a36Sopenharmony_ci#define MAX_INPUT_BUF_SZ 255
311962306a36Sopenharmony_ci
312062306a36Sopenharmony_cistatic ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
312162306a36Sopenharmony_ci				size_t count, loff_t *ppops)
312262306a36Sopenharmony_ci{
312362306a36Sopenharmony_ci	static DEFINE_MUTEX(split_debug_mutex);
312462306a36Sopenharmony_ci	ssize_t ret;
312562306a36Sopenharmony_ci	/* hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end */
312662306a36Sopenharmony_ci	char input_buf[MAX_INPUT_BUF_SZ];
312762306a36Sopenharmony_ci	int pid;
312862306a36Sopenharmony_ci	unsigned long vaddr_start, vaddr_end;
312962306a36Sopenharmony_ci
313062306a36Sopenharmony_ci	ret = mutex_lock_interruptible(&split_debug_mutex);
313162306a36Sopenharmony_ci	if (ret)
313262306a36Sopenharmony_ci		return ret;
313362306a36Sopenharmony_ci
313462306a36Sopenharmony_ci	ret = -EFAULT;
313562306a36Sopenharmony_ci
313662306a36Sopenharmony_ci	memset(input_buf, 0, MAX_INPUT_BUF_SZ);
313762306a36Sopenharmony_ci	if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ)))
313862306a36Sopenharmony_ci		goto out;
313962306a36Sopenharmony_ci
314062306a36Sopenharmony_ci	input_buf[MAX_INPUT_BUF_SZ - 1] = '\0';
314162306a36Sopenharmony_ci
314262306a36Sopenharmony_ci	if (input_buf[0] == '/') {
314362306a36Sopenharmony_ci		char *tok;
314462306a36Sopenharmony_ci		char *buf = input_buf;
314562306a36Sopenharmony_ci		char file_path[MAX_INPUT_BUF_SZ];
314662306a36Sopenharmony_ci		pgoff_t off_start = 0, off_end = 0;
314762306a36Sopenharmony_ci		size_t input_len = strlen(input_buf);
314862306a36Sopenharmony_ci
314962306a36Sopenharmony_ci		tok = strsep(&buf, ",");
315062306a36Sopenharmony_ci		if (tok) {
315162306a36Sopenharmony_ci			strcpy(file_path, tok);
315262306a36Sopenharmony_ci		} else {
315362306a36Sopenharmony_ci			ret = -EINVAL;
315462306a36Sopenharmony_ci			goto out;
315562306a36Sopenharmony_ci		}
315662306a36Sopenharmony_ci
315762306a36Sopenharmony_ci		ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end);
315862306a36Sopenharmony_ci		if (ret != 2) {
315962306a36Sopenharmony_ci			ret = -EINVAL;
316062306a36Sopenharmony_ci			goto out;
316162306a36Sopenharmony_ci		}
316262306a36Sopenharmony_ci		ret = split_huge_pages_in_file(file_path, off_start, off_end);
316362306a36Sopenharmony_ci		if (!ret)
316462306a36Sopenharmony_ci			ret = input_len;
316562306a36Sopenharmony_ci
316662306a36Sopenharmony_ci		goto out;
316762306a36Sopenharmony_ci	}
316862306a36Sopenharmony_ci
316962306a36Sopenharmony_ci	ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
317062306a36Sopenharmony_ci	if (ret == 1 && pid == 1) {
317162306a36Sopenharmony_ci		split_huge_pages_all();
317262306a36Sopenharmony_ci		ret = strlen(input_buf);
317362306a36Sopenharmony_ci		goto out;
317462306a36Sopenharmony_ci	} else if (ret != 3) {
317562306a36Sopenharmony_ci		ret = -EINVAL;
317662306a36Sopenharmony_ci		goto out;
317762306a36Sopenharmony_ci	}
317862306a36Sopenharmony_ci
317962306a36Sopenharmony_ci	ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end);
318062306a36Sopenharmony_ci	if (!ret)
318162306a36Sopenharmony_ci		ret = strlen(input_buf);
318262306a36Sopenharmony_ciout:
318362306a36Sopenharmony_ci	mutex_unlock(&split_debug_mutex);
318462306a36Sopenharmony_ci	return ret;
318562306a36Sopenharmony_ci
318662306a36Sopenharmony_ci}
318762306a36Sopenharmony_ci
318862306a36Sopenharmony_cistatic const struct file_operations split_huge_pages_fops = {
318962306a36Sopenharmony_ci	.owner	 = THIS_MODULE,
319062306a36Sopenharmony_ci	.write	 = split_huge_pages_write,
319162306a36Sopenharmony_ci	.llseek  = no_llseek,
319262306a36Sopenharmony_ci};
319362306a36Sopenharmony_ci
319462306a36Sopenharmony_cistatic int __init split_huge_pages_debugfs(void)
319562306a36Sopenharmony_ci{
319662306a36Sopenharmony_ci	debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
319762306a36Sopenharmony_ci			    &split_huge_pages_fops);
319862306a36Sopenharmony_ci	return 0;
319962306a36Sopenharmony_ci}
320062306a36Sopenharmony_cilate_initcall(split_huge_pages_debugfs);
320162306a36Sopenharmony_ci#endif
320262306a36Sopenharmony_ci
320362306a36Sopenharmony_ci#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
320462306a36Sopenharmony_ciint set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
320562306a36Sopenharmony_ci		struct page *page)
320662306a36Sopenharmony_ci{
320762306a36Sopenharmony_ci	struct vm_area_struct *vma = pvmw->vma;
320862306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
320962306a36Sopenharmony_ci	unsigned long address = pvmw->address;
321062306a36Sopenharmony_ci	bool anon_exclusive;
321162306a36Sopenharmony_ci	pmd_t pmdval;
321262306a36Sopenharmony_ci	swp_entry_t entry;
321362306a36Sopenharmony_ci	pmd_t pmdswp;
321462306a36Sopenharmony_ci
321562306a36Sopenharmony_ci	if (!(pvmw->pmd && !pvmw->pte))
321662306a36Sopenharmony_ci		return 0;
321762306a36Sopenharmony_ci
321862306a36Sopenharmony_ci	flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
321962306a36Sopenharmony_ci	pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
322062306a36Sopenharmony_ci
322162306a36Sopenharmony_ci	/* See page_try_share_anon_rmap(): invalidate PMD first. */
322262306a36Sopenharmony_ci	anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
322362306a36Sopenharmony_ci	if (anon_exclusive && page_try_share_anon_rmap(page)) {
322462306a36Sopenharmony_ci		set_pmd_at(mm, address, pvmw->pmd, pmdval);
322562306a36Sopenharmony_ci		return -EBUSY;
322662306a36Sopenharmony_ci	}
322762306a36Sopenharmony_ci
322862306a36Sopenharmony_ci	if (pmd_dirty(pmdval))
322962306a36Sopenharmony_ci		set_page_dirty(page);
323062306a36Sopenharmony_ci	if (pmd_write(pmdval))
323162306a36Sopenharmony_ci		entry = make_writable_migration_entry(page_to_pfn(page));
323262306a36Sopenharmony_ci	else if (anon_exclusive)
323362306a36Sopenharmony_ci		entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
323462306a36Sopenharmony_ci	else
323562306a36Sopenharmony_ci		entry = make_readable_migration_entry(page_to_pfn(page));
323662306a36Sopenharmony_ci	if (pmd_young(pmdval))
323762306a36Sopenharmony_ci		entry = make_migration_entry_young(entry);
323862306a36Sopenharmony_ci	if (pmd_dirty(pmdval))
323962306a36Sopenharmony_ci		entry = make_migration_entry_dirty(entry);
324062306a36Sopenharmony_ci	pmdswp = swp_entry_to_pmd(entry);
324162306a36Sopenharmony_ci	if (pmd_soft_dirty(pmdval))
324262306a36Sopenharmony_ci		pmdswp = pmd_swp_mksoft_dirty(pmdswp);
324362306a36Sopenharmony_ci	if (pmd_uffd_wp(pmdval))
324462306a36Sopenharmony_ci		pmdswp = pmd_swp_mkuffd_wp(pmdswp);
324562306a36Sopenharmony_ci	set_pmd_at(mm, address, pvmw->pmd, pmdswp);
324662306a36Sopenharmony_ci	page_remove_rmap(page, vma, true);
324762306a36Sopenharmony_ci	put_page(page);
324862306a36Sopenharmony_ci	trace_set_migration_pmd(address, pmd_val(pmdswp));
324962306a36Sopenharmony_ci
325062306a36Sopenharmony_ci	return 0;
325162306a36Sopenharmony_ci}
325262306a36Sopenharmony_ci
325362306a36Sopenharmony_civoid remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
325462306a36Sopenharmony_ci{
325562306a36Sopenharmony_ci	struct vm_area_struct *vma = pvmw->vma;
325662306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
325762306a36Sopenharmony_ci	unsigned long address = pvmw->address;
325862306a36Sopenharmony_ci	unsigned long haddr = address & HPAGE_PMD_MASK;
325962306a36Sopenharmony_ci	pmd_t pmde;
326062306a36Sopenharmony_ci	swp_entry_t entry;
326162306a36Sopenharmony_ci
326262306a36Sopenharmony_ci	if (!(pvmw->pmd && !pvmw->pte))
326362306a36Sopenharmony_ci		return;
326462306a36Sopenharmony_ci
326562306a36Sopenharmony_ci	entry = pmd_to_swp_entry(*pvmw->pmd);
326662306a36Sopenharmony_ci	get_page(new);
326762306a36Sopenharmony_ci	pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
326862306a36Sopenharmony_ci	if (pmd_swp_soft_dirty(*pvmw->pmd))
326962306a36Sopenharmony_ci		pmde = pmd_mksoft_dirty(pmde);
327062306a36Sopenharmony_ci	if (is_writable_migration_entry(entry))
327162306a36Sopenharmony_ci		pmde = pmd_mkwrite(pmde, vma);
327262306a36Sopenharmony_ci	if (pmd_swp_uffd_wp(*pvmw->pmd))
327362306a36Sopenharmony_ci		pmde = pmd_mkuffd_wp(pmde);
327462306a36Sopenharmony_ci	if (!is_migration_entry_young(entry))
327562306a36Sopenharmony_ci		pmde = pmd_mkold(pmde);
327662306a36Sopenharmony_ci	/* NOTE: this may contain setting soft-dirty on some archs */
327762306a36Sopenharmony_ci	if (PageDirty(new) && is_migration_entry_dirty(entry))
327862306a36Sopenharmony_ci		pmde = pmd_mkdirty(pmde);
327962306a36Sopenharmony_ci
328062306a36Sopenharmony_ci	if (PageAnon(new)) {
328162306a36Sopenharmony_ci		rmap_t rmap_flags = RMAP_COMPOUND;
328262306a36Sopenharmony_ci
328362306a36Sopenharmony_ci		if (!is_readable_migration_entry(entry))
328462306a36Sopenharmony_ci			rmap_flags |= RMAP_EXCLUSIVE;
328562306a36Sopenharmony_ci
328662306a36Sopenharmony_ci		page_add_anon_rmap(new, vma, haddr, rmap_flags);
328762306a36Sopenharmony_ci	} else {
328862306a36Sopenharmony_ci		page_add_file_rmap(new, vma, true);
328962306a36Sopenharmony_ci	}
329062306a36Sopenharmony_ci	VM_BUG_ON(pmd_write(pmde) && PageAnon(new) && !PageAnonExclusive(new));
329162306a36Sopenharmony_ci	set_pmd_at(mm, haddr, pvmw->pmd, pmde);
329262306a36Sopenharmony_ci
329362306a36Sopenharmony_ci	/* No need to invalidate - it was non-present before */
329462306a36Sopenharmony_ci	update_mmu_cache_pmd(vma, address, pvmw->pmd);
329562306a36Sopenharmony_ci	trace_remove_migration_pmd(address, pmd_val(pmde));
329662306a36Sopenharmony_ci}
329762306a36Sopenharmony_ci#endif
3298