162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
262306a36Sopenharmony_ci#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
362306a36Sopenharmony_ci
462306a36Sopenharmony_ci#include <linux/mm.h>
562306a36Sopenharmony_ci#include <linux/sched.h>
662306a36Sopenharmony_ci#include <linux/sched/mm.h>
762306a36Sopenharmony_ci#include <linux/sched/coredump.h>
862306a36Sopenharmony_ci#include <linux/mmu_notifier.h>
962306a36Sopenharmony_ci#include <linux/rmap.h>
1062306a36Sopenharmony_ci#include <linux/swap.h>
1162306a36Sopenharmony_ci#include <linux/mm_inline.h>
1262306a36Sopenharmony_ci#include <linux/kthread.h>
1362306a36Sopenharmony_ci#include <linux/khugepaged.h>
1462306a36Sopenharmony_ci#include <linux/freezer.h>
1562306a36Sopenharmony_ci#include <linux/mman.h>
1662306a36Sopenharmony_ci#include <linux/hashtable.h>
1762306a36Sopenharmony_ci#include <linux/userfaultfd_k.h>
1862306a36Sopenharmony_ci#include <linux/page_idle.h>
1962306a36Sopenharmony_ci#include <linux/page_table_check.h>
2062306a36Sopenharmony_ci#include <linux/swapops.h>
2162306a36Sopenharmony_ci#include <linux/shmem_fs.h>
2262306a36Sopenharmony_ci#include <linux/ksm.h>
2362306a36Sopenharmony_ci
2462306a36Sopenharmony_ci#include <asm/tlb.h>
2562306a36Sopenharmony_ci#include <asm/pgalloc.h>
2662306a36Sopenharmony_ci#include "internal.h"
2762306a36Sopenharmony_ci#include "mm_slot.h"
2862306a36Sopenharmony_ci
2962306a36Sopenharmony_cienum scan_result {
3062306a36Sopenharmony_ci	SCAN_FAIL,
3162306a36Sopenharmony_ci	SCAN_SUCCEED,
3262306a36Sopenharmony_ci	SCAN_PMD_NULL,
3362306a36Sopenharmony_ci	SCAN_PMD_NONE,
3462306a36Sopenharmony_ci	SCAN_PMD_MAPPED,
3562306a36Sopenharmony_ci	SCAN_EXCEED_NONE_PTE,
3662306a36Sopenharmony_ci	SCAN_EXCEED_SWAP_PTE,
3762306a36Sopenharmony_ci	SCAN_EXCEED_SHARED_PTE,
3862306a36Sopenharmony_ci	SCAN_PTE_NON_PRESENT,
3962306a36Sopenharmony_ci	SCAN_PTE_UFFD_WP,
4062306a36Sopenharmony_ci	SCAN_PTE_MAPPED_HUGEPAGE,
4162306a36Sopenharmony_ci	SCAN_PAGE_RO,
4262306a36Sopenharmony_ci	SCAN_LACK_REFERENCED_PAGE,
4362306a36Sopenharmony_ci	SCAN_PAGE_NULL,
4462306a36Sopenharmony_ci	SCAN_SCAN_ABORT,
4562306a36Sopenharmony_ci	SCAN_PAGE_COUNT,
4662306a36Sopenharmony_ci	SCAN_PAGE_LRU,
4762306a36Sopenharmony_ci	SCAN_PAGE_LOCK,
4862306a36Sopenharmony_ci	SCAN_PAGE_ANON,
4962306a36Sopenharmony_ci	SCAN_PAGE_COMPOUND,
5062306a36Sopenharmony_ci	SCAN_ANY_PROCESS,
5162306a36Sopenharmony_ci	SCAN_VMA_NULL,
5262306a36Sopenharmony_ci	SCAN_VMA_CHECK,
5362306a36Sopenharmony_ci	SCAN_ADDRESS_RANGE,
5462306a36Sopenharmony_ci	SCAN_DEL_PAGE_LRU,
5562306a36Sopenharmony_ci	SCAN_ALLOC_HUGE_PAGE_FAIL,
5662306a36Sopenharmony_ci	SCAN_CGROUP_CHARGE_FAIL,
5762306a36Sopenharmony_ci	SCAN_TRUNCATED,
5862306a36Sopenharmony_ci	SCAN_PAGE_HAS_PRIVATE,
5962306a36Sopenharmony_ci	SCAN_STORE_FAILED,
6062306a36Sopenharmony_ci	SCAN_COPY_MC,
6162306a36Sopenharmony_ci	SCAN_PAGE_FILLED,
6262306a36Sopenharmony_ci};
6362306a36Sopenharmony_ci
6462306a36Sopenharmony_ci#define CREATE_TRACE_POINTS
6562306a36Sopenharmony_ci#include <trace/events/huge_memory.h>
6662306a36Sopenharmony_ci
6762306a36Sopenharmony_cistatic struct task_struct *khugepaged_thread __read_mostly;
6862306a36Sopenharmony_cistatic DEFINE_MUTEX(khugepaged_mutex);
6962306a36Sopenharmony_ci
7062306a36Sopenharmony_ci/* default scan 8*512 pte (or vmas) every 30 second */
7162306a36Sopenharmony_cistatic unsigned int khugepaged_pages_to_scan __read_mostly;
7262306a36Sopenharmony_cistatic unsigned int khugepaged_pages_collapsed;
7362306a36Sopenharmony_cistatic unsigned int khugepaged_full_scans;
7462306a36Sopenharmony_cistatic unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
7562306a36Sopenharmony_ci/* during fragmentation poll the hugepage allocator once every minute */
7662306a36Sopenharmony_cistatic unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
7762306a36Sopenharmony_cistatic unsigned long khugepaged_sleep_expire;
7862306a36Sopenharmony_cistatic DEFINE_SPINLOCK(khugepaged_mm_lock);
7962306a36Sopenharmony_cistatic DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
8062306a36Sopenharmony_ci/*
8162306a36Sopenharmony_ci * default collapse hugepages if there is at least one pte mapped like
8262306a36Sopenharmony_ci * it would have happened if the vma was large enough during page
8362306a36Sopenharmony_ci * fault.
8462306a36Sopenharmony_ci *
8562306a36Sopenharmony_ci * Note that these are only respected if collapse was initiated by khugepaged.
8662306a36Sopenharmony_ci */
8762306a36Sopenharmony_cistatic unsigned int khugepaged_max_ptes_none __read_mostly;
8862306a36Sopenharmony_cistatic unsigned int khugepaged_max_ptes_swap __read_mostly;
8962306a36Sopenharmony_cistatic unsigned int khugepaged_max_ptes_shared __read_mostly;
9062306a36Sopenharmony_ci
9162306a36Sopenharmony_ci#define MM_SLOTS_HASH_BITS 10
9262306a36Sopenharmony_cistatic DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
9362306a36Sopenharmony_ci
9462306a36Sopenharmony_cistatic struct kmem_cache *mm_slot_cache __read_mostly;
9562306a36Sopenharmony_ci
9662306a36Sopenharmony_cistruct collapse_control {
9762306a36Sopenharmony_ci	bool is_khugepaged;
9862306a36Sopenharmony_ci
9962306a36Sopenharmony_ci	/* Num pages scanned per node */
10062306a36Sopenharmony_ci	u32 node_load[MAX_NUMNODES];
10162306a36Sopenharmony_ci
10262306a36Sopenharmony_ci	/* nodemask for allocation fallback */
10362306a36Sopenharmony_ci	nodemask_t alloc_nmask;
10462306a36Sopenharmony_ci};
10562306a36Sopenharmony_ci
10662306a36Sopenharmony_ci/**
10762306a36Sopenharmony_ci * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned
10862306a36Sopenharmony_ci * @slot: hash lookup from mm to mm_slot
10962306a36Sopenharmony_ci */
11062306a36Sopenharmony_cistruct khugepaged_mm_slot {
11162306a36Sopenharmony_ci	struct mm_slot slot;
11262306a36Sopenharmony_ci};
11362306a36Sopenharmony_ci
11462306a36Sopenharmony_ci/**
11562306a36Sopenharmony_ci * struct khugepaged_scan - cursor for scanning
11662306a36Sopenharmony_ci * @mm_head: the head of the mm list to scan
11762306a36Sopenharmony_ci * @mm_slot: the current mm_slot we are scanning
11862306a36Sopenharmony_ci * @address: the next address inside that to be scanned
11962306a36Sopenharmony_ci *
12062306a36Sopenharmony_ci * There is only the one khugepaged_scan instance of this cursor structure.
12162306a36Sopenharmony_ci */
12262306a36Sopenharmony_cistruct khugepaged_scan {
12362306a36Sopenharmony_ci	struct list_head mm_head;
12462306a36Sopenharmony_ci	struct khugepaged_mm_slot *mm_slot;
12562306a36Sopenharmony_ci	unsigned long address;
12662306a36Sopenharmony_ci};
12762306a36Sopenharmony_ci
12862306a36Sopenharmony_cistatic struct khugepaged_scan khugepaged_scan = {
12962306a36Sopenharmony_ci	.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
13062306a36Sopenharmony_ci};
13162306a36Sopenharmony_ci
13262306a36Sopenharmony_ci#ifdef CONFIG_SYSFS
13362306a36Sopenharmony_cistatic ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
13462306a36Sopenharmony_ci					 struct kobj_attribute *attr,
13562306a36Sopenharmony_ci					 char *buf)
13662306a36Sopenharmony_ci{
13762306a36Sopenharmony_ci	return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs);
13862306a36Sopenharmony_ci}
13962306a36Sopenharmony_ci
14062306a36Sopenharmony_cistatic ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
14162306a36Sopenharmony_ci					  struct kobj_attribute *attr,
14262306a36Sopenharmony_ci					  const char *buf, size_t count)
14362306a36Sopenharmony_ci{
14462306a36Sopenharmony_ci	unsigned int msecs;
14562306a36Sopenharmony_ci	int err;
14662306a36Sopenharmony_ci
14762306a36Sopenharmony_ci	err = kstrtouint(buf, 10, &msecs);
14862306a36Sopenharmony_ci	if (err)
14962306a36Sopenharmony_ci		return -EINVAL;
15062306a36Sopenharmony_ci
15162306a36Sopenharmony_ci	khugepaged_scan_sleep_millisecs = msecs;
15262306a36Sopenharmony_ci	khugepaged_sleep_expire = 0;
15362306a36Sopenharmony_ci	wake_up_interruptible(&khugepaged_wait);
15462306a36Sopenharmony_ci
15562306a36Sopenharmony_ci	return count;
15662306a36Sopenharmony_ci}
15762306a36Sopenharmony_cistatic struct kobj_attribute scan_sleep_millisecs_attr =
15862306a36Sopenharmony_ci	__ATTR_RW(scan_sleep_millisecs);
15962306a36Sopenharmony_ci
16062306a36Sopenharmony_cistatic ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
16162306a36Sopenharmony_ci					  struct kobj_attribute *attr,
16262306a36Sopenharmony_ci					  char *buf)
16362306a36Sopenharmony_ci{
16462306a36Sopenharmony_ci	return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
16562306a36Sopenharmony_ci}
16662306a36Sopenharmony_ci
16762306a36Sopenharmony_cistatic ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
16862306a36Sopenharmony_ci					   struct kobj_attribute *attr,
16962306a36Sopenharmony_ci					   const char *buf, size_t count)
17062306a36Sopenharmony_ci{
17162306a36Sopenharmony_ci	unsigned int msecs;
17262306a36Sopenharmony_ci	int err;
17362306a36Sopenharmony_ci
17462306a36Sopenharmony_ci	err = kstrtouint(buf, 10, &msecs);
17562306a36Sopenharmony_ci	if (err)
17662306a36Sopenharmony_ci		return -EINVAL;
17762306a36Sopenharmony_ci
17862306a36Sopenharmony_ci	khugepaged_alloc_sleep_millisecs = msecs;
17962306a36Sopenharmony_ci	khugepaged_sleep_expire = 0;
18062306a36Sopenharmony_ci	wake_up_interruptible(&khugepaged_wait);
18162306a36Sopenharmony_ci
18262306a36Sopenharmony_ci	return count;
18362306a36Sopenharmony_ci}
18462306a36Sopenharmony_cistatic struct kobj_attribute alloc_sleep_millisecs_attr =
18562306a36Sopenharmony_ci	__ATTR_RW(alloc_sleep_millisecs);
18662306a36Sopenharmony_ci
18762306a36Sopenharmony_cistatic ssize_t pages_to_scan_show(struct kobject *kobj,
18862306a36Sopenharmony_ci				  struct kobj_attribute *attr,
18962306a36Sopenharmony_ci				  char *buf)
19062306a36Sopenharmony_ci{
19162306a36Sopenharmony_ci	return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan);
19262306a36Sopenharmony_ci}
19362306a36Sopenharmony_cistatic ssize_t pages_to_scan_store(struct kobject *kobj,
19462306a36Sopenharmony_ci				   struct kobj_attribute *attr,
19562306a36Sopenharmony_ci				   const char *buf, size_t count)
19662306a36Sopenharmony_ci{
19762306a36Sopenharmony_ci	unsigned int pages;
19862306a36Sopenharmony_ci	int err;
19962306a36Sopenharmony_ci
20062306a36Sopenharmony_ci	err = kstrtouint(buf, 10, &pages);
20162306a36Sopenharmony_ci	if (err || !pages)
20262306a36Sopenharmony_ci		return -EINVAL;
20362306a36Sopenharmony_ci
20462306a36Sopenharmony_ci	khugepaged_pages_to_scan = pages;
20562306a36Sopenharmony_ci
20662306a36Sopenharmony_ci	return count;
20762306a36Sopenharmony_ci}
20862306a36Sopenharmony_cistatic struct kobj_attribute pages_to_scan_attr =
20962306a36Sopenharmony_ci	__ATTR_RW(pages_to_scan);
21062306a36Sopenharmony_ci
21162306a36Sopenharmony_cistatic ssize_t pages_collapsed_show(struct kobject *kobj,
21262306a36Sopenharmony_ci				    struct kobj_attribute *attr,
21362306a36Sopenharmony_ci				    char *buf)
21462306a36Sopenharmony_ci{
21562306a36Sopenharmony_ci	return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed);
21662306a36Sopenharmony_ci}
21762306a36Sopenharmony_cistatic struct kobj_attribute pages_collapsed_attr =
21862306a36Sopenharmony_ci	__ATTR_RO(pages_collapsed);
21962306a36Sopenharmony_ci
22062306a36Sopenharmony_cistatic ssize_t full_scans_show(struct kobject *kobj,
22162306a36Sopenharmony_ci			       struct kobj_attribute *attr,
22262306a36Sopenharmony_ci			       char *buf)
22362306a36Sopenharmony_ci{
22462306a36Sopenharmony_ci	return sysfs_emit(buf, "%u\n", khugepaged_full_scans);
22562306a36Sopenharmony_ci}
22662306a36Sopenharmony_cistatic struct kobj_attribute full_scans_attr =
22762306a36Sopenharmony_ci	__ATTR_RO(full_scans);
22862306a36Sopenharmony_ci
22962306a36Sopenharmony_cistatic ssize_t defrag_show(struct kobject *kobj,
23062306a36Sopenharmony_ci			   struct kobj_attribute *attr, char *buf)
23162306a36Sopenharmony_ci{
23262306a36Sopenharmony_ci	return single_hugepage_flag_show(kobj, attr, buf,
23362306a36Sopenharmony_ci					 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
23462306a36Sopenharmony_ci}
23562306a36Sopenharmony_cistatic ssize_t defrag_store(struct kobject *kobj,
23662306a36Sopenharmony_ci			    struct kobj_attribute *attr,
23762306a36Sopenharmony_ci			    const char *buf, size_t count)
23862306a36Sopenharmony_ci{
23962306a36Sopenharmony_ci	return single_hugepage_flag_store(kobj, attr, buf, count,
24062306a36Sopenharmony_ci				 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
24162306a36Sopenharmony_ci}
24262306a36Sopenharmony_cistatic struct kobj_attribute khugepaged_defrag_attr =
24362306a36Sopenharmony_ci	__ATTR_RW(defrag);
24462306a36Sopenharmony_ci
24562306a36Sopenharmony_ci/*
24662306a36Sopenharmony_ci * max_ptes_none controls if khugepaged should collapse hugepages over
24762306a36Sopenharmony_ci * any unmapped ptes in turn potentially increasing the memory
24862306a36Sopenharmony_ci * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
24962306a36Sopenharmony_ci * reduce the available free memory in the system as it
25062306a36Sopenharmony_ci * runs. Increasing max_ptes_none will instead potentially reduce the
25162306a36Sopenharmony_ci * free memory in the system during the khugepaged scan.
25262306a36Sopenharmony_ci */
25362306a36Sopenharmony_cistatic ssize_t max_ptes_none_show(struct kobject *kobj,
25462306a36Sopenharmony_ci				  struct kobj_attribute *attr,
25562306a36Sopenharmony_ci				  char *buf)
25662306a36Sopenharmony_ci{
25762306a36Sopenharmony_ci	return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none);
25862306a36Sopenharmony_ci}
25962306a36Sopenharmony_cistatic ssize_t max_ptes_none_store(struct kobject *kobj,
26062306a36Sopenharmony_ci				   struct kobj_attribute *attr,
26162306a36Sopenharmony_ci				   const char *buf, size_t count)
26262306a36Sopenharmony_ci{
26362306a36Sopenharmony_ci	int err;
26462306a36Sopenharmony_ci	unsigned long max_ptes_none;
26562306a36Sopenharmony_ci
26662306a36Sopenharmony_ci	err = kstrtoul(buf, 10, &max_ptes_none);
26762306a36Sopenharmony_ci	if (err || max_ptes_none > HPAGE_PMD_NR - 1)
26862306a36Sopenharmony_ci		return -EINVAL;
26962306a36Sopenharmony_ci
27062306a36Sopenharmony_ci	khugepaged_max_ptes_none = max_ptes_none;
27162306a36Sopenharmony_ci
27262306a36Sopenharmony_ci	return count;
27362306a36Sopenharmony_ci}
27462306a36Sopenharmony_cistatic struct kobj_attribute khugepaged_max_ptes_none_attr =
27562306a36Sopenharmony_ci	__ATTR_RW(max_ptes_none);
27662306a36Sopenharmony_ci
27762306a36Sopenharmony_cistatic ssize_t max_ptes_swap_show(struct kobject *kobj,
27862306a36Sopenharmony_ci				  struct kobj_attribute *attr,
27962306a36Sopenharmony_ci				  char *buf)
28062306a36Sopenharmony_ci{
28162306a36Sopenharmony_ci	return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap);
28262306a36Sopenharmony_ci}
28362306a36Sopenharmony_ci
28462306a36Sopenharmony_cistatic ssize_t max_ptes_swap_store(struct kobject *kobj,
28562306a36Sopenharmony_ci				   struct kobj_attribute *attr,
28662306a36Sopenharmony_ci				   const char *buf, size_t count)
28762306a36Sopenharmony_ci{
28862306a36Sopenharmony_ci	int err;
28962306a36Sopenharmony_ci	unsigned long max_ptes_swap;
29062306a36Sopenharmony_ci
29162306a36Sopenharmony_ci	err  = kstrtoul(buf, 10, &max_ptes_swap);
29262306a36Sopenharmony_ci	if (err || max_ptes_swap > HPAGE_PMD_NR - 1)
29362306a36Sopenharmony_ci		return -EINVAL;
29462306a36Sopenharmony_ci
29562306a36Sopenharmony_ci	khugepaged_max_ptes_swap = max_ptes_swap;
29662306a36Sopenharmony_ci
29762306a36Sopenharmony_ci	return count;
29862306a36Sopenharmony_ci}
29962306a36Sopenharmony_ci
30062306a36Sopenharmony_cistatic struct kobj_attribute khugepaged_max_ptes_swap_attr =
30162306a36Sopenharmony_ci	__ATTR_RW(max_ptes_swap);
30262306a36Sopenharmony_ci
30362306a36Sopenharmony_cistatic ssize_t max_ptes_shared_show(struct kobject *kobj,
30462306a36Sopenharmony_ci				    struct kobj_attribute *attr,
30562306a36Sopenharmony_ci				    char *buf)
30662306a36Sopenharmony_ci{
30762306a36Sopenharmony_ci	return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared);
30862306a36Sopenharmony_ci}
30962306a36Sopenharmony_ci
31062306a36Sopenharmony_cistatic ssize_t max_ptes_shared_store(struct kobject *kobj,
31162306a36Sopenharmony_ci				     struct kobj_attribute *attr,
31262306a36Sopenharmony_ci				     const char *buf, size_t count)
31362306a36Sopenharmony_ci{
31462306a36Sopenharmony_ci	int err;
31562306a36Sopenharmony_ci	unsigned long max_ptes_shared;
31662306a36Sopenharmony_ci
31762306a36Sopenharmony_ci	err  = kstrtoul(buf, 10, &max_ptes_shared);
31862306a36Sopenharmony_ci	if (err || max_ptes_shared > HPAGE_PMD_NR - 1)
31962306a36Sopenharmony_ci		return -EINVAL;
32062306a36Sopenharmony_ci
32162306a36Sopenharmony_ci	khugepaged_max_ptes_shared = max_ptes_shared;
32262306a36Sopenharmony_ci
32362306a36Sopenharmony_ci	return count;
32462306a36Sopenharmony_ci}
32562306a36Sopenharmony_ci
32662306a36Sopenharmony_cistatic struct kobj_attribute khugepaged_max_ptes_shared_attr =
32762306a36Sopenharmony_ci	__ATTR_RW(max_ptes_shared);
32862306a36Sopenharmony_ci
32962306a36Sopenharmony_cistatic struct attribute *khugepaged_attr[] = {
33062306a36Sopenharmony_ci	&khugepaged_defrag_attr.attr,
33162306a36Sopenharmony_ci	&khugepaged_max_ptes_none_attr.attr,
33262306a36Sopenharmony_ci	&khugepaged_max_ptes_swap_attr.attr,
33362306a36Sopenharmony_ci	&khugepaged_max_ptes_shared_attr.attr,
33462306a36Sopenharmony_ci	&pages_to_scan_attr.attr,
33562306a36Sopenharmony_ci	&pages_collapsed_attr.attr,
33662306a36Sopenharmony_ci	&full_scans_attr.attr,
33762306a36Sopenharmony_ci	&scan_sleep_millisecs_attr.attr,
33862306a36Sopenharmony_ci	&alloc_sleep_millisecs_attr.attr,
33962306a36Sopenharmony_ci	NULL,
34062306a36Sopenharmony_ci};
34162306a36Sopenharmony_ci
34262306a36Sopenharmony_cistruct attribute_group khugepaged_attr_group = {
34362306a36Sopenharmony_ci	.attrs = khugepaged_attr,
34462306a36Sopenharmony_ci	.name = "khugepaged",
34562306a36Sopenharmony_ci};
34662306a36Sopenharmony_ci#endif /* CONFIG_SYSFS */
34762306a36Sopenharmony_ci
34862306a36Sopenharmony_ciint hugepage_madvise(struct vm_area_struct *vma,
34962306a36Sopenharmony_ci		     unsigned long *vm_flags, int advice)
35062306a36Sopenharmony_ci{
35162306a36Sopenharmony_ci	switch (advice) {
35262306a36Sopenharmony_ci	case MADV_HUGEPAGE:
35362306a36Sopenharmony_ci#ifdef CONFIG_S390
35462306a36Sopenharmony_ci		/*
35562306a36Sopenharmony_ci		 * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
35662306a36Sopenharmony_ci		 * can't handle this properly after s390_enable_sie, so we simply
35762306a36Sopenharmony_ci		 * ignore the madvise to prevent qemu from causing a SIGSEGV.
35862306a36Sopenharmony_ci		 */
35962306a36Sopenharmony_ci		if (mm_has_pgste(vma->vm_mm))
36062306a36Sopenharmony_ci			return 0;
36162306a36Sopenharmony_ci#endif
36262306a36Sopenharmony_ci		*vm_flags &= ~VM_NOHUGEPAGE;
36362306a36Sopenharmony_ci		*vm_flags |= VM_HUGEPAGE;
36462306a36Sopenharmony_ci		/*
36562306a36Sopenharmony_ci		 * If the vma become good for khugepaged to scan,
36662306a36Sopenharmony_ci		 * register it here without waiting a page fault that
36762306a36Sopenharmony_ci		 * may not happen any time soon.
36862306a36Sopenharmony_ci		 */
36962306a36Sopenharmony_ci		khugepaged_enter_vma(vma, *vm_flags);
37062306a36Sopenharmony_ci		break;
37162306a36Sopenharmony_ci	case MADV_NOHUGEPAGE:
37262306a36Sopenharmony_ci		*vm_flags &= ~VM_HUGEPAGE;
37362306a36Sopenharmony_ci		*vm_flags |= VM_NOHUGEPAGE;
37462306a36Sopenharmony_ci		/*
37562306a36Sopenharmony_ci		 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
37662306a36Sopenharmony_ci		 * this vma even if we leave the mm registered in khugepaged if
37762306a36Sopenharmony_ci		 * it got registered before VM_NOHUGEPAGE was set.
37862306a36Sopenharmony_ci		 */
37962306a36Sopenharmony_ci		break;
38062306a36Sopenharmony_ci	}
38162306a36Sopenharmony_ci
38262306a36Sopenharmony_ci	return 0;
38362306a36Sopenharmony_ci}
38462306a36Sopenharmony_ci
38562306a36Sopenharmony_ciint __init khugepaged_init(void)
38662306a36Sopenharmony_ci{
38762306a36Sopenharmony_ci	mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
38862306a36Sopenharmony_ci					  sizeof(struct khugepaged_mm_slot),
38962306a36Sopenharmony_ci					  __alignof__(struct khugepaged_mm_slot),
39062306a36Sopenharmony_ci					  0, NULL);
39162306a36Sopenharmony_ci	if (!mm_slot_cache)
39262306a36Sopenharmony_ci		return -ENOMEM;
39362306a36Sopenharmony_ci
39462306a36Sopenharmony_ci	khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
39562306a36Sopenharmony_ci	khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
39662306a36Sopenharmony_ci	khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
39762306a36Sopenharmony_ci	khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;
39862306a36Sopenharmony_ci
39962306a36Sopenharmony_ci	return 0;
40062306a36Sopenharmony_ci}
40162306a36Sopenharmony_ci
40262306a36Sopenharmony_civoid __init khugepaged_destroy(void)
40362306a36Sopenharmony_ci{
40462306a36Sopenharmony_ci	kmem_cache_destroy(mm_slot_cache);
40562306a36Sopenharmony_ci}
40662306a36Sopenharmony_ci
40762306a36Sopenharmony_cistatic inline int hpage_collapse_test_exit(struct mm_struct *mm)
40862306a36Sopenharmony_ci{
40962306a36Sopenharmony_ci	return atomic_read(&mm->mm_users) == 0;
41062306a36Sopenharmony_ci}
41162306a36Sopenharmony_ci
41262306a36Sopenharmony_civoid __khugepaged_enter(struct mm_struct *mm)
41362306a36Sopenharmony_ci{
41462306a36Sopenharmony_ci	struct khugepaged_mm_slot *mm_slot;
41562306a36Sopenharmony_ci	struct mm_slot *slot;
41662306a36Sopenharmony_ci	int wakeup;
41762306a36Sopenharmony_ci
41862306a36Sopenharmony_ci	/* __khugepaged_exit() must not run from under us */
41962306a36Sopenharmony_ci	VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
42062306a36Sopenharmony_ci	if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags)))
42162306a36Sopenharmony_ci		return;
42262306a36Sopenharmony_ci
42362306a36Sopenharmony_ci	mm_slot = mm_slot_alloc(mm_slot_cache);
42462306a36Sopenharmony_ci	if (!mm_slot)
42562306a36Sopenharmony_ci		return;
42662306a36Sopenharmony_ci
42762306a36Sopenharmony_ci	slot = &mm_slot->slot;
42862306a36Sopenharmony_ci
42962306a36Sopenharmony_ci	spin_lock(&khugepaged_mm_lock);
43062306a36Sopenharmony_ci	mm_slot_insert(mm_slots_hash, mm, slot);
43162306a36Sopenharmony_ci	/*
43262306a36Sopenharmony_ci	 * Insert just behind the scanning cursor, to let the area settle
43362306a36Sopenharmony_ci	 * down a little.
43462306a36Sopenharmony_ci	 */
43562306a36Sopenharmony_ci	wakeup = list_empty(&khugepaged_scan.mm_head);
43662306a36Sopenharmony_ci	list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head);
43762306a36Sopenharmony_ci	spin_unlock(&khugepaged_mm_lock);
43862306a36Sopenharmony_ci
43962306a36Sopenharmony_ci	mmgrab(mm);
44062306a36Sopenharmony_ci	if (wakeup)
44162306a36Sopenharmony_ci		wake_up_interruptible(&khugepaged_wait);
44262306a36Sopenharmony_ci}
44362306a36Sopenharmony_ci
44462306a36Sopenharmony_civoid khugepaged_enter_vma(struct vm_area_struct *vma,
44562306a36Sopenharmony_ci			  unsigned long vm_flags)
44662306a36Sopenharmony_ci{
44762306a36Sopenharmony_ci	if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
44862306a36Sopenharmony_ci	    hugepage_flags_enabled()) {
44962306a36Sopenharmony_ci		if (hugepage_vma_check(vma, vm_flags, false, false, true))
45062306a36Sopenharmony_ci			__khugepaged_enter(vma->vm_mm);
45162306a36Sopenharmony_ci	}
45262306a36Sopenharmony_ci}
45362306a36Sopenharmony_ci
45462306a36Sopenharmony_civoid __khugepaged_exit(struct mm_struct *mm)
45562306a36Sopenharmony_ci{
45662306a36Sopenharmony_ci	struct khugepaged_mm_slot *mm_slot;
45762306a36Sopenharmony_ci	struct mm_slot *slot;
45862306a36Sopenharmony_ci	int free = 0;
45962306a36Sopenharmony_ci
46062306a36Sopenharmony_ci	spin_lock(&khugepaged_mm_lock);
46162306a36Sopenharmony_ci	slot = mm_slot_lookup(mm_slots_hash, mm);
46262306a36Sopenharmony_ci	mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
46362306a36Sopenharmony_ci	if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
46462306a36Sopenharmony_ci		hash_del(&slot->hash);
46562306a36Sopenharmony_ci		list_del(&slot->mm_node);
46662306a36Sopenharmony_ci		free = 1;
46762306a36Sopenharmony_ci	}
46862306a36Sopenharmony_ci	spin_unlock(&khugepaged_mm_lock);
46962306a36Sopenharmony_ci
47062306a36Sopenharmony_ci	if (free) {
47162306a36Sopenharmony_ci		clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
47262306a36Sopenharmony_ci		mm_slot_free(mm_slot_cache, mm_slot);
47362306a36Sopenharmony_ci		mmdrop(mm);
47462306a36Sopenharmony_ci	} else if (mm_slot) {
47562306a36Sopenharmony_ci		/*
47662306a36Sopenharmony_ci		 * This is required to serialize against
47762306a36Sopenharmony_ci		 * hpage_collapse_test_exit() (which is guaranteed to run
47862306a36Sopenharmony_ci		 * under mmap sem read mode). Stop here (after we return all
47962306a36Sopenharmony_ci		 * pagetables will be destroyed) until khugepaged has finished
48062306a36Sopenharmony_ci		 * working on the pagetables under the mmap_lock.
48162306a36Sopenharmony_ci		 */
48262306a36Sopenharmony_ci		mmap_write_lock(mm);
48362306a36Sopenharmony_ci		mmap_write_unlock(mm);
48462306a36Sopenharmony_ci	}
48562306a36Sopenharmony_ci}
48662306a36Sopenharmony_ci
48762306a36Sopenharmony_cistatic void release_pte_folio(struct folio *folio)
48862306a36Sopenharmony_ci{
48962306a36Sopenharmony_ci	node_stat_mod_folio(folio,
49062306a36Sopenharmony_ci			NR_ISOLATED_ANON + folio_is_file_lru(folio),
49162306a36Sopenharmony_ci			-folio_nr_pages(folio));
49262306a36Sopenharmony_ci	folio_unlock(folio);
49362306a36Sopenharmony_ci	folio_putback_lru(folio);
49462306a36Sopenharmony_ci}
49562306a36Sopenharmony_ci
49662306a36Sopenharmony_cistatic void release_pte_page(struct page *page)
49762306a36Sopenharmony_ci{
49862306a36Sopenharmony_ci	release_pte_folio(page_folio(page));
49962306a36Sopenharmony_ci}
50062306a36Sopenharmony_ci
50162306a36Sopenharmony_cistatic void release_pte_pages(pte_t *pte, pte_t *_pte,
50262306a36Sopenharmony_ci		struct list_head *compound_pagelist)
50362306a36Sopenharmony_ci{
50462306a36Sopenharmony_ci	struct folio *folio, *tmp;
50562306a36Sopenharmony_ci
50662306a36Sopenharmony_ci	while (--_pte >= pte) {
50762306a36Sopenharmony_ci		pte_t pteval = ptep_get(_pte);
50862306a36Sopenharmony_ci		unsigned long pfn;
50962306a36Sopenharmony_ci
51062306a36Sopenharmony_ci		if (pte_none(pteval))
51162306a36Sopenharmony_ci			continue;
51262306a36Sopenharmony_ci		pfn = pte_pfn(pteval);
51362306a36Sopenharmony_ci		if (is_zero_pfn(pfn))
51462306a36Sopenharmony_ci			continue;
51562306a36Sopenharmony_ci		folio = pfn_folio(pfn);
51662306a36Sopenharmony_ci		if (folio_test_large(folio))
51762306a36Sopenharmony_ci			continue;
51862306a36Sopenharmony_ci		release_pte_folio(folio);
51962306a36Sopenharmony_ci	}
52062306a36Sopenharmony_ci
52162306a36Sopenharmony_ci	list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) {
52262306a36Sopenharmony_ci		list_del(&folio->lru);
52362306a36Sopenharmony_ci		release_pte_folio(folio);
52462306a36Sopenharmony_ci	}
52562306a36Sopenharmony_ci}
52662306a36Sopenharmony_ci
52762306a36Sopenharmony_cistatic bool is_refcount_suitable(struct page *page)
52862306a36Sopenharmony_ci{
52962306a36Sopenharmony_ci	int expected_refcount;
53062306a36Sopenharmony_ci
53162306a36Sopenharmony_ci	expected_refcount = total_mapcount(page);
53262306a36Sopenharmony_ci	if (PageSwapCache(page))
53362306a36Sopenharmony_ci		expected_refcount += compound_nr(page);
53462306a36Sopenharmony_ci
53562306a36Sopenharmony_ci	return page_count(page) == expected_refcount;
53662306a36Sopenharmony_ci}
53762306a36Sopenharmony_ci
53862306a36Sopenharmony_cistatic int __collapse_huge_page_isolate(struct vm_area_struct *vma,
53962306a36Sopenharmony_ci					unsigned long address,
54062306a36Sopenharmony_ci					pte_t *pte,
54162306a36Sopenharmony_ci					struct collapse_control *cc,
54262306a36Sopenharmony_ci					struct list_head *compound_pagelist)
54362306a36Sopenharmony_ci{
54462306a36Sopenharmony_ci	struct page *page = NULL;
54562306a36Sopenharmony_ci	pte_t *_pte;
54662306a36Sopenharmony_ci	int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
54762306a36Sopenharmony_ci	bool writable = false;
54862306a36Sopenharmony_ci
54962306a36Sopenharmony_ci	for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
55062306a36Sopenharmony_ci	     _pte++, address += PAGE_SIZE) {
55162306a36Sopenharmony_ci		pte_t pteval = ptep_get(_pte);
55262306a36Sopenharmony_ci		if (pte_none(pteval) || (pte_present(pteval) &&
55362306a36Sopenharmony_ci				is_zero_pfn(pte_pfn(pteval)))) {
55462306a36Sopenharmony_ci			++none_or_zero;
55562306a36Sopenharmony_ci			if (!userfaultfd_armed(vma) &&
55662306a36Sopenharmony_ci			    (!cc->is_khugepaged ||
55762306a36Sopenharmony_ci			     none_or_zero <= khugepaged_max_ptes_none)) {
55862306a36Sopenharmony_ci				continue;
55962306a36Sopenharmony_ci			} else {
56062306a36Sopenharmony_ci				result = SCAN_EXCEED_NONE_PTE;
56162306a36Sopenharmony_ci				count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
56262306a36Sopenharmony_ci				goto out;
56362306a36Sopenharmony_ci			}
56462306a36Sopenharmony_ci		}
56562306a36Sopenharmony_ci		if (!pte_present(pteval)) {
56662306a36Sopenharmony_ci			result = SCAN_PTE_NON_PRESENT;
56762306a36Sopenharmony_ci			goto out;
56862306a36Sopenharmony_ci		}
56962306a36Sopenharmony_ci		if (pte_uffd_wp(pteval)) {
57062306a36Sopenharmony_ci			result = SCAN_PTE_UFFD_WP;
57162306a36Sopenharmony_ci			goto out;
57262306a36Sopenharmony_ci		}
57362306a36Sopenharmony_ci		page = vm_normal_page(vma, address, pteval);
57462306a36Sopenharmony_ci		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
57562306a36Sopenharmony_ci			result = SCAN_PAGE_NULL;
57662306a36Sopenharmony_ci			goto out;
57762306a36Sopenharmony_ci		}
57862306a36Sopenharmony_ci
57962306a36Sopenharmony_ci		VM_BUG_ON_PAGE(!PageAnon(page), page);
58062306a36Sopenharmony_ci
58162306a36Sopenharmony_ci		if (page_mapcount(page) > 1) {
58262306a36Sopenharmony_ci			++shared;
58362306a36Sopenharmony_ci			if (cc->is_khugepaged &&
58462306a36Sopenharmony_ci			    shared > khugepaged_max_ptes_shared) {
58562306a36Sopenharmony_ci				result = SCAN_EXCEED_SHARED_PTE;
58662306a36Sopenharmony_ci				count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
58762306a36Sopenharmony_ci				goto out;
58862306a36Sopenharmony_ci			}
58962306a36Sopenharmony_ci		}
59062306a36Sopenharmony_ci
59162306a36Sopenharmony_ci		if (PageCompound(page)) {
59262306a36Sopenharmony_ci			struct page *p;
59362306a36Sopenharmony_ci			page = compound_head(page);
59462306a36Sopenharmony_ci
59562306a36Sopenharmony_ci			/*
59662306a36Sopenharmony_ci			 * Check if we have dealt with the compound page
59762306a36Sopenharmony_ci			 * already
59862306a36Sopenharmony_ci			 */
59962306a36Sopenharmony_ci			list_for_each_entry(p, compound_pagelist, lru) {
60062306a36Sopenharmony_ci				if (page == p)
60162306a36Sopenharmony_ci					goto next;
60262306a36Sopenharmony_ci			}
60362306a36Sopenharmony_ci		}
60462306a36Sopenharmony_ci
60562306a36Sopenharmony_ci		/*
60662306a36Sopenharmony_ci		 * We can do it before isolate_lru_page because the
60762306a36Sopenharmony_ci		 * page can't be freed from under us. NOTE: PG_lock
60862306a36Sopenharmony_ci		 * is needed to serialize against split_huge_page
60962306a36Sopenharmony_ci		 * when invoked from the VM.
61062306a36Sopenharmony_ci		 */
61162306a36Sopenharmony_ci		if (!trylock_page(page)) {
61262306a36Sopenharmony_ci			result = SCAN_PAGE_LOCK;
61362306a36Sopenharmony_ci			goto out;
61462306a36Sopenharmony_ci		}
61562306a36Sopenharmony_ci
61662306a36Sopenharmony_ci		/*
61762306a36Sopenharmony_ci		 * Check if the page has any GUP (or other external) pins.
61862306a36Sopenharmony_ci		 *
61962306a36Sopenharmony_ci		 * The page table that maps the page has been already unlinked
62062306a36Sopenharmony_ci		 * from the page table tree and this process cannot get
62162306a36Sopenharmony_ci		 * an additional pin on the page.
62262306a36Sopenharmony_ci		 *
62362306a36Sopenharmony_ci		 * New pins can come later if the page is shared across fork,
62462306a36Sopenharmony_ci		 * but not from this process. The other process cannot write to
62562306a36Sopenharmony_ci		 * the page, only trigger CoW.
62662306a36Sopenharmony_ci		 */
62762306a36Sopenharmony_ci		if (!is_refcount_suitable(page)) {
62862306a36Sopenharmony_ci			unlock_page(page);
62962306a36Sopenharmony_ci			result = SCAN_PAGE_COUNT;
63062306a36Sopenharmony_ci			goto out;
63162306a36Sopenharmony_ci		}
63262306a36Sopenharmony_ci
63362306a36Sopenharmony_ci		/*
63462306a36Sopenharmony_ci		 * Isolate the page to avoid collapsing an hugepage
63562306a36Sopenharmony_ci		 * currently in use by the VM.
63662306a36Sopenharmony_ci		 */
63762306a36Sopenharmony_ci		if (!isolate_lru_page(page)) {
63862306a36Sopenharmony_ci			unlock_page(page);
63962306a36Sopenharmony_ci			result = SCAN_DEL_PAGE_LRU;
64062306a36Sopenharmony_ci			goto out;
64162306a36Sopenharmony_ci		}
64262306a36Sopenharmony_ci		mod_node_page_state(page_pgdat(page),
64362306a36Sopenharmony_ci				NR_ISOLATED_ANON + page_is_file_lru(page),
64462306a36Sopenharmony_ci				compound_nr(page));
64562306a36Sopenharmony_ci		VM_BUG_ON_PAGE(!PageLocked(page), page);
64662306a36Sopenharmony_ci		VM_BUG_ON_PAGE(PageLRU(page), page);
64762306a36Sopenharmony_ci
64862306a36Sopenharmony_ci		if (PageCompound(page))
64962306a36Sopenharmony_ci			list_add_tail(&page->lru, compound_pagelist);
65062306a36Sopenharmony_cinext:
65162306a36Sopenharmony_ci		/*
65262306a36Sopenharmony_ci		 * If collapse was initiated by khugepaged, check that there is
65362306a36Sopenharmony_ci		 * enough young pte to justify collapsing the page
65462306a36Sopenharmony_ci		 */
65562306a36Sopenharmony_ci		if (cc->is_khugepaged &&
65662306a36Sopenharmony_ci		    (pte_young(pteval) || page_is_young(page) ||
65762306a36Sopenharmony_ci		     PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
65862306a36Sopenharmony_ci								     address)))
65962306a36Sopenharmony_ci			referenced++;
66062306a36Sopenharmony_ci
66162306a36Sopenharmony_ci		if (pte_write(pteval))
66262306a36Sopenharmony_ci			writable = true;
66362306a36Sopenharmony_ci	}
66462306a36Sopenharmony_ci
66562306a36Sopenharmony_ci	if (unlikely(!writable)) {
66662306a36Sopenharmony_ci		result = SCAN_PAGE_RO;
66762306a36Sopenharmony_ci	} else if (unlikely(cc->is_khugepaged && !referenced)) {
66862306a36Sopenharmony_ci		result = SCAN_LACK_REFERENCED_PAGE;
66962306a36Sopenharmony_ci	} else {
67062306a36Sopenharmony_ci		result = SCAN_SUCCEED;
67162306a36Sopenharmony_ci		trace_mm_collapse_huge_page_isolate(page, none_or_zero,
67262306a36Sopenharmony_ci						    referenced, writable, result);
67362306a36Sopenharmony_ci		return result;
67462306a36Sopenharmony_ci	}
67562306a36Sopenharmony_ciout:
67662306a36Sopenharmony_ci	release_pte_pages(pte, _pte, compound_pagelist);
67762306a36Sopenharmony_ci	trace_mm_collapse_huge_page_isolate(page, none_or_zero,
67862306a36Sopenharmony_ci					    referenced, writable, result);
67962306a36Sopenharmony_ci	return result;
68062306a36Sopenharmony_ci}
68162306a36Sopenharmony_ci
68262306a36Sopenharmony_cistatic void __collapse_huge_page_copy_succeeded(pte_t *pte,
68362306a36Sopenharmony_ci						struct vm_area_struct *vma,
68462306a36Sopenharmony_ci						unsigned long address,
68562306a36Sopenharmony_ci						spinlock_t *ptl,
68662306a36Sopenharmony_ci						struct list_head *compound_pagelist)
68762306a36Sopenharmony_ci{
68862306a36Sopenharmony_ci	struct page *src_page;
68962306a36Sopenharmony_ci	struct page *tmp;
69062306a36Sopenharmony_ci	pte_t *_pte;
69162306a36Sopenharmony_ci	pte_t pteval;
69262306a36Sopenharmony_ci
69362306a36Sopenharmony_ci	for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
69462306a36Sopenharmony_ci	     _pte++, address += PAGE_SIZE) {
69562306a36Sopenharmony_ci		pteval = ptep_get(_pte);
69662306a36Sopenharmony_ci		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
69762306a36Sopenharmony_ci			add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
69862306a36Sopenharmony_ci			if (is_zero_pfn(pte_pfn(pteval))) {
69962306a36Sopenharmony_ci				/*
70062306a36Sopenharmony_ci				 * ptl mostly unnecessary.
70162306a36Sopenharmony_ci				 */
70262306a36Sopenharmony_ci				spin_lock(ptl);
70362306a36Sopenharmony_ci				ptep_clear(vma->vm_mm, address, _pte);
70462306a36Sopenharmony_ci				spin_unlock(ptl);
70562306a36Sopenharmony_ci				ksm_might_unmap_zero_page(vma->vm_mm, pteval);
70662306a36Sopenharmony_ci			}
70762306a36Sopenharmony_ci		} else {
70862306a36Sopenharmony_ci			src_page = pte_page(pteval);
70962306a36Sopenharmony_ci			if (!PageCompound(src_page))
71062306a36Sopenharmony_ci				release_pte_page(src_page);
71162306a36Sopenharmony_ci			/*
71262306a36Sopenharmony_ci			 * ptl mostly unnecessary, but preempt has to
71362306a36Sopenharmony_ci			 * be disabled to update the per-cpu stats
71462306a36Sopenharmony_ci			 * inside page_remove_rmap().
71562306a36Sopenharmony_ci			 */
71662306a36Sopenharmony_ci			spin_lock(ptl);
71762306a36Sopenharmony_ci			ptep_clear(vma->vm_mm, address, _pte);
71862306a36Sopenharmony_ci			page_remove_rmap(src_page, vma, false);
71962306a36Sopenharmony_ci			spin_unlock(ptl);
72062306a36Sopenharmony_ci			free_page_and_swap_cache(src_page);
72162306a36Sopenharmony_ci		}
72262306a36Sopenharmony_ci	}
72362306a36Sopenharmony_ci
72462306a36Sopenharmony_ci	list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) {
72562306a36Sopenharmony_ci		list_del(&src_page->lru);
72662306a36Sopenharmony_ci		mod_node_page_state(page_pgdat(src_page),
72762306a36Sopenharmony_ci				    NR_ISOLATED_ANON + page_is_file_lru(src_page),
72862306a36Sopenharmony_ci				    -compound_nr(src_page));
72962306a36Sopenharmony_ci		unlock_page(src_page);
73062306a36Sopenharmony_ci		free_swap_cache(src_page);
73162306a36Sopenharmony_ci		putback_lru_page(src_page);
73262306a36Sopenharmony_ci	}
73362306a36Sopenharmony_ci}
73462306a36Sopenharmony_ci
73562306a36Sopenharmony_cistatic void __collapse_huge_page_copy_failed(pte_t *pte,
73662306a36Sopenharmony_ci					     pmd_t *pmd,
73762306a36Sopenharmony_ci					     pmd_t orig_pmd,
73862306a36Sopenharmony_ci					     struct vm_area_struct *vma,
73962306a36Sopenharmony_ci					     struct list_head *compound_pagelist)
74062306a36Sopenharmony_ci{
74162306a36Sopenharmony_ci	spinlock_t *pmd_ptl;
74262306a36Sopenharmony_ci
74362306a36Sopenharmony_ci	/*
74462306a36Sopenharmony_ci	 * Re-establish the PMD to point to the original page table
74562306a36Sopenharmony_ci	 * entry. Restoring PMD needs to be done prior to releasing
74662306a36Sopenharmony_ci	 * pages. Since pages are still isolated and locked here,
74762306a36Sopenharmony_ci	 * acquiring anon_vma_lock_write is unnecessary.
74862306a36Sopenharmony_ci	 */
74962306a36Sopenharmony_ci	pmd_ptl = pmd_lock(vma->vm_mm, pmd);
75062306a36Sopenharmony_ci	pmd_populate(vma->vm_mm, pmd, pmd_pgtable(orig_pmd));
75162306a36Sopenharmony_ci	spin_unlock(pmd_ptl);
75262306a36Sopenharmony_ci	/*
75362306a36Sopenharmony_ci	 * Release both raw and compound pages isolated
75462306a36Sopenharmony_ci	 * in __collapse_huge_page_isolate.
75562306a36Sopenharmony_ci	 */
75662306a36Sopenharmony_ci	release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
75762306a36Sopenharmony_ci}
75862306a36Sopenharmony_ci
75962306a36Sopenharmony_ci/*
76062306a36Sopenharmony_ci * __collapse_huge_page_copy - attempts to copy memory contents from raw
76162306a36Sopenharmony_ci * pages to a hugepage. Cleans up the raw pages if copying succeeds;
76262306a36Sopenharmony_ci * otherwise restores the original page table and releases isolated raw pages.
76362306a36Sopenharmony_ci * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC.
76462306a36Sopenharmony_ci *
76562306a36Sopenharmony_ci * @pte: starting of the PTEs to copy from
76662306a36Sopenharmony_ci * @page: the new hugepage to copy contents to
76762306a36Sopenharmony_ci * @pmd: pointer to the new hugepage's PMD
76862306a36Sopenharmony_ci * @orig_pmd: the original raw pages' PMD
76962306a36Sopenharmony_ci * @vma: the original raw pages' virtual memory area
77062306a36Sopenharmony_ci * @address: starting address to copy
77162306a36Sopenharmony_ci * @ptl: lock on raw pages' PTEs
77262306a36Sopenharmony_ci * @compound_pagelist: list that stores compound pages
77362306a36Sopenharmony_ci */
77462306a36Sopenharmony_cistatic int __collapse_huge_page_copy(pte_t *pte,
77562306a36Sopenharmony_ci				     struct page *page,
77662306a36Sopenharmony_ci				     pmd_t *pmd,
77762306a36Sopenharmony_ci				     pmd_t orig_pmd,
77862306a36Sopenharmony_ci				     struct vm_area_struct *vma,
77962306a36Sopenharmony_ci				     unsigned long address,
78062306a36Sopenharmony_ci				     spinlock_t *ptl,
78162306a36Sopenharmony_ci				     struct list_head *compound_pagelist)
78262306a36Sopenharmony_ci{
78362306a36Sopenharmony_ci	struct page *src_page;
78462306a36Sopenharmony_ci	pte_t *_pte;
78562306a36Sopenharmony_ci	pte_t pteval;
78662306a36Sopenharmony_ci	unsigned long _address;
78762306a36Sopenharmony_ci	int result = SCAN_SUCCEED;
78862306a36Sopenharmony_ci
78962306a36Sopenharmony_ci	/*
79062306a36Sopenharmony_ci	 * Copying pages' contents is subject to memory poison at any iteration.
79162306a36Sopenharmony_ci	 */
79262306a36Sopenharmony_ci	for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR;
79362306a36Sopenharmony_ci	     _pte++, page++, _address += PAGE_SIZE) {
79462306a36Sopenharmony_ci		pteval = ptep_get(_pte);
79562306a36Sopenharmony_ci		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
79662306a36Sopenharmony_ci			clear_user_highpage(page, _address);
79762306a36Sopenharmony_ci			continue;
79862306a36Sopenharmony_ci		}
79962306a36Sopenharmony_ci		src_page = pte_page(pteval);
80062306a36Sopenharmony_ci		if (copy_mc_user_highpage(page, src_page, _address, vma) > 0) {
80162306a36Sopenharmony_ci			result = SCAN_COPY_MC;
80262306a36Sopenharmony_ci			break;
80362306a36Sopenharmony_ci		}
80462306a36Sopenharmony_ci	}
80562306a36Sopenharmony_ci
80662306a36Sopenharmony_ci	if (likely(result == SCAN_SUCCEED))
80762306a36Sopenharmony_ci		__collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
80862306a36Sopenharmony_ci						    compound_pagelist);
80962306a36Sopenharmony_ci	else
81062306a36Sopenharmony_ci		__collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
81162306a36Sopenharmony_ci						 compound_pagelist);
81262306a36Sopenharmony_ci
81362306a36Sopenharmony_ci	return result;
81462306a36Sopenharmony_ci}
81562306a36Sopenharmony_ci
81662306a36Sopenharmony_cistatic void khugepaged_alloc_sleep(void)
81762306a36Sopenharmony_ci{
81862306a36Sopenharmony_ci	DEFINE_WAIT(wait);
81962306a36Sopenharmony_ci
82062306a36Sopenharmony_ci	add_wait_queue(&khugepaged_wait, &wait);
82162306a36Sopenharmony_ci	__set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
82262306a36Sopenharmony_ci	schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
82362306a36Sopenharmony_ci	remove_wait_queue(&khugepaged_wait, &wait);
82462306a36Sopenharmony_ci}
82562306a36Sopenharmony_ci
82662306a36Sopenharmony_cistruct collapse_control khugepaged_collapse_control = {
82762306a36Sopenharmony_ci	.is_khugepaged = true,
82862306a36Sopenharmony_ci};
82962306a36Sopenharmony_ci
83062306a36Sopenharmony_cistatic bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc)
83162306a36Sopenharmony_ci{
83262306a36Sopenharmony_ci	int i;
83362306a36Sopenharmony_ci
83462306a36Sopenharmony_ci	/*
83562306a36Sopenharmony_ci	 * If node_reclaim_mode is disabled, then no extra effort is made to
83662306a36Sopenharmony_ci	 * allocate memory locally.
83762306a36Sopenharmony_ci	 */
83862306a36Sopenharmony_ci	if (!node_reclaim_enabled())
83962306a36Sopenharmony_ci		return false;
84062306a36Sopenharmony_ci
84162306a36Sopenharmony_ci	/* If there is a count for this node already, it must be acceptable */
84262306a36Sopenharmony_ci	if (cc->node_load[nid])
84362306a36Sopenharmony_ci		return false;
84462306a36Sopenharmony_ci
84562306a36Sopenharmony_ci	for (i = 0; i < MAX_NUMNODES; i++) {
84662306a36Sopenharmony_ci		if (!cc->node_load[i])
84762306a36Sopenharmony_ci			continue;
84862306a36Sopenharmony_ci		if (node_distance(nid, i) > node_reclaim_distance)
84962306a36Sopenharmony_ci			return true;
85062306a36Sopenharmony_ci	}
85162306a36Sopenharmony_ci	return false;
85262306a36Sopenharmony_ci}
85362306a36Sopenharmony_ci
85462306a36Sopenharmony_ci#define khugepaged_defrag()					\
85562306a36Sopenharmony_ci	(transparent_hugepage_flags &				\
85662306a36Sopenharmony_ci	 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))
85762306a36Sopenharmony_ci
85862306a36Sopenharmony_ci/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
85962306a36Sopenharmony_cistatic inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
86062306a36Sopenharmony_ci{
86162306a36Sopenharmony_ci	return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT;
86262306a36Sopenharmony_ci}
86362306a36Sopenharmony_ci
86462306a36Sopenharmony_ci#ifdef CONFIG_NUMA
86562306a36Sopenharmony_cistatic int hpage_collapse_find_target_node(struct collapse_control *cc)
86662306a36Sopenharmony_ci{
86762306a36Sopenharmony_ci	int nid, target_node = 0, max_value = 0;
86862306a36Sopenharmony_ci
86962306a36Sopenharmony_ci	/* find first node with max normal pages hit */
87062306a36Sopenharmony_ci	for (nid = 0; nid < MAX_NUMNODES; nid++)
87162306a36Sopenharmony_ci		if (cc->node_load[nid] > max_value) {
87262306a36Sopenharmony_ci			max_value = cc->node_load[nid];
87362306a36Sopenharmony_ci			target_node = nid;
87462306a36Sopenharmony_ci		}
87562306a36Sopenharmony_ci
87662306a36Sopenharmony_ci	for_each_online_node(nid) {
87762306a36Sopenharmony_ci		if (max_value == cc->node_load[nid])
87862306a36Sopenharmony_ci			node_set(nid, cc->alloc_nmask);
87962306a36Sopenharmony_ci	}
88062306a36Sopenharmony_ci
88162306a36Sopenharmony_ci	return target_node;
88262306a36Sopenharmony_ci}
88362306a36Sopenharmony_ci#else
88462306a36Sopenharmony_cistatic int hpage_collapse_find_target_node(struct collapse_control *cc)
88562306a36Sopenharmony_ci{
88662306a36Sopenharmony_ci	return 0;
88762306a36Sopenharmony_ci}
88862306a36Sopenharmony_ci#endif
88962306a36Sopenharmony_ci
89062306a36Sopenharmony_cistatic bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node,
89162306a36Sopenharmony_ci				      nodemask_t *nmask)
89262306a36Sopenharmony_ci{
89362306a36Sopenharmony_ci	*hpage = __alloc_pages(gfp, HPAGE_PMD_ORDER, node, nmask);
89462306a36Sopenharmony_ci	if (unlikely(!*hpage)) {
89562306a36Sopenharmony_ci		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
89662306a36Sopenharmony_ci		return false;
89762306a36Sopenharmony_ci	}
89862306a36Sopenharmony_ci
89962306a36Sopenharmony_ci	folio_prep_large_rmappable((struct folio *)*hpage);
90062306a36Sopenharmony_ci	count_vm_event(THP_COLLAPSE_ALLOC);
90162306a36Sopenharmony_ci	return true;
90262306a36Sopenharmony_ci}
90362306a36Sopenharmony_ci
90462306a36Sopenharmony_ci/*
90562306a36Sopenharmony_ci * If mmap_lock temporarily dropped, revalidate vma
90662306a36Sopenharmony_ci * before taking mmap_lock.
90762306a36Sopenharmony_ci * Returns enum scan_result value.
90862306a36Sopenharmony_ci */
90962306a36Sopenharmony_ci
91062306a36Sopenharmony_cistatic int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
91162306a36Sopenharmony_ci				   bool expect_anon,
91262306a36Sopenharmony_ci				   struct vm_area_struct **vmap,
91362306a36Sopenharmony_ci				   struct collapse_control *cc)
91462306a36Sopenharmony_ci{
91562306a36Sopenharmony_ci	struct vm_area_struct *vma;
91662306a36Sopenharmony_ci
91762306a36Sopenharmony_ci	if (unlikely(hpage_collapse_test_exit(mm)))
91862306a36Sopenharmony_ci		return SCAN_ANY_PROCESS;
91962306a36Sopenharmony_ci
92062306a36Sopenharmony_ci	*vmap = vma = find_vma(mm, address);
92162306a36Sopenharmony_ci	if (!vma)
92262306a36Sopenharmony_ci		return SCAN_VMA_NULL;
92362306a36Sopenharmony_ci
92462306a36Sopenharmony_ci	if (!transhuge_vma_suitable(vma, address))
92562306a36Sopenharmony_ci		return SCAN_ADDRESS_RANGE;
92662306a36Sopenharmony_ci	if (!hugepage_vma_check(vma, vma->vm_flags, false, false,
92762306a36Sopenharmony_ci				cc->is_khugepaged))
92862306a36Sopenharmony_ci		return SCAN_VMA_CHECK;
92962306a36Sopenharmony_ci	/*
93062306a36Sopenharmony_ci	 * Anon VMA expected, the address may be unmapped then
93162306a36Sopenharmony_ci	 * remapped to file after khugepaged reaquired the mmap_lock.
93262306a36Sopenharmony_ci	 *
93362306a36Sopenharmony_ci	 * hugepage_vma_check may return true for qualified file
93462306a36Sopenharmony_ci	 * vmas.
93562306a36Sopenharmony_ci	 */
93662306a36Sopenharmony_ci	if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap)))
93762306a36Sopenharmony_ci		return SCAN_PAGE_ANON;
93862306a36Sopenharmony_ci	return SCAN_SUCCEED;
93962306a36Sopenharmony_ci}
94062306a36Sopenharmony_ci
94162306a36Sopenharmony_cistatic int find_pmd_or_thp_or_none(struct mm_struct *mm,
94262306a36Sopenharmony_ci				   unsigned long address,
94362306a36Sopenharmony_ci				   pmd_t **pmd)
94462306a36Sopenharmony_ci{
94562306a36Sopenharmony_ci	pmd_t pmde;
94662306a36Sopenharmony_ci
94762306a36Sopenharmony_ci	*pmd = mm_find_pmd(mm, address);
94862306a36Sopenharmony_ci	if (!*pmd)
94962306a36Sopenharmony_ci		return SCAN_PMD_NULL;
95062306a36Sopenharmony_ci
95162306a36Sopenharmony_ci	pmde = pmdp_get_lockless(*pmd);
95262306a36Sopenharmony_ci	if (pmd_none(pmde))
95362306a36Sopenharmony_ci		return SCAN_PMD_NONE;
95462306a36Sopenharmony_ci	if (!pmd_present(pmde))
95562306a36Sopenharmony_ci		return SCAN_PMD_NULL;
95662306a36Sopenharmony_ci	if (pmd_trans_huge(pmde))
95762306a36Sopenharmony_ci		return SCAN_PMD_MAPPED;
95862306a36Sopenharmony_ci	if (pmd_devmap(pmde))
95962306a36Sopenharmony_ci		return SCAN_PMD_NULL;
96062306a36Sopenharmony_ci	if (pmd_bad(pmde))
96162306a36Sopenharmony_ci		return SCAN_PMD_NULL;
96262306a36Sopenharmony_ci	return SCAN_SUCCEED;
96362306a36Sopenharmony_ci}
96462306a36Sopenharmony_ci
96562306a36Sopenharmony_cistatic int check_pmd_still_valid(struct mm_struct *mm,
96662306a36Sopenharmony_ci				 unsigned long address,
96762306a36Sopenharmony_ci				 pmd_t *pmd)
96862306a36Sopenharmony_ci{
96962306a36Sopenharmony_ci	pmd_t *new_pmd;
97062306a36Sopenharmony_ci	int result = find_pmd_or_thp_or_none(mm, address, &new_pmd);
97162306a36Sopenharmony_ci
97262306a36Sopenharmony_ci	if (result != SCAN_SUCCEED)
97362306a36Sopenharmony_ci		return result;
97462306a36Sopenharmony_ci	if (new_pmd != pmd)
97562306a36Sopenharmony_ci		return SCAN_FAIL;
97662306a36Sopenharmony_ci	return SCAN_SUCCEED;
97762306a36Sopenharmony_ci}
97862306a36Sopenharmony_ci
97962306a36Sopenharmony_ci/*
98062306a36Sopenharmony_ci * Bring missing pages in from swap, to complete THP collapse.
98162306a36Sopenharmony_ci * Only done if hpage_collapse_scan_pmd believes it is worthwhile.
98262306a36Sopenharmony_ci *
98362306a36Sopenharmony_ci * Called and returns without pte mapped or spinlocks held.
98462306a36Sopenharmony_ci * Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
98562306a36Sopenharmony_ci */
98662306a36Sopenharmony_cistatic int __collapse_huge_page_swapin(struct mm_struct *mm,
98762306a36Sopenharmony_ci				       struct vm_area_struct *vma,
98862306a36Sopenharmony_ci				       unsigned long haddr, pmd_t *pmd,
98962306a36Sopenharmony_ci				       int referenced)
99062306a36Sopenharmony_ci{
99162306a36Sopenharmony_ci	int swapped_in = 0;
99262306a36Sopenharmony_ci	vm_fault_t ret = 0;
99362306a36Sopenharmony_ci	unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE);
99462306a36Sopenharmony_ci	int result;
99562306a36Sopenharmony_ci	pte_t *pte = NULL;
99662306a36Sopenharmony_ci	spinlock_t *ptl;
99762306a36Sopenharmony_ci
99862306a36Sopenharmony_ci	for (address = haddr; address < end; address += PAGE_SIZE) {
99962306a36Sopenharmony_ci		struct vm_fault vmf = {
100062306a36Sopenharmony_ci			.vma = vma,
100162306a36Sopenharmony_ci			.address = address,
100262306a36Sopenharmony_ci			.pgoff = linear_page_index(vma, address),
100362306a36Sopenharmony_ci			.flags = FAULT_FLAG_ALLOW_RETRY,
100462306a36Sopenharmony_ci			.pmd = pmd,
100562306a36Sopenharmony_ci		};
100662306a36Sopenharmony_ci
100762306a36Sopenharmony_ci		if (!pte++) {
100862306a36Sopenharmony_ci			pte = pte_offset_map_nolock(mm, pmd, address, &ptl);
100962306a36Sopenharmony_ci			if (!pte) {
101062306a36Sopenharmony_ci				mmap_read_unlock(mm);
101162306a36Sopenharmony_ci				result = SCAN_PMD_NULL;
101262306a36Sopenharmony_ci				goto out;
101362306a36Sopenharmony_ci			}
101462306a36Sopenharmony_ci		}
101562306a36Sopenharmony_ci
101662306a36Sopenharmony_ci		vmf.orig_pte = ptep_get_lockless(pte);
101762306a36Sopenharmony_ci		if (!is_swap_pte(vmf.orig_pte))
101862306a36Sopenharmony_ci			continue;
101962306a36Sopenharmony_ci
102062306a36Sopenharmony_ci		vmf.pte = pte;
102162306a36Sopenharmony_ci		vmf.ptl = ptl;
102262306a36Sopenharmony_ci		ret = do_swap_page(&vmf);
102362306a36Sopenharmony_ci		/* Which unmaps pte (after perhaps re-checking the entry) */
102462306a36Sopenharmony_ci		pte = NULL;
102562306a36Sopenharmony_ci
102662306a36Sopenharmony_ci		/*
102762306a36Sopenharmony_ci		 * do_swap_page returns VM_FAULT_RETRY with released mmap_lock.
102862306a36Sopenharmony_ci		 * Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because
102962306a36Sopenharmony_ci		 * we do not retry here and swap entry will remain in pagetable
103062306a36Sopenharmony_ci		 * resulting in later failure.
103162306a36Sopenharmony_ci		 */
103262306a36Sopenharmony_ci		if (ret & VM_FAULT_RETRY) {
103362306a36Sopenharmony_ci			/* Likely, but not guaranteed, that page lock failed */
103462306a36Sopenharmony_ci			result = SCAN_PAGE_LOCK;
103562306a36Sopenharmony_ci			goto out;
103662306a36Sopenharmony_ci		}
103762306a36Sopenharmony_ci		if (ret & VM_FAULT_ERROR) {
103862306a36Sopenharmony_ci			mmap_read_unlock(mm);
103962306a36Sopenharmony_ci			result = SCAN_FAIL;
104062306a36Sopenharmony_ci			goto out;
104162306a36Sopenharmony_ci		}
104262306a36Sopenharmony_ci		swapped_in++;
104362306a36Sopenharmony_ci	}
104462306a36Sopenharmony_ci
104562306a36Sopenharmony_ci	if (pte)
104662306a36Sopenharmony_ci		pte_unmap(pte);
104762306a36Sopenharmony_ci
104862306a36Sopenharmony_ci	/* Drain LRU cache to remove extra pin on the swapped in pages */
104962306a36Sopenharmony_ci	if (swapped_in)
105062306a36Sopenharmony_ci		lru_add_drain();
105162306a36Sopenharmony_ci
105262306a36Sopenharmony_ci	result = SCAN_SUCCEED;
105362306a36Sopenharmony_ciout:
105462306a36Sopenharmony_ci	trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result);
105562306a36Sopenharmony_ci	return result;
105662306a36Sopenharmony_ci}
105762306a36Sopenharmony_ci
105862306a36Sopenharmony_cistatic int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm,
105962306a36Sopenharmony_ci			      struct collapse_control *cc)
106062306a36Sopenharmony_ci{
106162306a36Sopenharmony_ci	gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
106262306a36Sopenharmony_ci		     GFP_TRANSHUGE);
106362306a36Sopenharmony_ci	int node = hpage_collapse_find_target_node(cc);
106462306a36Sopenharmony_ci	struct folio *folio;
106562306a36Sopenharmony_ci
106662306a36Sopenharmony_ci	if (!hpage_collapse_alloc_page(hpage, gfp, node, &cc->alloc_nmask))
106762306a36Sopenharmony_ci		return SCAN_ALLOC_HUGE_PAGE_FAIL;
106862306a36Sopenharmony_ci
106962306a36Sopenharmony_ci	folio = page_folio(*hpage);
107062306a36Sopenharmony_ci	if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
107162306a36Sopenharmony_ci		folio_put(folio);
107262306a36Sopenharmony_ci		*hpage = NULL;
107362306a36Sopenharmony_ci		return SCAN_CGROUP_CHARGE_FAIL;
107462306a36Sopenharmony_ci	}
107562306a36Sopenharmony_ci	count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC);
107662306a36Sopenharmony_ci
107762306a36Sopenharmony_ci	return SCAN_SUCCEED;
107862306a36Sopenharmony_ci}
107962306a36Sopenharmony_ci
108062306a36Sopenharmony_cistatic int collapse_huge_page(struct mm_struct *mm, unsigned long address,
108162306a36Sopenharmony_ci			      int referenced, int unmapped,
108262306a36Sopenharmony_ci			      struct collapse_control *cc)
108362306a36Sopenharmony_ci{
108462306a36Sopenharmony_ci	LIST_HEAD(compound_pagelist);
108562306a36Sopenharmony_ci	pmd_t *pmd, _pmd;
108662306a36Sopenharmony_ci	pte_t *pte;
108762306a36Sopenharmony_ci	pgtable_t pgtable;
108862306a36Sopenharmony_ci	struct page *hpage;
108962306a36Sopenharmony_ci	spinlock_t *pmd_ptl, *pte_ptl;
109062306a36Sopenharmony_ci	int result = SCAN_FAIL;
109162306a36Sopenharmony_ci	struct vm_area_struct *vma;
109262306a36Sopenharmony_ci	struct mmu_notifier_range range;
109362306a36Sopenharmony_ci
109462306a36Sopenharmony_ci	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
109562306a36Sopenharmony_ci
109662306a36Sopenharmony_ci	/*
109762306a36Sopenharmony_ci	 * Before allocating the hugepage, release the mmap_lock read lock.
109862306a36Sopenharmony_ci	 * The allocation can take potentially a long time if it involves
109962306a36Sopenharmony_ci	 * sync compaction, and we do not need to hold the mmap_lock during
110062306a36Sopenharmony_ci	 * that. We will recheck the vma after taking it again in write mode.
110162306a36Sopenharmony_ci	 */
110262306a36Sopenharmony_ci	mmap_read_unlock(mm);
110362306a36Sopenharmony_ci
110462306a36Sopenharmony_ci	result = alloc_charge_hpage(&hpage, mm, cc);
110562306a36Sopenharmony_ci	if (result != SCAN_SUCCEED)
110662306a36Sopenharmony_ci		goto out_nolock;
110762306a36Sopenharmony_ci
110862306a36Sopenharmony_ci	mmap_read_lock(mm);
110962306a36Sopenharmony_ci	result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
111062306a36Sopenharmony_ci	if (result != SCAN_SUCCEED) {
111162306a36Sopenharmony_ci		mmap_read_unlock(mm);
111262306a36Sopenharmony_ci		goto out_nolock;
111362306a36Sopenharmony_ci	}
111462306a36Sopenharmony_ci
111562306a36Sopenharmony_ci	result = find_pmd_or_thp_or_none(mm, address, &pmd);
111662306a36Sopenharmony_ci	if (result != SCAN_SUCCEED) {
111762306a36Sopenharmony_ci		mmap_read_unlock(mm);
111862306a36Sopenharmony_ci		goto out_nolock;
111962306a36Sopenharmony_ci	}
112062306a36Sopenharmony_ci
112162306a36Sopenharmony_ci	if (unmapped) {
112262306a36Sopenharmony_ci		/*
112362306a36Sopenharmony_ci		 * __collapse_huge_page_swapin will return with mmap_lock
112462306a36Sopenharmony_ci		 * released when it fails. So we jump out_nolock directly in
112562306a36Sopenharmony_ci		 * that case.  Continuing to collapse causes inconsistency.
112662306a36Sopenharmony_ci		 */
112762306a36Sopenharmony_ci		result = __collapse_huge_page_swapin(mm, vma, address, pmd,
112862306a36Sopenharmony_ci						     referenced);
112962306a36Sopenharmony_ci		if (result != SCAN_SUCCEED)
113062306a36Sopenharmony_ci			goto out_nolock;
113162306a36Sopenharmony_ci	}
113262306a36Sopenharmony_ci
113362306a36Sopenharmony_ci	mmap_read_unlock(mm);
113462306a36Sopenharmony_ci	/*
113562306a36Sopenharmony_ci	 * Prevent all access to pagetables with the exception of
113662306a36Sopenharmony_ci	 * gup_fast later handled by the ptep_clear_flush and the VM
113762306a36Sopenharmony_ci	 * handled by the anon_vma lock + PG_lock.
113862306a36Sopenharmony_ci	 */
113962306a36Sopenharmony_ci	mmap_write_lock(mm);
114062306a36Sopenharmony_ci	result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
114162306a36Sopenharmony_ci	if (result != SCAN_SUCCEED)
114262306a36Sopenharmony_ci		goto out_up_write;
114362306a36Sopenharmony_ci	/* check if the pmd is still valid */
114462306a36Sopenharmony_ci	result = check_pmd_still_valid(mm, address, pmd);
114562306a36Sopenharmony_ci	if (result != SCAN_SUCCEED)
114662306a36Sopenharmony_ci		goto out_up_write;
114762306a36Sopenharmony_ci
114862306a36Sopenharmony_ci	vma_start_write(vma);
114962306a36Sopenharmony_ci	anon_vma_lock_write(vma->anon_vma);
115062306a36Sopenharmony_ci
115162306a36Sopenharmony_ci	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
115262306a36Sopenharmony_ci				address + HPAGE_PMD_SIZE);
115362306a36Sopenharmony_ci	mmu_notifier_invalidate_range_start(&range);
115462306a36Sopenharmony_ci
115562306a36Sopenharmony_ci	pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
115662306a36Sopenharmony_ci	/*
115762306a36Sopenharmony_ci	 * This removes any huge TLB entry from the CPU so we won't allow
115862306a36Sopenharmony_ci	 * huge and small TLB entries for the same virtual address to
115962306a36Sopenharmony_ci	 * avoid the risk of CPU bugs in that area.
116062306a36Sopenharmony_ci	 *
116162306a36Sopenharmony_ci	 * Parallel fast GUP is fine since fast GUP will back off when
116262306a36Sopenharmony_ci	 * it detects PMD is changed.
116362306a36Sopenharmony_ci	 */
116462306a36Sopenharmony_ci	_pmd = pmdp_collapse_flush(vma, address, pmd);
116562306a36Sopenharmony_ci	spin_unlock(pmd_ptl);
116662306a36Sopenharmony_ci	mmu_notifier_invalidate_range_end(&range);
116762306a36Sopenharmony_ci	tlb_remove_table_sync_one();
116862306a36Sopenharmony_ci
116962306a36Sopenharmony_ci	pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
117062306a36Sopenharmony_ci	if (pte) {
117162306a36Sopenharmony_ci		result = __collapse_huge_page_isolate(vma, address, pte, cc,
117262306a36Sopenharmony_ci						      &compound_pagelist);
117362306a36Sopenharmony_ci		spin_unlock(pte_ptl);
117462306a36Sopenharmony_ci	} else {
117562306a36Sopenharmony_ci		result = SCAN_PMD_NULL;
117662306a36Sopenharmony_ci	}
117762306a36Sopenharmony_ci
117862306a36Sopenharmony_ci	if (unlikely(result != SCAN_SUCCEED)) {
117962306a36Sopenharmony_ci		if (pte)
118062306a36Sopenharmony_ci			pte_unmap(pte);
118162306a36Sopenharmony_ci		spin_lock(pmd_ptl);
118262306a36Sopenharmony_ci		BUG_ON(!pmd_none(*pmd));
118362306a36Sopenharmony_ci		/*
118462306a36Sopenharmony_ci		 * We can only use set_pmd_at when establishing
118562306a36Sopenharmony_ci		 * hugepmds and never for establishing regular pmds that
118662306a36Sopenharmony_ci		 * points to regular pagetables. Use pmd_populate for that
118762306a36Sopenharmony_ci		 */
118862306a36Sopenharmony_ci		pmd_populate(mm, pmd, pmd_pgtable(_pmd));
118962306a36Sopenharmony_ci		spin_unlock(pmd_ptl);
119062306a36Sopenharmony_ci		anon_vma_unlock_write(vma->anon_vma);
119162306a36Sopenharmony_ci		goto out_up_write;
119262306a36Sopenharmony_ci	}
119362306a36Sopenharmony_ci
119462306a36Sopenharmony_ci	/*
119562306a36Sopenharmony_ci	 * All pages are isolated and locked so anon_vma rmap
119662306a36Sopenharmony_ci	 * can't run anymore.
119762306a36Sopenharmony_ci	 */
119862306a36Sopenharmony_ci	anon_vma_unlock_write(vma->anon_vma);
119962306a36Sopenharmony_ci
120062306a36Sopenharmony_ci	result = __collapse_huge_page_copy(pte, hpage, pmd, _pmd,
120162306a36Sopenharmony_ci					   vma, address, pte_ptl,
120262306a36Sopenharmony_ci					   &compound_pagelist);
120362306a36Sopenharmony_ci	pte_unmap(pte);
120462306a36Sopenharmony_ci	if (unlikely(result != SCAN_SUCCEED))
120562306a36Sopenharmony_ci		goto out_up_write;
120662306a36Sopenharmony_ci
120762306a36Sopenharmony_ci	/*
120862306a36Sopenharmony_ci	 * spin_lock() below is not the equivalent of smp_wmb(), but
120962306a36Sopenharmony_ci	 * the smp_wmb() inside __SetPageUptodate() can be reused to
121062306a36Sopenharmony_ci	 * avoid the copy_huge_page writes to become visible after
121162306a36Sopenharmony_ci	 * the set_pmd_at() write.
121262306a36Sopenharmony_ci	 */
121362306a36Sopenharmony_ci	__SetPageUptodate(hpage);
121462306a36Sopenharmony_ci	pgtable = pmd_pgtable(_pmd);
121562306a36Sopenharmony_ci
121662306a36Sopenharmony_ci	_pmd = mk_huge_pmd(hpage, vma->vm_page_prot);
121762306a36Sopenharmony_ci	_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
121862306a36Sopenharmony_ci
121962306a36Sopenharmony_ci	spin_lock(pmd_ptl);
122062306a36Sopenharmony_ci	BUG_ON(!pmd_none(*pmd));
122162306a36Sopenharmony_ci	page_add_new_anon_rmap(hpage, vma, address);
122262306a36Sopenharmony_ci	lru_cache_add_inactive_or_unevictable(hpage, vma);
122362306a36Sopenharmony_ci	pgtable_trans_huge_deposit(mm, pmd, pgtable);
122462306a36Sopenharmony_ci	set_pmd_at(mm, address, pmd, _pmd);
122562306a36Sopenharmony_ci	update_mmu_cache_pmd(vma, address, pmd);
122662306a36Sopenharmony_ci	spin_unlock(pmd_ptl);
122762306a36Sopenharmony_ci
122862306a36Sopenharmony_ci	hpage = NULL;
122962306a36Sopenharmony_ci
123062306a36Sopenharmony_ci	result = SCAN_SUCCEED;
123162306a36Sopenharmony_ciout_up_write:
123262306a36Sopenharmony_ci	mmap_write_unlock(mm);
123362306a36Sopenharmony_ciout_nolock:
123462306a36Sopenharmony_ci	if (hpage)
123562306a36Sopenharmony_ci		put_page(hpage);
123662306a36Sopenharmony_ci	trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
123762306a36Sopenharmony_ci	return result;
123862306a36Sopenharmony_ci}
123962306a36Sopenharmony_ci
124062306a36Sopenharmony_cistatic int hpage_collapse_scan_pmd(struct mm_struct *mm,
124162306a36Sopenharmony_ci				   struct vm_area_struct *vma,
124262306a36Sopenharmony_ci				   unsigned long address, bool *mmap_locked,
124362306a36Sopenharmony_ci				   struct collapse_control *cc)
124462306a36Sopenharmony_ci{
124562306a36Sopenharmony_ci	pmd_t *pmd;
124662306a36Sopenharmony_ci	pte_t *pte, *_pte;
124762306a36Sopenharmony_ci	int result = SCAN_FAIL, referenced = 0;
124862306a36Sopenharmony_ci	int none_or_zero = 0, shared = 0;
124962306a36Sopenharmony_ci	struct page *page = NULL;
125062306a36Sopenharmony_ci	unsigned long _address;
125162306a36Sopenharmony_ci	spinlock_t *ptl;
125262306a36Sopenharmony_ci	int node = NUMA_NO_NODE, unmapped = 0;
125362306a36Sopenharmony_ci	bool writable = false;
125462306a36Sopenharmony_ci
125562306a36Sopenharmony_ci	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
125662306a36Sopenharmony_ci
125762306a36Sopenharmony_ci	result = find_pmd_or_thp_or_none(mm, address, &pmd);
125862306a36Sopenharmony_ci	if (result != SCAN_SUCCEED)
125962306a36Sopenharmony_ci		goto out;
126062306a36Sopenharmony_ci
126162306a36Sopenharmony_ci	memset(cc->node_load, 0, sizeof(cc->node_load));
126262306a36Sopenharmony_ci	nodes_clear(cc->alloc_nmask);
126362306a36Sopenharmony_ci	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
126462306a36Sopenharmony_ci	if (!pte) {
126562306a36Sopenharmony_ci		result = SCAN_PMD_NULL;
126662306a36Sopenharmony_ci		goto out;
126762306a36Sopenharmony_ci	}
126862306a36Sopenharmony_ci
126962306a36Sopenharmony_ci	for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
127062306a36Sopenharmony_ci	     _pte++, _address += PAGE_SIZE) {
127162306a36Sopenharmony_ci		pte_t pteval = ptep_get(_pte);
127262306a36Sopenharmony_ci		if (is_swap_pte(pteval)) {
127362306a36Sopenharmony_ci			++unmapped;
127462306a36Sopenharmony_ci			if (!cc->is_khugepaged ||
127562306a36Sopenharmony_ci			    unmapped <= khugepaged_max_ptes_swap) {
127662306a36Sopenharmony_ci				/*
127762306a36Sopenharmony_ci				 * Always be strict with uffd-wp
127862306a36Sopenharmony_ci				 * enabled swap entries.  Please see
127962306a36Sopenharmony_ci				 * comment below for pte_uffd_wp().
128062306a36Sopenharmony_ci				 */
128162306a36Sopenharmony_ci				if (pte_swp_uffd_wp_any(pteval)) {
128262306a36Sopenharmony_ci					result = SCAN_PTE_UFFD_WP;
128362306a36Sopenharmony_ci					goto out_unmap;
128462306a36Sopenharmony_ci				}
128562306a36Sopenharmony_ci				continue;
128662306a36Sopenharmony_ci			} else {
128762306a36Sopenharmony_ci				result = SCAN_EXCEED_SWAP_PTE;
128862306a36Sopenharmony_ci				count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
128962306a36Sopenharmony_ci				goto out_unmap;
129062306a36Sopenharmony_ci			}
129162306a36Sopenharmony_ci		}
129262306a36Sopenharmony_ci		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
129362306a36Sopenharmony_ci			++none_or_zero;
129462306a36Sopenharmony_ci			if (!userfaultfd_armed(vma) &&
129562306a36Sopenharmony_ci			    (!cc->is_khugepaged ||
129662306a36Sopenharmony_ci			     none_or_zero <= khugepaged_max_ptes_none)) {
129762306a36Sopenharmony_ci				continue;
129862306a36Sopenharmony_ci			} else {
129962306a36Sopenharmony_ci				result = SCAN_EXCEED_NONE_PTE;
130062306a36Sopenharmony_ci				count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
130162306a36Sopenharmony_ci				goto out_unmap;
130262306a36Sopenharmony_ci			}
130362306a36Sopenharmony_ci		}
130462306a36Sopenharmony_ci		if (pte_uffd_wp(pteval)) {
130562306a36Sopenharmony_ci			/*
130662306a36Sopenharmony_ci			 * Don't collapse the page if any of the small
130762306a36Sopenharmony_ci			 * PTEs are armed with uffd write protection.
130862306a36Sopenharmony_ci			 * Here we can also mark the new huge pmd as
130962306a36Sopenharmony_ci			 * write protected if any of the small ones is
131062306a36Sopenharmony_ci			 * marked but that could bring unknown
131162306a36Sopenharmony_ci			 * userfault messages that falls outside of
131262306a36Sopenharmony_ci			 * the registered range.  So, just be simple.
131362306a36Sopenharmony_ci			 */
131462306a36Sopenharmony_ci			result = SCAN_PTE_UFFD_WP;
131562306a36Sopenharmony_ci			goto out_unmap;
131662306a36Sopenharmony_ci		}
131762306a36Sopenharmony_ci		if (pte_write(pteval))
131862306a36Sopenharmony_ci			writable = true;
131962306a36Sopenharmony_ci
132062306a36Sopenharmony_ci		page = vm_normal_page(vma, _address, pteval);
132162306a36Sopenharmony_ci		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
132262306a36Sopenharmony_ci			result = SCAN_PAGE_NULL;
132362306a36Sopenharmony_ci			goto out_unmap;
132462306a36Sopenharmony_ci		}
132562306a36Sopenharmony_ci
132662306a36Sopenharmony_ci		if (page_mapcount(page) > 1) {
132762306a36Sopenharmony_ci			++shared;
132862306a36Sopenharmony_ci			if (cc->is_khugepaged &&
132962306a36Sopenharmony_ci			    shared > khugepaged_max_ptes_shared) {
133062306a36Sopenharmony_ci				result = SCAN_EXCEED_SHARED_PTE;
133162306a36Sopenharmony_ci				count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
133262306a36Sopenharmony_ci				goto out_unmap;
133362306a36Sopenharmony_ci			}
133462306a36Sopenharmony_ci		}
133562306a36Sopenharmony_ci
133662306a36Sopenharmony_ci		page = compound_head(page);
133762306a36Sopenharmony_ci
133862306a36Sopenharmony_ci		/*
133962306a36Sopenharmony_ci		 * Record which node the original page is from and save this
134062306a36Sopenharmony_ci		 * information to cc->node_load[].
134162306a36Sopenharmony_ci		 * Khugepaged will allocate hugepage from the node has the max
134262306a36Sopenharmony_ci		 * hit record.
134362306a36Sopenharmony_ci		 */
134462306a36Sopenharmony_ci		node = page_to_nid(page);
134562306a36Sopenharmony_ci		if (hpage_collapse_scan_abort(node, cc)) {
134662306a36Sopenharmony_ci			result = SCAN_SCAN_ABORT;
134762306a36Sopenharmony_ci			goto out_unmap;
134862306a36Sopenharmony_ci		}
134962306a36Sopenharmony_ci		cc->node_load[node]++;
135062306a36Sopenharmony_ci		if (!PageLRU(page)) {
135162306a36Sopenharmony_ci			result = SCAN_PAGE_LRU;
135262306a36Sopenharmony_ci			goto out_unmap;
135362306a36Sopenharmony_ci		}
135462306a36Sopenharmony_ci		if (PageLocked(page)) {
135562306a36Sopenharmony_ci			result = SCAN_PAGE_LOCK;
135662306a36Sopenharmony_ci			goto out_unmap;
135762306a36Sopenharmony_ci		}
135862306a36Sopenharmony_ci		if (!PageAnon(page)) {
135962306a36Sopenharmony_ci			result = SCAN_PAGE_ANON;
136062306a36Sopenharmony_ci			goto out_unmap;
136162306a36Sopenharmony_ci		}
136262306a36Sopenharmony_ci
136362306a36Sopenharmony_ci		/*
136462306a36Sopenharmony_ci		 * Check if the page has any GUP (or other external) pins.
136562306a36Sopenharmony_ci		 *
136662306a36Sopenharmony_ci		 * Here the check may be racy:
136762306a36Sopenharmony_ci		 * it may see total_mapcount > refcount in some cases?
136862306a36Sopenharmony_ci		 * But such case is ephemeral we could always retry collapse
136962306a36Sopenharmony_ci		 * later.  However it may report false positive if the page
137062306a36Sopenharmony_ci		 * has excessive GUP pins (i.e. 512).  Anyway the same check
137162306a36Sopenharmony_ci		 * will be done again later the risk seems low.
137262306a36Sopenharmony_ci		 */
137362306a36Sopenharmony_ci		if (!is_refcount_suitable(page)) {
137462306a36Sopenharmony_ci			result = SCAN_PAGE_COUNT;
137562306a36Sopenharmony_ci			goto out_unmap;
137662306a36Sopenharmony_ci		}
137762306a36Sopenharmony_ci
137862306a36Sopenharmony_ci		/*
137962306a36Sopenharmony_ci		 * If collapse was initiated by khugepaged, check that there is
138062306a36Sopenharmony_ci		 * enough young pte to justify collapsing the page
138162306a36Sopenharmony_ci		 */
138262306a36Sopenharmony_ci		if (cc->is_khugepaged &&
138362306a36Sopenharmony_ci		    (pte_young(pteval) || page_is_young(page) ||
138462306a36Sopenharmony_ci		     PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
138562306a36Sopenharmony_ci								     address)))
138662306a36Sopenharmony_ci			referenced++;
138762306a36Sopenharmony_ci	}
138862306a36Sopenharmony_ci	if (!writable) {
138962306a36Sopenharmony_ci		result = SCAN_PAGE_RO;
139062306a36Sopenharmony_ci	} else if (cc->is_khugepaged &&
139162306a36Sopenharmony_ci		   (!referenced ||
139262306a36Sopenharmony_ci		    (unmapped && referenced < HPAGE_PMD_NR / 2))) {
139362306a36Sopenharmony_ci		result = SCAN_LACK_REFERENCED_PAGE;
139462306a36Sopenharmony_ci	} else {
139562306a36Sopenharmony_ci		result = SCAN_SUCCEED;
139662306a36Sopenharmony_ci	}
139762306a36Sopenharmony_ciout_unmap:
139862306a36Sopenharmony_ci	pte_unmap_unlock(pte, ptl);
139962306a36Sopenharmony_ci	if (result == SCAN_SUCCEED) {
140062306a36Sopenharmony_ci		result = collapse_huge_page(mm, address, referenced,
140162306a36Sopenharmony_ci					    unmapped, cc);
140262306a36Sopenharmony_ci		/* collapse_huge_page will return with the mmap_lock released */
140362306a36Sopenharmony_ci		*mmap_locked = false;
140462306a36Sopenharmony_ci	}
140562306a36Sopenharmony_ciout:
140662306a36Sopenharmony_ci	trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
140762306a36Sopenharmony_ci				     none_or_zero, result, unmapped);
140862306a36Sopenharmony_ci	return result;
140962306a36Sopenharmony_ci}
141062306a36Sopenharmony_ci
141162306a36Sopenharmony_cistatic void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
141262306a36Sopenharmony_ci{
141362306a36Sopenharmony_ci	struct mm_slot *slot = &mm_slot->slot;
141462306a36Sopenharmony_ci	struct mm_struct *mm = slot->mm;
141562306a36Sopenharmony_ci
141662306a36Sopenharmony_ci	lockdep_assert_held(&khugepaged_mm_lock);
141762306a36Sopenharmony_ci
141862306a36Sopenharmony_ci	if (hpage_collapse_test_exit(mm)) {
141962306a36Sopenharmony_ci		/* free mm_slot */
142062306a36Sopenharmony_ci		hash_del(&slot->hash);
142162306a36Sopenharmony_ci		list_del(&slot->mm_node);
142262306a36Sopenharmony_ci
142362306a36Sopenharmony_ci		/*
142462306a36Sopenharmony_ci		 * Not strictly needed because the mm exited already.
142562306a36Sopenharmony_ci		 *
142662306a36Sopenharmony_ci		 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
142762306a36Sopenharmony_ci		 */
142862306a36Sopenharmony_ci
142962306a36Sopenharmony_ci		/* khugepaged_mm_lock actually not necessary for the below */
143062306a36Sopenharmony_ci		mm_slot_free(mm_slot_cache, mm_slot);
143162306a36Sopenharmony_ci		mmdrop(mm);
143262306a36Sopenharmony_ci	}
143362306a36Sopenharmony_ci}
143462306a36Sopenharmony_ci
143562306a36Sopenharmony_ci#ifdef CONFIG_SHMEM
143662306a36Sopenharmony_ci/* hpage must be locked, and mmap_lock must be held */
143762306a36Sopenharmony_cistatic int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
143862306a36Sopenharmony_ci			pmd_t *pmdp, struct page *hpage)
143962306a36Sopenharmony_ci{
144062306a36Sopenharmony_ci	struct vm_fault vmf = {
144162306a36Sopenharmony_ci		.vma = vma,
144262306a36Sopenharmony_ci		.address = addr,
144362306a36Sopenharmony_ci		.flags = 0,
144462306a36Sopenharmony_ci		.pmd = pmdp,
144562306a36Sopenharmony_ci	};
144662306a36Sopenharmony_ci
144762306a36Sopenharmony_ci	VM_BUG_ON(!PageTransHuge(hpage));
144862306a36Sopenharmony_ci	mmap_assert_locked(vma->vm_mm);
144962306a36Sopenharmony_ci
145062306a36Sopenharmony_ci	if (do_set_pmd(&vmf, hpage))
145162306a36Sopenharmony_ci		return SCAN_FAIL;
145262306a36Sopenharmony_ci
145362306a36Sopenharmony_ci	get_page(hpage);
145462306a36Sopenharmony_ci	return SCAN_SUCCEED;
145562306a36Sopenharmony_ci}
145662306a36Sopenharmony_ci
145762306a36Sopenharmony_ci/**
145862306a36Sopenharmony_ci * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
145962306a36Sopenharmony_ci * address haddr.
146062306a36Sopenharmony_ci *
146162306a36Sopenharmony_ci * @mm: process address space where collapse happens
146262306a36Sopenharmony_ci * @addr: THP collapse address
146362306a36Sopenharmony_ci * @install_pmd: If a huge PMD should be installed
146462306a36Sopenharmony_ci *
146562306a36Sopenharmony_ci * This function checks whether all the PTEs in the PMD are pointing to the
146662306a36Sopenharmony_ci * right THP. If so, retract the page table so the THP can refault in with
146762306a36Sopenharmony_ci * as pmd-mapped. Possibly install a huge PMD mapping the THP.
146862306a36Sopenharmony_ci */
146962306a36Sopenharmony_ciint collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
147062306a36Sopenharmony_ci			    bool install_pmd)
147162306a36Sopenharmony_ci{
147262306a36Sopenharmony_ci	struct mmu_notifier_range range;
147362306a36Sopenharmony_ci	bool notified = false;
147462306a36Sopenharmony_ci	unsigned long haddr = addr & HPAGE_PMD_MASK;
147562306a36Sopenharmony_ci	struct vm_area_struct *vma = vma_lookup(mm, haddr);
147662306a36Sopenharmony_ci	struct page *hpage;
147762306a36Sopenharmony_ci	pte_t *start_pte, *pte;
147862306a36Sopenharmony_ci	pmd_t *pmd, pgt_pmd;
147962306a36Sopenharmony_ci	spinlock_t *pml = NULL, *ptl;
148062306a36Sopenharmony_ci	int nr_ptes = 0, result = SCAN_FAIL;
148162306a36Sopenharmony_ci	int i;
148262306a36Sopenharmony_ci
148362306a36Sopenharmony_ci	mmap_assert_locked(mm);
148462306a36Sopenharmony_ci
148562306a36Sopenharmony_ci	/* First check VMA found, in case page tables are being torn down */
148662306a36Sopenharmony_ci	if (!vma || !vma->vm_file ||
148762306a36Sopenharmony_ci	    !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
148862306a36Sopenharmony_ci		return SCAN_VMA_CHECK;
148962306a36Sopenharmony_ci
149062306a36Sopenharmony_ci	/* Fast check before locking page if already PMD-mapped */
149162306a36Sopenharmony_ci	result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
149262306a36Sopenharmony_ci	if (result == SCAN_PMD_MAPPED)
149362306a36Sopenharmony_ci		return result;
149462306a36Sopenharmony_ci
149562306a36Sopenharmony_ci	/*
149662306a36Sopenharmony_ci	 * If we are here, we've succeeded in replacing all the native pages
149762306a36Sopenharmony_ci	 * in the page cache with a single hugepage. If a mm were to fault-in
149862306a36Sopenharmony_ci	 * this memory (mapped by a suitably aligned VMA), we'd get the hugepage
149962306a36Sopenharmony_ci	 * and map it by a PMD, regardless of sysfs THP settings. As such, let's
150062306a36Sopenharmony_ci	 * analogously elide sysfs THP settings here.
150162306a36Sopenharmony_ci	 */
150262306a36Sopenharmony_ci	if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
150362306a36Sopenharmony_ci		return SCAN_VMA_CHECK;
150462306a36Sopenharmony_ci
150562306a36Sopenharmony_ci	/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
150662306a36Sopenharmony_ci	if (userfaultfd_wp(vma))
150762306a36Sopenharmony_ci		return SCAN_PTE_UFFD_WP;
150862306a36Sopenharmony_ci
150962306a36Sopenharmony_ci	hpage = find_lock_page(vma->vm_file->f_mapping,
151062306a36Sopenharmony_ci			       linear_page_index(vma, haddr));
151162306a36Sopenharmony_ci	if (!hpage)
151262306a36Sopenharmony_ci		return SCAN_PAGE_NULL;
151362306a36Sopenharmony_ci
151462306a36Sopenharmony_ci	if (!PageHead(hpage)) {
151562306a36Sopenharmony_ci		result = SCAN_FAIL;
151662306a36Sopenharmony_ci		goto drop_hpage;
151762306a36Sopenharmony_ci	}
151862306a36Sopenharmony_ci
151962306a36Sopenharmony_ci	if (compound_order(hpage) != HPAGE_PMD_ORDER) {
152062306a36Sopenharmony_ci		result = SCAN_PAGE_COMPOUND;
152162306a36Sopenharmony_ci		goto drop_hpage;
152262306a36Sopenharmony_ci	}
152362306a36Sopenharmony_ci
152462306a36Sopenharmony_ci	result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
152562306a36Sopenharmony_ci	switch (result) {
152662306a36Sopenharmony_ci	case SCAN_SUCCEED:
152762306a36Sopenharmony_ci		break;
152862306a36Sopenharmony_ci	case SCAN_PMD_NONE:
152962306a36Sopenharmony_ci		/*
153062306a36Sopenharmony_ci		 * All pte entries have been removed and pmd cleared.
153162306a36Sopenharmony_ci		 * Skip all the pte checks and just update the pmd mapping.
153262306a36Sopenharmony_ci		 */
153362306a36Sopenharmony_ci		goto maybe_install_pmd;
153462306a36Sopenharmony_ci	default:
153562306a36Sopenharmony_ci		goto drop_hpage;
153662306a36Sopenharmony_ci	}
153762306a36Sopenharmony_ci
153862306a36Sopenharmony_ci	result = SCAN_FAIL;
153962306a36Sopenharmony_ci	start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
154062306a36Sopenharmony_ci	if (!start_pte)		/* mmap_lock + page lock should prevent this */
154162306a36Sopenharmony_ci		goto drop_hpage;
154262306a36Sopenharmony_ci
154362306a36Sopenharmony_ci	/* step 1: check all mapped PTEs are to the right huge page */
154462306a36Sopenharmony_ci	for (i = 0, addr = haddr, pte = start_pte;
154562306a36Sopenharmony_ci	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
154662306a36Sopenharmony_ci		struct page *page;
154762306a36Sopenharmony_ci		pte_t ptent = ptep_get(pte);
154862306a36Sopenharmony_ci
154962306a36Sopenharmony_ci		/* empty pte, skip */
155062306a36Sopenharmony_ci		if (pte_none(ptent))
155162306a36Sopenharmony_ci			continue;
155262306a36Sopenharmony_ci
155362306a36Sopenharmony_ci		/* page swapped out, abort */
155462306a36Sopenharmony_ci		if (!pte_present(ptent)) {
155562306a36Sopenharmony_ci			result = SCAN_PTE_NON_PRESENT;
155662306a36Sopenharmony_ci			goto abort;
155762306a36Sopenharmony_ci		}
155862306a36Sopenharmony_ci
155962306a36Sopenharmony_ci		page = vm_normal_page(vma, addr, ptent);
156062306a36Sopenharmony_ci		if (WARN_ON_ONCE(page && is_zone_device_page(page)))
156162306a36Sopenharmony_ci			page = NULL;
156262306a36Sopenharmony_ci		/*
156362306a36Sopenharmony_ci		 * Note that uprobe, debugger, or MAP_PRIVATE may change the
156462306a36Sopenharmony_ci		 * page table, but the new page will not be a subpage of hpage.
156562306a36Sopenharmony_ci		 */
156662306a36Sopenharmony_ci		if (hpage + i != page)
156762306a36Sopenharmony_ci			goto abort;
156862306a36Sopenharmony_ci	}
156962306a36Sopenharmony_ci
157062306a36Sopenharmony_ci	pte_unmap_unlock(start_pte, ptl);
157162306a36Sopenharmony_ci	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
157262306a36Sopenharmony_ci				haddr, haddr + HPAGE_PMD_SIZE);
157362306a36Sopenharmony_ci	mmu_notifier_invalidate_range_start(&range);
157462306a36Sopenharmony_ci	notified = true;
157562306a36Sopenharmony_ci
157662306a36Sopenharmony_ci	/*
157762306a36Sopenharmony_ci	 * pmd_lock covers a wider range than ptl, and (if split from mm's
157862306a36Sopenharmony_ci	 * page_table_lock) ptl nests inside pml. The less time we hold pml,
157962306a36Sopenharmony_ci	 * the better; but userfaultfd's mfill_atomic_pte() on a private VMA
158062306a36Sopenharmony_ci	 * inserts a valid as-if-COWed PTE without even looking up page cache.
158162306a36Sopenharmony_ci	 * So page lock of hpage does not protect from it, so we must not drop
158262306a36Sopenharmony_ci	 * ptl before pgt_pmd is removed, so uffd private needs pml taken now.
158362306a36Sopenharmony_ci	 */
158462306a36Sopenharmony_ci	if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
158562306a36Sopenharmony_ci		pml = pmd_lock(mm, pmd);
158662306a36Sopenharmony_ci
158762306a36Sopenharmony_ci	start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl);
158862306a36Sopenharmony_ci	if (!start_pte)		/* mmap_lock + page lock should prevent this */
158962306a36Sopenharmony_ci		goto abort;
159062306a36Sopenharmony_ci	if (!pml)
159162306a36Sopenharmony_ci		spin_lock(ptl);
159262306a36Sopenharmony_ci	else if (ptl != pml)
159362306a36Sopenharmony_ci		spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
159462306a36Sopenharmony_ci
159562306a36Sopenharmony_ci	/* step 2: clear page table and adjust rmap */
159662306a36Sopenharmony_ci	for (i = 0, addr = haddr, pte = start_pte;
159762306a36Sopenharmony_ci	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
159862306a36Sopenharmony_ci		struct page *page;
159962306a36Sopenharmony_ci		pte_t ptent = ptep_get(pte);
160062306a36Sopenharmony_ci
160162306a36Sopenharmony_ci		if (pte_none(ptent))
160262306a36Sopenharmony_ci			continue;
160362306a36Sopenharmony_ci		/*
160462306a36Sopenharmony_ci		 * We dropped ptl after the first scan, to do the mmu_notifier:
160562306a36Sopenharmony_ci		 * page lock stops more PTEs of the hpage being faulted in, but
160662306a36Sopenharmony_ci		 * does not stop write faults COWing anon copies from existing
160762306a36Sopenharmony_ci		 * PTEs; and does not stop those being swapped out or migrated.
160862306a36Sopenharmony_ci		 */
160962306a36Sopenharmony_ci		if (!pte_present(ptent)) {
161062306a36Sopenharmony_ci			result = SCAN_PTE_NON_PRESENT;
161162306a36Sopenharmony_ci			goto abort;
161262306a36Sopenharmony_ci		}
161362306a36Sopenharmony_ci		page = vm_normal_page(vma, addr, ptent);
161462306a36Sopenharmony_ci		if (hpage + i != page)
161562306a36Sopenharmony_ci			goto abort;
161662306a36Sopenharmony_ci
161762306a36Sopenharmony_ci		/*
161862306a36Sopenharmony_ci		 * Must clear entry, or a racing truncate may re-remove it.
161962306a36Sopenharmony_ci		 * TLB flush can be left until pmdp_collapse_flush() does it.
162062306a36Sopenharmony_ci		 * PTE dirty? Shmem page is already dirty; file is read-only.
162162306a36Sopenharmony_ci		 */
162262306a36Sopenharmony_ci		ptep_clear(mm, addr, pte);
162362306a36Sopenharmony_ci		page_remove_rmap(page, vma, false);
162462306a36Sopenharmony_ci		nr_ptes++;
162562306a36Sopenharmony_ci	}
162662306a36Sopenharmony_ci
162762306a36Sopenharmony_ci	pte_unmap(start_pte);
162862306a36Sopenharmony_ci	if (!pml)
162962306a36Sopenharmony_ci		spin_unlock(ptl);
163062306a36Sopenharmony_ci
163162306a36Sopenharmony_ci	/* step 3: set proper refcount and mm_counters. */
163262306a36Sopenharmony_ci	if (nr_ptes) {
163362306a36Sopenharmony_ci		page_ref_sub(hpage, nr_ptes);
163462306a36Sopenharmony_ci		add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes);
163562306a36Sopenharmony_ci	}
163662306a36Sopenharmony_ci
163762306a36Sopenharmony_ci	/* step 4: remove empty page table */
163862306a36Sopenharmony_ci	if (!pml) {
163962306a36Sopenharmony_ci		pml = pmd_lock(mm, pmd);
164062306a36Sopenharmony_ci		if (ptl != pml)
164162306a36Sopenharmony_ci			spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
164262306a36Sopenharmony_ci	}
164362306a36Sopenharmony_ci	pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
164462306a36Sopenharmony_ci	pmdp_get_lockless_sync();
164562306a36Sopenharmony_ci	if (ptl != pml)
164662306a36Sopenharmony_ci		spin_unlock(ptl);
164762306a36Sopenharmony_ci	spin_unlock(pml);
164862306a36Sopenharmony_ci
164962306a36Sopenharmony_ci	mmu_notifier_invalidate_range_end(&range);
165062306a36Sopenharmony_ci
165162306a36Sopenharmony_ci	mm_dec_nr_ptes(mm);
165262306a36Sopenharmony_ci	page_table_check_pte_clear_range(mm, haddr, pgt_pmd);
165362306a36Sopenharmony_ci	pte_free_defer(mm, pmd_pgtable(pgt_pmd));
165462306a36Sopenharmony_ci
165562306a36Sopenharmony_cimaybe_install_pmd:
165662306a36Sopenharmony_ci	/* step 5: install pmd entry */
165762306a36Sopenharmony_ci	result = install_pmd
165862306a36Sopenharmony_ci			? set_huge_pmd(vma, haddr, pmd, hpage)
165962306a36Sopenharmony_ci			: SCAN_SUCCEED;
166062306a36Sopenharmony_ci	goto drop_hpage;
166162306a36Sopenharmony_ciabort:
166262306a36Sopenharmony_ci	if (nr_ptes) {
166362306a36Sopenharmony_ci		flush_tlb_mm(mm);
166462306a36Sopenharmony_ci		page_ref_sub(hpage, nr_ptes);
166562306a36Sopenharmony_ci		add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes);
166662306a36Sopenharmony_ci	}
166762306a36Sopenharmony_ci	if (start_pte)
166862306a36Sopenharmony_ci		pte_unmap_unlock(start_pte, ptl);
166962306a36Sopenharmony_ci	if (pml && pml != ptl)
167062306a36Sopenharmony_ci		spin_unlock(pml);
167162306a36Sopenharmony_ci	if (notified)
167262306a36Sopenharmony_ci		mmu_notifier_invalidate_range_end(&range);
167362306a36Sopenharmony_cidrop_hpage:
167462306a36Sopenharmony_ci	unlock_page(hpage);
167562306a36Sopenharmony_ci	put_page(hpage);
167662306a36Sopenharmony_ci	return result;
167762306a36Sopenharmony_ci}
167862306a36Sopenharmony_ci
167962306a36Sopenharmony_cistatic void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
168062306a36Sopenharmony_ci{
168162306a36Sopenharmony_ci	struct vm_area_struct *vma;
168262306a36Sopenharmony_ci
168362306a36Sopenharmony_ci	i_mmap_lock_read(mapping);
168462306a36Sopenharmony_ci	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
168562306a36Sopenharmony_ci		struct mmu_notifier_range range;
168662306a36Sopenharmony_ci		struct mm_struct *mm;
168762306a36Sopenharmony_ci		unsigned long addr;
168862306a36Sopenharmony_ci		pmd_t *pmd, pgt_pmd;
168962306a36Sopenharmony_ci		spinlock_t *pml;
169062306a36Sopenharmony_ci		spinlock_t *ptl;
169162306a36Sopenharmony_ci		bool skipped_uffd = false;
169262306a36Sopenharmony_ci
169362306a36Sopenharmony_ci		/*
169462306a36Sopenharmony_ci		 * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
169562306a36Sopenharmony_ci		 * got written to. These VMAs are likely not worth removing
169662306a36Sopenharmony_ci		 * page tables from, as PMD-mapping is likely to be split later.
169762306a36Sopenharmony_ci		 */
169862306a36Sopenharmony_ci		if (READ_ONCE(vma->anon_vma))
169962306a36Sopenharmony_ci			continue;
170062306a36Sopenharmony_ci
170162306a36Sopenharmony_ci		addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
170262306a36Sopenharmony_ci		if (addr & ~HPAGE_PMD_MASK ||
170362306a36Sopenharmony_ci		    vma->vm_end < addr + HPAGE_PMD_SIZE)
170462306a36Sopenharmony_ci			continue;
170562306a36Sopenharmony_ci
170662306a36Sopenharmony_ci		mm = vma->vm_mm;
170762306a36Sopenharmony_ci		if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED)
170862306a36Sopenharmony_ci			continue;
170962306a36Sopenharmony_ci
171062306a36Sopenharmony_ci		if (hpage_collapse_test_exit(mm))
171162306a36Sopenharmony_ci			continue;
171262306a36Sopenharmony_ci		/*
171362306a36Sopenharmony_ci		 * When a vma is registered with uffd-wp, we cannot recycle
171462306a36Sopenharmony_ci		 * the page table because there may be pte markers installed.
171562306a36Sopenharmony_ci		 * Other vmas can still have the same file mapped hugely, but
171662306a36Sopenharmony_ci		 * skip this one: it will always be mapped in small page size
171762306a36Sopenharmony_ci		 * for uffd-wp registered ranges.
171862306a36Sopenharmony_ci		 */
171962306a36Sopenharmony_ci		if (userfaultfd_wp(vma))
172062306a36Sopenharmony_ci			continue;
172162306a36Sopenharmony_ci
172262306a36Sopenharmony_ci		/* PTEs were notified when unmapped; but now for the PMD? */
172362306a36Sopenharmony_ci		mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
172462306a36Sopenharmony_ci					addr, addr + HPAGE_PMD_SIZE);
172562306a36Sopenharmony_ci		mmu_notifier_invalidate_range_start(&range);
172662306a36Sopenharmony_ci
172762306a36Sopenharmony_ci		pml = pmd_lock(mm, pmd);
172862306a36Sopenharmony_ci		ptl = pte_lockptr(mm, pmd);
172962306a36Sopenharmony_ci		if (ptl != pml)
173062306a36Sopenharmony_ci			spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
173162306a36Sopenharmony_ci
173262306a36Sopenharmony_ci		/*
173362306a36Sopenharmony_ci		 * Huge page lock is still held, so normally the page table
173462306a36Sopenharmony_ci		 * must remain empty; and we have already skipped anon_vma
173562306a36Sopenharmony_ci		 * and userfaultfd_wp() vmas.  But since the mmap_lock is not
173662306a36Sopenharmony_ci		 * held, it is still possible for a racing userfaultfd_ioctl()
173762306a36Sopenharmony_ci		 * to have inserted ptes or markers.  Now that we hold ptlock,
173862306a36Sopenharmony_ci		 * repeating the anon_vma check protects from one category,
173962306a36Sopenharmony_ci		 * and repeating the userfaultfd_wp() check from another.
174062306a36Sopenharmony_ci		 */
174162306a36Sopenharmony_ci		if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) {
174262306a36Sopenharmony_ci			skipped_uffd = true;
174362306a36Sopenharmony_ci		} else {
174462306a36Sopenharmony_ci			pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
174562306a36Sopenharmony_ci			pmdp_get_lockless_sync();
174662306a36Sopenharmony_ci		}
174762306a36Sopenharmony_ci
174862306a36Sopenharmony_ci		if (ptl != pml)
174962306a36Sopenharmony_ci			spin_unlock(ptl);
175062306a36Sopenharmony_ci		spin_unlock(pml);
175162306a36Sopenharmony_ci
175262306a36Sopenharmony_ci		mmu_notifier_invalidate_range_end(&range);
175362306a36Sopenharmony_ci
175462306a36Sopenharmony_ci		if (!skipped_uffd) {
175562306a36Sopenharmony_ci			mm_dec_nr_ptes(mm);
175662306a36Sopenharmony_ci			page_table_check_pte_clear_range(mm, addr, pgt_pmd);
175762306a36Sopenharmony_ci			pte_free_defer(mm, pmd_pgtable(pgt_pmd));
175862306a36Sopenharmony_ci		}
175962306a36Sopenharmony_ci	}
176062306a36Sopenharmony_ci	i_mmap_unlock_read(mapping);
176162306a36Sopenharmony_ci}
176262306a36Sopenharmony_ci
176362306a36Sopenharmony_ci/**
176462306a36Sopenharmony_ci * collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
176562306a36Sopenharmony_ci *
176662306a36Sopenharmony_ci * @mm: process address space where collapse happens
176762306a36Sopenharmony_ci * @addr: virtual collapse start address
176862306a36Sopenharmony_ci * @file: file that collapse on
176962306a36Sopenharmony_ci * @start: collapse start address
177062306a36Sopenharmony_ci * @cc: collapse context and scratchpad
177162306a36Sopenharmony_ci *
177262306a36Sopenharmony_ci * Basic scheme is simple, details are more complex:
177362306a36Sopenharmony_ci *  - allocate and lock a new huge page;
177462306a36Sopenharmony_ci *  - scan page cache, locking old pages
177562306a36Sopenharmony_ci *    + swap/gup in pages if necessary;
177662306a36Sopenharmony_ci *  - copy data to new page
177762306a36Sopenharmony_ci *  - handle shmem holes
177862306a36Sopenharmony_ci *    + re-validate that holes weren't filled by someone else
177962306a36Sopenharmony_ci *    + check for userfaultfd
178062306a36Sopenharmony_ci *  - finalize updates to the page cache;
178162306a36Sopenharmony_ci *  - if replacing succeeds:
178262306a36Sopenharmony_ci *    + unlock huge page;
178362306a36Sopenharmony_ci *    + free old pages;
178462306a36Sopenharmony_ci *  - if replacing failed;
178562306a36Sopenharmony_ci *    + unlock old pages
178662306a36Sopenharmony_ci *    + unlock and free huge page;
178762306a36Sopenharmony_ci */
178862306a36Sopenharmony_cistatic int collapse_file(struct mm_struct *mm, unsigned long addr,
178962306a36Sopenharmony_ci			 struct file *file, pgoff_t start,
179062306a36Sopenharmony_ci			 struct collapse_control *cc)
179162306a36Sopenharmony_ci{
179262306a36Sopenharmony_ci	struct address_space *mapping = file->f_mapping;
179362306a36Sopenharmony_ci	struct page *hpage;
179462306a36Sopenharmony_ci	struct page *page;
179562306a36Sopenharmony_ci	struct page *tmp;
179662306a36Sopenharmony_ci	struct folio *folio;
179762306a36Sopenharmony_ci	pgoff_t index = 0, end = start + HPAGE_PMD_NR;
179862306a36Sopenharmony_ci	LIST_HEAD(pagelist);
179962306a36Sopenharmony_ci	XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
180062306a36Sopenharmony_ci	int nr_none = 0, result = SCAN_SUCCEED;
180162306a36Sopenharmony_ci	bool is_shmem = shmem_file(file);
180262306a36Sopenharmony_ci	int nr = 0;
180362306a36Sopenharmony_ci
180462306a36Sopenharmony_ci	VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
180562306a36Sopenharmony_ci	VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
180662306a36Sopenharmony_ci
180762306a36Sopenharmony_ci	result = alloc_charge_hpage(&hpage, mm, cc);
180862306a36Sopenharmony_ci	if (result != SCAN_SUCCEED)
180962306a36Sopenharmony_ci		goto out;
181062306a36Sopenharmony_ci
181162306a36Sopenharmony_ci	__SetPageLocked(hpage);
181262306a36Sopenharmony_ci	if (is_shmem)
181362306a36Sopenharmony_ci		__SetPageSwapBacked(hpage);
181462306a36Sopenharmony_ci	hpage->index = start;
181562306a36Sopenharmony_ci	hpage->mapping = mapping;
181662306a36Sopenharmony_ci
181762306a36Sopenharmony_ci	/*
181862306a36Sopenharmony_ci	 * Ensure we have slots for all the pages in the range.  This is
181962306a36Sopenharmony_ci	 * almost certainly a no-op because most of the pages must be present
182062306a36Sopenharmony_ci	 */
182162306a36Sopenharmony_ci	do {
182262306a36Sopenharmony_ci		xas_lock_irq(&xas);
182362306a36Sopenharmony_ci		xas_create_range(&xas);
182462306a36Sopenharmony_ci		if (!xas_error(&xas))
182562306a36Sopenharmony_ci			break;
182662306a36Sopenharmony_ci		xas_unlock_irq(&xas);
182762306a36Sopenharmony_ci		if (!xas_nomem(&xas, GFP_KERNEL)) {
182862306a36Sopenharmony_ci			result = SCAN_FAIL;
182962306a36Sopenharmony_ci			goto rollback;
183062306a36Sopenharmony_ci		}
183162306a36Sopenharmony_ci	} while (1);
183262306a36Sopenharmony_ci
183362306a36Sopenharmony_ci	for (index = start; index < end; index++) {
183462306a36Sopenharmony_ci		xas_set(&xas, index);
183562306a36Sopenharmony_ci		page = xas_load(&xas);
183662306a36Sopenharmony_ci
183762306a36Sopenharmony_ci		VM_BUG_ON(index != xas.xa_index);
183862306a36Sopenharmony_ci		if (is_shmem) {
183962306a36Sopenharmony_ci			if (!page) {
184062306a36Sopenharmony_ci				/*
184162306a36Sopenharmony_ci				 * Stop if extent has been truncated or
184262306a36Sopenharmony_ci				 * hole-punched, and is now completely
184362306a36Sopenharmony_ci				 * empty.
184462306a36Sopenharmony_ci				 */
184562306a36Sopenharmony_ci				if (index == start) {
184662306a36Sopenharmony_ci					if (!xas_next_entry(&xas, end - 1)) {
184762306a36Sopenharmony_ci						result = SCAN_TRUNCATED;
184862306a36Sopenharmony_ci						goto xa_locked;
184962306a36Sopenharmony_ci					}
185062306a36Sopenharmony_ci				}
185162306a36Sopenharmony_ci				nr_none++;
185262306a36Sopenharmony_ci				continue;
185362306a36Sopenharmony_ci			}
185462306a36Sopenharmony_ci
185562306a36Sopenharmony_ci			if (xa_is_value(page) || !PageUptodate(page)) {
185662306a36Sopenharmony_ci				xas_unlock_irq(&xas);
185762306a36Sopenharmony_ci				/* swap in or instantiate fallocated page */
185862306a36Sopenharmony_ci				if (shmem_get_folio(mapping->host, index,
185962306a36Sopenharmony_ci						&folio, SGP_NOALLOC)) {
186062306a36Sopenharmony_ci					result = SCAN_FAIL;
186162306a36Sopenharmony_ci					goto xa_unlocked;
186262306a36Sopenharmony_ci				}
186362306a36Sopenharmony_ci				/* drain lru cache to help isolate_lru_page() */
186462306a36Sopenharmony_ci				lru_add_drain();
186562306a36Sopenharmony_ci				page = folio_file_page(folio, index);
186662306a36Sopenharmony_ci			} else if (trylock_page(page)) {
186762306a36Sopenharmony_ci				get_page(page);
186862306a36Sopenharmony_ci				xas_unlock_irq(&xas);
186962306a36Sopenharmony_ci			} else {
187062306a36Sopenharmony_ci				result = SCAN_PAGE_LOCK;
187162306a36Sopenharmony_ci				goto xa_locked;
187262306a36Sopenharmony_ci			}
187362306a36Sopenharmony_ci		} else {	/* !is_shmem */
187462306a36Sopenharmony_ci			if (!page || xa_is_value(page)) {
187562306a36Sopenharmony_ci				xas_unlock_irq(&xas);
187662306a36Sopenharmony_ci				page_cache_sync_readahead(mapping, &file->f_ra,
187762306a36Sopenharmony_ci							  file, index,
187862306a36Sopenharmony_ci							  end - index);
187962306a36Sopenharmony_ci				/* drain lru cache to help isolate_lru_page() */
188062306a36Sopenharmony_ci				lru_add_drain();
188162306a36Sopenharmony_ci				page = find_lock_page(mapping, index);
188262306a36Sopenharmony_ci				if (unlikely(page == NULL)) {
188362306a36Sopenharmony_ci					result = SCAN_FAIL;
188462306a36Sopenharmony_ci					goto xa_unlocked;
188562306a36Sopenharmony_ci				}
188662306a36Sopenharmony_ci			} else if (PageDirty(page)) {
188762306a36Sopenharmony_ci				/*
188862306a36Sopenharmony_ci				 * khugepaged only works on read-only fd,
188962306a36Sopenharmony_ci				 * so this page is dirty because it hasn't
189062306a36Sopenharmony_ci				 * been flushed since first write. There
189162306a36Sopenharmony_ci				 * won't be new dirty pages.
189262306a36Sopenharmony_ci				 *
189362306a36Sopenharmony_ci				 * Trigger async flush here and hope the
189462306a36Sopenharmony_ci				 * writeback is done when khugepaged
189562306a36Sopenharmony_ci				 * revisits this page.
189662306a36Sopenharmony_ci				 *
189762306a36Sopenharmony_ci				 * This is a one-off situation. We are not
189862306a36Sopenharmony_ci				 * forcing writeback in loop.
189962306a36Sopenharmony_ci				 */
190062306a36Sopenharmony_ci				xas_unlock_irq(&xas);
190162306a36Sopenharmony_ci				filemap_flush(mapping);
190262306a36Sopenharmony_ci				result = SCAN_FAIL;
190362306a36Sopenharmony_ci				goto xa_unlocked;
190462306a36Sopenharmony_ci			} else if (PageWriteback(page)) {
190562306a36Sopenharmony_ci				xas_unlock_irq(&xas);
190662306a36Sopenharmony_ci				result = SCAN_FAIL;
190762306a36Sopenharmony_ci				goto xa_unlocked;
190862306a36Sopenharmony_ci			} else if (trylock_page(page)) {
190962306a36Sopenharmony_ci				get_page(page);
191062306a36Sopenharmony_ci				xas_unlock_irq(&xas);
191162306a36Sopenharmony_ci			} else {
191262306a36Sopenharmony_ci				result = SCAN_PAGE_LOCK;
191362306a36Sopenharmony_ci				goto xa_locked;
191462306a36Sopenharmony_ci			}
191562306a36Sopenharmony_ci		}
191662306a36Sopenharmony_ci
191762306a36Sopenharmony_ci		/*
191862306a36Sopenharmony_ci		 * The page must be locked, so we can drop the i_pages lock
191962306a36Sopenharmony_ci		 * without racing with truncate.
192062306a36Sopenharmony_ci		 */
192162306a36Sopenharmony_ci		VM_BUG_ON_PAGE(!PageLocked(page), page);
192262306a36Sopenharmony_ci
192362306a36Sopenharmony_ci		/* make sure the page is up to date */
192462306a36Sopenharmony_ci		if (unlikely(!PageUptodate(page))) {
192562306a36Sopenharmony_ci			result = SCAN_FAIL;
192662306a36Sopenharmony_ci			goto out_unlock;
192762306a36Sopenharmony_ci		}
192862306a36Sopenharmony_ci
192962306a36Sopenharmony_ci		/*
193062306a36Sopenharmony_ci		 * If file was truncated then extended, or hole-punched, before
193162306a36Sopenharmony_ci		 * we locked the first page, then a THP might be there already.
193262306a36Sopenharmony_ci		 * This will be discovered on the first iteration.
193362306a36Sopenharmony_ci		 */
193462306a36Sopenharmony_ci		if (PageTransCompound(page)) {
193562306a36Sopenharmony_ci			struct page *head = compound_head(page);
193662306a36Sopenharmony_ci
193762306a36Sopenharmony_ci			result = compound_order(head) == HPAGE_PMD_ORDER &&
193862306a36Sopenharmony_ci					head->index == start
193962306a36Sopenharmony_ci					/* Maybe PMD-mapped */
194062306a36Sopenharmony_ci					? SCAN_PTE_MAPPED_HUGEPAGE
194162306a36Sopenharmony_ci					: SCAN_PAGE_COMPOUND;
194262306a36Sopenharmony_ci			goto out_unlock;
194362306a36Sopenharmony_ci		}
194462306a36Sopenharmony_ci
194562306a36Sopenharmony_ci		folio = page_folio(page);
194662306a36Sopenharmony_ci
194762306a36Sopenharmony_ci		if (folio_mapping(folio) != mapping) {
194862306a36Sopenharmony_ci			result = SCAN_TRUNCATED;
194962306a36Sopenharmony_ci			goto out_unlock;
195062306a36Sopenharmony_ci		}
195162306a36Sopenharmony_ci
195262306a36Sopenharmony_ci		if (!is_shmem && (folio_test_dirty(folio) ||
195362306a36Sopenharmony_ci				  folio_test_writeback(folio))) {
195462306a36Sopenharmony_ci			/*
195562306a36Sopenharmony_ci			 * khugepaged only works on read-only fd, so this
195662306a36Sopenharmony_ci			 * page is dirty because it hasn't been flushed
195762306a36Sopenharmony_ci			 * since first write.
195862306a36Sopenharmony_ci			 */
195962306a36Sopenharmony_ci			result = SCAN_FAIL;
196062306a36Sopenharmony_ci			goto out_unlock;
196162306a36Sopenharmony_ci		}
196262306a36Sopenharmony_ci
196362306a36Sopenharmony_ci		if (!folio_isolate_lru(folio)) {
196462306a36Sopenharmony_ci			result = SCAN_DEL_PAGE_LRU;
196562306a36Sopenharmony_ci			goto out_unlock;
196662306a36Sopenharmony_ci		}
196762306a36Sopenharmony_ci
196862306a36Sopenharmony_ci		if (!filemap_release_folio(folio, GFP_KERNEL)) {
196962306a36Sopenharmony_ci			result = SCAN_PAGE_HAS_PRIVATE;
197062306a36Sopenharmony_ci			folio_putback_lru(folio);
197162306a36Sopenharmony_ci			goto out_unlock;
197262306a36Sopenharmony_ci		}
197362306a36Sopenharmony_ci
197462306a36Sopenharmony_ci		if (folio_mapped(folio))
197562306a36Sopenharmony_ci			try_to_unmap(folio,
197662306a36Sopenharmony_ci					TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH);
197762306a36Sopenharmony_ci
197862306a36Sopenharmony_ci		xas_lock_irq(&xas);
197962306a36Sopenharmony_ci
198062306a36Sopenharmony_ci		VM_BUG_ON_PAGE(page != xa_load(xas.xa, index), page);
198162306a36Sopenharmony_ci
198262306a36Sopenharmony_ci		/*
198362306a36Sopenharmony_ci		 * We control three references to the page:
198462306a36Sopenharmony_ci		 *  - we hold a pin on it;
198562306a36Sopenharmony_ci		 *  - one reference from page cache;
198662306a36Sopenharmony_ci		 *  - one from isolate_lru_page;
198762306a36Sopenharmony_ci		 * If those are the only references, then any new usage of the
198862306a36Sopenharmony_ci		 * page will have to fetch it from the page cache. That requires
198962306a36Sopenharmony_ci		 * locking the page to handle truncate, so any new usage will be
199062306a36Sopenharmony_ci		 * blocked until we unlock page after collapse/during rollback.
199162306a36Sopenharmony_ci		 */
199262306a36Sopenharmony_ci		if (page_count(page) != 3) {
199362306a36Sopenharmony_ci			result = SCAN_PAGE_COUNT;
199462306a36Sopenharmony_ci			xas_unlock_irq(&xas);
199562306a36Sopenharmony_ci			putback_lru_page(page);
199662306a36Sopenharmony_ci			goto out_unlock;
199762306a36Sopenharmony_ci		}
199862306a36Sopenharmony_ci
199962306a36Sopenharmony_ci		/*
200062306a36Sopenharmony_ci		 * Accumulate the pages that are being collapsed.
200162306a36Sopenharmony_ci		 */
200262306a36Sopenharmony_ci		list_add_tail(&page->lru, &pagelist);
200362306a36Sopenharmony_ci		continue;
200462306a36Sopenharmony_ciout_unlock:
200562306a36Sopenharmony_ci		unlock_page(page);
200662306a36Sopenharmony_ci		put_page(page);
200762306a36Sopenharmony_ci		goto xa_unlocked;
200862306a36Sopenharmony_ci	}
200962306a36Sopenharmony_ci
201062306a36Sopenharmony_ci	if (!is_shmem) {
201162306a36Sopenharmony_ci		filemap_nr_thps_inc(mapping);
201262306a36Sopenharmony_ci		/*
201362306a36Sopenharmony_ci		 * Paired with smp_mb() in do_dentry_open() to ensure
201462306a36Sopenharmony_ci		 * i_writecount is up to date and the update to nr_thps is
201562306a36Sopenharmony_ci		 * visible. Ensures the page cache will be truncated if the
201662306a36Sopenharmony_ci		 * file is opened writable.
201762306a36Sopenharmony_ci		 */
201862306a36Sopenharmony_ci		smp_mb();
201962306a36Sopenharmony_ci		if (inode_is_open_for_write(mapping->host)) {
202062306a36Sopenharmony_ci			result = SCAN_FAIL;
202162306a36Sopenharmony_ci			filemap_nr_thps_dec(mapping);
202262306a36Sopenharmony_ci		}
202362306a36Sopenharmony_ci	}
202462306a36Sopenharmony_ci
202562306a36Sopenharmony_cixa_locked:
202662306a36Sopenharmony_ci	xas_unlock_irq(&xas);
202762306a36Sopenharmony_cixa_unlocked:
202862306a36Sopenharmony_ci
202962306a36Sopenharmony_ci	/*
203062306a36Sopenharmony_ci	 * If collapse is successful, flush must be done now before copying.
203162306a36Sopenharmony_ci	 * If collapse is unsuccessful, does flush actually need to be done?
203262306a36Sopenharmony_ci	 * Do it anyway, to clear the state.
203362306a36Sopenharmony_ci	 */
203462306a36Sopenharmony_ci	try_to_unmap_flush();
203562306a36Sopenharmony_ci
203662306a36Sopenharmony_ci	if (result == SCAN_SUCCEED && nr_none &&
203762306a36Sopenharmony_ci	    !shmem_charge(mapping->host, nr_none))
203862306a36Sopenharmony_ci		result = SCAN_FAIL;
203962306a36Sopenharmony_ci	if (result != SCAN_SUCCEED) {
204062306a36Sopenharmony_ci		nr_none = 0;
204162306a36Sopenharmony_ci		goto rollback;
204262306a36Sopenharmony_ci	}
204362306a36Sopenharmony_ci
204462306a36Sopenharmony_ci	/*
204562306a36Sopenharmony_ci	 * The old pages are locked, so they won't change anymore.
204662306a36Sopenharmony_ci	 */
204762306a36Sopenharmony_ci	index = start;
204862306a36Sopenharmony_ci	list_for_each_entry(page, &pagelist, lru) {
204962306a36Sopenharmony_ci		while (index < page->index) {
205062306a36Sopenharmony_ci			clear_highpage(hpage + (index % HPAGE_PMD_NR));
205162306a36Sopenharmony_ci			index++;
205262306a36Sopenharmony_ci		}
205362306a36Sopenharmony_ci		if (copy_mc_highpage(hpage + (page->index % HPAGE_PMD_NR), page) > 0) {
205462306a36Sopenharmony_ci			result = SCAN_COPY_MC;
205562306a36Sopenharmony_ci			goto rollback;
205662306a36Sopenharmony_ci		}
205762306a36Sopenharmony_ci		index++;
205862306a36Sopenharmony_ci	}
205962306a36Sopenharmony_ci	while (index < end) {
206062306a36Sopenharmony_ci		clear_highpage(hpage + (index % HPAGE_PMD_NR));
206162306a36Sopenharmony_ci		index++;
206262306a36Sopenharmony_ci	}
206362306a36Sopenharmony_ci
206462306a36Sopenharmony_ci	if (nr_none) {
206562306a36Sopenharmony_ci		struct vm_area_struct *vma;
206662306a36Sopenharmony_ci		int nr_none_check = 0;
206762306a36Sopenharmony_ci
206862306a36Sopenharmony_ci		i_mmap_lock_read(mapping);
206962306a36Sopenharmony_ci		xas_lock_irq(&xas);
207062306a36Sopenharmony_ci
207162306a36Sopenharmony_ci		xas_set(&xas, start);
207262306a36Sopenharmony_ci		for (index = start; index < end; index++) {
207362306a36Sopenharmony_ci			if (!xas_next(&xas)) {
207462306a36Sopenharmony_ci				xas_store(&xas, XA_RETRY_ENTRY);
207562306a36Sopenharmony_ci				if (xas_error(&xas)) {
207662306a36Sopenharmony_ci					result = SCAN_STORE_FAILED;
207762306a36Sopenharmony_ci					goto immap_locked;
207862306a36Sopenharmony_ci				}
207962306a36Sopenharmony_ci				nr_none_check++;
208062306a36Sopenharmony_ci			}
208162306a36Sopenharmony_ci		}
208262306a36Sopenharmony_ci
208362306a36Sopenharmony_ci		if (nr_none != nr_none_check) {
208462306a36Sopenharmony_ci			result = SCAN_PAGE_FILLED;
208562306a36Sopenharmony_ci			goto immap_locked;
208662306a36Sopenharmony_ci		}
208762306a36Sopenharmony_ci
208862306a36Sopenharmony_ci		/*
208962306a36Sopenharmony_ci		 * If userspace observed a missing page in a VMA with a MODE_MISSING
209062306a36Sopenharmony_ci		 * userfaultfd, then it might expect a UFFD_EVENT_PAGEFAULT for that
209162306a36Sopenharmony_ci		 * page. If so, we need to roll back to avoid suppressing such an
209262306a36Sopenharmony_ci		 * event. Since wp/minor userfaultfds don't give userspace any
209362306a36Sopenharmony_ci		 * guarantees that the kernel doesn't fill a missing page with a zero
209462306a36Sopenharmony_ci		 * page, so they don't matter here.
209562306a36Sopenharmony_ci		 *
209662306a36Sopenharmony_ci		 * Any userfaultfds registered after this point will not be able to
209762306a36Sopenharmony_ci		 * observe any missing pages due to the previously inserted retry
209862306a36Sopenharmony_ci		 * entries.
209962306a36Sopenharmony_ci		 */
210062306a36Sopenharmony_ci		vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) {
210162306a36Sopenharmony_ci			if (userfaultfd_missing(vma)) {
210262306a36Sopenharmony_ci				result = SCAN_EXCEED_NONE_PTE;
210362306a36Sopenharmony_ci				goto immap_locked;
210462306a36Sopenharmony_ci			}
210562306a36Sopenharmony_ci		}
210662306a36Sopenharmony_ci
210762306a36Sopenharmony_ciimmap_locked:
210862306a36Sopenharmony_ci		i_mmap_unlock_read(mapping);
210962306a36Sopenharmony_ci		if (result != SCAN_SUCCEED) {
211062306a36Sopenharmony_ci			xas_set(&xas, start);
211162306a36Sopenharmony_ci			for (index = start; index < end; index++) {
211262306a36Sopenharmony_ci				if (xas_next(&xas) == XA_RETRY_ENTRY)
211362306a36Sopenharmony_ci					xas_store(&xas, NULL);
211462306a36Sopenharmony_ci			}
211562306a36Sopenharmony_ci
211662306a36Sopenharmony_ci			xas_unlock_irq(&xas);
211762306a36Sopenharmony_ci			goto rollback;
211862306a36Sopenharmony_ci		}
211962306a36Sopenharmony_ci	} else {
212062306a36Sopenharmony_ci		xas_lock_irq(&xas);
212162306a36Sopenharmony_ci	}
212262306a36Sopenharmony_ci
212362306a36Sopenharmony_ci	nr = thp_nr_pages(hpage);
212462306a36Sopenharmony_ci	if (is_shmem)
212562306a36Sopenharmony_ci		__mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
212662306a36Sopenharmony_ci	else
212762306a36Sopenharmony_ci		__mod_lruvec_page_state(hpage, NR_FILE_THPS, nr);
212862306a36Sopenharmony_ci
212962306a36Sopenharmony_ci	if (nr_none) {
213062306a36Sopenharmony_ci		__mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none);
213162306a36Sopenharmony_ci		/* nr_none is always 0 for non-shmem. */
213262306a36Sopenharmony_ci		__mod_lruvec_page_state(hpage, NR_SHMEM, nr_none);
213362306a36Sopenharmony_ci	}
213462306a36Sopenharmony_ci
213562306a36Sopenharmony_ci	/*
213662306a36Sopenharmony_ci	 * Mark hpage as uptodate before inserting it into the page cache so
213762306a36Sopenharmony_ci	 * that it isn't mistaken for an fallocated but unwritten page.
213862306a36Sopenharmony_ci	 */
213962306a36Sopenharmony_ci	folio = page_folio(hpage);
214062306a36Sopenharmony_ci	folio_mark_uptodate(folio);
214162306a36Sopenharmony_ci	folio_ref_add(folio, HPAGE_PMD_NR - 1);
214262306a36Sopenharmony_ci
214362306a36Sopenharmony_ci	if (is_shmem)
214462306a36Sopenharmony_ci		folio_mark_dirty(folio);
214562306a36Sopenharmony_ci	folio_add_lru(folio);
214662306a36Sopenharmony_ci
214762306a36Sopenharmony_ci	/* Join all the small entries into a single multi-index entry. */
214862306a36Sopenharmony_ci	xas_set_order(&xas, start, HPAGE_PMD_ORDER);
214962306a36Sopenharmony_ci	xas_store(&xas, hpage);
215062306a36Sopenharmony_ci	WARN_ON_ONCE(xas_error(&xas));
215162306a36Sopenharmony_ci	xas_unlock_irq(&xas);
215262306a36Sopenharmony_ci
215362306a36Sopenharmony_ci	/*
215462306a36Sopenharmony_ci	 * Remove pte page tables, so we can re-fault the page as huge.
215562306a36Sopenharmony_ci	 * If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp().
215662306a36Sopenharmony_ci	 */
215762306a36Sopenharmony_ci	retract_page_tables(mapping, start);
215862306a36Sopenharmony_ci	if (cc && !cc->is_khugepaged)
215962306a36Sopenharmony_ci		result = SCAN_PTE_MAPPED_HUGEPAGE;
216062306a36Sopenharmony_ci	unlock_page(hpage);
216162306a36Sopenharmony_ci
216262306a36Sopenharmony_ci	/*
216362306a36Sopenharmony_ci	 * The collapse has succeeded, so free the old pages.
216462306a36Sopenharmony_ci	 */
216562306a36Sopenharmony_ci	list_for_each_entry_safe(page, tmp, &pagelist, lru) {
216662306a36Sopenharmony_ci		list_del(&page->lru);
216762306a36Sopenharmony_ci		page->mapping = NULL;
216862306a36Sopenharmony_ci		ClearPageActive(page);
216962306a36Sopenharmony_ci		ClearPageUnevictable(page);
217062306a36Sopenharmony_ci		unlock_page(page);
217162306a36Sopenharmony_ci		folio_put_refs(page_folio(page), 3);
217262306a36Sopenharmony_ci	}
217362306a36Sopenharmony_ci
217462306a36Sopenharmony_ci	goto out;
217562306a36Sopenharmony_ci
217662306a36Sopenharmony_cirollback:
217762306a36Sopenharmony_ci	/* Something went wrong: roll back page cache changes */
217862306a36Sopenharmony_ci	if (nr_none) {
217962306a36Sopenharmony_ci		xas_lock_irq(&xas);
218062306a36Sopenharmony_ci		mapping->nrpages -= nr_none;
218162306a36Sopenharmony_ci		xas_unlock_irq(&xas);
218262306a36Sopenharmony_ci		shmem_uncharge(mapping->host, nr_none);
218362306a36Sopenharmony_ci	}
218462306a36Sopenharmony_ci
218562306a36Sopenharmony_ci	list_for_each_entry_safe(page, tmp, &pagelist, lru) {
218662306a36Sopenharmony_ci		list_del(&page->lru);
218762306a36Sopenharmony_ci		unlock_page(page);
218862306a36Sopenharmony_ci		putback_lru_page(page);
218962306a36Sopenharmony_ci		put_page(page);
219062306a36Sopenharmony_ci	}
219162306a36Sopenharmony_ci	/*
219262306a36Sopenharmony_ci	 * Undo the updates of filemap_nr_thps_inc for non-SHMEM
219362306a36Sopenharmony_ci	 * file only. This undo is not needed unless failure is
219462306a36Sopenharmony_ci	 * due to SCAN_COPY_MC.
219562306a36Sopenharmony_ci	 */
219662306a36Sopenharmony_ci	if (!is_shmem && result == SCAN_COPY_MC) {
219762306a36Sopenharmony_ci		filemap_nr_thps_dec(mapping);
219862306a36Sopenharmony_ci		/*
219962306a36Sopenharmony_ci		 * Paired with smp_mb() in do_dentry_open() to
220062306a36Sopenharmony_ci		 * ensure the update to nr_thps is visible.
220162306a36Sopenharmony_ci		 */
220262306a36Sopenharmony_ci		smp_mb();
220362306a36Sopenharmony_ci	}
220462306a36Sopenharmony_ci
220562306a36Sopenharmony_ci	hpage->mapping = NULL;
220662306a36Sopenharmony_ci
220762306a36Sopenharmony_ci	unlock_page(hpage);
220862306a36Sopenharmony_ci	put_page(hpage);
220962306a36Sopenharmony_ciout:
221062306a36Sopenharmony_ci	VM_BUG_ON(!list_empty(&pagelist));
221162306a36Sopenharmony_ci	trace_mm_khugepaged_collapse_file(mm, hpage, index, is_shmem, addr, file, nr, result);
221262306a36Sopenharmony_ci	return result;
221362306a36Sopenharmony_ci}
221462306a36Sopenharmony_ci
221562306a36Sopenharmony_cistatic int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
221662306a36Sopenharmony_ci				    struct file *file, pgoff_t start,
221762306a36Sopenharmony_ci				    struct collapse_control *cc)
221862306a36Sopenharmony_ci{
221962306a36Sopenharmony_ci	struct page *page = NULL;
222062306a36Sopenharmony_ci	struct address_space *mapping = file->f_mapping;
222162306a36Sopenharmony_ci	XA_STATE(xas, &mapping->i_pages, start);
222262306a36Sopenharmony_ci	int present, swap;
222362306a36Sopenharmony_ci	int node = NUMA_NO_NODE;
222462306a36Sopenharmony_ci	int result = SCAN_SUCCEED;
222562306a36Sopenharmony_ci
222662306a36Sopenharmony_ci	present = 0;
222762306a36Sopenharmony_ci	swap = 0;
222862306a36Sopenharmony_ci	memset(cc->node_load, 0, sizeof(cc->node_load));
222962306a36Sopenharmony_ci	nodes_clear(cc->alloc_nmask);
223062306a36Sopenharmony_ci	rcu_read_lock();
223162306a36Sopenharmony_ci	xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
223262306a36Sopenharmony_ci		if (xas_retry(&xas, page))
223362306a36Sopenharmony_ci			continue;
223462306a36Sopenharmony_ci
223562306a36Sopenharmony_ci		if (xa_is_value(page)) {
223662306a36Sopenharmony_ci			++swap;
223762306a36Sopenharmony_ci			if (cc->is_khugepaged &&
223862306a36Sopenharmony_ci			    swap > khugepaged_max_ptes_swap) {
223962306a36Sopenharmony_ci				result = SCAN_EXCEED_SWAP_PTE;
224062306a36Sopenharmony_ci				count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
224162306a36Sopenharmony_ci				break;
224262306a36Sopenharmony_ci			}
224362306a36Sopenharmony_ci			continue;
224462306a36Sopenharmony_ci		}
224562306a36Sopenharmony_ci
224662306a36Sopenharmony_ci		/*
224762306a36Sopenharmony_ci		 * TODO: khugepaged should compact smaller compound pages
224862306a36Sopenharmony_ci		 * into a PMD sized page
224962306a36Sopenharmony_ci		 */
225062306a36Sopenharmony_ci		if (PageTransCompound(page)) {
225162306a36Sopenharmony_ci			struct page *head = compound_head(page);
225262306a36Sopenharmony_ci
225362306a36Sopenharmony_ci			result = compound_order(head) == HPAGE_PMD_ORDER &&
225462306a36Sopenharmony_ci					head->index == start
225562306a36Sopenharmony_ci					/* Maybe PMD-mapped */
225662306a36Sopenharmony_ci					? SCAN_PTE_MAPPED_HUGEPAGE
225762306a36Sopenharmony_ci					: SCAN_PAGE_COMPOUND;
225862306a36Sopenharmony_ci			/*
225962306a36Sopenharmony_ci			 * For SCAN_PTE_MAPPED_HUGEPAGE, further processing
226062306a36Sopenharmony_ci			 * by the caller won't touch the page cache, and so
226162306a36Sopenharmony_ci			 * it's safe to skip LRU and refcount checks before
226262306a36Sopenharmony_ci			 * returning.
226362306a36Sopenharmony_ci			 */
226462306a36Sopenharmony_ci			break;
226562306a36Sopenharmony_ci		}
226662306a36Sopenharmony_ci
226762306a36Sopenharmony_ci		node = page_to_nid(page);
226862306a36Sopenharmony_ci		if (hpage_collapse_scan_abort(node, cc)) {
226962306a36Sopenharmony_ci			result = SCAN_SCAN_ABORT;
227062306a36Sopenharmony_ci			break;
227162306a36Sopenharmony_ci		}
227262306a36Sopenharmony_ci		cc->node_load[node]++;
227362306a36Sopenharmony_ci
227462306a36Sopenharmony_ci		if (!PageLRU(page)) {
227562306a36Sopenharmony_ci			result = SCAN_PAGE_LRU;
227662306a36Sopenharmony_ci			break;
227762306a36Sopenharmony_ci		}
227862306a36Sopenharmony_ci
227962306a36Sopenharmony_ci		if (page_count(page) !=
228062306a36Sopenharmony_ci		    1 + page_mapcount(page) + page_has_private(page)) {
228162306a36Sopenharmony_ci			result = SCAN_PAGE_COUNT;
228262306a36Sopenharmony_ci			break;
228362306a36Sopenharmony_ci		}
228462306a36Sopenharmony_ci
228562306a36Sopenharmony_ci		/*
228662306a36Sopenharmony_ci		 * We probably should check if the page is referenced here, but
228762306a36Sopenharmony_ci		 * nobody would transfer pte_young() to PageReferenced() for us.
228862306a36Sopenharmony_ci		 * And rmap walk here is just too costly...
228962306a36Sopenharmony_ci		 */
229062306a36Sopenharmony_ci
229162306a36Sopenharmony_ci		present++;
229262306a36Sopenharmony_ci
229362306a36Sopenharmony_ci		if (need_resched()) {
229462306a36Sopenharmony_ci			xas_pause(&xas);
229562306a36Sopenharmony_ci			cond_resched_rcu();
229662306a36Sopenharmony_ci		}
229762306a36Sopenharmony_ci	}
229862306a36Sopenharmony_ci	rcu_read_unlock();
229962306a36Sopenharmony_ci
230062306a36Sopenharmony_ci	if (result == SCAN_SUCCEED) {
230162306a36Sopenharmony_ci		if (cc->is_khugepaged &&
230262306a36Sopenharmony_ci		    present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
230362306a36Sopenharmony_ci			result = SCAN_EXCEED_NONE_PTE;
230462306a36Sopenharmony_ci			count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
230562306a36Sopenharmony_ci		} else {
230662306a36Sopenharmony_ci			result = collapse_file(mm, addr, file, start, cc);
230762306a36Sopenharmony_ci		}
230862306a36Sopenharmony_ci	}
230962306a36Sopenharmony_ci
231062306a36Sopenharmony_ci	trace_mm_khugepaged_scan_file(mm, page, file, present, swap, result);
231162306a36Sopenharmony_ci	return result;
231262306a36Sopenharmony_ci}
231362306a36Sopenharmony_ci#else
231462306a36Sopenharmony_cistatic int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
231562306a36Sopenharmony_ci				    struct file *file, pgoff_t start,
231662306a36Sopenharmony_ci				    struct collapse_control *cc)
231762306a36Sopenharmony_ci{
231862306a36Sopenharmony_ci	BUILD_BUG();
231962306a36Sopenharmony_ci}
232062306a36Sopenharmony_ci#endif
232162306a36Sopenharmony_ci
232262306a36Sopenharmony_cistatic unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
232362306a36Sopenharmony_ci					    struct collapse_control *cc)
232462306a36Sopenharmony_ci	__releases(&khugepaged_mm_lock)
232562306a36Sopenharmony_ci	__acquires(&khugepaged_mm_lock)
232662306a36Sopenharmony_ci{
232762306a36Sopenharmony_ci	struct vma_iterator vmi;
232862306a36Sopenharmony_ci	struct khugepaged_mm_slot *mm_slot;
232962306a36Sopenharmony_ci	struct mm_slot *slot;
233062306a36Sopenharmony_ci	struct mm_struct *mm;
233162306a36Sopenharmony_ci	struct vm_area_struct *vma;
233262306a36Sopenharmony_ci	int progress = 0;
233362306a36Sopenharmony_ci
233462306a36Sopenharmony_ci	VM_BUG_ON(!pages);
233562306a36Sopenharmony_ci	lockdep_assert_held(&khugepaged_mm_lock);
233662306a36Sopenharmony_ci	*result = SCAN_FAIL;
233762306a36Sopenharmony_ci
233862306a36Sopenharmony_ci	if (khugepaged_scan.mm_slot) {
233962306a36Sopenharmony_ci		mm_slot = khugepaged_scan.mm_slot;
234062306a36Sopenharmony_ci		slot = &mm_slot->slot;
234162306a36Sopenharmony_ci	} else {
234262306a36Sopenharmony_ci		slot = list_entry(khugepaged_scan.mm_head.next,
234362306a36Sopenharmony_ci				     struct mm_slot, mm_node);
234462306a36Sopenharmony_ci		mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
234562306a36Sopenharmony_ci		khugepaged_scan.address = 0;
234662306a36Sopenharmony_ci		khugepaged_scan.mm_slot = mm_slot;
234762306a36Sopenharmony_ci	}
234862306a36Sopenharmony_ci	spin_unlock(&khugepaged_mm_lock);
234962306a36Sopenharmony_ci
235062306a36Sopenharmony_ci	mm = slot->mm;
235162306a36Sopenharmony_ci	/*
235262306a36Sopenharmony_ci	 * Don't wait for semaphore (to avoid long wait times).  Just move to
235362306a36Sopenharmony_ci	 * the next mm on the list.
235462306a36Sopenharmony_ci	 */
235562306a36Sopenharmony_ci	vma = NULL;
235662306a36Sopenharmony_ci	if (unlikely(!mmap_read_trylock(mm)))
235762306a36Sopenharmony_ci		goto breakouterloop_mmap_lock;
235862306a36Sopenharmony_ci
235962306a36Sopenharmony_ci	progress++;
236062306a36Sopenharmony_ci	if (unlikely(hpage_collapse_test_exit(mm)))
236162306a36Sopenharmony_ci		goto breakouterloop;
236262306a36Sopenharmony_ci
236362306a36Sopenharmony_ci	vma_iter_init(&vmi, mm, khugepaged_scan.address);
236462306a36Sopenharmony_ci	for_each_vma(vmi, vma) {
236562306a36Sopenharmony_ci		unsigned long hstart, hend;
236662306a36Sopenharmony_ci
236762306a36Sopenharmony_ci		cond_resched();
236862306a36Sopenharmony_ci		if (unlikely(hpage_collapse_test_exit(mm))) {
236962306a36Sopenharmony_ci			progress++;
237062306a36Sopenharmony_ci			break;
237162306a36Sopenharmony_ci		}
237262306a36Sopenharmony_ci		if (!hugepage_vma_check(vma, vma->vm_flags, false, false, true)) {
237362306a36Sopenharmony_ciskip:
237462306a36Sopenharmony_ci			progress++;
237562306a36Sopenharmony_ci			continue;
237662306a36Sopenharmony_ci		}
237762306a36Sopenharmony_ci		hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
237862306a36Sopenharmony_ci		hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
237962306a36Sopenharmony_ci		if (khugepaged_scan.address > hend)
238062306a36Sopenharmony_ci			goto skip;
238162306a36Sopenharmony_ci		if (khugepaged_scan.address < hstart)
238262306a36Sopenharmony_ci			khugepaged_scan.address = hstart;
238362306a36Sopenharmony_ci		VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
238462306a36Sopenharmony_ci
238562306a36Sopenharmony_ci		while (khugepaged_scan.address < hend) {
238662306a36Sopenharmony_ci			bool mmap_locked = true;
238762306a36Sopenharmony_ci
238862306a36Sopenharmony_ci			cond_resched();
238962306a36Sopenharmony_ci			if (unlikely(hpage_collapse_test_exit(mm)))
239062306a36Sopenharmony_ci				goto breakouterloop;
239162306a36Sopenharmony_ci
239262306a36Sopenharmony_ci			VM_BUG_ON(khugepaged_scan.address < hstart ||
239362306a36Sopenharmony_ci				  khugepaged_scan.address + HPAGE_PMD_SIZE >
239462306a36Sopenharmony_ci				  hend);
239562306a36Sopenharmony_ci			if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
239662306a36Sopenharmony_ci				struct file *file = get_file(vma->vm_file);
239762306a36Sopenharmony_ci				pgoff_t pgoff = linear_page_index(vma,
239862306a36Sopenharmony_ci						khugepaged_scan.address);
239962306a36Sopenharmony_ci
240062306a36Sopenharmony_ci				mmap_read_unlock(mm);
240162306a36Sopenharmony_ci				mmap_locked = false;
240262306a36Sopenharmony_ci				*result = hpage_collapse_scan_file(mm,
240362306a36Sopenharmony_ci					khugepaged_scan.address, file, pgoff, cc);
240462306a36Sopenharmony_ci				fput(file);
240562306a36Sopenharmony_ci				if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
240662306a36Sopenharmony_ci					mmap_read_lock(mm);
240762306a36Sopenharmony_ci					if (hpage_collapse_test_exit(mm))
240862306a36Sopenharmony_ci						goto breakouterloop;
240962306a36Sopenharmony_ci					*result = collapse_pte_mapped_thp(mm,
241062306a36Sopenharmony_ci						khugepaged_scan.address, false);
241162306a36Sopenharmony_ci					if (*result == SCAN_PMD_MAPPED)
241262306a36Sopenharmony_ci						*result = SCAN_SUCCEED;
241362306a36Sopenharmony_ci					mmap_read_unlock(mm);
241462306a36Sopenharmony_ci				}
241562306a36Sopenharmony_ci			} else {
241662306a36Sopenharmony_ci				*result = hpage_collapse_scan_pmd(mm, vma,
241762306a36Sopenharmony_ci					khugepaged_scan.address, &mmap_locked, cc);
241862306a36Sopenharmony_ci			}
241962306a36Sopenharmony_ci
242062306a36Sopenharmony_ci			if (*result == SCAN_SUCCEED)
242162306a36Sopenharmony_ci				++khugepaged_pages_collapsed;
242262306a36Sopenharmony_ci
242362306a36Sopenharmony_ci			/* move to next address */
242462306a36Sopenharmony_ci			khugepaged_scan.address += HPAGE_PMD_SIZE;
242562306a36Sopenharmony_ci			progress += HPAGE_PMD_NR;
242662306a36Sopenharmony_ci			if (!mmap_locked)
242762306a36Sopenharmony_ci				/*
242862306a36Sopenharmony_ci				 * We released mmap_lock so break loop.  Note
242962306a36Sopenharmony_ci				 * that we drop mmap_lock before all hugepage
243062306a36Sopenharmony_ci				 * allocations, so if allocation fails, we are
243162306a36Sopenharmony_ci				 * guaranteed to break here and report the
243262306a36Sopenharmony_ci				 * correct result back to caller.
243362306a36Sopenharmony_ci				 */
243462306a36Sopenharmony_ci				goto breakouterloop_mmap_lock;
243562306a36Sopenharmony_ci			if (progress >= pages)
243662306a36Sopenharmony_ci				goto breakouterloop;
243762306a36Sopenharmony_ci		}
243862306a36Sopenharmony_ci	}
243962306a36Sopenharmony_cibreakouterloop:
244062306a36Sopenharmony_ci	mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
244162306a36Sopenharmony_cibreakouterloop_mmap_lock:
244262306a36Sopenharmony_ci
244362306a36Sopenharmony_ci	spin_lock(&khugepaged_mm_lock);
244462306a36Sopenharmony_ci	VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
244562306a36Sopenharmony_ci	/*
244662306a36Sopenharmony_ci	 * Release the current mm_slot if this mm is about to die, or
244762306a36Sopenharmony_ci	 * if we scanned all vmas of this mm.
244862306a36Sopenharmony_ci	 */
244962306a36Sopenharmony_ci	if (hpage_collapse_test_exit(mm) || !vma) {
245062306a36Sopenharmony_ci		/*
245162306a36Sopenharmony_ci		 * Make sure that if mm_users is reaching zero while
245262306a36Sopenharmony_ci		 * khugepaged runs here, khugepaged_exit will find
245362306a36Sopenharmony_ci		 * mm_slot not pointing to the exiting mm.
245462306a36Sopenharmony_ci		 */
245562306a36Sopenharmony_ci		if (slot->mm_node.next != &khugepaged_scan.mm_head) {
245662306a36Sopenharmony_ci			slot = list_entry(slot->mm_node.next,
245762306a36Sopenharmony_ci					  struct mm_slot, mm_node);
245862306a36Sopenharmony_ci			khugepaged_scan.mm_slot =
245962306a36Sopenharmony_ci				mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
246062306a36Sopenharmony_ci			khugepaged_scan.address = 0;
246162306a36Sopenharmony_ci		} else {
246262306a36Sopenharmony_ci			khugepaged_scan.mm_slot = NULL;
246362306a36Sopenharmony_ci			khugepaged_full_scans++;
246462306a36Sopenharmony_ci		}
246562306a36Sopenharmony_ci
246662306a36Sopenharmony_ci		collect_mm_slot(mm_slot);
246762306a36Sopenharmony_ci	}
246862306a36Sopenharmony_ci
246962306a36Sopenharmony_ci	return progress;
247062306a36Sopenharmony_ci}
247162306a36Sopenharmony_ci
247262306a36Sopenharmony_cistatic int khugepaged_has_work(void)
247362306a36Sopenharmony_ci{
247462306a36Sopenharmony_ci	return !list_empty(&khugepaged_scan.mm_head) &&
247562306a36Sopenharmony_ci		hugepage_flags_enabled();
247662306a36Sopenharmony_ci}
247762306a36Sopenharmony_ci
247862306a36Sopenharmony_cistatic int khugepaged_wait_event(void)
247962306a36Sopenharmony_ci{
248062306a36Sopenharmony_ci	return !list_empty(&khugepaged_scan.mm_head) ||
248162306a36Sopenharmony_ci		kthread_should_stop();
248262306a36Sopenharmony_ci}
248362306a36Sopenharmony_ci
248462306a36Sopenharmony_cistatic void khugepaged_do_scan(struct collapse_control *cc)
248562306a36Sopenharmony_ci{
248662306a36Sopenharmony_ci	unsigned int progress = 0, pass_through_head = 0;
248762306a36Sopenharmony_ci	unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
248862306a36Sopenharmony_ci	bool wait = true;
248962306a36Sopenharmony_ci	int result = SCAN_SUCCEED;
249062306a36Sopenharmony_ci
249162306a36Sopenharmony_ci	lru_add_drain_all();
249262306a36Sopenharmony_ci
249362306a36Sopenharmony_ci	while (true) {
249462306a36Sopenharmony_ci		cond_resched();
249562306a36Sopenharmony_ci
249662306a36Sopenharmony_ci		if (unlikely(kthread_should_stop() || try_to_freeze()))
249762306a36Sopenharmony_ci			break;
249862306a36Sopenharmony_ci
249962306a36Sopenharmony_ci		spin_lock(&khugepaged_mm_lock);
250062306a36Sopenharmony_ci		if (!khugepaged_scan.mm_slot)
250162306a36Sopenharmony_ci			pass_through_head++;
250262306a36Sopenharmony_ci		if (khugepaged_has_work() &&
250362306a36Sopenharmony_ci		    pass_through_head < 2)
250462306a36Sopenharmony_ci			progress += khugepaged_scan_mm_slot(pages - progress,
250562306a36Sopenharmony_ci							    &result, cc);
250662306a36Sopenharmony_ci		else
250762306a36Sopenharmony_ci			progress = pages;
250862306a36Sopenharmony_ci		spin_unlock(&khugepaged_mm_lock);
250962306a36Sopenharmony_ci
251062306a36Sopenharmony_ci		if (progress >= pages)
251162306a36Sopenharmony_ci			break;
251262306a36Sopenharmony_ci
251362306a36Sopenharmony_ci		if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) {
251462306a36Sopenharmony_ci			/*
251562306a36Sopenharmony_ci			 * If fail to allocate the first time, try to sleep for
251662306a36Sopenharmony_ci			 * a while.  When hit again, cancel the scan.
251762306a36Sopenharmony_ci			 */
251862306a36Sopenharmony_ci			if (!wait)
251962306a36Sopenharmony_ci				break;
252062306a36Sopenharmony_ci			wait = false;
252162306a36Sopenharmony_ci			khugepaged_alloc_sleep();
252262306a36Sopenharmony_ci		}
252362306a36Sopenharmony_ci	}
252462306a36Sopenharmony_ci}
252562306a36Sopenharmony_ci
252662306a36Sopenharmony_cistatic bool khugepaged_should_wakeup(void)
252762306a36Sopenharmony_ci{
252862306a36Sopenharmony_ci	return kthread_should_stop() ||
252962306a36Sopenharmony_ci	       time_after_eq(jiffies, khugepaged_sleep_expire);
253062306a36Sopenharmony_ci}
253162306a36Sopenharmony_ci
253262306a36Sopenharmony_cistatic void khugepaged_wait_work(void)
253362306a36Sopenharmony_ci{
253462306a36Sopenharmony_ci	if (khugepaged_has_work()) {
253562306a36Sopenharmony_ci		const unsigned long scan_sleep_jiffies =
253662306a36Sopenharmony_ci			msecs_to_jiffies(khugepaged_scan_sleep_millisecs);
253762306a36Sopenharmony_ci
253862306a36Sopenharmony_ci		if (!scan_sleep_jiffies)
253962306a36Sopenharmony_ci			return;
254062306a36Sopenharmony_ci
254162306a36Sopenharmony_ci		khugepaged_sleep_expire = jiffies + scan_sleep_jiffies;
254262306a36Sopenharmony_ci		wait_event_freezable_timeout(khugepaged_wait,
254362306a36Sopenharmony_ci					     khugepaged_should_wakeup(),
254462306a36Sopenharmony_ci					     scan_sleep_jiffies);
254562306a36Sopenharmony_ci		return;
254662306a36Sopenharmony_ci	}
254762306a36Sopenharmony_ci
254862306a36Sopenharmony_ci	if (hugepage_flags_enabled())
254962306a36Sopenharmony_ci		wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
255062306a36Sopenharmony_ci}
255162306a36Sopenharmony_ci
255262306a36Sopenharmony_cistatic int khugepaged(void *none)
255362306a36Sopenharmony_ci{
255462306a36Sopenharmony_ci	struct khugepaged_mm_slot *mm_slot;
255562306a36Sopenharmony_ci
255662306a36Sopenharmony_ci	set_freezable();
255762306a36Sopenharmony_ci	set_user_nice(current, MAX_NICE);
255862306a36Sopenharmony_ci
255962306a36Sopenharmony_ci	while (!kthread_should_stop()) {
256062306a36Sopenharmony_ci		khugepaged_do_scan(&khugepaged_collapse_control);
256162306a36Sopenharmony_ci		khugepaged_wait_work();
256262306a36Sopenharmony_ci	}
256362306a36Sopenharmony_ci
256462306a36Sopenharmony_ci	spin_lock(&khugepaged_mm_lock);
256562306a36Sopenharmony_ci	mm_slot = khugepaged_scan.mm_slot;
256662306a36Sopenharmony_ci	khugepaged_scan.mm_slot = NULL;
256762306a36Sopenharmony_ci	if (mm_slot)
256862306a36Sopenharmony_ci		collect_mm_slot(mm_slot);
256962306a36Sopenharmony_ci	spin_unlock(&khugepaged_mm_lock);
257062306a36Sopenharmony_ci	return 0;
257162306a36Sopenharmony_ci}
257262306a36Sopenharmony_ci
257362306a36Sopenharmony_cistatic void set_recommended_min_free_kbytes(void)
257462306a36Sopenharmony_ci{
257562306a36Sopenharmony_ci	struct zone *zone;
257662306a36Sopenharmony_ci	int nr_zones = 0;
257762306a36Sopenharmony_ci	unsigned long recommended_min;
257862306a36Sopenharmony_ci
257962306a36Sopenharmony_ci	if (!hugepage_flags_enabled()) {
258062306a36Sopenharmony_ci		calculate_min_free_kbytes();
258162306a36Sopenharmony_ci		goto update_wmarks;
258262306a36Sopenharmony_ci	}
258362306a36Sopenharmony_ci
258462306a36Sopenharmony_ci	for_each_populated_zone(zone) {
258562306a36Sopenharmony_ci		/*
258662306a36Sopenharmony_ci		 * We don't need to worry about fragmentation of
258762306a36Sopenharmony_ci		 * ZONE_MOVABLE since it only has movable pages.
258862306a36Sopenharmony_ci		 */
258962306a36Sopenharmony_ci		if (zone_idx(zone) > gfp_zone(GFP_USER))
259062306a36Sopenharmony_ci			continue;
259162306a36Sopenharmony_ci
259262306a36Sopenharmony_ci		nr_zones++;
259362306a36Sopenharmony_ci	}
259462306a36Sopenharmony_ci
259562306a36Sopenharmony_ci	/* Ensure 2 pageblocks are free to assist fragmentation avoidance */
259662306a36Sopenharmony_ci	recommended_min = pageblock_nr_pages * nr_zones * 2;
259762306a36Sopenharmony_ci
259862306a36Sopenharmony_ci	/*
259962306a36Sopenharmony_ci	 * Make sure that on average at least two pageblocks are almost free
260062306a36Sopenharmony_ci	 * of another type, one for a migratetype to fall back to and a
260162306a36Sopenharmony_ci	 * second to avoid subsequent fallbacks of other types There are 3
260262306a36Sopenharmony_ci	 * MIGRATE_TYPES we care about.
260362306a36Sopenharmony_ci	 */
260462306a36Sopenharmony_ci	recommended_min += pageblock_nr_pages * nr_zones *
260562306a36Sopenharmony_ci			   MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
260662306a36Sopenharmony_ci
260762306a36Sopenharmony_ci	/* don't ever allow to reserve more than 5% of the lowmem */
260862306a36Sopenharmony_ci	recommended_min = min(recommended_min,
260962306a36Sopenharmony_ci			      (unsigned long) nr_free_buffer_pages() / 20);
261062306a36Sopenharmony_ci	recommended_min <<= (PAGE_SHIFT-10);
261162306a36Sopenharmony_ci
261262306a36Sopenharmony_ci	if (recommended_min > min_free_kbytes) {
261362306a36Sopenharmony_ci		if (user_min_free_kbytes >= 0)
261462306a36Sopenharmony_ci			pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
261562306a36Sopenharmony_ci				min_free_kbytes, recommended_min);
261662306a36Sopenharmony_ci
261762306a36Sopenharmony_ci		min_free_kbytes = recommended_min;
261862306a36Sopenharmony_ci	}
261962306a36Sopenharmony_ci
262062306a36Sopenharmony_ciupdate_wmarks:
262162306a36Sopenharmony_ci	setup_per_zone_wmarks();
262262306a36Sopenharmony_ci}
262362306a36Sopenharmony_ci
262462306a36Sopenharmony_ciint start_stop_khugepaged(void)
262562306a36Sopenharmony_ci{
262662306a36Sopenharmony_ci	int err = 0;
262762306a36Sopenharmony_ci
262862306a36Sopenharmony_ci	mutex_lock(&khugepaged_mutex);
262962306a36Sopenharmony_ci	if (hugepage_flags_enabled()) {
263062306a36Sopenharmony_ci		if (!khugepaged_thread)
263162306a36Sopenharmony_ci			khugepaged_thread = kthread_run(khugepaged, NULL,
263262306a36Sopenharmony_ci							"khugepaged");
263362306a36Sopenharmony_ci		if (IS_ERR(khugepaged_thread)) {
263462306a36Sopenharmony_ci			pr_err("khugepaged: kthread_run(khugepaged) failed\n");
263562306a36Sopenharmony_ci			err = PTR_ERR(khugepaged_thread);
263662306a36Sopenharmony_ci			khugepaged_thread = NULL;
263762306a36Sopenharmony_ci			goto fail;
263862306a36Sopenharmony_ci		}
263962306a36Sopenharmony_ci
264062306a36Sopenharmony_ci		if (!list_empty(&khugepaged_scan.mm_head))
264162306a36Sopenharmony_ci			wake_up_interruptible(&khugepaged_wait);
264262306a36Sopenharmony_ci	} else if (khugepaged_thread) {
264362306a36Sopenharmony_ci		kthread_stop(khugepaged_thread);
264462306a36Sopenharmony_ci		khugepaged_thread = NULL;
264562306a36Sopenharmony_ci	}
264662306a36Sopenharmony_ci	set_recommended_min_free_kbytes();
264762306a36Sopenharmony_cifail:
264862306a36Sopenharmony_ci	mutex_unlock(&khugepaged_mutex);
264962306a36Sopenharmony_ci	return err;
265062306a36Sopenharmony_ci}
265162306a36Sopenharmony_ci
265262306a36Sopenharmony_civoid khugepaged_min_free_kbytes_update(void)
265362306a36Sopenharmony_ci{
265462306a36Sopenharmony_ci	mutex_lock(&khugepaged_mutex);
265562306a36Sopenharmony_ci	if (hugepage_flags_enabled() && khugepaged_thread)
265662306a36Sopenharmony_ci		set_recommended_min_free_kbytes();
265762306a36Sopenharmony_ci	mutex_unlock(&khugepaged_mutex);
265862306a36Sopenharmony_ci}
265962306a36Sopenharmony_ci
266062306a36Sopenharmony_cibool current_is_khugepaged(void)
266162306a36Sopenharmony_ci{
266262306a36Sopenharmony_ci	return kthread_func(current) == khugepaged;
266362306a36Sopenharmony_ci}
266462306a36Sopenharmony_ci
266562306a36Sopenharmony_cistatic int madvise_collapse_errno(enum scan_result r)
266662306a36Sopenharmony_ci{
266762306a36Sopenharmony_ci	/*
266862306a36Sopenharmony_ci	 * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide
266962306a36Sopenharmony_ci	 * actionable feedback to caller, so they may take an appropriate
267062306a36Sopenharmony_ci	 * fallback measure depending on the nature of the failure.
267162306a36Sopenharmony_ci	 */
267262306a36Sopenharmony_ci	switch (r) {
267362306a36Sopenharmony_ci	case SCAN_ALLOC_HUGE_PAGE_FAIL:
267462306a36Sopenharmony_ci		return -ENOMEM;
267562306a36Sopenharmony_ci	case SCAN_CGROUP_CHARGE_FAIL:
267662306a36Sopenharmony_ci	case SCAN_EXCEED_NONE_PTE:
267762306a36Sopenharmony_ci		return -EBUSY;
267862306a36Sopenharmony_ci	/* Resource temporary unavailable - trying again might succeed */
267962306a36Sopenharmony_ci	case SCAN_PAGE_COUNT:
268062306a36Sopenharmony_ci	case SCAN_PAGE_LOCK:
268162306a36Sopenharmony_ci	case SCAN_PAGE_LRU:
268262306a36Sopenharmony_ci	case SCAN_DEL_PAGE_LRU:
268362306a36Sopenharmony_ci	case SCAN_PAGE_FILLED:
268462306a36Sopenharmony_ci		return -EAGAIN;
268562306a36Sopenharmony_ci	/*
268662306a36Sopenharmony_ci	 * Other: Trying again likely not to succeed / error intrinsic to
268762306a36Sopenharmony_ci	 * specified memory range. khugepaged likely won't be able to collapse
268862306a36Sopenharmony_ci	 * either.
268962306a36Sopenharmony_ci	 */
269062306a36Sopenharmony_ci	default:
269162306a36Sopenharmony_ci		return -EINVAL;
269262306a36Sopenharmony_ci	}
269362306a36Sopenharmony_ci}
269462306a36Sopenharmony_ci
269562306a36Sopenharmony_ciint madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
269662306a36Sopenharmony_ci		     unsigned long start, unsigned long end)
269762306a36Sopenharmony_ci{
269862306a36Sopenharmony_ci	struct collapse_control *cc;
269962306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
270062306a36Sopenharmony_ci	unsigned long hstart, hend, addr;
270162306a36Sopenharmony_ci	int thps = 0, last_fail = SCAN_FAIL;
270262306a36Sopenharmony_ci	bool mmap_locked = true;
270362306a36Sopenharmony_ci
270462306a36Sopenharmony_ci	BUG_ON(vma->vm_start > start);
270562306a36Sopenharmony_ci	BUG_ON(vma->vm_end < end);
270662306a36Sopenharmony_ci
270762306a36Sopenharmony_ci	*prev = vma;
270862306a36Sopenharmony_ci
270962306a36Sopenharmony_ci	if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
271062306a36Sopenharmony_ci		return -EINVAL;
271162306a36Sopenharmony_ci
271262306a36Sopenharmony_ci	cc = kmalloc(sizeof(*cc), GFP_KERNEL);
271362306a36Sopenharmony_ci	if (!cc)
271462306a36Sopenharmony_ci		return -ENOMEM;
271562306a36Sopenharmony_ci	cc->is_khugepaged = false;
271662306a36Sopenharmony_ci
271762306a36Sopenharmony_ci	mmgrab(mm);
271862306a36Sopenharmony_ci	lru_add_drain_all();
271962306a36Sopenharmony_ci
272062306a36Sopenharmony_ci	hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
272162306a36Sopenharmony_ci	hend = end & HPAGE_PMD_MASK;
272262306a36Sopenharmony_ci
272362306a36Sopenharmony_ci	for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
272462306a36Sopenharmony_ci		int result = SCAN_FAIL;
272562306a36Sopenharmony_ci
272662306a36Sopenharmony_ci		if (!mmap_locked) {
272762306a36Sopenharmony_ci			cond_resched();
272862306a36Sopenharmony_ci			mmap_read_lock(mm);
272962306a36Sopenharmony_ci			mmap_locked = true;
273062306a36Sopenharmony_ci			result = hugepage_vma_revalidate(mm, addr, false, &vma,
273162306a36Sopenharmony_ci							 cc);
273262306a36Sopenharmony_ci			if (result  != SCAN_SUCCEED) {
273362306a36Sopenharmony_ci				last_fail = result;
273462306a36Sopenharmony_ci				goto out_nolock;
273562306a36Sopenharmony_ci			}
273662306a36Sopenharmony_ci
273762306a36Sopenharmony_ci			hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
273862306a36Sopenharmony_ci		}
273962306a36Sopenharmony_ci		mmap_assert_locked(mm);
274062306a36Sopenharmony_ci		memset(cc->node_load, 0, sizeof(cc->node_load));
274162306a36Sopenharmony_ci		nodes_clear(cc->alloc_nmask);
274262306a36Sopenharmony_ci		if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
274362306a36Sopenharmony_ci			struct file *file = get_file(vma->vm_file);
274462306a36Sopenharmony_ci			pgoff_t pgoff = linear_page_index(vma, addr);
274562306a36Sopenharmony_ci
274662306a36Sopenharmony_ci			mmap_read_unlock(mm);
274762306a36Sopenharmony_ci			mmap_locked = false;
274862306a36Sopenharmony_ci			result = hpage_collapse_scan_file(mm, addr, file, pgoff,
274962306a36Sopenharmony_ci							  cc);
275062306a36Sopenharmony_ci			fput(file);
275162306a36Sopenharmony_ci		} else {
275262306a36Sopenharmony_ci			result = hpage_collapse_scan_pmd(mm, vma, addr,
275362306a36Sopenharmony_ci							 &mmap_locked, cc);
275462306a36Sopenharmony_ci		}
275562306a36Sopenharmony_ci		if (!mmap_locked)
275662306a36Sopenharmony_ci			*prev = NULL;  /* Tell caller we dropped mmap_lock */
275762306a36Sopenharmony_ci
275862306a36Sopenharmony_cihandle_result:
275962306a36Sopenharmony_ci		switch (result) {
276062306a36Sopenharmony_ci		case SCAN_SUCCEED:
276162306a36Sopenharmony_ci		case SCAN_PMD_MAPPED:
276262306a36Sopenharmony_ci			++thps;
276362306a36Sopenharmony_ci			break;
276462306a36Sopenharmony_ci		case SCAN_PTE_MAPPED_HUGEPAGE:
276562306a36Sopenharmony_ci			BUG_ON(mmap_locked);
276662306a36Sopenharmony_ci			BUG_ON(*prev);
276762306a36Sopenharmony_ci			mmap_read_lock(mm);
276862306a36Sopenharmony_ci			result = collapse_pte_mapped_thp(mm, addr, true);
276962306a36Sopenharmony_ci			mmap_read_unlock(mm);
277062306a36Sopenharmony_ci			goto handle_result;
277162306a36Sopenharmony_ci		/* Whitelisted set of results where continuing OK */
277262306a36Sopenharmony_ci		case SCAN_PMD_NULL:
277362306a36Sopenharmony_ci		case SCAN_PTE_NON_PRESENT:
277462306a36Sopenharmony_ci		case SCAN_PTE_UFFD_WP:
277562306a36Sopenharmony_ci		case SCAN_PAGE_RO:
277662306a36Sopenharmony_ci		case SCAN_LACK_REFERENCED_PAGE:
277762306a36Sopenharmony_ci		case SCAN_PAGE_NULL:
277862306a36Sopenharmony_ci		case SCAN_PAGE_COUNT:
277962306a36Sopenharmony_ci		case SCAN_PAGE_LOCK:
278062306a36Sopenharmony_ci		case SCAN_PAGE_COMPOUND:
278162306a36Sopenharmony_ci		case SCAN_PAGE_LRU:
278262306a36Sopenharmony_ci		case SCAN_DEL_PAGE_LRU:
278362306a36Sopenharmony_ci			last_fail = result;
278462306a36Sopenharmony_ci			break;
278562306a36Sopenharmony_ci		default:
278662306a36Sopenharmony_ci			last_fail = result;
278762306a36Sopenharmony_ci			/* Other error, exit */
278862306a36Sopenharmony_ci			goto out_maybelock;
278962306a36Sopenharmony_ci		}
279062306a36Sopenharmony_ci	}
279162306a36Sopenharmony_ci
279262306a36Sopenharmony_ciout_maybelock:
279362306a36Sopenharmony_ci	/* Caller expects us to hold mmap_lock on return */
279462306a36Sopenharmony_ci	if (!mmap_locked)
279562306a36Sopenharmony_ci		mmap_read_lock(mm);
279662306a36Sopenharmony_ciout_nolock:
279762306a36Sopenharmony_ci	mmap_assert_locked(mm);
279862306a36Sopenharmony_ci	mmdrop(mm);
279962306a36Sopenharmony_ci	kfree(cc);
280062306a36Sopenharmony_ci
280162306a36Sopenharmony_ci	return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0
280262306a36Sopenharmony_ci			: madvise_collapse_errno(last_fail);
280362306a36Sopenharmony_ci}
2804