162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 362306a36Sopenharmony_ci 462306a36Sopenharmony_ci#include <linux/mm.h> 562306a36Sopenharmony_ci#include <linux/sched.h> 662306a36Sopenharmony_ci#include <linux/sched/mm.h> 762306a36Sopenharmony_ci#include <linux/sched/coredump.h> 862306a36Sopenharmony_ci#include <linux/mmu_notifier.h> 962306a36Sopenharmony_ci#include <linux/rmap.h> 1062306a36Sopenharmony_ci#include <linux/swap.h> 1162306a36Sopenharmony_ci#include <linux/mm_inline.h> 1262306a36Sopenharmony_ci#include <linux/kthread.h> 1362306a36Sopenharmony_ci#include <linux/khugepaged.h> 1462306a36Sopenharmony_ci#include <linux/freezer.h> 1562306a36Sopenharmony_ci#include <linux/mman.h> 1662306a36Sopenharmony_ci#include <linux/hashtable.h> 1762306a36Sopenharmony_ci#include <linux/userfaultfd_k.h> 1862306a36Sopenharmony_ci#include <linux/page_idle.h> 1962306a36Sopenharmony_ci#include <linux/page_table_check.h> 2062306a36Sopenharmony_ci#include <linux/swapops.h> 2162306a36Sopenharmony_ci#include <linux/shmem_fs.h> 2262306a36Sopenharmony_ci#include <linux/ksm.h> 2362306a36Sopenharmony_ci 2462306a36Sopenharmony_ci#include <asm/tlb.h> 2562306a36Sopenharmony_ci#include <asm/pgalloc.h> 2662306a36Sopenharmony_ci#include "internal.h" 2762306a36Sopenharmony_ci#include "mm_slot.h" 2862306a36Sopenharmony_ci 2962306a36Sopenharmony_cienum scan_result { 3062306a36Sopenharmony_ci SCAN_FAIL, 3162306a36Sopenharmony_ci SCAN_SUCCEED, 3262306a36Sopenharmony_ci SCAN_PMD_NULL, 3362306a36Sopenharmony_ci SCAN_PMD_NONE, 3462306a36Sopenharmony_ci SCAN_PMD_MAPPED, 3562306a36Sopenharmony_ci SCAN_EXCEED_NONE_PTE, 3662306a36Sopenharmony_ci SCAN_EXCEED_SWAP_PTE, 3762306a36Sopenharmony_ci SCAN_EXCEED_SHARED_PTE, 3862306a36Sopenharmony_ci SCAN_PTE_NON_PRESENT, 3962306a36Sopenharmony_ci SCAN_PTE_UFFD_WP, 4062306a36Sopenharmony_ci SCAN_PTE_MAPPED_HUGEPAGE, 4162306a36Sopenharmony_ci SCAN_PAGE_RO, 4262306a36Sopenharmony_ci SCAN_LACK_REFERENCED_PAGE, 4362306a36Sopenharmony_ci SCAN_PAGE_NULL, 4462306a36Sopenharmony_ci SCAN_SCAN_ABORT, 4562306a36Sopenharmony_ci SCAN_PAGE_COUNT, 4662306a36Sopenharmony_ci SCAN_PAGE_LRU, 4762306a36Sopenharmony_ci SCAN_PAGE_LOCK, 4862306a36Sopenharmony_ci SCAN_PAGE_ANON, 4962306a36Sopenharmony_ci SCAN_PAGE_COMPOUND, 5062306a36Sopenharmony_ci SCAN_ANY_PROCESS, 5162306a36Sopenharmony_ci SCAN_VMA_NULL, 5262306a36Sopenharmony_ci SCAN_VMA_CHECK, 5362306a36Sopenharmony_ci SCAN_ADDRESS_RANGE, 5462306a36Sopenharmony_ci SCAN_DEL_PAGE_LRU, 5562306a36Sopenharmony_ci SCAN_ALLOC_HUGE_PAGE_FAIL, 5662306a36Sopenharmony_ci SCAN_CGROUP_CHARGE_FAIL, 5762306a36Sopenharmony_ci SCAN_TRUNCATED, 5862306a36Sopenharmony_ci SCAN_PAGE_HAS_PRIVATE, 5962306a36Sopenharmony_ci SCAN_STORE_FAILED, 6062306a36Sopenharmony_ci SCAN_COPY_MC, 6162306a36Sopenharmony_ci SCAN_PAGE_FILLED, 6262306a36Sopenharmony_ci}; 6362306a36Sopenharmony_ci 6462306a36Sopenharmony_ci#define CREATE_TRACE_POINTS 6562306a36Sopenharmony_ci#include <trace/events/huge_memory.h> 6662306a36Sopenharmony_ci 6762306a36Sopenharmony_cistatic struct task_struct *khugepaged_thread __read_mostly; 6862306a36Sopenharmony_cistatic DEFINE_MUTEX(khugepaged_mutex); 6962306a36Sopenharmony_ci 7062306a36Sopenharmony_ci/* default scan 8*512 pte (or vmas) every 30 second */ 7162306a36Sopenharmony_cistatic unsigned int khugepaged_pages_to_scan __read_mostly; 7262306a36Sopenharmony_cistatic unsigned int khugepaged_pages_collapsed; 7362306a36Sopenharmony_cistatic unsigned int khugepaged_full_scans; 7462306a36Sopenharmony_cistatic unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; 7562306a36Sopenharmony_ci/* during fragmentation poll the hugepage allocator once every minute */ 7662306a36Sopenharmony_cistatic unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; 7762306a36Sopenharmony_cistatic unsigned long khugepaged_sleep_expire; 7862306a36Sopenharmony_cistatic DEFINE_SPINLOCK(khugepaged_mm_lock); 7962306a36Sopenharmony_cistatic DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); 8062306a36Sopenharmony_ci/* 8162306a36Sopenharmony_ci * default collapse hugepages if there is at least one pte mapped like 8262306a36Sopenharmony_ci * it would have happened if the vma was large enough during page 8362306a36Sopenharmony_ci * fault. 8462306a36Sopenharmony_ci * 8562306a36Sopenharmony_ci * Note that these are only respected if collapse was initiated by khugepaged. 8662306a36Sopenharmony_ci */ 8762306a36Sopenharmony_cistatic unsigned int khugepaged_max_ptes_none __read_mostly; 8862306a36Sopenharmony_cistatic unsigned int khugepaged_max_ptes_swap __read_mostly; 8962306a36Sopenharmony_cistatic unsigned int khugepaged_max_ptes_shared __read_mostly; 9062306a36Sopenharmony_ci 9162306a36Sopenharmony_ci#define MM_SLOTS_HASH_BITS 10 9262306a36Sopenharmony_cistatic DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_cistatic struct kmem_cache *mm_slot_cache __read_mostly; 9562306a36Sopenharmony_ci 9662306a36Sopenharmony_cistruct collapse_control { 9762306a36Sopenharmony_ci bool is_khugepaged; 9862306a36Sopenharmony_ci 9962306a36Sopenharmony_ci /* Num pages scanned per node */ 10062306a36Sopenharmony_ci u32 node_load[MAX_NUMNODES]; 10162306a36Sopenharmony_ci 10262306a36Sopenharmony_ci /* nodemask for allocation fallback */ 10362306a36Sopenharmony_ci nodemask_t alloc_nmask; 10462306a36Sopenharmony_ci}; 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_ci/** 10762306a36Sopenharmony_ci * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned 10862306a36Sopenharmony_ci * @slot: hash lookup from mm to mm_slot 10962306a36Sopenharmony_ci */ 11062306a36Sopenharmony_cistruct khugepaged_mm_slot { 11162306a36Sopenharmony_ci struct mm_slot slot; 11262306a36Sopenharmony_ci}; 11362306a36Sopenharmony_ci 11462306a36Sopenharmony_ci/** 11562306a36Sopenharmony_ci * struct khugepaged_scan - cursor for scanning 11662306a36Sopenharmony_ci * @mm_head: the head of the mm list to scan 11762306a36Sopenharmony_ci * @mm_slot: the current mm_slot we are scanning 11862306a36Sopenharmony_ci * @address: the next address inside that to be scanned 11962306a36Sopenharmony_ci * 12062306a36Sopenharmony_ci * There is only the one khugepaged_scan instance of this cursor structure. 12162306a36Sopenharmony_ci */ 12262306a36Sopenharmony_cistruct khugepaged_scan { 12362306a36Sopenharmony_ci struct list_head mm_head; 12462306a36Sopenharmony_ci struct khugepaged_mm_slot *mm_slot; 12562306a36Sopenharmony_ci unsigned long address; 12662306a36Sopenharmony_ci}; 12762306a36Sopenharmony_ci 12862306a36Sopenharmony_cistatic struct khugepaged_scan khugepaged_scan = { 12962306a36Sopenharmony_ci .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), 13062306a36Sopenharmony_ci}; 13162306a36Sopenharmony_ci 13262306a36Sopenharmony_ci#ifdef CONFIG_SYSFS 13362306a36Sopenharmony_cistatic ssize_t scan_sleep_millisecs_show(struct kobject *kobj, 13462306a36Sopenharmony_ci struct kobj_attribute *attr, 13562306a36Sopenharmony_ci char *buf) 13662306a36Sopenharmony_ci{ 13762306a36Sopenharmony_ci return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs); 13862306a36Sopenharmony_ci} 13962306a36Sopenharmony_ci 14062306a36Sopenharmony_cistatic ssize_t scan_sleep_millisecs_store(struct kobject *kobj, 14162306a36Sopenharmony_ci struct kobj_attribute *attr, 14262306a36Sopenharmony_ci const char *buf, size_t count) 14362306a36Sopenharmony_ci{ 14462306a36Sopenharmony_ci unsigned int msecs; 14562306a36Sopenharmony_ci int err; 14662306a36Sopenharmony_ci 14762306a36Sopenharmony_ci err = kstrtouint(buf, 10, &msecs); 14862306a36Sopenharmony_ci if (err) 14962306a36Sopenharmony_ci return -EINVAL; 15062306a36Sopenharmony_ci 15162306a36Sopenharmony_ci khugepaged_scan_sleep_millisecs = msecs; 15262306a36Sopenharmony_ci khugepaged_sleep_expire = 0; 15362306a36Sopenharmony_ci wake_up_interruptible(&khugepaged_wait); 15462306a36Sopenharmony_ci 15562306a36Sopenharmony_ci return count; 15662306a36Sopenharmony_ci} 15762306a36Sopenharmony_cistatic struct kobj_attribute scan_sleep_millisecs_attr = 15862306a36Sopenharmony_ci __ATTR_RW(scan_sleep_millisecs); 15962306a36Sopenharmony_ci 16062306a36Sopenharmony_cistatic ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, 16162306a36Sopenharmony_ci struct kobj_attribute *attr, 16262306a36Sopenharmony_ci char *buf) 16362306a36Sopenharmony_ci{ 16462306a36Sopenharmony_ci return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs); 16562306a36Sopenharmony_ci} 16662306a36Sopenharmony_ci 16762306a36Sopenharmony_cistatic ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, 16862306a36Sopenharmony_ci struct kobj_attribute *attr, 16962306a36Sopenharmony_ci const char *buf, size_t count) 17062306a36Sopenharmony_ci{ 17162306a36Sopenharmony_ci unsigned int msecs; 17262306a36Sopenharmony_ci int err; 17362306a36Sopenharmony_ci 17462306a36Sopenharmony_ci err = kstrtouint(buf, 10, &msecs); 17562306a36Sopenharmony_ci if (err) 17662306a36Sopenharmony_ci return -EINVAL; 17762306a36Sopenharmony_ci 17862306a36Sopenharmony_ci khugepaged_alloc_sleep_millisecs = msecs; 17962306a36Sopenharmony_ci khugepaged_sleep_expire = 0; 18062306a36Sopenharmony_ci wake_up_interruptible(&khugepaged_wait); 18162306a36Sopenharmony_ci 18262306a36Sopenharmony_ci return count; 18362306a36Sopenharmony_ci} 18462306a36Sopenharmony_cistatic struct kobj_attribute alloc_sleep_millisecs_attr = 18562306a36Sopenharmony_ci __ATTR_RW(alloc_sleep_millisecs); 18662306a36Sopenharmony_ci 18762306a36Sopenharmony_cistatic ssize_t pages_to_scan_show(struct kobject *kobj, 18862306a36Sopenharmony_ci struct kobj_attribute *attr, 18962306a36Sopenharmony_ci char *buf) 19062306a36Sopenharmony_ci{ 19162306a36Sopenharmony_ci return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan); 19262306a36Sopenharmony_ci} 19362306a36Sopenharmony_cistatic ssize_t pages_to_scan_store(struct kobject *kobj, 19462306a36Sopenharmony_ci struct kobj_attribute *attr, 19562306a36Sopenharmony_ci const char *buf, size_t count) 19662306a36Sopenharmony_ci{ 19762306a36Sopenharmony_ci unsigned int pages; 19862306a36Sopenharmony_ci int err; 19962306a36Sopenharmony_ci 20062306a36Sopenharmony_ci err = kstrtouint(buf, 10, &pages); 20162306a36Sopenharmony_ci if (err || !pages) 20262306a36Sopenharmony_ci return -EINVAL; 20362306a36Sopenharmony_ci 20462306a36Sopenharmony_ci khugepaged_pages_to_scan = pages; 20562306a36Sopenharmony_ci 20662306a36Sopenharmony_ci return count; 20762306a36Sopenharmony_ci} 20862306a36Sopenharmony_cistatic struct kobj_attribute pages_to_scan_attr = 20962306a36Sopenharmony_ci __ATTR_RW(pages_to_scan); 21062306a36Sopenharmony_ci 21162306a36Sopenharmony_cistatic ssize_t pages_collapsed_show(struct kobject *kobj, 21262306a36Sopenharmony_ci struct kobj_attribute *attr, 21362306a36Sopenharmony_ci char *buf) 21462306a36Sopenharmony_ci{ 21562306a36Sopenharmony_ci return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed); 21662306a36Sopenharmony_ci} 21762306a36Sopenharmony_cistatic struct kobj_attribute pages_collapsed_attr = 21862306a36Sopenharmony_ci __ATTR_RO(pages_collapsed); 21962306a36Sopenharmony_ci 22062306a36Sopenharmony_cistatic ssize_t full_scans_show(struct kobject *kobj, 22162306a36Sopenharmony_ci struct kobj_attribute *attr, 22262306a36Sopenharmony_ci char *buf) 22362306a36Sopenharmony_ci{ 22462306a36Sopenharmony_ci return sysfs_emit(buf, "%u\n", khugepaged_full_scans); 22562306a36Sopenharmony_ci} 22662306a36Sopenharmony_cistatic struct kobj_attribute full_scans_attr = 22762306a36Sopenharmony_ci __ATTR_RO(full_scans); 22862306a36Sopenharmony_ci 22962306a36Sopenharmony_cistatic ssize_t defrag_show(struct kobject *kobj, 23062306a36Sopenharmony_ci struct kobj_attribute *attr, char *buf) 23162306a36Sopenharmony_ci{ 23262306a36Sopenharmony_ci return single_hugepage_flag_show(kobj, attr, buf, 23362306a36Sopenharmony_ci TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 23462306a36Sopenharmony_ci} 23562306a36Sopenharmony_cistatic ssize_t defrag_store(struct kobject *kobj, 23662306a36Sopenharmony_ci struct kobj_attribute *attr, 23762306a36Sopenharmony_ci const char *buf, size_t count) 23862306a36Sopenharmony_ci{ 23962306a36Sopenharmony_ci return single_hugepage_flag_store(kobj, attr, buf, count, 24062306a36Sopenharmony_ci TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 24162306a36Sopenharmony_ci} 24262306a36Sopenharmony_cistatic struct kobj_attribute khugepaged_defrag_attr = 24362306a36Sopenharmony_ci __ATTR_RW(defrag); 24462306a36Sopenharmony_ci 24562306a36Sopenharmony_ci/* 24662306a36Sopenharmony_ci * max_ptes_none controls if khugepaged should collapse hugepages over 24762306a36Sopenharmony_ci * any unmapped ptes in turn potentially increasing the memory 24862306a36Sopenharmony_ci * footprint of the vmas. When max_ptes_none is 0 khugepaged will not 24962306a36Sopenharmony_ci * reduce the available free memory in the system as it 25062306a36Sopenharmony_ci * runs. Increasing max_ptes_none will instead potentially reduce the 25162306a36Sopenharmony_ci * free memory in the system during the khugepaged scan. 25262306a36Sopenharmony_ci */ 25362306a36Sopenharmony_cistatic ssize_t max_ptes_none_show(struct kobject *kobj, 25462306a36Sopenharmony_ci struct kobj_attribute *attr, 25562306a36Sopenharmony_ci char *buf) 25662306a36Sopenharmony_ci{ 25762306a36Sopenharmony_ci return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none); 25862306a36Sopenharmony_ci} 25962306a36Sopenharmony_cistatic ssize_t max_ptes_none_store(struct kobject *kobj, 26062306a36Sopenharmony_ci struct kobj_attribute *attr, 26162306a36Sopenharmony_ci const char *buf, size_t count) 26262306a36Sopenharmony_ci{ 26362306a36Sopenharmony_ci int err; 26462306a36Sopenharmony_ci unsigned long max_ptes_none; 26562306a36Sopenharmony_ci 26662306a36Sopenharmony_ci err = kstrtoul(buf, 10, &max_ptes_none); 26762306a36Sopenharmony_ci if (err || max_ptes_none > HPAGE_PMD_NR - 1) 26862306a36Sopenharmony_ci return -EINVAL; 26962306a36Sopenharmony_ci 27062306a36Sopenharmony_ci khugepaged_max_ptes_none = max_ptes_none; 27162306a36Sopenharmony_ci 27262306a36Sopenharmony_ci return count; 27362306a36Sopenharmony_ci} 27462306a36Sopenharmony_cistatic struct kobj_attribute khugepaged_max_ptes_none_attr = 27562306a36Sopenharmony_ci __ATTR_RW(max_ptes_none); 27662306a36Sopenharmony_ci 27762306a36Sopenharmony_cistatic ssize_t max_ptes_swap_show(struct kobject *kobj, 27862306a36Sopenharmony_ci struct kobj_attribute *attr, 27962306a36Sopenharmony_ci char *buf) 28062306a36Sopenharmony_ci{ 28162306a36Sopenharmony_ci return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap); 28262306a36Sopenharmony_ci} 28362306a36Sopenharmony_ci 28462306a36Sopenharmony_cistatic ssize_t max_ptes_swap_store(struct kobject *kobj, 28562306a36Sopenharmony_ci struct kobj_attribute *attr, 28662306a36Sopenharmony_ci const char *buf, size_t count) 28762306a36Sopenharmony_ci{ 28862306a36Sopenharmony_ci int err; 28962306a36Sopenharmony_ci unsigned long max_ptes_swap; 29062306a36Sopenharmony_ci 29162306a36Sopenharmony_ci err = kstrtoul(buf, 10, &max_ptes_swap); 29262306a36Sopenharmony_ci if (err || max_ptes_swap > HPAGE_PMD_NR - 1) 29362306a36Sopenharmony_ci return -EINVAL; 29462306a36Sopenharmony_ci 29562306a36Sopenharmony_ci khugepaged_max_ptes_swap = max_ptes_swap; 29662306a36Sopenharmony_ci 29762306a36Sopenharmony_ci return count; 29862306a36Sopenharmony_ci} 29962306a36Sopenharmony_ci 30062306a36Sopenharmony_cistatic struct kobj_attribute khugepaged_max_ptes_swap_attr = 30162306a36Sopenharmony_ci __ATTR_RW(max_ptes_swap); 30262306a36Sopenharmony_ci 30362306a36Sopenharmony_cistatic ssize_t max_ptes_shared_show(struct kobject *kobj, 30462306a36Sopenharmony_ci struct kobj_attribute *attr, 30562306a36Sopenharmony_ci char *buf) 30662306a36Sopenharmony_ci{ 30762306a36Sopenharmony_ci return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared); 30862306a36Sopenharmony_ci} 30962306a36Sopenharmony_ci 31062306a36Sopenharmony_cistatic ssize_t max_ptes_shared_store(struct kobject *kobj, 31162306a36Sopenharmony_ci struct kobj_attribute *attr, 31262306a36Sopenharmony_ci const char *buf, size_t count) 31362306a36Sopenharmony_ci{ 31462306a36Sopenharmony_ci int err; 31562306a36Sopenharmony_ci unsigned long max_ptes_shared; 31662306a36Sopenharmony_ci 31762306a36Sopenharmony_ci err = kstrtoul(buf, 10, &max_ptes_shared); 31862306a36Sopenharmony_ci if (err || max_ptes_shared > HPAGE_PMD_NR - 1) 31962306a36Sopenharmony_ci return -EINVAL; 32062306a36Sopenharmony_ci 32162306a36Sopenharmony_ci khugepaged_max_ptes_shared = max_ptes_shared; 32262306a36Sopenharmony_ci 32362306a36Sopenharmony_ci return count; 32462306a36Sopenharmony_ci} 32562306a36Sopenharmony_ci 32662306a36Sopenharmony_cistatic struct kobj_attribute khugepaged_max_ptes_shared_attr = 32762306a36Sopenharmony_ci __ATTR_RW(max_ptes_shared); 32862306a36Sopenharmony_ci 32962306a36Sopenharmony_cistatic struct attribute *khugepaged_attr[] = { 33062306a36Sopenharmony_ci &khugepaged_defrag_attr.attr, 33162306a36Sopenharmony_ci &khugepaged_max_ptes_none_attr.attr, 33262306a36Sopenharmony_ci &khugepaged_max_ptes_swap_attr.attr, 33362306a36Sopenharmony_ci &khugepaged_max_ptes_shared_attr.attr, 33462306a36Sopenharmony_ci &pages_to_scan_attr.attr, 33562306a36Sopenharmony_ci &pages_collapsed_attr.attr, 33662306a36Sopenharmony_ci &full_scans_attr.attr, 33762306a36Sopenharmony_ci &scan_sleep_millisecs_attr.attr, 33862306a36Sopenharmony_ci &alloc_sleep_millisecs_attr.attr, 33962306a36Sopenharmony_ci NULL, 34062306a36Sopenharmony_ci}; 34162306a36Sopenharmony_ci 34262306a36Sopenharmony_cistruct attribute_group khugepaged_attr_group = { 34362306a36Sopenharmony_ci .attrs = khugepaged_attr, 34462306a36Sopenharmony_ci .name = "khugepaged", 34562306a36Sopenharmony_ci}; 34662306a36Sopenharmony_ci#endif /* CONFIG_SYSFS */ 34762306a36Sopenharmony_ci 34862306a36Sopenharmony_ciint hugepage_madvise(struct vm_area_struct *vma, 34962306a36Sopenharmony_ci unsigned long *vm_flags, int advice) 35062306a36Sopenharmony_ci{ 35162306a36Sopenharmony_ci switch (advice) { 35262306a36Sopenharmony_ci case MADV_HUGEPAGE: 35362306a36Sopenharmony_ci#ifdef CONFIG_S390 35462306a36Sopenharmony_ci /* 35562306a36Sopenharmony_ci * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390 35662306a36Sopenharmony_ci * can't handle this properly after s390_enable_sie, so we simply 35762306a36Sopenharmony_ci * ignore the madvise to prevent qemu from causing a SIGSEGV. 35862306a36Sopenharmony_ci */ 35962306a36Sopenharmony_ci if (mm_has_pgste(vma->vm_mm)) 36062306a36Sopenharmony_ci return 0; 36162306a36Sopenharmony_ci#endif 36262306a36Sopenharmony_ci *vm_flags &= ~VM_NOHUGEPAGE; 36362306a36Sopenharmony_ci *vm_flags |= VM_HUGEPAGE; 36462306a36Sopenharmony_ci /* 36562306a36Sopenharmony_ci * If the vma become good for khugepaged to scan, 36662306a36Sopenharmony_ci * register it here without waiting a page fault that 36762306a36Sopenharmony_ci * may not happen any time soon. 36862306a36Sopenharmony_ci */ 36962306a36Sopenharmony_ci khugepaged_enter_vma(vma, *vm_flags); 37062306a36Sopenharmony_ci break; 37162306a36Sopenharmony_ci case MADV_NOHUGEPAGE: 37262306a36Sopenharmony_ci *vm_flags &= ~VM_HUGEPAGE; 37362306a36Sopenharmony_ci *vm_flags |= VM_NOHUGEPAGE; 37462306a36Sopenharmony_ci /* 37562306a36Sopenharmony_ci * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning 37662306a36Sopenharmony_ci * this vma even if we leave the mm registered in khugepaged if 37762306a36Sopenharmony_ci * it got registered before VM_NOHUGEPAGE was set. 37862306a36Sopenharmony_ci */ 37962306a36Sopenharmony_ci break; 38062306a36Sopenharmony_ci } 38162306a36Sopenharmony_ci 38262306a36Sopenharmony_ci return 0; 38362306a36Sopenharmony_ci} 38462306a36Sopenharmony_ci 38562306a36Sopenharmony_ciint __init khugepaged_init(void) 38662306a36Sopenharmony_ci{ 38762306a36Sopenharmony_ci mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", 38862306a36Sopenharmony_ci sizeof(struct khugepaged_mm_slot), 38962306a36Sopenharmony_ci __alignof__(struct khugepaged_mm_slot), 39062306a36Sopenharmony_ci 0, NULL); 39162306a36Sopenharmony_ci if (!mm_slot_cache) 39262306a36Sopenharmony_ci return -ENOMEM; 39362306a36Sopenharmony_ci 39462306a36Sopenharmony_ci khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; 39562306a36Sopenharmony_ci khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; 39662306a36Sopenharmony_ci khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; 39762306a36Sopenharmony_ci khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2; 39862306a36Sopenharmony_ci 39962306a36Sopenharmony_ci return 0; 40062306a36Sopenharmony_ci} 40162306a36Sopenharmony_ci 40262306a36Sopenharmony_civoid __init khugepaged_destroy(void) 40362306a36Sopenharmony_ci{ 40462306a36Sopenharmony_ci kmem_cache_destroy(mm_slot_cache); 40562306a36Sopenharmony_ci} 40662306a36Sopenharmony_ci 40762306a36Sopenharmony_cistatic inline int hpage_collapse_test_exit(struct mm_struct *mm) 40862306a36Sopenharmony_ci{ 40962306a36Sopenharmony_ci return atomic_read(&mm->mm_users) == 0; 41062306a36Sopenharmony_ci} 41162306a36Sopenharmony_ci 41262306a36Sopenharmony_civoid __khugepaged_enter(struct mm_struct *mm) 41362306a36Sopenharmony_ci{ 41462306a36Sopenharmony_ci struct khugepaged_mm_slot *mm_slot; 41562306a36Sopenharmony_ci struct mm_slot *slot; 41662306a36Sopenharmony_ci int wakeup; 41762306a36Sopenharmony_ci 41862306a36Sopenharmony_ci /* __khugepaged_exit() must not run from under us */ 41962306a36Sopenharmony_ci VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); 42062306a36Sopenharmony_ci if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) 42162306a36Sopenharmony_ci return; 42262306a36Sopenharmony_ci 42362306a36Sopenharmony_ci mm_slot = mm_slot_alloc(mm_slot_cache); 42462306a36Sopenharmony_ci if (!mm_slot) 42562306a36Sopenharmony_ci return; 42662306a36Sopenharmony_ci 42762306a36Sopenharmony_ci slot = &mm_slot->slot; 42862306a36Sopenharmony_ci 42962306a36Sopenharmony_ci spin_lock(&khugepaged_mm_lock); 43062306a36Sopenharmony_ci mm_slot_insert(mm_slots_hash, mm, slot); 43162306a36Sopenharmony_ci /* 43262306a36Sopenharmony_ci * Insert just behind the scanning cursor, to let the area settle 43362306a36Sopenharmony_ci * down a little. 43462306a36Sopenharmony_ci */ 43562306a36Sopenharmony_ci wakeup = list_empty(&khugepaged_scan.mm_head); 43662306a36Sopenharmony_ci list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head); 43762306a36Sopenharmony_ci spin_unlock(&khugepaged_mm_lock); 43862306a36Sopenharmony_ci 43962306a36Sopenharmony_ci mmgrab(mm); 44062306a36Sopenharmony_ci if (wakeup) 44162306a36Sopenharmony_ci wake_up_interruptible(&khugepaged_wait); 44262306a36Sopenharmony_ci} 44362306a36Sopenharmony_ci 44462306a36Sopenharmony_civoid khugepaged_enter_vma(struct vm_area_struct *vma, 44562306a36Sopenharmony_ci unsigned long vm_flags) 44662306a36Sopenharmony_ci{ 44762306a36Sopenharmony_ci if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && 44862306a36Sopenharmony_ci hugepage_flags_enabled()) { 44962306a36Sopenharmony_ci if (hugepage_vma_check(vma, vm_flags, false, false, true)) 45062306a36Sopenharmony_ci __khugepaged_enter(vma->vm_mm); 45162306a36Sopenharmony_ci } 45262306a36Sopenharmony_ci} 45362306a36Sopenharmony_ci 45462306a36Sopenharmony_civoid __khugepaged_exit(struct mm_struct *mm) 45562306a36Sopenharmony_ci{ 45662306a36Sopenharmony_ci struct khugepaged_mm_slot *mm_slot; 45762306a36Sopenharmony_ci struct mm_slot *slot; 45862306a36Sopenharmony_ci int free = 0; 45962306a36Sopenharmony_ci 46062306a36Sopenharmony_ci spin_lock(&khugepaged_mm_lock); 46162306a36Sopenharmony_ci slot = mm_slot_lookup(mm_slots_hash, mm); 46262306a36Sopenharmony_ci mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); 46362306a36Sopenharmony_ci if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { 46462306a36Sopenharmony_ci hash_del(&slot->hash); 46562306a36Sopenharmony_ci list_del(&slot->mm_node); 46662306a36Sopenharmony_ci free = 1; 46762306a36Sopenharmony_ci } 46862306a36Sopenharmony_ci spin_unlock(&khugepaged_mm_lock); 46962306a36Sopenharmony_ci 47062306a36Sopenharmony_ci if (free) { 47162306a36Sopenharmony_ci clear_bit(MMF_VM_HUGEPAGE, &mm->flags); 47262306a36Sopenharmony_ci mm_slot_free(mm_slot_cache, mm_slot); 47362306a36Sopenharmony_ci mmdrop(mm); 47462306a36Sopenharmony_ci } else if (mm_slot) { 47562306a36Sopenharmony_ci /* 47662306a36Sopenharmony_ci * This is required to serialize against 47762306a36Sopenharmony_ci * hpage_collapse_test_exit() (which is guaranteed to run 47862306a36Sopenharmony_ci * under mmap sem read mode). Stop here (after we return all 47962306a36Sopenharmony_ci * pagetables will be destroyed) until khugepaged has finished 48062306a36Sopenharmony_ci * working on the pagetables under the mmap_lock. 48162306a36Sopenharmony_ci */ 48262306a36Sopenharmony_ci mmap_write_lock(mm); 48362306a36Sopenharmony_ci mmap_write_unlock(mm); 48462306a36Sopenharmony_ci } 48562306a36Sopenharmony_ci} 48662306a36Sopenharmony_ci 48762306a36Sopenharmony_cistatic void release_pte_folio(struct folio *folio) 48862306a36Sopenharmony_ci{ 48962306a36Sopenharmony_ci node_stat_mod_folio(folio, 49062306a36Sopenharmony_ci NR_ISOLATED_ANON + folio_is_file_lru(folio), 49162306a36Sopenharmony_ci -folio_nr_pages(folio)); 49262306a36Sopenharmony_ci folio_unlock(folio); 49362306a36Sopenharmony_ci folio_putback_lru(folio); 49462306a36Sopenharmony_ci} 49562306a36Sopenharmony_ci 49662306a36Sopenharmony_cistatic void release_pte_page(struct page *page) 49762306a36Sopenharmony_ci{ 49862306a36Sopenharmony_ci release_pte_folio(page_folio(page)); 49962306a36Sopenharmony_ci} 50062306a36Sopenharmony_ci 50162306a36Sopenharmony_cistatic void release_pte_pages(pte_t *pte, pte_t *_pte, 50262306a36Sopenharmony_ci struct list_head *compound_pagelist) 50362306a36Sopenharmony_ci{ 50462306a36Sopenharmony_ci struct folio *folio, *tmp; 50562306a36Sopenharmony_ci 50662306a36Sopenharmony_ci while (--_pte >= pte) { 50762306a36Sopenharmony_ci pte_t pteval = ptep_get(_pte); 50862306a36Sopenharmony_ci unsigned long pfn; 50962306a36Sopenharmony_ci 51062306a36Sopenharmony_ci if (pte_none(pteval)) 51162306a36Sopenharmony_ci continue; 51262306a36Sopenharmony_ci pfn = pte_pfn(pteval); 51362306a36Sopenharmony_ci if (is_zero_pfn(pfn)) 51462306a36Sopenharmony_ci continue; 51562306a36Sopenharmony_ci folio = pfn_folio(pfn); 51662306a36Sopenharmony_ci if (folio_test_large(folio)) 51762306a36Sopenharmony_ci continue; 51862306a36Sopenharmony_ci release_pte_folio(folio); 51962306a36Sopenharmony_ci } 52062306a36Sopenharmony_ci 52162306a36Sopenharmony_ci list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) { 52262306a36Sopenharmony_ci list_del(&folio->lru); 52362306a36Sopenharmony_ci release_pte_folio(folio); 52462306a36Sopenharmony_ci } 52562306a36Sopenharmony_ci} 52662306a36Sopenharmony_ci 52762306a36Sopenharmony_cistatic bool is_refcount_suitable(struct page *page) 52862306a36Sopenharmony_ci{ 52962306a36Sopenharmony_ci int expected_refcount; 53062306a36Sopenharmony_ci 53162306a36Sopenharmony_ci expected_refcount = total_mapcount(page); 53262306a36Sopenharmony_ci if (PageSwapCache(page)) 53362306a36Sopenharmony_ci expected_refcount += compound_nr(page); 53462306a36Sopenharmony_ci 53562306a36Sopenharmony_ci return page_count(page) == expected_refcount; 53662306a36Sopenharmony_ci} 53762306a36Sopenharmony_ci 53862306a36Sopenharmony_cistatic int __collapse_huge_page_isolate(struct vm_area_struct *vma, 53962306a36Sopenharmony_ci unsigned long address, 54062306a36Sopenharmony_ci pte_t *pte, 54162306a36Sopenharmony_ci struct collapse_control *cc, 54262306a36Sopenharmony_ci struct list_head *compound_pagelist) 54362306a36Sopenharmony_ci{ 54462306a36Sopenharmony_ci struct page *page = NULL; 54562306a36Sopenharmony_ci pte_t *_pte; 54662306a36Sopenharmony_ci int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0; 54762306a36Sopenharmony_ci bool writable = false; 54862306a36Sopenharmony_ci 54962306a36Sopenharmony_ci for (_pte = pte; _pte < pte + HPAGE_PMD_NR; 55062306a36Sopenharmony_ci _pte++, address += PAGE_SIZE) { 55162306a36Sopenharmony_ci pte_t pteval = ptep_get(_pte); 55262306a36Sopenharmony_ci if (pte_none(pteval) || (pte_present(pteval) && 55362306a36Sopenharmony_ci is_zero_pfn(pte_pfn(pteval)))) { 55462306a36Sopenharmony_ci ++none_or_zero; 55562306a36Sopenharmony_ci if (!userfaultfd_armed(vma) && 55662306a36Sopenharmony_ci (!cc->is_khugepaged || 55762306a36Sopenharmony_ci none_or_zero <= khugepaged_max_ptes_none)) { 55862306a36Sopenharmony_ci continue; 55962306a36Sopenharmony_ci } else { 56062306a36Sopenharmony_ci result = SCAN_EXCEED_NONE_PTE; 56162306a36Sopenharmony_ci count_vm_event(THP_SCAN_EXCEED_NONE_PTE); 56262306a36Sopenharmony_ci goto out; 56362306a36Sopenharmony_ci } 56462306a36Sopenharmony_ci } 56562306a36Sopenharmony_ci if (!pte_present(pteval)) { 56662306a36Sopenharmony_ci result = SCAN_PTE_NON_PRESENT; 56762306a36Sopenharmony_ci goto out; 56862306a36Sopenharmony_ci } 56962306a36Sopenharmony_ci if (pte_uffd_wp(pteval)) { 57062306a36Sopenharmony_ci result = SCAN_PTE_UFFD_WP; 57162306a36Sopenharmony_ci goto out; 57262306a36Sopenharmony_ci } 57362306a36Sopenharmony_ci page = vm_normal_page(vma, address, pteval); 57462306a36Sopenharmony_ci if (unlikely(!page) || unlikely(is_zone_device_page(page))) { 57562306a36Sopenharmony_ci result = SCAN_PAGE_NULL; 57662306a36Sopenharmony_ci goto out; 57762306a36Sopenharmony_ci } 57862306a36Sopenharmony_ci 57962306a36Sopenharmony_ci VM_BUG_ON_PAGE(!PageAnon(page), page); 58062306a36Sopenharmony_ci 58162306a36Sopenharmony_ci if (page_mapcount(page) > 1) { 58262306a36Sopenharmony_ci ++shared; 58362306a36Sopenharmony_ci if (cc->is_khugepaged && 58462306a36Sopenharmony_ci shared > khugepaged_max_ptes_shared) { 58562306a36Sopenharmony_ci result = SCAN_EXCEED_SHARED_PTE; 58662306a36Sopenharmony_ci count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); 58762306a36Sopenharmony_ci goto out; 58862306a36Sopenharmony_ci } 58962306a36Sopenharmony_ci } 59062306a36Sopenharmony_ci 59162306a36Sopenharmony_ci if (PageCompound(page)) { 59262306a36Sopenharmony_ci struct page *p; 59362306a36Sopenharmony_ci page = compound_head(page); 59462306a36Sopenharmony_ci 59562306a36Sopenharmony_ci /* 59662306a36Sopenharmony_ci * Check if we have dealt with the compound page 59762306a36Sopenharmony_ci * already 59862306a36Sopenharmony_ci */ 59962306a36Sopenharmony_ci list_for_each_entry(p, compound_pagelist, lru) { 60062306a36Sopenharmony_ci if (page == p) 60162306a36Sopenharmony_ci goto next; 60262306a36Sopenharmony_ci } 60362306a36Sopenharmony_ci } 60462306a36Sopenharmony_ci 60562306a36Sopenharmony_ci /* 60662306a36Sopenharmony_ci * We can do it before isolate_lru_page because the 60762306a36Sopenharmony_ci * page can't be freed from under us. NOTE: PG_lock 60862306a36Sopenharmony_ci * is needed to serialize against split_huge_page 60962306a36Sopenharmony_ci * when invoked from the VM. 61062306a36Sopenharmony_ci */ 61162306a36Sopenharmony_ci if (!trylock_page(page)) { 61262306a36Sopenharmony_ci result = SCAN_PAGE_LOCK; 61362306a36Sopenharmony_ci goto out; 61462306a36Sopenharmony_ci } 61562306a36Sopenharmony_ci 61662306a36Sopenharmony_ci /* 61762306a36Sopenharmony_ci * Check if the page has any GUP (or other external) pins. 61862306a36Sopenharmony_ci * 61962306a36Sopenharmony_ci * The page table that maps the page has been already unlinked 62062306a36Sopenharmony_ci * from the page table tree and this process cannot get 62162306a36Sopenharmony_ci * an additional pin on the page. 62262306a36Sopenharmony_ci * 62362306a36Sopenharmony_ci * New pins can come later if the page is shared across fork, 62462306a36Sopenharmony_ci * but not from this process. The other process cannot write to 62562306a36Sopenharmony_ci * the page, only trigger CoW. 62662306a36Sopenharmony_ci */ 62762306a36Sopenharmony_ci if (!is_refcount_suitable(page)) { 62862306a36Sopenharmony_ci unlock_page(page); 62962306a36Sopenharmony_ci result = SCAN_PAGE_COUNT; 63062306a36Sopenharmony_ci goto out; 63162306a36Sopenharmony_ci } 63262306a36Sopenharmony_ci 63362306a36Sopenharmony_ci /* 63462306a36Sopenharmony_ci * Isolate the page to avoid collapsing an hugepage 63562306a36Sopenharmony_ci * currently in use by the VM. 63662306a36Sopenharmony_ci */ 63762306a36Sopenharmony_ci if (!isolate_lru_page(page)) { 63862306a36Sopenharmony_ci unlock_page(page); 63962306a36Sopenharmony_ci result = SCAN_DEL_PAGE_LRU; 64062306a36Sopenharmony_ci goto out; 64162306a36Sopenharmony_ci } 64262306a36Sopenharmony_ci mod_node_page_state(page_pgdat(page), 64362306a36Sopenharmony_ci NR_ISOLATED_ANON + page_is_file_lru(page), 64462306a36Sopenharmony_ci compound_nr(page)); 64562306a36Sopenharmony_ci VM_BUG_ON_PAGE(!PageLocked(page), page); 64662306a36Sopenharmony_ci VM_BUG_ON_PAGE(PageLRU(page), page); 64762306a36Sopenharmony_ci 64862306a36Sopenharmony_ci if (PageCompound(page)) 64962306a36Sopenharmony_ci list_add_tail(&page->lru, compound_pagelist); 65062306a36Sopenharmony_cinext: 65162306a36Sopenharmony_ci /* 65262306a36Sopenharmony_ci * If collapse was initiated by khugepaged, check that there is 65362306a36Sopenharmony_ci * enough young pte to justify collapsing the page 65462306a36Sopenharmony_ci */ 65562306a36Sopenharmony_ci if (cc->is_khugepaged && 65662306a36Sopenharmony_ci (pte_young(pteval) || page_is_young(page) || 65762306a36Sopenharmony_ci PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, 65862306a36Sopenharmony_ci address))) 65962306a36Sopenharmony_ci referenced++; 66062306a36Sopenharmony_ci 66162306a36Sopenharmony_ci if (pte_write(pteval)) 66262306a36Sopenharmony_ci writable = true; 66362306a36Sopenharmony_ci } 66462306a36Sopenharmony_ci 66562306a36Sopenharmony_ci if (unlikely(!writable)) { 66662306a36Sopenharmony_ci result = SCAN_PAGE_RO; 66762306a36Sopenharmony_ci } else if (unlikely(cc->is_khugepaged && !referenced)) { 66862306a36Sopenharmony_ci result = SCAN_LACK_REFERENCED_PAGE; 66962306a36Sopenharmony_ci } else { 67062306a36Sopenharmony_ci result = SCAN_SUCCEED; 67162306a36Sopenharmony_ci trace_mm_collapse_huge_page_isolate(page, none_or_zero, 67262306a36Sopenharmony_ci referenced, writable, result); 67362306a36Sopenharmony_ci return result; 67462306a36Sopenharmony_ci } 67562306a36Sopenharmony_ciout: 67662306a36Sopenharmony_ci release_pte_pages(pte, _pte, compound_pagelist); 67762306a36Sopenharmony_ci trace_mm_collapse_huge_page_isolate(page, none_or_zero, 67862306a36Sopenharmony_ci referenced, writable, result); 67962306a36Sopenharmony_ci return result; 68062306a36Sopenharmony_ci} 68162306a36Sopenharmony_ci 68262306a36Sopenharmony_cistatic void __collapse_huge_page_copy_succeeded(pte_t *pte, 68362306a36Sopenharmony_ci struct vm_area_struct *vma, 68462306a36Sopenharmony_ci unsigned long address, 68562306a36Sopenharmony_ci spinlock_t *ptl, 68662306a36Sopenharmony_ci struct list_head *compound_pagelist) 68762306a36Sopenharmony_ci{ 68862306a36Sopenharmony_ci struct page *src_page; 68962306a36Sopenharmony_ci struct page *tmp; 69062306a36Sopenharmony_ci pte_t *_pte; 69162306a36Sopenharmony_ci pte_t pteval; 69262306a36Sopenharmony_ci 69362306a36Sopenharmony_ci for (_pte = pte; _pte < pte + HPAGE_PMD_NR; 69462306a36Sopenharmony_ci _pte++, address += PAGE_SIZE) { 69562306a36Sopenharmony_ci pteval = ptep_get(_pte); 69662306a36Sopenharmony_ci if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { 69762306a36Sopenharmony_ci add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); 69862306a36Sopenharmony_ci if (is_zero_pfn(pte_pfn(pteval))) { 69962306a36Sopenharmony_ci /* 70062306a36Sopenharmony_ci * ptl mostly unnecessary. 70162306a36Sopenharmony_ci */ 70262306a36Sopenharmony_ci spin_lock(ptl); 70362306a36Sopenharmony_ci ptep_clear(vma->vm_mm, address, _pte); 70462306a36Sopenharmony_ci spin_unlock(ptl); 70562306a36Sopenharmony_ci ksm_might_unmap_zero_page(vma->vm_mm, pteval); 70662306a36Sopenharmony_ci } 70762306a36Sopenharmony_ci } else { 70862306a36Sopenharmony_ci src_page = pte_page(pteval); 70962306a36Sopenharmony_ci if (!PageCompound(src_page)) 71062306a36Sopenharmony_ci release_pte_page(src_page); 71162306a36Sopenharmony_ci /* 71262306a36Sopenharmony_ci * ptl mostly unnecessary, but preempt has to 71362306a36Sopenharmony_ci * be disabled to update the per-cpu stats 71462306a36Sopenharmony_ci * inside page_remove_rmap(). 71562306a36Sopenharmony_ci */ 71662306a36Sopenharmony_ci spin_lock(ptl); 71762306a36Sopenharmony_ci ptep_clear(vma->vm_mm, address, _pte); 71862306a36Sopenharmony_ci page_remove_rmap(src_page, vma, false); 71962306a36Sopenharmony_ci spin_unlock(ptl); 72062306a36Sopenharmony_ci free_page_and_swap_cache(src_page); 72162306a36Sopenharmony_ci } 72262306a36Sopenharmony_ci } 72362306a36Sopenharmony_ci 72462306a36Sopenharmony_ci list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) { 72562306a36Sopenharmony_ci list_del(&src_page->lru); 72662306a36Sopenharmony_ci mod_node_page_state(page_pgdat(src_page), 72762306a36Sopenharmony_ci NR_ISOLATED_ANON + page_is_file_lru(src_page), 72862306a36Sopenharmony_ci -compound_nr(src_page)); 72962306a36Sopenharmony_ci unlock_page(src_page); 73062306a36Sopenharmony_ci free_swap_cache(src_page); 73162306a36Sopenharmony_ci putback_lru_page(src_page); 73262306a36Sopenharmony_ci } 73362306a36Sopenharmony_ci} 73462306a36Sopenharmony_ci 73562306a36Sopenharmony_cistatic void __collapse_huge_page_copy_failed(pte_t *pte, 73662306a36Sopenharmony_ci pmd_t *pmd, 73762306a36Sopenharmony_ci pmd_t orig_pmd, 73862306a36Sopenharmony_ci struct vm_area_struct *vma, 73962306a36Sopenharmony_ci struct list_head *compound_pagelist) 74062306a36Sopenharmony_ci{ 74162306a36Sopenharmony_ci spinlock_t *pmd_ptl; 74262306a36Sopenharmony_ci 74362306a36Sopenharmony_ci /* 74462306a36Sopenharmony_ci * Re-establish the PMD to point to the original page table 74562306a36Sopenharmony_ci * entry. Restoring PMD needs to be done prior to releasing 74662306a36Sopenharmony_ci * pages. Since pages are still isolated and locked here, 74762306a36Sopenharmony_ci * acquiring anon_vma_lock_write is unnecessary. 74862306a36Sopenharmony_ci */ 74962306a36Sopenharmony_ci pmd_ptl = pmd_lock(vma->vm_mm, pmd); 75062306a36Sopenharmony_ci pmd_populate(vma->vm_mm, pmd, pmd_pgtable(orig_pmd)); 75162306a36Sopenharmony_ci spin_unlock(pmd_ptl); 75262306a36Sopenharmony_ci /* 75362306a36Sopenharmony_ci * Release both raw and compound pages isolated 75462306a36Sopenharmony_ci * in __collapse_huge_page_isolate. 75562306a36Sopenharmony_ci */ 75662306a36Sopenharmony_ci release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist); 75762306a36Sopenharmony_ci} 75862306a36Sopenharmony_ci 75962306a36Sopenharmony_ci/* 76062306a36Sopenharmony_ci * __collapse_huge_page_copy - attempts to copy memory contents from raw 76162306a36Sopenharmony_ci * pages to a hugepage. Cleans up the raw pages if copying succeeds; 76262306a36Sopenharmony_ci * otherwise restores the original page table and releases isolated raw pages. 76362306a36Sopenharmony_ci * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC. 76462306a36Sopenharmony_ci * 76562306a36Sopenharmony_ci * @pte: starting of the PTEs to copy from 76662306a36Sopenharmony_ci * @page: the new hugepage to copy contents to 76762306a36Sopenharmony_ci * @pmd: pointer to the new hugepage's PMD 76862306a36Sopenharmony_ci * @orig_pmd: the original raw pages' PMD 76962306a36Sopenharmony_ci * @vma: the original raw pages' virtual memory area 77062306a36Sopenharmony_ci * @address: starting address to copy 77162306a36Sopenharmony_ci * @ptl: lock on raw pages' PTEs 77262306a36Sopenharmony_ci * @compound_pagelist: list that stores compound pages 77362306a36Sopenharmony_ci */ 77462306a36Sopenharmony_cistatic int __collapse_huge_page_copy(pte_t *pte, 77562306a36Sopenharmony_ci struct page *page, 77662306a36Sopenharmony_ci pmd_t *pmd, 77762306a36Sopenharmony_ci pmd_t orig_pmd, 77862306a36Sopenharmony_ci struct vm_area_struct *vma, 77962306a36Sopenharmony_ci unsigned long address, 78062306a36Sopenharmony_ci spinlock_t *ptl, 78162306a36Sopenharmony_ci struct list_head *compound_pagelist) 78262306a36Sopenharmony_ci{ 78362306a36Sopenharmony_ci struct page *src_page; 78462306a36Sopenharmony_ci pte_t *_pte; 78562306a36Sopenharmony_ci pte_t pteval; 78662306a36Sopenharmony_ci unsigned long _address; 78762306a36Sopenharmony_ci int result = SCAN_SUCCEED; 78862306a36Sopenharmony_ci 78962306a36Sopenharmony_ci /* 79062306a36Sopenharmony_ci * Copying pages' contents is subject to memory poison at any iteration. 79162306a36Sopenharmony_ci */ 79262306a36Sopenharmony_ci for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR; 79362306a36Sopenharmony_ci _pte++, page++, _address += PAGE_SIZE) { 79462306a36Sopenharmony_ci pteval = ptep_get(_pte); 79562306a36Sopenharmony_ci if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { 79662306a36Sopenharmony_ci clear_user_highpage(page, _address); 79762306a36Sopenharmony_ci continue; 79862306a36Sopenharmony_ci } 79962306a36Sopenharmony_ci src_page = pte_page(pteval); 80062306a36Sopenharmony_ci if (copy_mc_user_highpage(page, src_page, _address, vma) > 0) { 80162306a36Sopenharmony_ci result = SCAN_COPY_MC; 80262306a36Sopenharmony_ci break; 80362306a36Sopenharmony_ci } 80462306a36Sopenharmony_ci } 80562306a36Sopenharmony_ci 80662306a36Sopenharmony_ci if (likely(result == SCAN_SUCCEED)) 80762306a36Sopenharmony_ci __collapse_huge_page_copy_succeeded(pte, vma, address, ptl, 80862306a36Sopenharmony_ci compound_pagelist); 80962306a36Sopenharmony_ci else 81062306a36Sopenharmony_ci __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma, 81162306a36Sopenharmony_ci compound_pagelist); 81262306a36Sopenharmony_ci 81362306a36Sopenharmony_ci return result; 81462306a36Sopenharmony_ci} 81562306a36Sopenharmony_ci 81662306a36Sopenharmony_cistatic void khugepaged_alloc_sleep(void) 81762306a36Sopenharmony_ci{ 81862306a36Sopenharmony_ci DEFINE_WAIT(wait); 81962306a36Sopenharmony_ci 82062306a36Sopenharmony_ci add_wait_queue(&khugepaged_wait, &wait); 82162306a36Sopenharmony_ci __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); 82262306a36Sopenharmony_ci schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); 82362306a36Sopenharmony_ci remove_wait_queue(&khugepaged_wait, &wait); 82462306a36Sopenharmony_ci} 82562306a36Sopenharmony_ci 82662306a36Sopenharmony_cistruct collapse_control khugepaged_collapse_control = { 82762306a36Sopenharmony_ci .is_khugepaged = true, 82862306a36Sopenharmony_ci}; 82962306a36Sopenharmony_ci 83062306a36Sopenharmony_cistatic bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc) 83162306a36Sopenharmony_ci{ 83262306a36Sopenharmony_ci int i; 83362306a36Sopenharmony_ci 83462306a36Sopenharmony_ci /* 83562306a36Sopenharmony_ci * If node_reclaim_mode is disabled, then no extra effort is made to 83662306a36Sopenharmony_ci * allocate memory locally. 83762306a36Sopenharmony_ci */ 83862306a36Sopenharmony_ci if (!node_reclaim_enabled()) 83962306a36Sopenharmony_ci return false; 84062306a36Sopenharmony_ci 84162306a36Sopenharmony_ci /* If there is a count for this node already, it must be acceptable */ 84262306a36Sopenharmony_ci if (cc->node_load[nid]) 84362306a36Sopenharmony_ci return false; 84462306a36Sopenharmony_ci 84562306a36Sopenharmony_ci for (i = 0; i < MAX_NUMNODES; i++) { 84662306a36Sopenharmony_ci if (!cc->node_load[i]) 84762306a36Sopenharmony_ci continue; 84862306a36Sopenharmony_ci if (node_distance(nid, i) > node_reclaim_distance) 84962306a36Sopenharmony_ci return true; 85062306a36Sopenharmony_ci } 85162306a36Sopenharmony_ci return false; 85262306a36Sopenharmony_ci} 85362306a36Sopenharmony_ci 85462306a36Sopenharmony_ci#define khugepaged_defrag() \ 85562306a36Sopenharmony_ci (transparent_hugepage_flags & \ 85662306a36Sopenharmony_ci (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)) 85762306a36Sopenharmony_ci 85862306a36Sopenharmony_ci/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ 85962306a36Sopenharmony_cistatic inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) 86062306a36Sopenharmony_ci{ 86162306a36Sopenharmony_ci return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT; 86262306a36Sopenharmony_ci} 86362306a36Sopenharmony_ci 86462306a36Sopenharmony_ci#ifdef CONFIG_NUMA 86562306a36Sopenharmony_cistatic int hpage_collapse_find_target_node(struct collapse_control *cc) 86662306a36Sopenharmony_ci{ 86762306a36Sopenharmony_ci int nid, target_node = 0, max_value = 0; 86862306a36Sopenharmony_ci 86962306a36Sopenharmony_ci /* find first node with max normal pages hit */ 87062306a36Sopenharmony_ci for (nid = 0; nid < MAX_NUMNODES; nid++) 87162306a36Sopenharmony_ci if (cc->node_load[nid] > max_value) { 87262306a36Sopenharmony_ci max_value = cc->node_load[nid]; 87362306a36Sopenharmony_ci target_node = nid; 87462306a36Sopenharmony_ci } 87562306a36Sopenharmony_ci 87662306a36Sopenharmony_ci for_each_online_node(nid) { 87762306a36Sopenharmony_ci if (max_value == cc->node_load[nid]) 87862306a36Sopenharmony_ci node_set(nid, cc->alloc_nmask); 87962306a36Sopenharmony_ci } 88062306a36Sopenharmony_ci 88162306a36Sopenharmony_ci return target_node; 88262306a36Sopenharmony_ci} 88362306a36Sopenharmony_ci#else 88462306a36Sopenharmony_cistatic int hpage_collapse_find_target_node(struct collapse_control *cc) 88562306a36Sopenharmony_ci{ 88662306a36Sopenharmony_ci return 0; 88762306a36Sopenharmony_ci} 88862306a36Sopenharmony_ci#endif 88962306a36Sopenharmony_ci 89062306a36Sopenharmony_cistatic bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node, 89162306a36Sopenharmony_ci nodemask_t *nmask) 89262306a36Sopenharmony_ci{ 89362306a36Sopenharmony_ci *hpage = __alloc_pages(gfp, HPAGE_PMD_ORDER, node, nmask); 89462306a36Sopenharmony_ci if (unlikely(!*hpage)) { 89562306a36Sopenharmony_ci count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 89662306a36Sopenharmony_ci return false; 89762306a36Sopenharmony_ci } 89862306a36Sopenharmony_ci 89962306a36Sopenharmony_ci folio_prep_large_rmappable((struct folio *)*hpage); 90062306a36Sopenharmony_ci count_vm_event(THP_COLLAPSE_ALLOC); 90162306a36Sopenharmony_ci return true; 90262306a36Sopenharmony_ci} 90362306a36Sopenharmony_ci 90462306a36Sopenharmony_ci/* 90562306a36Sopenharmony_ci * If mmap_lock temporarily dropped, revalidate vma 90662306a36Sopenharmony_ci * before taking mmap_lock. 90762306a36Sopenharmony_ci * Returns enum scan_result value. 90862306a36Sopenharmony_ci */ 90962306a36Sopenharmony_ci 91062306a36Sopenharmony_cistatic int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, 91162306a36Sopenharmony_ci bool expect_anon, 91262306a36Sopenharmony_ci struct vm_area_struct **vmap, 91362306a36Sopenharmony_ci struct collapse_control *cc) 91462306a36Sopenharmony_ci{ 91562306a36Sopenharmony_ci struct vm_area_struct *vma; 91662306a36Sopenharmony_ci 91762306a36Sopenharmony_ci if (unlikely(hpage_collapse_test_exit(mm))) 91862306a36Sopenharmony_ci return SCAN_ANY_PROCESS; 91962306a36Sopenharmony_ci 92062306a36Sopenharmony_ci *vmap = vma = find_vma(mm, address); 92162306a36Sopenharmony_ci if (!vma) 92262306a36Sopenharmony_ci return SCAN_VMA_NULL; 92362306a36Sopenharmony_ci 92462306a36Sopenharmony_ci if (!transhuge_vma_suitable(vma, address)) 92562306a36Sopenharmony_ci return SCAN_ADDRESS_RANGE; 92662306a36Sopenharmony_ci if (!hugepage_vma_check(vma, vma->vm_flags, false, false, 92762306a36Sopenharmony_ci cc->is_khugepaged)) 92862306a36Sopenharmony_ci return SCAN_VMA_CHECK; 92962306a36Sopenharmony_ci /* 93062306a36Sopenharmony_ci * Anon VMA expected, the address may be unmapped then 93162306a36Sopenharmony_ci * remapped to file after khugepaged reaquired the mmap_lock. 93262306a36Sopenharmony_ci * 93362306a36Sopenharmony_ci * hugepage_vma_check may return true for qualified file 93462306a36Sopenharmony_ci * vmas. 93562306a36Sopenharmony_ci */ 93662306a36Sopenharmony_ci if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap))) 93762306a36Sopenharmony_ci return SCAN_PAGE_ANON; 93862306a36Sopenharmony_ci return SCAN_SUCCEED; 93962306a36Sopenharmony_ci} 94062306a36Sopenharmony_ci 94162306a36Sopenharmony_cistatic int find_pmd_or_thp_or_none(struct mm_struct *mm, 94262306a36Sopenharmony_ci unsigned long address, 94362306a36Sopenharmony_ci pmd_t **pmd) 94462306a36Sopenharmony_ci{ 94562306a36Sopenharmony_ci pmd_t pmde; 94662306a36Sopenharmony_ci 94762306a36Sopenharmony_ci *pmd = mm_find_pmd(mm, address); 94862306a36Sopenharmony_ci if (!*pmd) 94962306a36Sopenharmony_ci return SCAN_PMD_NULL; 95062306a36Sopenharmony_ci 95162306a36Sopenharmony_ci pmde = pmdp_get_lockless(*pmd); 95262306a36Sopenharmony_ci if (pmd_none(pmde)) 95362306a36Sopenharmony_ci return SCAN_PMD_NONE; 95462306a36Sopenharmony_ci if (!pmd_present(pmde)) 95562306a36Sopenharmony_ci return SCAN_PMD_NULL; 95662306a36Sopenharmony_ci if (pmd_trans_huge(pmde)) 95762306a36Sopenharmony_ci return SCAN_PMD_MAPPED; 95862306a36Sopenharmony_ci if (pmd_devmap(pmde)) 95962306a36Sopenharmony_ci return SCAN_PMD_NULL; 96062306a36Sopenharmony_ci if (pmd_bad(pmde)) 96162306a36Sopenharmony_ci return SCAN_PMD_NULL; 96262306a36Sopenharmony_ci return SCAN_SUCCEED; 96362306a36Sopenharmony_ci} 96462306a36Sopenharmony_ci 96562306a36Sopenharmony_cistatic int check_pmd_still_valid(struct mm_struct *mm, 96662306a36Sopenharmony_ci unsigned long address, 96762306a36Sopenharmony_ci pmd_t *pmd) 96862306a36Sopenharmony_ci{ 96962306a36Sopenharmony_ci pmd_t *new_pmd; 97062306a36Sopenharmony_ci int result = find_pmd_or_thp_or_none(mm, address, &new_pmd); 97162306a36Sopenharmony_ci 97262306a36Sopenharmony_ci if (result != SCAN_SUCCEED) 97362306a36Sopenharmony_ci return result; 97462306a36Sopenharmony_ci if (new_pmd != pmd) 97562306a36Sopenharmony_ci return SCAN_FAIL; 97662306a36Sopenharmony_ci return SCAN_SUCCEED; 97762306a36Sopenharmony_ci} 97862306a36Sopenharmony_ci 97962306a36Sopenharmony_ci/* 98062306a36Sopenharmony_ci * Bring missing pages in from swap, to complete THP collapse. 98162306a36Sopenharmony_ci * Only done if hpage_collapse_scan_pmd believes it is worthwhile. 98262306a36Sopenharmony_ci * 98362306a36Sopenharmony_ci * Called and returns without pte mapped or spinlocks held. 98462306a36Sopenharmony_ci * Returns result: if not SCAN_SUCCEED, mmap_lock has been released. 98562306a36Sopenharmony_ci */ 98662306a36Sopenharmony_cistatic int __collapse_huge_page_swapin(struct mm_struct *mm, 98762306a36Sopenharmony_ci struct vm_area_struct *vma, 98862306a36Sopenharmony_ci unsigned long haddr, pmd_t *pmd, 98962306a36Sopenharmony_ci int referenced) 99062306a36Sopenharmony_ci{ 99162306a36Sopenharmony_ci int swapped_in = 0; 99262306a36Sopenharmony_ci vm_fault_t ret = 0; 99362306a36Sopenharmony_ci unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE); 99462306a36Sopenharmony_ci int result; 99562306a36Sopenharmony_ci pte_t *pte = NULL; 99662306a36Sopenharmony_ci spinlock_t *ptl; 99762306a36Sopenharmony_ci 99862306a36Sopenharmony_ci for (address = haddr; address < end; address += PAGE_SIZE) { 99962306a36Sopenharmony_ci struct vm_fault vmf = { 100062306a36Sopenharmony_ci .vma = vma, 100162306a36Sopenharmony_ci .address = address, 100262306a36Sopenharmony_ci .pgoff = linear_page_index(vma, address), 100362306a36Sopenharmony_ci .flags = FAULT_FLAG_ALLOW_RETRY, 100462306a36Sopenharmony_ci .pmd = pmd, 100562306a36Sopenharmony_ci }; 100662306a36Sopenharmony_ci 100762306a36Sopenharmony_ci if (!pte++) { 100862306a36Sopenharmony_ci pte = pte_offset_map_nolock(mm, pmd, address, &ptl); 100962306a36Sopenharmony_ci if (!pte) { 101062306a36Sopenharmony_ci mmap_read_unlock(mm); 101162306a36Sopenharmony_ci result = SCAN_PMD_NULL; 101262306a36Sopenharmony_ci goto out; 101362306a36Sopenharmony_ci } 101462306a36Sopenharmony_ci } 101562306a36Sopenharmony_ci 101662306a36Sopenharmony_ci vmf.orig_pte = ptep_get_lockless(pte); 101762306a36Sopenharmony_ci if (!is_swap_pte(vmf.orig_pte)) 101862306a36Sopenharmony_ci continue; 101962306a36Sopenharmony_ci 102062306a36Sopenharmony_ci vmf.pte = pte; 102162306a36Sopenharmony_ci vmf.ptl = ptl; 102262306a36Sopenharmony_ci ret = do_swap_page(&vmf); 102362306a36Sopenharmony_ci /* Which unmaps pte (after perhaps re-checking the entry) */ 102462306a36Sopenharmony_ci pte = NULL; 102562306a36Sopenharmony_ci 102662306a36Sopenharmony_ci /* 102762306a36Sopenharmony_ci * do_swap_page returns VM_FAULT_RETRY with released mmap_lock. 102862306a36Sopenharmony_ci * Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because 102962306a36Sopenharmony_ci * we do not retry here and swap entry will remain in pagetable 103062306a36Sopenharmony_ci * resulting in later failure. 103162306a36Sopenharmony_ci */ 103262306a36Sopenharmony_ci if (ret & VM_FAULT_RETRY) { 103362306a36Sopenharmony_ci /* Likely, but not guaranteed, that page lock failed */ 103462306a36Sopenharmony_ci result = SCAN_PAGE_LOCK; 103562306a36Sopenharmony_ci goto out; 103662306a36Sopenharmony_ci } 103762306a36Sopenharmony_ci if (ret & VM_FAULT_ERROR) { 103862306a36Sopenharmony_ci mmap_read_unlock(mm); 103962306a36Sopenharmony_ci result = SCAN_FAIL; 104062306a36Sopenharmony_ci goto out; 104162306a36Sopenharmony_ci } 104262306a36Sopenharmony_ci swapped_in++; 104362306a36Sopenharmony_ci } 104462306a36Sopenharmony_ci 104562306a36Sopenharmony_ci if (pte) 104662306a36Sopenharmony_ci pte_unmap(pte); 104762306a36Sopenharmony_ci 104862306a36Sopenharmony_ci /* Drain LRU cache to remove extra pin on the swapped in pages */ 104962306a36Sopenharmony_ci if (swapped_in) 105062306a36Sopenharmony_ci lru_add_drain(); 105162306a36Sopenharmony_ci 105262306a36Sopenharmony_ci result = SCAN_SUCCEED; 105362306a36Sopenharmony_ciout: 105462306a36Sopenharmony_ci trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result); 105562306a36Sopenharmony_ci return result; 105662306a36Sopenharmony_ci} 105762306a36Sopenharmony_ci 105862306a36Sopenharmony_cistatic int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm, 105962306a36Sopenharmony_ci struct collapse_control *cc) 106062306a36Sopenharmony_ci{ 106162306a36Sopenharmony_ci gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() : 106262306a36Sopenharmony_ci GFP_TRANSHUGE); 106362306a36Sopenharmony_ci int node = hpage_collapse_find_target_node(cc); 106462306a36Sopenharmony_ci struct folio *folio; 106562306a36Sopenharmony_ci 106662306a36Sopenharmony_ci if (!hpage_collapse_alloc_page(hpage, gfp, node, &cc->alloc_nmask)) 106762306a36Sopenharmony_ci return SCAN_ALLOC_HUGE_PAGE_FAIL; 106862306a36Sopenharmony_ci 106962306a36Sopenharmony_ci folio = page_folio(*hpage); 107062306a36Sopenharmony_ci if (unlikely(mem_cgroup_charge(folio, mm, gfp))) { 107162306a36Sopenharmony_ci folio_put(folio); 107262306a36Sopenharmony_ci *hpage = NULL; 107362306a36Sopenharmony_ci return SCAN_CGROUP_CHARGE_FAIL; 107462306a36Sopenharmony_ci } 107562306a36Sopenharmony_ci count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC); 107662306a36Sopenharmony_ci 107762306a36Sopenharmony_ci return SCAN_SUCCEED; 107862306a36Sopenharmony_ci} 107962306a36Sopenharmony_ci 108062306a36Sopenharmony_cistatic int collapse_huge_page(struct mm_struct *mm, unsigned long address, 108162306a36Sopenharmony_ci int referenced, int unmapped, 108262306a36Sopenharmony_ci struct collapse_control *cc) 108362306a36Sopenharmony_ci{ 108462306a36Sopenharmony_ci LIST_HEAD(compound_pagelist); 108562306a36Sopenharmony_ci pmd_t *pmd, _pmd; 108662306a36Sopenharmony_ci pte_t *pte; 108762306a36Sopenharmony_ci pgtable_t pgtable; 108862306a36Sopenharmony_ci struct page *hpage; 108962306a36Sopenharmony_ci spinlock_t *pmd_ptl, *pte_ptl; 109062306a36Sopenharmony_ci int result = SCAN_FAIL; 109162306a36Sopenharmony_ci struct vm_area_struct *vma; 109262306a36Sopenharmony_ci struct mmu_notifier_range range; 109362306a36Sopenharmony_ci 109462306a36Sopenharmony_ci VM_BUG_ON(address & ~HPAGE_PMD_MASK); 109562306a36Sopenharmony_ci 109662306a36Sopenharmony_ci /* 109762306a36Sopenharmony_ci * Before allocating the hugepage, release the mmap_lock read lock. 109862306a36Sopenharmony_ci * The allocation can take potentially a long time if it involves 109962306a36Sopenharmony_ci * sync compaction, and we do not need to hold the mmap_lock during 110062306a36Sopenharmony_ci * that. We will recheck the vma after taking it again in write mode. 110162306a36Sopenharmony_ci */ 110262306a36Sopenharmony_ci mmap_read_unlock(mm); 110362306a36Sopenharmony_ci 110462306a36Sopenharmony_ci result = alloc_charge_hpage(&hpage, mm, cc); 110562306a36Sopenharmony_ci if (result != SCAN_SUCCEED) 110662306a36Sopenharmony_ci goto out_nolock; 110762306a36Sopenharmony_ci 110862306a36Sopenharmony_ci mmap_read_lock(mm); 110962306a36Sopenharmony_ci result = hugepage_vma_revalidate(mm, address, true, &vma, cc); 111062306a36Sopenharmony_ci if (result != SCAN_SUCCEED) { 111162306a36Sopenharmony_ci mmap_read_unlock(mm); 111262306a36Sopenharmony_ci goto out_nolock; 111362306a36Sopenharmony_ci } 111462306a36Sopenharmony_ci 111562306a36Sopenharmony_ci result = find_pmd_or_thp_or_none(mm, address, &pmd); 111662306a36Sopenharmony_ci if (result != SCAN_SUCCEED) { 111762306a36Sopenharmony_ci mmap_read_unlock(mm); 111862306a36Sopenharmony_ci goto out_nolock; 111962306a36Sopenharmony_ci } 112062306a36Sopenharmony_ci 112162306a36Sopenharmony_ci if (unmapped) { 112262306a36Sopenharmony_ci /* 112362306a36Sopenharmony_ci * __collapse_huge_page_swapin will return with mmap_lock 112462306a36Sopenharmony_ci * released when it fails. So we jump out_nolock directly in 112562306a36Sopenharmony_ci * that case. Continuing to collapse causes inconsistency. 112662306a36Sopenharmony_ci */ 112762306a36Sopenharmony_ci result = __collapse_huge_page_swapin(mm, vma, address, pmd, 112862306a36Sopenharmony_ci referenced); 112962306a36Sopenharmony_ci if (result != SCAN_SUCCEED) 113062306a36Sopenharmony_ci goto out_nolock; 113162306a36Sopenharmony_ci } 113262306a36Sopenharmony_ci 113362306a36Sopenharmony_ci mmap_read_unlock(mm); 113462306a36Sopenharmony_ci /* 113562306a36Sopenharmony_ci * Prevent all access to pagetables with the exception of 113662306a36Sopenharmony_ci * gup_fast later handled by the ptep_clear_flush and the VM 113762306a36Sopenharmony_ci * handled by the anon_vma lock + PG_lock. 113862306a36Sopenharmony_ci */ 113962306a36Sopenharmony_ci mmap_write_lock(mm); 114062306a36Sopenharmony_ci result = hugepage_vma_revalidate(mm, address, true, &vma, cc); 114162306a36Sopenharmony_ci if (result != SCAN_SUCCEED) 114262306a36Sopenharmony_ci goto out_up_write; 114362306a36Sopenharmony_ci /* check if the pmd is still valid */ 114462306a36Sopenharmony_ci result = check_pmd_still_valid(mm, address, pmd); 114562306a36Sopenharmony_ci if (result != SCAN_SUCCEED) 114662306a36Sopenharmony_ci goto out_up_write; 114762306a36Sopenharmony_ci 114862306a36Sopenharmony_ci vma_start_write(vma); 114962306a36Sopenharmony_ci anon_vma_lock_write(vma->anon_vma); 115062306a36Sopenharmony_ci 115162306a36Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address, 115262306a36Sopenharmony_ci address + HPAGE_PMD_SIZE); 115362306a36Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 115462306a36Sopenharmony_ci 115562306a36Sopenharmony_ci pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ 115662306a36Sopenharmony_ci /* 115762306a36Sopenharmony_ci * This removes any huge TLB entry from the CPU so we won't allow 115862306a36Sopenharmony_ci * huge and small TLB entries for the same virtual address to 115962306a36Sopenharmony_ci * avoid the risk of CPU bugs in that area. 116062306a36Sopenharmony_ci * 116162306a36Sopenharmony_ci * Parallel fast GUP is fine since fast GUP will back off when 116262306a36Sopenharmony_ci * it detects PMD is changed. 116362306a36Sopenharmony_ci */ 116462306a36Sopenharmony_ci _pmd = pmdp_collapse_flush(vma, address, pmd); 116562306a36Sopenharmony_ci spin_unlock(pmd_ptl); 116662306a36Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 116762306a36Sopenharmony_ci tlb_remove_table_sync_one(); 116862306a36Sopenharmony_ci 116962306a36Sopenharmony_ci pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl); 117062306a36Sopenharmony_ci if (pte) { 117162306a36Sopenharmony_ci result = __collapse_huge_page_isolate(vma, address, pte, cc, 117262306a36Sopenharmony_ci &compound_pagelist); 117362306a36Sopenharmony_ci spin_unlock(pte_ptl); 117462306a36Sopenharmony_ci } else { 117562306a36Sopenharmony_ci result = SCAN_PMD_NULL; 117662306a36Sopenharmony_ci } 117762306a36Sopenharmony_ci 117862306a36Sopenharmony_ci if (unlikely(result != SCAN_SUCCEED)) { 117962306a36Sopenharmony_ci if (pte) 118062306a36Sopenharmony_ci pte_unmap(pte); 118162306a36Sopenharmony_ci spin_lock(pmd_ptl); 118262306a36Sopenharmony_ci BUG_ON(!pmd_none(*pmd)); 118362306a36Sopenharmony_ci /* 118462306a36Sopenharmony_ci * We can only use set_pmd_at when establishing 118562306a36Sopenharmony_ci * hugepmds and never for establishing regular pmds that 118662306a36Sopenharmony_ci * points to regular pagetables. Use pmd_populate for that 118762306a36Sopenharmony_ci */ 118862306a36Sopenharmony_ci pmd_populate(mm, pmd, pmd_pgtable(_pmd)); 118962306a36Sopenharmony_ci spin_unlock(pmd_ptl); 119062306a36Sopenharmony_ci anon_vma_unlock_write(vma->anon_vma); 119162306a36Sopenharmony_ci goto out_up_write; 119262306a36Sopenharmony_ci } 119362306a36Sopenharmony_ci 119462306a36Sopenharmony_ci /* 119562306a36Sopenharmony_ci * All pages are isolated and locked so anon_vma rmap 119662306a36Sopenharmony_ci * can't run anymore. 119762306a36Sopenharmony_ci */ 119862306a36Sopenharmony_ci anon_vma_unlock_write(vma->anon_vma); 119962306a36Sopenharmony_ci 120062306a36Sopenharmony_ci result = __collapse_huge_page_copy(pte, hpage, pmd, _pmd, 120162306a36Sopenharmony_ci vma, address, pte_ptl, 120262306a36Sopenharmony_ci &compound_pagelist); 120362306a36Sopenharmony_ci pte_unmap(pte); 120462306a36Sopenharmony_ci if (unlikely(result != SCAN_SUCCEED)) 120562306a36Sopenharmony_ci goto out_up_write; 120662306a36Sopenharmony_ci 120762306a36Sopenharmony_ci /* 120862306a36Sopenharmony_ci * spin_lock() below is not the equivalent of smp_wmb(), but 120962306a36Sopenharmony_ci * the smp_wmb() inside __SetPageUptodate() can be reused to 121062306a36Sopenharmony_ci * avoid the copy_huge_page writes to become visible after 121162306a36Sopenharmony_ci * the set_pmd_at() write. 121262306a36Sopenharmony_ci */ 121362306a36Sopenharmony_ci __SetPageUptodate(hpage); 121462306a36Sopenharmony_ci pgtable = pmd_pgtable(_pmd); 121562306a36Sopenharmony_ci 121662306a36Sopenharmony_ci _pmd = mk_huge_pmd(hpage, vma->vm_page_prot); 121762306a36Sopenharmony_ci _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); 121862306a36Sopenharmony_ci 121962306a36Sopenharmony_ci spin_lock(pmd_ptl); 122062306a36Sopenharmony_ci BUG_ON(!pmd_none(*pmd)); 122162306a36Sopenharmony_ci page_add_new_anon_rmap(hpage, vma, address); 122262306a36Sopenharmony_ci lru_cache_add_inactive_or_unevictable(hpage, vma); 122362306a36Sopenharmony_ci pgtable_trans_huge_deposit(mm, pmd, pgtable); 122462306a36Sopenharmony_ci set_pmd_at(mm, address, pmd, _pmd); 122562306a36Sopenharmony_ci update_mmu_cache_pmd(vma, address, pmd); 122662306a36Sopenharmony_ci spin_unlock(pmd_ptl); 122762306a36Sopenharmony_ci 122862306a36Sopenharmony_ci hpage = NULL; 122962306a36Sopenharmony_ci 123062306a36Sopenharmony_ci result = SCAN_SUCCEED; 123162306a36Sopenharmony_ciout_up_write: 123262306a36Sopenharmony_ci mmap_write_unlock(mm); 123362306a36Sopenharmony_ciout_nolock: 123462306a36Sopenharmony_ci if (hpage) 123562306a36Sopenharmony_ci put_page(hpage); 123662306a36Sopenharmony_ci trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result); 123762306a36Sopenharmony_ci return result; 123862306a36Sopenharmony_ci} 123962306a36Sopenharmony_ci 124062306a36Sopenharmony_cistatic int hpage_collapse_scan_pmd(struct mm_struct *mm, 124162306a36Sopenharmony_ci struct vm_area_struct *vma, 124262306a36Sopenharmony_ci unsigned long address, bool *mmap_locked, 124362306a36Sopenharmony_ci struct collapse_control *cc) 124462306a36Sopenharmony_ci{ 124562306a36Sopenharmony_ci pmd_t *pmd; 124662306a36Sopenharmony_ci pte_t *pte, *_pte; 124762306a36Sopenharmony_ci int result = SCAN_FAIL, referenced = 0; 124862306a36Sopenharmony_ci int none_or_zero = 0, shared = 0; 124962306a36Sopenharmony_ci struct page *page = NULL; 125062306a36Sopenharmony_ci unsigned long _address; 125162306a36Sopenharmony_ci spinlock_t *ptl; 125262306a36Sopenharmony_ci int node = NUMA_NO_NODE, unmapped = 0; 125362306a36Sopenharmony_ci bool writable = false; 125462306a36Sopenharmony_ci 125562306a36Sopenharmony_ci VM_BUG_ON(address & ~HPAGE_PMD_MASK); 125662306a36Sopenharmony_ci 125762306a36Sopenharmony_ci result = find_pmd_or_thp_or_none(mm, address, &pmd); 125862306a36Sopenharmony_ci if (result != SCAN_SUCCEED) 125962306a36Sopenharmony_ci goto out; 126062306a36Sopenharmony_ci 126162306a36Sopenharmony_ci memset(cc->node_load, 0, sizeof(cc->node_load)); 126262306a36Sopenharmony_ci nodes_clear(cc->alloc_nmask); 126362306a36Sopenharmony_ci pte = pte_offset_map_lock(mm, pmd, address, &ptl); 126462306a36Sopenharmony_ci if (!pte) { 126562306a36Sopenharmony_ci result = SCAN_PMD_NULL; 126662306a36Sopenharmony_ci goto out; 126762306a36Sopenharmony_ci } 126862306a36Sopenharmony_ci 126962306a36Sopenharmony_ci for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR; 127062306a36Sopenharmony_ci _pte++, _address += PAGE_SIZE) { 127162306a36Sopenharmony_ci pte_t pteval = ptep_get(_pte); 127262306a36Sopenharmony_ci if (is_swap_pte(pteval)) { 127362306a36Sopenharmony_ci ++unmapped; 127462306a36Sopenharmony_ci if (!cc->is_khugepaged || 127562306a36Sopenharmony_ci unmapped <= khugepaged_max_ptes_swap) { 127662306a36Sopenharmony_ci /* 127762306a36Sopenharmony_ci * Always be strict with uffd-wp 127862306a36Sopenharmony_ci * enabled swap entries. Please see 127962306a36Sopenharmony_ci * comment below for pte_uffd_wp(). 128062306a36Sopenharmony_ci */ 128162306a36Sopenharmony_ci if (pte_swp_uffd_wp_any(pteval)) { 128262306a36Sopenharmony_ci result = SCAN_PTE_UFFD_WP; 128362306a36Sopenharmony_ci goto out_unmap; 128462306a36Sopenharmony_ci } 128562306a36Sopenharmony_ci continue; 128662306a36Sopenharmony_ci } else { 128762306a36Sopenharmony_ci result = SCAN_EXCEED_SWAP_PTE; 128862306a36Sopenharmony_ci count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); 128962306a36Sopenharmony_ci goto out_unmap; 129062306a36Sopenharmony_ci } 129162306a36Sopenharmony_ci } 129262306a36Sopenharmony_ci if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { 129362306a36Sopenharmony_ci ++none_or_zero; 129462306a36Sopenharmony_ci if (!userfaultfd_armed(vma) && 129562306a36Sopenharmony_ci (!cc->is_khugepaged || 129662306a36Sopenharmony_ci none_or_zero <= khugepaged_max_ptes_none)) { 129762306a36Sopenharmony_ci continue; 129862306a36Sopenharmony_ci } else { 129962306a36Sopenharmony_ci result = SCAN_EXCEED_NONE_PTE; 130062306a36Sopenharmony_ci count_vm_event(THP_SCAN_EXCEED_NONE_PTE); 130162306a36Sopenharmony_ci goto out_unmap; 130262306a36Sopenharmony_ci } 130362306a36Sopenharmony_ci } 130462306a36Sopenharmony_ci if (pte_uffd_wp(pteval)) { 130562306a36Sopenharmony_ci /* 130662306a36Sopenharmony_ci * Don't collapse the page if any of the small 130762306a36Sopenharmony_ci * PTEs are armed with uffd write protection. 130862306a36Sopenharmony_ci * Here we can also mark the new huge pmd as 130962306a36Sopenharmony_ci * write protected if any of the small ones is 131062306a36Sopenharmony_ci * marked but that could bring unknown 131162306a36Sopenharmony_ci * userfault messages that falls outside of 131262306a36Sopenharmony_ci * the registered range. So, just be simple. 131362306a36Sopenharmony_ci */ 131462306a36Sopenharmony_ci result = SCAN_PTE_UFFD_WP; 131562306a36Sopenharmony_ci goto out_unmap; 131662306a36Sopenharmony_ci } 131762306a36Sopenharmony_ci if (pte_write(pteval)) 131862306a36Sopenharmony_ci writable = true; 131962306a36Sopenharmony_ci 132062306a36Sopenharmony_ci page = vm_normal_page(vma, _address, pteval); 132162306a36Sopenharmony_ci if (unlikely(!page) || unlikely(is_zone_device_page(page))) { 132262306a36Sopenharmony_ci result = SCAN_PAGE_NULL; 132362306a36Sopenharmony_ci goto out_unmap; 132462306a36Sopenharmony_ci } 132562306a36Sopenharmony_ci 132662306a36Sopenharmony_ci if (page_mapcount(page) > 1) { 132762306a36Sopenharmony_ci ++shared; 132862306a36Sopenharmony_ci if (cc->is_khugepaged && 132962306a36Sopenharmony_ci shared > khugepaged_max_ptes_shared) { 133062306a36Sopenharmony_ci result = SCAN_EXCEED_SHARED_PTE; 133162306a36Sopenharmony_ci count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); 133262306a36Sopenharmony_ci goto out_unmap; 133362306a36Sopenharmony_ci } 133462306a36Sopenharmony_ci } 133562306a36Sopenharmony_ci 133662306a36Sopenharmony_ci page = compound_head(page); 133762306a36Sopenharmony_ci 133862306a36Sopenharmony_ci /* 133962306a36Sopenharmony_ci * Record which node the original page is from and save this 134062306a36Sopenharmony_ci * information to cc->node_load[]. 134162306a36Sopenharmony_ci * Khugepaged will allocate hugepage from the node has the max 134262306a36Sopenharmony_ci * hit record. 134362306a36Sopenharmony_ci */ 134462306a36Sopenharmony_ci node = page_to_nid(page); 134562306a36Sopenharmony_ci if (hpage_collapse_scan_abort(node, cc)) { 134662306a36Sopenharmony_ci result = SCAN_SCAN_ABORT; 134762306a36Sopenharmony_ci goto out_unmap; 134862306a36Sopenharmony_ci } 134962306a36Sopenharmony_ci cc->node_load[node]++; 135062306a36Sopenharmony_ci if (!PageLRU(page)) { 135162306a36Sopenharmony_ci result = SCAN_PAGE_LRU; 135262306a36Sopenharmony_ci goto out_unmap; 135362306a36Sopenharmony_ci } 135462306a36Sopenharmony_ci if (PageLocked(page)) { 135562306a36Sopenharmony_ci result = SCAN_PAGE_LOCK; 135662306a36Sopenharmony_ci goto out_unmap; 135762306a36Sopenharmony_ci } 135862306a36Sopenharmony_ci if (!PageAnon(page)) { 135962306a36Sopenharmony_ci result = SCAN_PAGE_ANON; 136062306a36Sopenharmony_ci goto out_unmap; 136162306a36Sopenharmony_ci } 136262306a36Sopenharmony_ci 136362306a36Sopenharmony_ci /* 136462306a36Sopenharmony_ci * Check if the page has any GUP (or other external) pins. 136562306a36Sopenharmony_ci * 136662306a36Sopenharmony_ci * Here the check may be racy: 136762306a36Sopenharmony_ci * it may see total_mapcount > refcount in some cases? 136862306a36Sopenharmony_ci * But such case is ephemeral we could always retry collapse 136962306a36Sopenharmony_ci * later. However it may report false positive if the page 137062306a36Sopenharmony_ci * has excessive GUP pins (i.e. 512). Anyway the same check 137162306a36Sopenharmony_ci * will be done again later the risk seems low. 137262306a36Sopenharmony_ci */ 137362306a36Sopenharmony_ci if (!is_refcount_suitable(page)) { 137462306a36Sopenharmony_ci result = SCAN_PAGE_COUNT; 137562306a36Sopenharmony_ci goto out_unmap; 137662306a36Sopenharmony_ci } 137762306a36Sopenharmony_ci 137862306a36Sopenharmony_ci /* 137962306a36Sopenharmony_ci * If collapse was initiated by khugepaged, check that there is 138062306a36Sopenharmony_ci * enough young pte to justify collapsing the page 138162306a36Sopenharmony_ci */ 138262306a36Sopenharmony_ci if (cc->is_khugepaged && 138362306a36Sopenharmony_ci (pte_young(pteval) || page_is_young(page) || 138462306a36Sopenharmony_ci PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, 138562306a36Sopenharmony_ci address))) 138662306a36Sopenharmony_ci referenced++; 138762306a36Sopenharmony_ci } 138862306a36Sopenharmony_ci if (!writable) { 138962306a36Sopenharmony_ci result = SCAN_PAGE_RO; 139062306a36Sopenharmony_ci } else if (cc->is_khugepaged && 139162306a36Sopenharmony_ci (!referenced || 139262306a36Sopenharmony_ci (unmapped && referenced < HPAGE_PMD_NR / 2))) { 139362306a36Sopenharmony_ci result = SCAN_LACK_REFERENCED_PAGE; 139462306a36Sopenharmony_ci } else { 139562306a36Sopenharmony_ci result = SCAN_SUCCEED; 139662306a36Sopenharmony_ci } 139762306a36Sopenharmony_ciout_unmap: 139862306a36Sopenharmony_ci pte_unmap_unlock(pte, ptl); 139962306a36Sopenharmony_ci if (result == SCAN_SUCCEED) { 140062306a36Sopenharmony_ci result = collapse_huge_page(mm, address, referenced, 140162306a36Sopenharmony_ci unmapped, cc); 140262306a36Sopenharmony_ci /* collapse_huge_page will return with the mmap_lock released */ 140362306a36Sopenharmony_ci *mmap_locked = false; 140462306a36Sopenharmony_ci } 140562306a36Sopenharmony_ciout: 140662306a36Sopenharmony_ci trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, 140762306a36Sopenharmony_ci none_or_zero, result, unmapped); 140862306a36Sopenharmony_ci return result; 140962306a36Sopenharmony_ci} 141062306a36Sopenharmony_ci 141162306a36Sopenharmony_cistatic void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) 141262306a36Sopenharmony_ci{ 141362306a36Sopenharmony_ci struct mm_slot *slot = &mm_slot->slot; 141462306a36Sopenharmony_ci struct mm_struct *mm = slot->mm; 141562306a36Sopenharmony_ci 141662306a36Sopenharmony_ci lockdep_assert_held(&khugepaged_mm_lock); 141762306a36Sopenharmony_ci 141862306a36Sopenharmony_ci if (hpage_collapse_test_exit(mm)) { 141962306a36Sopenharmony_ci /* free mm_slot */ 142062306a36Sopenharmony_ci hash_del(&slot->hash); 142162306a36Sopenharmony_ci list_del(&slot->mm_node); 142262306a36Sopenharmony_ci 142362306a36Sopenharmony_ci /* 142462306a36Sopenharmony_ci * Not strictly needed because the mm exited already. 142562306a36Sopenharmony_ci * 142662306a36Sopenharmony_ci * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); 142762306a36Sopenharmony_ci */ 142862306a36Sopenharmony_ci 142962306a36Sopenharmony_ci /* khugepaged_mm_lock actually not necessary for the below */ 143062306a36Sopenharmony_ci mm_slot_free(mm_slot_cache, mm_slot); 143162306a36Sopenharmony_ci mmdrop(mm); 143262306a36Sopenharmony_ci } 143362306a36Sopenharmony_ci} 143462306a36Sopenharmony_ci 143562306a36Sopenharmony_ci#ifdef CONFIG_SHMEM 143662306a36Sopenharmony_ci/* hpage must be locked, and mmap_lock must be held */ 143762306a36Sopenharmony_cistatic int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, 143862306a36Sopenharmony_ci pmd_t *pmdp, struct page *hpage) 143962306a36Sopenharmony_ci{ 144062306a36Sopenharmony_ci struct vm_fault vmf = { 144162306a36Sopenharmony_ci .vma = vma, 144262306a36Sopenharmony_ci .address = addr, 144362306a36Sopenharmony_ci .flags = 0, 144462306a36Sopenharmony_ci .pmd = pmdp, 144562306a36Sopenharmony_ci }; 144662306a36Sopenharmony_ci 144762306a36Sopenharmony_ci VM_BUG_ON(!PageTransHuge(hpage)); 144862306a36Sopenharmony_ci mmap_assert_locked(vma->vm_mm); 144962306a36Sopenharmony_ci 145062306a36Sopenharmony_ci if (do_set_pmd(&vmf, hpage)) 145162306a36Sopenharmony_ci return SCAN_FAIL; 145262306a36Sopenharmony_ci 145362306a36Sopenharmony_ci get_page(hpage); 145462306a36Sopenharmony_ci return SCAN_SUCCEED; 145562306a36Sopenharmony_ci} 145662306a36Sopenharmony_ci 145762306a36Sopenharmony_ci/** 145862306a36Sopenharmony_ci * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at 145962306a36Sopenharmony_ci * address haddr. 146062306a36Sopenharmony_ci * 146162306a36Sopenharmony_ci * @mm: process address space where collapse happens 146262306a36Sopenharmony_ci * @addr: THP collapse address 146362306a36Sopenharmony_ci * @install_pmd: If a huge PMD should be installed 146462306a36Sopenharmony_ci * 146562306a36Sopenharmony_ci * This function checks whether all the PTEs in the PMD are pointing to the 146662306a36Sopenharmony_ci * right THP. If so, retract the page table so the THP can refault in with 146762306a36Sopenharmony_ci * as pmd-mapped. Possibly install a huge PMD mapping the THP. 146862306a36Sopenharmony_ci */ 146962306a36Sopenharmony_ciint collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, 147062306a36Sopenharmony_ci bool install_pmd) 147162306a36Sopenharmony_ci{ 147262306a36Sopenharmony_ci struct mmu_notifier_range range; 147362306a36Sopenharmony_ci bool notified = false; 147462306a36Sopenharmony_ci unsigned long haddr = addr & HPAGE_PMD_MASK; 147562306a36Sopenharmony_ci struct vm_area_struct *vma = vma_lookup(mm, haddr); 147662306a36Sopenharmony_ci struct page *hpage; 147762306a36Sopenharmony_ci pte_t *start_pte, *pte; 147862306a36Sopenharmony_ci pmd_t *pmd, pgt_pmd; 147962306a36Sopenharmony_ci spinlock_t *pml = NULL, *ptl; 148062306a36Sopenharmony_ci int nr_ptes = 0, result = SCAN_FAIL; 148162306a36Sopenharmony_ci int i; 148262306a36Sopenharmony_ci 148362306a36Sopenharmony_ci mmap_assert_locked(mm); 148462306a36Sopenharmony_ci 148562306a36Sopenharmony_ci /* First check VMA found, in case page tables are being torn down */ 148662306a36Sopenharmony_ci if (!vma || !vma->vm_file || 148762306a36Sopenharmony_ci !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE)) 148862306a36Sopenharmony_ci return SCAN_VMA_CHECK; 148962306a36Sopenharmony_ci 149062306a36Sopenharmony_ci /* Fast check before locking page if already PMD-mapped */ 149162306a36Sopenharmony_ci result = find_pmd_or_thp_or_none(mm, haddr, &pmd); 149262306a36Sopenharmony_ci if (result == SCAN_PMD_MAPPED) 149362306a36Sopenharmony_ci return result; 149462306a36Sopenharmony_ci 149562306a36Sopenharmony_ci /* 149662306a36Sopenharmony_ci * If we are here, we've succeeded in replacing all the native pages 149762306a36Sopenharmony_ci * in the page cache with a single hugepage. If a mm were to fault-in 149862306a36Sopenharmony_ci * this memory (mapped by a suitably aligned VMA), we'd get the hugepage 149962306a36Sopenharmony_ci * and map it by a PMD, regardless of sysfs THP settings. As such, let's 150062306a36Sopenharmony_ci * analogously elide sysfs THP settings here. 150162306a36Sopenharmony_ci */ 150262306a36Sopenharmony_ci if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) 150362306a36Sopenharmony_ci return SCAN_VMA_CHECK; 150462306a36Sopenharmony_ci 150562306a36Sopenharmony_ci /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ 150662306a36Sopenharmony_ci if (userfaultfd_wp(vma)) 150762306a36Sopenharmony_ci return SCAN_PTE_UFFD_WP; 150862306a36Sopenharmony_ci 150962306a36Sopenharmony_ci hpage = find_lock_page(vma->vm_file->f_mapping, 151062306a36Sopenharmony_ci linear_page_index(vma, haddr)); 151162306a36Sopenharmony_ci if (!hpage) 151262306a36Sopenharmony_ci return SCAN_PAGE_NULL; 151362306a36Sopenharmony_ci 151462306a36Sopenharmony_ci if (!PageHead(hpage)) { 151562306a36Sopenharmony_ci result = SCAN_FAIL; 151662306a36Sopenharmony_ci goto drop_hpage; 151762306a36Sopenharmony_ci } 151862306a36Sopenharmony_ci 151962306a36Sopenharmony_ci if (compound_order(hpage) != HPAGE_PMD_ORDER) { 152062306a36Sopenharmony_ci result = SCAN_PAGE_COMPOUND; 152162306a36Sopenharmony_ci goto drop_hpage; 152262306a36Sopenharmony_ci } 152362306a36Sopenharmony_ci 152462306a36Sopenharmony_ci result = find_pmd_or_thp_or_none(mm, haddr, &pmd); 152562306a36Sopenharmony_ci switch (result) { 152662306a36Sopenharmony_ci case SCAN_SUCCEED: 152762306a36Sopenharmony_ci break; 152862306a36Sopenharmony_ci case SCAN_PMD_NONE: 152962306a36Sopenharmony_ci /* 153062306a36Sopenharmony_ci * All pte entries have been removed and pmd cleared. 153162306a36Sopenharmony_ci * Skip all the pte checks and just update the pmd mapping. 153262306a36Sopenharmony_ci */ 153362306a36Sopenharmony_ci goto maybe_install_pmd; 153462306a36Sopenharmony_ci default: 153562306a36Sopenharmony_ci goto drop_hpage; 153662306a36Sopenharmony_ci } 153762306a36Sopenharmony_ci 153862306a36Sopenharmony_ci result = SCAN_FAIL; 153962306a36Sopenharmony_ci start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); 154062306a36Sopenharmony_ci if (!start_pte) /* mmap_lock + page lock should prevent this */ 154162306a36Sopenharmony_ci goto drop_hpage; 154262306a36Sopenharmony_ci 154362306a36Sopenharmony_ci /* step 1: check all mapped PTEs are to the right huge page */ 154462306a36Sopenharmony_ci for (i = 0, addr = haddr, pte = start_pte; 154562306a36Sopenharmony_ci i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { 154662306a36Sopenharmony_ci struct page *page; 154762306a36Sopenharmony_ci pte_t ptent = ptep_get(pte); 154862306a36Sopenharmony_ci 154962306a36Sopenharmony_ci /* empty pte, skip */ 155062306a36Sopenharmony_ci if (pte_none(ptent)) 155162306a36Sopenharmony_ci continue; 155262306a36Sopenharmony_ci 155362306a36Sopenharmony_ci /* page swapped out, abort */ 155462306a36Sopenharmony_ci if (!pte_present(ptent)) { 155562306a36Sopenharmony_ci result = SCAN_PTE_NON_PRESENT; 155662306a36Sopenharmony_ci goto abort; 155762306a36Sopenharmony_ci } 155862306a36Sopenharmony_ci 155962306a36Sopenharmony_ci page = vm_normal_page(vma, addr, ptent); 156062306a36Sopenharmony_ci if (WARN_ON_ONCE(page && is_zone_device_page(page))) 156162306a36Sopenharmony_ci page = NULL; 156262306a36Sopenharmony_ci /* 156362306a36Sopenharmony_ci * Note that uprobe, debugger, or MAP_PRIVATE may change the 156462306a36Sopenharmony_ci * page table, but the new page will not be a subpage of hpage. 156562306a36Sopenharmony_ci */ 156662306a36Sopenharmony_ci if (hpage + i != page) 156762306a36Sopenharmony_ci goto abort; 156862306a36Sopenharmony_ci } 156962306a36Sopenharmony_ci 157062306a36Sopenharmony_ci pte_unmap_unlock(start_pte, ptl); 157162306a36Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, 157262306a36Sopenharmony_ci haddr, haddr + HPAGE_PMD_SIZE); 157362306a36Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 157462306a36Sopenharmony_ci notified = true; 157562306a36Sopenharmony_ci 157662306a36Sopenharmony_ci /* 157762306a36Sopenharmony_ci * pmd_lock covers a wider range than ptl, and (if split from mm's 157862306a36Sopenharmony_ci * page_table_lock) ptl nests inside pml. The less time we hold pml, 157962306a36Sopenharmony_ci * the better; but userfaultfd's mfill_atomic_pte() on a private VMA 158062306a36Sopenharmony_ci * inserts a valid as-if-COWed PTE without even looking up page cache. 158162306a36Sopenharmony_ci * So page lock of hpage does not protect from it, so we must not drop 158262306a36Sopenharmony_ci * ptl before pgt_pmd is removed, so uffd private needs pml taken now. 158362306a36Sopenharmony_ci */ 158462306a36Sopenharmony_ci if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED)) 158562306a36Sopenharmony_ci pml = pmd_lock(mm, pmd); 158662306a36Sopenharmony_ci 158762306a36Sopenharmony_ci start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl); 158862306a36Sopenharmony_ci if (!start_pte) /* mmap_lock + page lock should prevent this */ 158962306a36Sopenharmony_ci goto abort; 159062306a36Sopenharmony_ci if (!pml) 159162306a36Sopenharmony_ci spin_lock(ptl); 159262306a36Sopenharmony_ci else if (ptl != pml) 159362306a36Sopenharmony_ci spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); 159462306a36Sopenharmony_ci 159562306a36Sopenharmony_ci /* step 2: clear page table and adjust rmap */ 159662306a36Sopenharmony_ci for (i = 0, addr = haddr, pte = start_pte; 159762306a36Sopenharmony_ci i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { 159862306a36Sopenharmony_ci struct page *page; 159962306a36Sopenharmony_ci pte_t ptent = ptep_get(pte); 160062306a36Sopenharmony_ci 160162306a36Sopenharmony_ci if (pte_none(ptent)) 160262306a36Sopenharmony_ci continue; 160362306a36Sopenharmony_ci /* 160462306a36Sopenharmony_ci * We dropped ptl after the first scan, to do the mmu_notifier: 160562306a36Sopenharmony_ci * page lock stops more PTEs of the hpage being faulted in, but 160662306a36Sopenharmony_ci * does not stop write faults COWing anon copies from existing 160762306a36Sopenharmony_ci * PTEs; and does not stop those being swapped out or migrated. 160862306a36Sopenharmony_ci */ 160962306a36Sopenharmony_ci if (!pte_present(ptent)) { 161062306a36Sopenharmony_ci result = SCAN_PTE_NON_PRESENT; 161162306a36Sopenharmony_ci goto abort; 161262306a36Sopenharmony_ci } 161362306a36Sopenharmony_ci page = vm_normal_page(vma, addr, ptent); 161462306a36Sopenharmony_ci if (hpage + i != page) 161562306a36Sopenharmony_ci goto abort; 161662306a36Sopenharmony_ci 161762306a36Sopenharmony_ci /* 161862306a36Sopenharmony_ci * Must clear entry, or a racing truncate may re-remove it. 161962306a36Sopenharmony_ci * TLB flush can be left until pmdp_collapse_flush() does it. 162062306a36Sopenharmony_ci * PTE dirty? Shmem page is already dirty; file is read-only. 162162306a36Sopenharmony_ci */ 162262306a36Sopenharmony_ci ptep_clear(mm, addr, pte); 162362306a36Sopenharmony_ci page_remove_rmap(page, vma, false); 162462306a36Sopenharmony_ci nr_ptes++; 162562306a36Sopenharmony_ci } 162662306a36Sopenharmony_ci 162762306a36Sopenharmony_ci pte_unmap(start_pte); 162862306a36Sopenharmony_ci if (!pml) 162962306a36Sopenharmony_ci spin_unlock(ptl); 163062306a36Sopenharmony_ci 163162306a36Sopenharmony_ci /* step 3: set proper refcount and mm_counters. */ 163262306a36Sopenharmony_ci if (nr_ptes) { 163362306a36Sopenharmony_ci page_ref_sub(hpage, nr_ptes); 163462306a36Sopenharmony_ci add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes); 163562306a36Sopenharmony_ci } 163662306a36Sopenharmony_ci 163762306a36Sopenharmony_ci /* step 4: remove empty page table */ 163862306a36Sopenharmony_ci if (!pml) { 163962306a36Sopenharmony_ci pml = pmd_lock(mm, pmd); 164062306a36Sopenharmony_ci if (ptl != pml) 164162306a36Sopenharmony_ci spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); 164262306a36Sopenharmony_ci } 164362306a36Sopenharmony_ci pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd); 164462306a36Sopenharmony_ci pmdp_get_lockless_sync(); 164562306a36Sopenharmony_ci if (ptl != pml) 164662306a36Sopenharmony_ci spin_unlock(ptl); 164762306a36Sopenharmony_ci spin_unlock(pml); 164862306a36Sopenharmony_ci 164962306a36Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 165062306a36Sopenharmony_ci 165162306a36Sopenharmony_ci mm_dec_nr_ptes(mm); 165262306a36Sopenharmony_ci page_table_check_pte_clear_range(mm, haddr, pgt_pmd); 165362306a36Sopenharmony_ci pte_free_defer(mm, pmd_pgtable(pgt_pmd)); 165462306a36Sopenharmony_ci 165562306a36Sopenharmony_cimaybe_install_pmd: 165662306a36Sopenharmony_ci /* step 5: install pmd entry */ 165762306a36Sopenharmony_ci result = install_pmd 165862306a36Sopenharmony_ci ? set_huge_pmd(vma, haddr, pmd, hpage) 165962306a36Sopenharmony_ci : SCAN_SUCCEED; 166062306a36Sopenharmony_ci goto drop_hpage; 166162306a36Sopenharmony_ciabort: 166262306a36Sopenharmony_ci if (nr_ptes) { 166362306a36Sopenharmony_ci flush_tlb_mm(mm); 166462306a36Sopenharmony_ci page_ref_sub(hpage, nr_ptes); 166562306a36Sopenharmony_ci add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes); 166662306a36Sopenharmony_ci } 166762306a36Sopenharmony_ci if (start_pte) 166862306a36Sopenharmony_ci pte_unmap_unlock(start_pte, ptl); 166962306a36Sopenharmony_ci if (pml && pml != ptl) 167062306a36Sopenharmony_ci spin_unlock(pml); 167162306a36Sopenharmony_ci if (notified) 167262306a36Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 167362306a36Sopenharmony_cidrop_hpage: 167462306a36Sopenharmony_ci unlock_page(hpage); 167562306a36Sopenharmony_ci put_page(hpage); 167662306a36Sopenharmony_ci return result; 167762306a36Sopenharmony_ci} 167862306a36Sopenharmony_ci 167962306a36Sopenharmony_cistatic void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) 168062306a36Sopenharmony_ci{ 168162306a36Sopenharmony_ci struct vm_area_struct *vma; 168262306a36Sopenharmony_ci 168362306a36Sopenharmony_ci i_mmap_lock_read(mapping); 168462306a36Sopenharmony_ci vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { 168562306a36Sopenharmony_ci struct mmu_notifier_range range; 168662306a36Sopenharmony_ci struct mm_struct *mm; 168762306a36Sopenharmony_ci unsigned long addr; 168862306a36Sopenharmony_ci pmd_t *pmd, pgt_pmd; 168962306a36Sopenharmony_ci spinlock_t *pml; 169062306a36Sopenharmony_ci spinlock_t *ptl; 169162306a36Sopenharmony_ci bool skipped_uffd = false; 169262306a36Sopenharmony_ci 169362306a36Sopenharmony_ci /* 169462306a36Sopenharmony_ci * Check vma->anon_vma to exclude MAP_PRIVATE mappings that 169562306a36Sopenharmony_ci * got written to. These VMAs are likely not worth removing 169662306a36Sopenharmony_ci * page tables from, as PMD-mapping is likely to be split later. 169762306a36Sopenharmony_ci */ 169862306a36Sopenharmony_ci if (READ_ONCE(vma->anon_vma)) 169962306a36Sopenharmony_ci continue; 170062306a36Sopenharmony_ci 170162306a36Sopenharmony_ci addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 170262306a36Sopenharmony_ci if (addr & ~HPAGE_PMD_MASK || 170362306a36Sopenharmony_ci vma->vm_end < addr + HPAGE_PMD_SIZE) 170462306a36Sopenharmony_ci continue; 170562306a36Sopenharmony_ci 170662306a36Sopenharmony_ci mm = vma->vm_mm; 170762306a36Sopenharmony_ci if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED) 170862306a36Sopenharmony_ci continue; 170962306a36Sopenharmony_ci 171062306a36Sopenharmony_ci if (hpage_collapse_test_exit(mm)) 171162306a36Sopenharmony_ci continue; 171262306a36Sopenharmony_ci /* 171362306a36Sopenharmony_ci * When a vma is registered with uffd-wp, we cannot recycle 171462306a36Sopenharmony_ci * the page table because there may be pte markers installed. 171562306a36Sopenharmony_ci * Other vmas can still have the same file mapped hugely, but 171662306a36Sopenharmony_ci * skip this one: it will always be mapped in small page size 171762306a36Sopenharmony_ci * for uffd-wp registered ranges. 171862306a36Sopenharmony_ci */ 171962306a36Sopenharmony_ci if (userfaultfd_wp(vma)) 172062306a36Sopenharmony_ci continue; 172162306a36Sopenharmony_ci 172262306a36Sopenharmony_ci /* PTEs were notified when unmapped; but now for the PMD? */ 172362306a36Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, 172462306a36Sopenharmony_ci addr, addr + HPAGE_PMD_SIZE); 172562306a36Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 172662306a36Sopenharmony_ci 172762306a36Sopenharmony_ci pml = pmd_lock(mm, pmd); 172862306a36Sopenharmony_ci ptl = pte_lockptr(mm, pmd); 172962306a36Sopenharmony_ci if (ptl != pml) 173062306a36Sopenharmony_ci spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); 173162306a36Sopenharmony_ci 173262306a36Sopenharmony_ci /* 173362306a36Sopenharmony_ci * Huge page lock is still held, so normally the page table 173462306a36Sopenharmony_ci * must remain empty; and we have already skipped anon_vma 173562306a36Sopenharmony_ci * and userfaultfd_wp() vmas. But since the mmap_lock is not 173662306a36Sopenharmony_ci * held, it is still possible for a racing userfaultfd_ioctl() 173762306a36Sopenharmony_ci * to have inserted ptes or markers. Now that we hold ptlock, 173862306a36Sopenharmony_ci * repeating the anon_vma check protects from one category, 173962306a36Sopenharmony_ci * and repeating the userfaultfd_wp() check from another. 174062306a36Sopenharmony_ci */ 174162306a36Sopenharmony_ci if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) { 174262306a36Sopenharmony_ci skipped_uffd = true; 174362306a36Sopenharmony_ci } else { 174462306a36Sopenharmony_ci pgt_pmd = pmdp_collapse_flush(vma, addr, pmd); 174562306a36Sopenharmony_ci pmdp_get_lockless_sync(); 174662306a36Sopenharmony_ci } 174762306a36Sopenharmony_ci 174862306a36Sopenharmony_ci if (ptl != pml) 174962306a36Sopenharmony_ci spin_unlock(ptl); 175062306a36Sopenharmony_ci spin_unlock(pml); 175162306a36Sopenharmony_ci 175262306a36Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 175362306a36Sopenharmony_ci 175462306a36Sopenharmony_ci if (!skipped_uffd) { 175562306a36Sopenharmony_ci mm_dec_nr_ptes(mm); 175662306a36Sopenharmony_ci page_table_check_pte_clear_range(mm, addr, pgt_pmd); 175762306a36Sopenharmony_ci pte_free_defer(mm, pmd_pgtable(pgt_pmd)); 175862306a36Sopenharmony_ci } 175962306a36Sopenharmony_ci } 176062306a36Sopenharmony_ci i_mmap_unlock_read(mapping); 176162306a36Sopenharmony_ci} 176262306a36Sopenharmony_ci 176362306a36Sopenharmony_ci/** 176462306a36Sopenharmony_ci * collapse_file - collapse filemap/tmpfs/shmem pages into huge one. 176562306a36Sopenharmony_ci * 176662306a36Sopenharmony_ci * @mm: process address space where collapse happens 176762306a36Sopenharmony_ci * @addr: virtual collapse start address 176862306a36Sopenharmony_ci * @file: file that collapse on 176962306a36Sopenharmony_ci * @start: collapse start address 177062306a36Sopenharmony_ci * @cc: collapse context and scratchpad 177162306a36Sopenharmony_ci * 177262306a36Sopenharmony_ci * Basic scheme is simple, details are more complex: 177362306a36Sopenharmony_ci * - allocate and lock a new huge page; 177462306a36Sopenharmony_ci * - scan page cache, locking old pages 177562306a36Sopenharmony_ci * + swap/gup in pages if necessary; 177662306a36Sopenharmony_ci * - copy data to new page 177762306a36Sopenharmony_ci * - handle shmem holes 177862306a36Sopenharmony_ci * + re-validate that holes weren't filled by someone else 177962306a36Sopenharmony_ci * + check for userfaultfd 178062306a36Sopenharmony_ci * - finalize updates to the page cache; 178162306a36Sopenharmony_ci * - if replacing succeeds: 178262306a36Sopenharmony_ci * + unlock huge page; 178362306a36Sopenharmony_ci * + free old pages; 178462306a36Sopenharmony_ci * - if replacing failed; 178562306a36Sopenharmony_ci * + unlock old pages 178662306a36Sopenharmony_ci * + unlock and free huge page; 178762306a36Sopenharmony_ci */ 178862306a36Sopenharmony_cistatic int collapse_file(struct mm_struct *mm, unsigned long addr, 178962306a36Sopenharmony_ci struct file *file, pgoff_t start, 179062306a36Sopenharmony_ci struct collapse_control *cc) 179162306a36Sopenharmony_ci{ 179262306a36Sopenharmony_ci struct address_space *mapping = file->f_mapping; 179362306a36Sopenharmony_ci struct page *hpage; 179462306a36Sopenharmony_ci struct page *page; 179562306a36Sopenharmony_ci struct page *tmp; 179662306a36Sopenharmony_ci struct folio *folio; 179762306a36Sopenharmony_ci pgoff_t index = 0, end = start + HPAGE_PMD_NR; 179862306a36Sopenharmony_ci LIST_HEAD(pagelist); 179962306a36Sopenharmony_ci XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); 180062306a36Sopenharmony_ci int nr_none = 0, result = SCAN_SUCCEED; 180162306a36Sopenharmony_ci bool is_shmem = shmem_file(file); 180262306a36Sopenharmony_ci int nr = 0; 180362306a36Sopenharmony_ci 180462306a36Sopenharmony_ci VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); 180562306a36Sopenharmony_ci VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); 180662306a36Sopenharmony_ci 180762306a36Sopenharmony_ci result = alloc_charge_hpage(&hpage, mm, cc); 180862306a36Sopenharmony_ci if (result != SCAN_SUCCEED) 180962306a36Sopenharmony_ci goto out; 181062306a36Sopenharmony_ci 181162306a36Sopenharmony_ci __SetPageLocked(hpage); 181262306a36Sopenharmony_ci if (is_shmem) 181362306a36Sopenharmony_ci __SetPageSwapBacked(hpage); 181462306a36Sopenharmony_ci hpage->index = start; 181562306a36Sopenharmony_ci hpage->mapping = mapping; 181662306a36Sopenharmony_ci 181762306a36Sopenharmony_ci /* 181862306a36Sopenharmony_ci * Ensure we have slots for all the pages in the range. This is 181962306a36Sopenharmony_ci * almost certainly a no-op because most of the pages must be present 182062306a36Sopenharmony_ci */ 182162306a36Sopenharmony_ci do { 182262306a36Sopenharmony_ci xas_lock_irq(&xas); 182362306a36Sopenharmony_ci xas_create_range(&xas); 182462306a36Sopenharmony_ci if (!xas_error(&xas)) 182562306a36Sopenharmony_ci break; 182662306a36Sopenharmony_ci xas_unlock_irq(&xas); 182762306a36Sopenharmony_ci if (!xas_nomem(&xas, GFP_KERNEL)) { 182862306a36Sopenharmony_ci result = SCAN_FAIL; 182962306a36Sopenharmony_ci goto rollback; 183062306a36Sopenharmony_ci } 183162306a36Sopenharmony_ci } while (1); 183262306a36Sopenharmony_ci 183362306a36Sopenharmony_ci for (index = start; index < end; index++) { 183462306a36Sopenharmony_ci xas_set(&xas, index); 183562306a36Sopenharmony_ci page = xas_load(&xas); 183662306a36Sopenharmony_ci 183762306a36Sopenharmony_ci VM_BUG_ON(index != xas.xa_index); 183862306a36Sopenharmony_ci if (is_shmem) { 183962306a36Sopenharmony_ci if (!page) { 184062306a36Sopenharmony_ci /* 184162306a36Sopenharmony_ci * Stop if extent has been truncated or 184262306a36Sopenharmony_ci * hole-punched, and is now completely 184362306a36Sopenharmony_ci * empty. 184462306a36Sopenharmony_ci */ 184562306a36Sopenharmony_ci if (index == start) { 184662306a36Sopenharmony_ci if (!xas_next_entry(&xas, end - 1)) { 184762306a36Sopenharmony_ci result = SCAN_TRUNCATED; 184862306a36Sopenharmony_ci goto xa_locked; 184962306a36Sopenharmony_ci } 185062306a36Sopenharmony_ci } 185162306a36Sopenharmony_ci nr_none++; 185262306a36Sopenharmony_ci continue; 185362306a36Sopenharmony_ci } 185462306a36Sopenharmony_ci 185562306a36Sopenharmony_ci if (xa_is_value(page) || !PageUptodate(page)) { 185662306a36Sopenharmony_ci xas_unlock_irq(&xas); 185762306a36Sopenharmony_ci /* swap in or instantiate fallocated page */ 185862306a36Sopenharmony_ci if (shmem_get_folio(mapping->host, index, 185962306a36Sopenharmony_ci &folio, SGP_NOALLOC)) { 186062306a36Sopenharmony_ci result = SCAN_FAIL; 186162306a36Sopenharmony_ci goto xa_unlocked; 186262306a36Sopenharmony_ci } 186362306a36Sopenharmony_ci /* drain lru cache to help isolate_lru_page() */ 186462306a36Sopenharmony_ci lru_add_drain(); 186562306a36Sopenharmony_ci page = folio_file_page(folio, index); 186662306a36Sopenharmony_ci } else if (trylock_page(page)) { 186762306a36Sopenharmony_ci get_page(page); 186862306a36Sopenharmony_ci xas_unlock_irq(&xas); 186962306a36Sopenharmony_ci } else { 187062306a36Sopenharmony_ci result = SCAN_PAGE_LOCK; 187162306a36Sopenharmony_ci goto xa_locked; 187262306a36Sopenharmony_ci } 187362306a36Sopenharmony_ci } else { /* !is_shmem */ 187462306a36Sopenharmony_ci if (!page || xa_is_value(page)) { 187562306a36Sopenharmony_ci xas_unlock_irq(&xas); 187662306a36Sopenharmony_ci page_cache_sync_readahead(mapping, &file->f_ra, 187762306a36Sopenharmony_ci file, index, 187862306a36Sopenharmony_ci end - index); 187962306a36Sopenharmony_ci /* drain lru cache to help isolate_lru_page() */ 188062306a36Sopenharmony_ci lru_add_drain(); 188162306a36Sopenharmony_ci page = find_lock_page(mapping, index); 188262306a36Sopenharmony_ci if (unlikely(page == NULL)) { 188362306a36Sopenharmony_ci result = SCAN_FAIL; 188462306a36Sopenharmony_ci goto xa_unlocked; 188562306a36Sopenharmony_ci } 188662306a36Sopenharmony_ci } else if (PageDirty(page)) { 188762306a36Sopenharmony_ci /* 188862306a36Sopenharmony_ci * khugepaged only works on read-only fd, 188962306a36Sopenharmony_ci * so this page is dirty because it hasn't 189062306a36Sopenharmony_ci * been flushed since first write. There 189162306a36Sopenharmony_ci * won't be new dirty pages. 189262306a36Sopenharmony_ci * 189362306a36Sopenharmony_ci * Trigger async flush here and hope the 189462306a36Sopenharmony_ci * writeback is done when khugepaged 189562306a36Sopenharmony_ci * revisits this page. 189662306a36Sopenharmony_ci * 189762306a36Sopenharmony_ci * This is a one-off situation. We are not 189862306a36Sopenharmony_ci * forcing writeback in loop. 189962306a36Sopenharmony_ci */ 190062306a36Sopenharmony_ci xas_unlock_irq(&xas); 190162306a36Sopenharmony_ci filemap_flush(mapping); 190262306a36Sopenharmony_ci result = SCAN_FAIL; 190362306a36Sopenharmony_ci goto xa_unlocked; 190462306a36Sopenharmony_ci } else if (PageWriteback(page)) { 190562306a36Sopenharmony_ci xas_unlock_irq(&xas); 190662306a36Sopenharmony_ci result = SCAN_FAIL; 190762306a36Sopenharmony_ci goto xa_unlocked; 190862306a36Sopenharmony_ci } else if (trylock_page(page)) { 190962306a36Sopenharmony_ci get_page(page); 191062306a36Sopenharmony_ci xas_unlock_irq(&xas); 191162306a36Sopenharmony_ci } else { 191262306a36Sopenharmony_ci result = SCAN_PAGE_LOCK; 191362306a36Sopenharmony_ci goto xa_locked; 191462306a36Sopenharmony_ci } 191562306a36Sopenharmony_ci } 191662306a36Sopenharmony_ci 191762306a36Sopenharmony_ci /* 191862306a36Sopenharmony_ci * The page must be locked, so we can drop the i_pages lock 191962306a36Sopenharmony_ci * without racing with truncate. 192062306a36Sopenharmony_ci */ 192162306a36Sopenharmony_ci VM_BUG_ON_PAGE(!PageLocked(page), page); 192262306a36Sopenharmony_ci 192362306a36Sopenharmony_ci /* make sure the page is up to date */ 192462306a36Sopenharmony_ci if (unlikely(!PageUptodate(page))) { 192562306a36Sopenharmony_ci result = SCAN_FAIL; 192662306a36Sopenharmony_ci goto out_unlock; 192762306a36Sopenharmony_ci } 192862306a36Sopenharmony_ci 192962306a36Sopenharmony_ci /* 193062306a36Sopenharmony_ci * If file was truncated then extended, or hole-punched, before 193162306a36Sopenharmony_ci * we locked the first page, then a THP might be there already. 193262306a36Sopenharmony_ci * This will be discovered on the first iteration. 193362306a36Sopenharmony_ci */ 193462306a36Sopenharmony_ci if (PageTransCompound(page)) { 193562306a36Sopenharmony_ci struct page *head = compound_head(page); 193662306a36Sopenharmony_ci 193762306a36Sopenharmony_ci result = compound_order(head) == HPAGE_PMD_ORDER && 193862306a36Sopenharmony_ci head->index == start 193962306a36Sopenharmony_ci /* Maybe PMD-mapped */ 194062306a36Sopenharmony_ci ? SCAN_PTE_MAPPED_HUGEPAGE 194162306a36Sopenharmony_ci : SCAN_PAGE_COMPOUND; 194262306a36Sopenharmony_ci goto out_unlock; 194362306a36Sopenharmony_ci } 194462306a36Sopenharmony_ci 194562306a36Sopenharmony_ci folio = page_folio(page); 194662306a36Sopenharmony_ci 194762306a36Sopenharmony_ci if (folio_mapping(folio) != mapping) { 194862306a36Sopenharmony_ci result = SCAN_TRUNCATED; 194962306a36Sopenharmony_ci goto out_unlock; 195062306a36Sopenharmony_ci } 195162306a36Sopenharmony_ci 195262306a36Sopenharmony_ci if (!is_shmem && (folio_test_dirty(folio) || 195362306a36Sopenharmony_ci folio_test_writeback(folio))) { 195462306a36Sopenharmony_ci /* 195562306a36Sopenharmony_ci * khugepaged only works on read-only fd, so this 195662306a36Sopenharmony_ci * page is dirty because it hasn't been flushed 195762306a36Sopenharmony_ci * since first write. 195862306a36Sopenharmony_ci */ 195962306a36Sopenharmony_ci result = SCAN_FAIL; 196062306a36Sopenharmony_ci goto out_unlock; 196162306a36Sopenharmony_ci } 196262306a36Sopenharmony_ci 196362306a36Sopenharmony_ci if (!folio_isolate_lru(folio)) { 196462306a36Sopenharmony_ci result = SCAN_DEL_PAGE_LRU; 196562306a36Sopenharmony_ci goto out_unlock; 196662306a36Sopenharmony_ci } 196762306a36Sopenharmony_ci 196862306a36Sopenharmony_ci if (!filemap_release_folio(folio, GFP_KERNEL)) { 196962306a36Sopenharmony_ci result = SCAN_PAGE_HAS_PRIVATE; 197062306a36Sopenharmony_ci folio_putback_lru(folio); 197162306a36Sopenharmony_ci goto out_unlock; 197262306a36Sopenharmony_ci } 197362306a36Sopenharmony_ci 197462306a36Sopenharmony_ci if (folio_mapped(folio)) 197562306a36Sopenharmony_ci try_to_unmap(folio, 197662306a36Sopenharmony_ci TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH); 197762306a36Sopenharmony_ci 197862306a36Sopenharmony_ci xas_lock_irq(&xas); 197962306a36Sopenharmony_ci 198062306a36Sopenharmony_ci VM_BUG_ON_PAGE(page != xa_load(xas.xa, index), page); 198162306a36Sopenharmony_ci 198262306a36Sopenharmony_ci /* 198362306a36Sopenharmony_ci * We control three references to the page: 198462306a36Sopenharmony_ci * - we hold a pin on it; 198562306a36Sopenharmony_ci * - one reference from page cache; 198662306a36Sopenharmony_ci * - one from isolate_lru_page; 198762306a36Sopenharmony_ci * If those are the only references, then any new usage of the 198862306a36Sopenharmony_ci * page will have to fetch it from the page cache. That requires 198962306a36Sopenharmony_ci * locking the page to handle truncate, so any new usage will be 199062306a36Sopenharmony_ci * blocked until we unlock page after collapse/during rollback. 199162306a36Sopenharmony_ci */ 199262306a36Sopenharmony_ci if (page_count(page) != 3) { 199362306a36Sopenharmony_ci result = SCAN_PAGE_COUNT; 199462306a36Sopenharmony_ci xas_unlock_irq(&xas); 199562306a36Sopenharmony_ci putback_lru_page(page); 199662306a36Sopenharmony_ci goto out_unlock; 199762306a36Sopenharmony_ci } 199862306a36Sopenharmony_ci 199962306a36Sopenharmony_ci /* 200062306a36Sopenharmony_ci * Accumulate the pages that are being collapsed. 200162306a36Sopenharmony_ci */ 200262306a36Sopenharmony_ci list_add_tail(&page->lru, &pagelist); 200362306a36Sopenharmony_ci continue; 200462306a36Sopenharmony_ciout_unlock: 200562306a36Sopenharmony_ci unlock_page(page); 200662306a36Sopenharmony_ci put_page(page); 200762306a36Sopenharmony_ci goto xa_unlocked; 200862306a36Sopenharmony_ci } 200962306a36Sopenharmony_ci 201062306a36Sopenharmony_ci if (!is_shmem) { 201162306a36Sopenharmony_ci filemap_nr_thps_inc(mapping); 201262306a36Sopenharmony_ci /* 201362306a36Sopenharmony_ci * Paired with smp_mb() in do_dentry_open() to ensure 201462306a36Sopenharmony_ci * i_writecount is up to date and the update to nr_thps is 201562306a36Sopenharmony_ci * visible. Ensures the page cache will be truncated if the 201662306a36Sopenharmony_ci * file is opened writable. 201762306a36Sopenharmony_ci */ 201862306a36Sopenharmony_ci smp_mb(); 201962306a36Sopenharmony_ci if (inode_is_open_for_write(mapping->host)) { 202062306a36Sopenharmony_ci result = SCAN_FAIL; 202162306a36Sopenharmony_ci filemap_nr_thps_dec(mapping); 202262306a36Sopenharmony_ci } 202362306a36Sopenharmony_ci } 202462306a36Sopenharmony_ci 202562306a36Sopenharmony_cixa_locked: 202662306a36Sopenharmony_ci xas_unlock_irq(&xas); 202762306a36Sopenharmony_cixa_unlocked: 202862306a36Sopenharmony_ci 202962306a36Sopenharmony_ci /* 203062306a36Sopenharmony_ci * If collapse is successful, flush must be done now before copying. 203162306a36Sopenharmony_ci * If collapse is unsuccessful, does flush actually need to be done? 203262306a36Sopenharmony_ci * Do it anyway, to clear the state. 203362306a36Sopenharmony_ci */ 203462306a36Sopenharmony_ci try_to_unmap_flush(); 203562306a36Sopenharmony_ci 203662306a36Sopenharmony_ci if (result == SCAN_SUCCEED && nr_none && 203762306a36Sopenharmony_ci !shmem_charge(mapping->host, nr_none)) 203862306a36Sopenharmony_ci result = SCAN_FAIL; 203962306a36Sopenharmony_ci if (result != SCAN_SUCCEED) { 204062306a36Sopenharmony_ci nr_none = 0; 204162306a36Sopenharmony_ci goto rollback; 204262306a36Sopenharmony_ci } 204362306a36Sopenharmony_ci 204462306a36Sopenharmony_ci /* 204562306a36Sopenharmony_ci * The old pages are locked, so they won't change anymore. 204662306a36Sopenharmony_ci */ 204762306a36Sopenharmony_ci index = start; 204862306a36Sopenharmony_ci list_for_each_entry(page, &pagelist, lru) { 204962306a36Sopenharmony_ci while (index < page->index) { 205062306a36Sopenharmony_ci clear_highpage(hpage + (index % HPAGE_PMD_NR)); 205162306a36Sopenharmony_ci index++; 205262306a36Sopenharmony_ci } 205362306a36Sopenharmony_ci if (copy_mc_highpage(hpage + (page->index % HPAGE_PMD_NR), page) > 0) { 205462306a36Sopenharmony_ci result = SCAN_COPY_MC; 205562306a36Sopenharmony_ci goto rollback; 205662306a36Sopenharmony_ci } 205762306a36Sopenharmony_ci index++; 205862306a36Sopenharmony_ci } 205962306a36Sopenharmony_ci while (index < end) { 206062306a36Sopenharmony_ci clear_highpage(hpage + (index % HPAGE_PMD_NR)); 206162306a36Sopenharmony_ci index++; 206262306a36Sopenharmony_ci } 206362306a36Sopenharmony_ci 206462306a36Sopenharmony_ci if (nr_none) { 206562306a36Sopenharmony_ci struct vm_area_struct *vma; 206662306a36Sopenharmony_ci int nr_none_check = 0; 206762306a36Sopenharmony_ci 206862306a36Sopenharmony_ci i_mmap_lock_read(mapping); 206962306a36Sopenharmony_ci xas_lock_irq(&xas); 207062306a36Sopenharmony_ci 207162306a36Sopenharmony_ci xas_set(&xas, start); 207262306a36Sopenharmony_ci for (index = start; index < end; index++) { 207362306a36Sopenharmony_ci if (!xas_next(&xas)) { 207462306a36Sopenharmony_ci xas_store(&xas, XA_RETRY_ENTRY); 207562306a36Sopenharmony_ci if (xas_error(&xas)) { 207662306a36Sopenharmony_ci result = SCAN_STORE_FAILED; 207762306a36Sopenharmony_ci goto immap_locked; 207862306a36Sopenharmony_ci } 207962306a36Sopenharmony_ci nr_none_check++; 208062306a36Sopenharmony_ci } 208162306a36Sopenharmony_ci } 208262306a36Sopenharmony_ci 208362306a36Sopenharmony_ci if (nr_none != nr_none_check) { 208462306a36Sopenharmony_ci result = SCAN_PAGE_FILLED; 208562306a36Sopenharmony_ci goto immap_locked; 208662306a36Sopenharmony_ci } 208762306a36Sopenharmony_ci 208862306a36Sopenharmony_ci /* 208962306a36Sopenharmony_ci * If userspace observed a missing page in a VMA with a MODE_MISSING 209062306a36Sopenharmony_ci * userfaultfd, then it might expect a UFFD_EVENT_PAGEFAULT for that 209162306a36Sopenharmony_ci * page. If so, we need to roll back to avoid suppressing such an 209262306a36Sopenharmony_ci * event. Since wp/minor userfaultfds don't give userspace any 209362306a36Sopenharmony_ci * guarantees that the kernel doesn't fill a missing page with a zero 209462306a36Sopenharmony_ci * page, so they don't matter here. 209562306a36Sopenharmony_ci * 209662306a36Sopenharmony_ci * Any userfaultfds registered after this point will not be able to 209762306a36Sopenharmony_ci * observe any missing pages due to the previously inserted retry 209862306a36Sopenharmony_ci * entries. 209962306a36Sopenharmony_ci */ 210062306a36Sopenharmony_ci vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) { 210162306a36Sopenharmony_ci if (userfaultfd_missing(vma)) { 210262306a36Sopenharmony_ci result = SCAN_EXCEED_NONE_PTE; 210362306a36Sopenharmony_ci goto immap_locked; 210462306a36Sopenharmony_ci } 210562306a36Sopenharmony_ci } 210662306a36Sopenharmony_ci 210762306a36Sopenharmony_ciimmap_locked: 210862306a36Sopenharmony_ci i_mmap_unlock_read(mapping); 210962306a36Sopenharmony_ci if (result != SCAN_SUCCEED) { 211062306a36Sopenharmony_ci xas_set(&xas, start); 211162306a36Sopenharmony_ci for (index = start; index < end; index++) { 211262306a36Sopenharmony_ci if (xas_next(&xas) == XA_RETRY_ENTRY) 211362306a36Sopenharmony_ci xas_store(&xas, NULL); 211462306a36Sopenharmony_ci } 211562306a36Sopenharmony_ci 211662306a36Sopenharmony_ci xas_unlock_irq(&xas); 211762306a36Sopenharmony_ci goto rollback; 211862306a36Sopenharmony_ci } 211962306a36Sopenharmony_ci } else { 212062306a36Sopenharmony_ci xas_lock_irq(&xas); 212162306a36Sopenharmony_ci } 212262306a36Sopenharmony_ci 212362306a36Sopenharmony_ci nr = thp_nr_pages(hpage); 212462306a36Sopenharmony_ci if (is_shmem) 212562306a36Sopenharmony_ci __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr); 212662306a36Sopenharmony_ci else 212762306a36Sopenharmony_ci __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr); 212862306a36Sopenharmony_ci 212962306a36Sopenharmony_ci if (nr_none) { 213062306a36Sopenharmony_ci __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none); 213162306a36Sopenharmony_ci /* nr_none is always 0 for non-shmem. */ 213262306a36Sopenharmony_ci __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none); 213362306a36Sopenharmony_ci } 213462306a36Sopenharmony_ci 213562306a36Sopenharmony_ci /* 213662306a36Sopenharmony_ci * Mark hpage as uptodate before inserting it into the page cache so 213762306a36Sopenharmony_ci * that it isn't mistaken for an fallocated but unwritten page. 213862306a36Sopenharmony_ci */ 213962306a36Sopenharmony_ci folio = page_folio(hpage); 214062306a36Sopenharmony_ci folio_mark_uptodate(folio); 214162306a36Sopenharmony_ci folio_ref_add(folio, HPAGE_PMD_NR - 1); 214262306a36Sopenharmony_ci 214362306a36Sopenharmony_ci if (is_shmem) 214462306a36Sopenharmony_ci folio_mark_dirty(folio); 214562306a36Sopenharmony_ci folio_add_lru(folio); 214662306a36Sopenharmony_ci 214762306a36Sopenharmony_ci /* Join all the small entries into a single multi-index entry. */ 214862306a36Sopenharmony_ci xas_set_order(&xas, start, HPAGE_PMD_ORDER); 214962306a36Sopenharmony_ci xas_store(&xas, hpage); 215062306a36Sopenharmony_ci WARN_ON_ONCE(xas_error(&xas)); 215162306a36Sopenharmony_ci xas_unlock_irq(&xas); 215262306a36Sopenharmony_ci 215362306a36Sopenharmony_ci /* 215462306a36Sopenharmony_ci * Remove pte page tables, so we can re-fault the page as huge. 215562306a36Sopenharmony_ci * If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp(). 215662306a36Sopenharmony_ci */ 215762306a36Sopenharmony_ci retract_page_tables(mapping, start); 215862306a36Sopenharmony_ci if (cc && !cc->is_khugepaged) 215962306a36Sopenharmony_ci result = SCAN_PTE_MAPPED_HUGEPAGE; 216062306a36Sopenharmony_ci unlock_page(hpage); 216162306a36Sopenharmony_ci 216262306a36Sopenharmony_ci /* 216362306a36Sopenharmony_ci * The collapse has succeeded, so free the old pages. 216462306a36Sopenharmony_ci */ 216562306a36Sopenharmony_ci list_for_each_entry_safe(page, tmp, &pagelist, lru) { 216662306a36Sopenharmony_ci list_del(&page->lru); 216762306a36Sopenharmony_ci page->mapping = NULL; 216862306a36Sopenharmony_ci ClearPageActive(page); 216962306a36Sopenharmony_ci ClearPageUnevictable(page); 217062306a36Sopenharmony_ci unlock_page(page); 217162306a36Sopenharmony_ci folio_put_refs(page_folio(page), 3); 217262306a36Sopenharmony_ci } 217362306a36Sopenharmony_ci 217462306a36Sopenharmony_ci goto out; 217562306a36Sopenharmony_ci 217662306a36Sopenharmony_cirollback: 217762306a36Sopenharmony_ci /* Something went wrong: roll back page cache changes */ 217862306a36Sopenharmony_ci if (nr_none) { 217962306a36Sopenharmony_ci xas_lock_irq(&xas); 218062306a36Sopenharmony_ci mapping->nrpages -= nr_none; 218162306a36Sopenharmony_ci xas_unlock_irq(&xas); 218262306a36Sopenharmony_ci shmem_uncharge(mapping->host, nr_none); 218362306a36Sopenharmony_ci } 218462306a36Sopenharmony_ci 218562306a36Sopenharmony_ci list_for_each_entry_safe(page, tmp, &pagelist, lru) { 218662306a36Sopenharmony_ci list_del(&page->lru); 218762306a36Sopenharmony_ci unlock_page(page); 218862306a36Sopenharmony_ci putback_lru_page(page); 218962306a36Sopenharmony_ci put_page(page); 219062306a36Sopenharmony_ci } 219162306a36Sopenharmony_ci /* 219262306a36Sopenharmony_ci * Undo the updates of filemap_nr_thps_inc for non-SHMEM 219362306a36Sopenharmony_ci * file only. This undo is not needed unless failure is 219462306a36Sopenharmony_ci * due to SCAN_COPY_MC. 219562306a36Sopenharmony_ci */ 219662306a36Sopenharmony_ci if (!is_shmem && result == SCAN_COPY_MC) { 219762306a36Sopenharmony_ci filemap_nr_thps_dec(mapping); 219862306a36Sopenharmony_ci /* 219962306a36Sopenharmony_ci * Paired with smp_mb() in do_dentry_open() to 220062306a36Sopenharmony_ci * ensure the update to nr_thps is visible. 220162306a36Sopenharmony_ci */ 220262306a36Sopenharmony_ci smp_mb(); 220362306a36Sopenharmony_ci } 220462306a36Sopenharmony_ci 220562306a36Sopenharmony_ci hpage->mapping = NULL; 220662306a36Sopenharmony_ci 220762306a36Sopenharmony_ci unlock_page(hpage); 220862306a36Sopenharmony_ci put_page(hpage); 220962306a36Sopenharmony_ciout: 221062306a36Sopenharmony_ci VM_BUG_ON(!list_empty(&pagelist)); 221162306a36Sopenharmony_ci trace_mm_khugepaged_collapse_file(mm, hpage, index, is_shmem, addr, file, nr, result); 221262306a36Sopenharmony_ci return result; 221362306a36Sopenharmony_ci} 221462306a36Sopenharmony_ci 221562306a36Sopenharmony_cistatic int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, 221662306a36Sopenharmony_ci struct file *file, pgoff_t start, 221762306a36Sopenharmony_ci struct collapse_control *cc) 221862306a36Sopenharmony_ci{ 221962306a36Sopenharmony_ci struct page *page = NULL; 222062306a36Sopenharmony_ci struct address_space *mapping = file->f_mapping; 222162306a36Sopenharmony_ci XA_STATE(xas, &mapping->i_pages, start); 222262306a36Sopenharmony_ci int present, swap; 222362306a36Sopenharmony_ci int node = NUMA_NO_NODE; 222462306a36Sopenharmony_ci int result = SCAN_SUCCEED; 222562306a36Sopenharmony_ci 222662306a36Sopenharmony_ci present = 0; 222762306a36Sopenharmony_ci swap = 0; 222862306a36Sopenharmony_ci memset(cc->node_load, 0, sizeof(cc->node_load)); 222962306a36Sopenharmony_ci nodes_clear(cc->alloc_nmask); 223062306a36Sopenharmony_ci rcu_read_lock(); 223162306a36Sopenharmony_ci xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) { 223262306a36Sopenharmony_ci if (xas_retry(&xas, page)) 223362306a36Sopenharmony_ci continue; 223462306a36Sopenharmony_ci 223562306a36Sopenharmony_ci if (xa_is_value(page)) { 223662306a36Sopenharmony_ci ++swap; 223762306a36Sopenharmony_ci if (cc->is_khugepaged && 223862306a36Sopenharmony_ci swap > khugepaged_max_ptes_swap) { 223962306a36Sopenharmony_ci result = SCAN_EXCEED_SWAP_PTE; 224062306a36Sopenharmony_ci count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); 224162306a36Sopenharmony_ci break; 224262306a36Sopenharmony_ci } 224362306a36Sopenharmony_ci continue; 224462306a36Sopenharmony_ci } 224562306a36Sopenharmony_ci 224662306a36Sopenharmony_ci /* 224762306a36Sopenharmony_ci * TODO: khugepaged should compact smaller compound pages 224862306a36Sopenharmony_ci * into a PMD sized page 224962306a36Sopenharmony_ci */ 225062306a36Sopenharmony_ci if (PageTransCompound(page)) { 225162306a36Sopenharmony_ci struct page *head = compound_head(page); 225262306a36Sopenharmony_ci 225362306a36Sopenharmony_ci result = compound_order(head) == HPAGE_PMD_ORDER && 225462306a36Sopenharmony_ci head->index == start 225562306a36Sopenharmony_ci /* Maybe PMD-mapped */ 225662306a36Sopenharmony_ci ? SCAN_PTE_MAPPED_HUGEPAGE 225762306a36Sopenharmony_ci : SCAN_PAGE_COMPOUND; 225862306a36Sopenharmony_ci /* 225962306a36Sopenharmony_ci * For SCAN_PTE_MAPPED_HUGEPAGE, further processing 226062306a36Sopenharmony_ci * by the caller won't touch the page cache, and so 226162306a36Sopenharmony_ci * it's safe to skip LRU and refcount checks before 226262306a36Sopenharmony_ci * returning. 226362306a36Sopenharmony_ci */ 226462306a36Sopenharmony_ci break; 226562306a36Sopenharmony_ci } 226662306a36Sopenharmony_ci 226762306a36Sopenharmony_ci node = page_to_nid(page); 226862306a36Sopenharmony_ci if (hpage_collapse_scan_abort(node, cc)) { 226962306a36Sopenharmony_ci result = SCAN_SCAN_ABORT; 227062306a36Sopenharmony_ci break; 227162306a36Sopenharmony_ci } 227262306a36Sopenharmony_ci cc->node_load[node]++; 227362306a36Sopenharmony_ci 227462306a36Sopenharmony_ci if (!PageLRU(page)) { 227562306a36Sopenharmony_ci result = SCAN_PAGE_LRU; 227662306a36Sopenharmony_ci break; 227762306a36Sopenharmony_ci } 227862306a36Sopenharmony_ci 227962306a36Sopenharmony_ci if (page_count(page) != 228062306a36Sopenharmony_ci 1 + page_mapcount(page) + page_has_private(page)) { 228162306a36Sopenharmony_ci result = SCAN_PAGE_COUNT; 228262306a36Sopenharmony_ci break; 228362306a36Sopenharmony_ci } 228462306a36Sopenharmony_ci 228562306a36Sopenharmony_ci /* 228662306a36Sopenharmony_ci * We probably should check if the page is referenced here, but 228762306a36Sopenharmony_ci * nobody would transfer pte_young() to PageReferenced() for us. 228862306a36Sopenharmony_ci * And rmap walk here is just too costly... 228962306a36Sopenharmony_ci */ 229062306a36Sopenharmony_ci 229162306a36Sopenharmony_ci present++; 229262306a36Sopenharmony_ci 229362306a36Sopenharmony_ci if (need_resched()) { 229462306a36Sopenharmony_ci xas_pause(&xas); 229562306a36Sopenharmony_ci cond_resched_rcu(); 229662306a36Sopenharmony_ci } 229762306a36Sopenharmony_ci } 229862306a36Sopenharmony_ci rcu_read_unlock(); 229962306a36Sopenharmony_ci 230062306a36Sopenharmony_ci if (result == SCAN_SUCCEED) { 230162306a36Sopenharmony_ci if (cc->is_khugepaged && 230262306a36Sopenharmony_ci present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { 230362306a36Sopenharmony_ci result = SCAN_EXCEED_NONE_PTE; 230462306a36Sopenharmony_ci count_vm_event(THP_SCAN_EXCEED_NONE_PTE); 230562306a36Sopenharmony_ci } else { 230662306a36Sopenharmony_ci result = collapse_file(mm, addr, file, start, cc); 230762306a36Sopenharmony_ci } 230862306a36Sopenharmony_ci } 230962306a36Sopenharmony_ci 231062306a36Sopenharmony_ci trace_mm_khugepaged_scan_file(mm, page, file, present, swap, result); 231162306a36Sopenharmony_ci return result; 231262306a36Sopenharmony_ci} 231362306a36Sopenharmony_ci#else 231462306a36Sopenharmony_cistatic int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, 231562306a36Sopenharmony_ci struct file *file, pgoff_t start, 231662306a36Sopenharmony_ci struct collapse_control *cc) 231762306a36Sopenharmony_ci{ 231862306a36Sopenharmony_ci BUILD_BUG(); 231962306a36Sopenharmony_ci} 232062306a36Sopenharmony_ci#endif 232162306a36Sopenharmony_ci 232262306a36Sopenharmony_cistatic unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, 232362306a36Sopenharmony_ci struct collapse_control *cc) 232462306a36Sopenharmony_ci __releases(&khugepaged_mm_lock) 232562306a36Sopenharmony_ci __acquires(&khugepaged_mm_lock) 232662306a36Sopenharmony_ci{ 232762306a36Sopenharmony_ci struct vma_iterator vmi; 232862306a36Sopenharmony_ci struct khugepaged_mm_slot *mm_slot; 232962306a36Sopenharmony_ci struct mm_slot *slot; 233062306a36Sopenharmony_ci struct mm_struct *mm; 233162306a36Sopenharmony_ci struct vm_area_struct *vma; 233262306a36Sopenharmony_ci int progress = 0; 233362306a36Sopenharmony_ci 233462306a36Sopenharmony_ci VM_BUG_ON(!pages); 233562306a36Sopenharmony_ci lockdep_assert_held(&khugepaged_mm_lock); 233662306a36Sopenharmony_ci *result = SCAN_FAIL; 233762306a36Sopenharmony_ci 233862306a36Sopenharmony_ci if (khugepaged_scan.mm_slot) { 233962306a36Sopenharmony_ci mm_slot = khugepaged_scan.mm_slot; 234062306a36Sopenharmony_ci slot = &mm_slot->slot; 234162306a36Sopenharmony_ci } else { 234262306a36Sopenharmony_ci slot = list_entry(khugepaged_scan.mm_head.next, 234362306a36Sopenharmony_ci struct mm_slot, mm_node); 234462306a36Sopenharmony_ci mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); 234562306a36Sopenharmony_ci khugepaged_scan.address = 0; 234662306a36Sopenharmony_ci khugepaged_scan.mm_slot = mm_slot; 234762306a36Sopenharmony_ci } 234862306a36Sopenharmony_ci spin_unlock(&khugepaged_mm_lock); 234962306a36Sopenharmony_ci 235062306a36Sopenharmony_ci mm = slot->mm; 235162306a36Sopenharmony_ci /* 235262306a36Sopenharmony_ci * Don't wait for semaphore (to avoid long wait times). Just move to 235362306a36Sopenharmony_ci * the next mm on the list. 235462306a36Sopenharmony_ci */ 235562306a36Sopenharmony_ci vma = NULL; 235662306a36Sopenharmony_ci if (unlikely(!mmap_read_trylock(mm))) 235762306a36Sopenharmony_ci goto breakouterloop_mmap_lock; 235862306a36Sopenharmony_ci 235962306a36Sopenharmony_ci progress++; 236062306a36Sopenharmony_ci if (unlikely(hpage_collapse_test_exit(mm))) 236162306a36Sopenharmony_ci goto breakouterloop; 236262306a36Sopenharmony_ci 236362306a36Sopenharmony_ci vma_iter_init(&vmi, mm, khugepaged_scan.address); 236462306a36Sopenharmony_ci for_each_vma(vmi, vma) { 236562306a36Sopenharmony_ci unsigned long hstart, hend; 236662306a36Sopenharmony_ci 236762306a36Sopenharmony_ci cond_resched(); 236862306a36Sopenharmony_ci if (unlikely(hpage_collapse_test_exit(mm))) { 236962306a36Sopenharmony_ci progress++; 237062306a36Sopenharmony_ci break; 237162306a36Sopenharmony_ci } 237262306a36Sopenharmony_ci if (!hugepage_vma_check(vma, vma->vm_flags, false, false, true)) { 237362306a36Sopenharmony_ciskip: 237462306a36Sopenharmony_ci progress++; 237562306a36Sopenharmony_ci continue; 237662306a36Sopenharmony_ci } 237762306a36Sopenharmony_ci hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE); 237862306a36Sopenharmony_ci hend = round_down(vma->vm_end, HPAGE_PMD_SIZE); 237962306a36Sopenharmony_ci if (khugepaged_scan.address > hend) 238062306a36Sopenharmony_ci goto skip; 238162306a36Sopenharmony_ci if (khugepaged_scan.address < hstart) 238262306a36Sopenharmony_ci khugepaged_scan.address = hstart; 238362306a36Sopenharmony_ci VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); 238462306a36Sopenharmony_ci 238562306a36Sopenharmony_ci while (khugepaged_scan.address < hend) { 238662306a36Sopenharmony_ci bool mmap_locked = true; 238762306a36Sopenharmony_ci 238862306a36Sopenharmony_ci cond_resched(); 238962306a36Sopenharmony_ci if (unlikely(hpage_collapse_test_exit(mm))) 239062306a36Sopenharmony_ci goto breakouterloop; 239162306a36Sopenharmony_ci 239262306a36Sopenharmony_ci VM_BUG_ON(khugepaged_scan.address < hstart || 239362306a36Sopenharmony_ci khugepaged_scan.address + HPAGE_PMD_SIZE > 239462306a36Sopenharmony_ci hend); 239562306a36Sopenharmony_ci if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { 239662306a36Sopenharmony_ci struct file *file = get_file(vma->vm_file); 239762306a36Sopenharmony_ci pgoff_t pgoff = linear_page_index(vma, 239862306a36Sopenharmony_ci khugepaged_scan.address); 239962306a36Sopenharmony_ci 240062306a36Sopenharmony_ci mmap_read_unlock(mm); 240162306a36Sopenharmony_ci mmap_locked = false; 240262306a36Sopenharmony_ci *result = hpage_collapse_scan_file(mm, 240362306a36Sopenharmony_ci khugepaged_scan.address, file, pgoff, cc); 240462306a36Sopenharmony_ci fput(file); 240562306a36Sopenharmony_ci if (*result == SCAN_PTE_MAPPED_HUGEPAGE) { 240662306a36Sopenharmony_ci mmap_read_lock(mm); 240762306a36Sopenharmony_ci if (hpage_collapse_test_exit(mm)) 240862306a36Sopenharmony_ci goto breakouterloop; 240962306a36Sopenharmony_ci *result = collapse_pte_mapped_thp(mm, 241062306a36Sopenharmony_ci khugepaged_scan.address, false); 241162306a36Sopenharmony_ci if (*result == SCAN_PMD_MAPPED) 241262306a36Sopenharmony_ci *result = SCAN_SUCCEED; 241362306a36Sopenharmony_ci mmap_read_unlock(mm); 241462306a36Sopenharmony_ci } 241562306a36Sopenharmony_ci } else { 241662306a36Sopenharmony_ci *result = hpage_collapse_scan_pmd(mm, vma, 241762306a36Sopenharmony_ci khugepaged_scan.address, &mmap_locked, cc); 241862306a36Sopenharmony_ci } 241962306a36Sopenharmony_ci 242062306a36Sopenharmony_ci if (*result == SCAN_SUCCEED) 242162306a36Sopenharmony_ci ++khugepaged_pages_collapsed; 242262306a36Sopenharmony_ci 242362306a36Sopenharmony_ci /* move to next address */ 242462306a36Sopenharmony_ci khugepaged_scan.address += HPAGE_PMD_SIZE; 242562306a36Sopenharmony_ci progress += HPAGE_PMD_NR; 242662306a36Sopenharmony_ci if (!mmap_locked) 242762306a36Sopenharmony_ci /* 242862306a36Sopenharmony_ci * We released mmap_lock so break loop. Note 242962306a36Sopenharmony_ci * that we drop mmap_lock before all hugepage 243062306a36Sopenharmony_ci * allocations, so if allocation fails, we are 243162306a36Sopenharmony_ci * guaranteed to break here and report the 243262306a36Sopenharmony_ci * correct result back to caller. 243362306a36Sopenharmony_ci */ 243462306a36Sopenharmony_ci goto breakouterloop_mmap_lock; 243562306a36Sopenharmony_ci if (progress >= pages) 243662306a36Sopenharmony_ci goto breakouterloop; 243762306a36Sopenharmony_ci } 243862306a36Sopenharmony_ci } 243962306a36Sopenharmony_cibreakouterloop: 244062306a36Sopenharmony_ci mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */ 244162306a36Sopenharmony_cibreakouterloop_mmap_lock: 244262306a36Sopenharmony_ci 244362306a36Sopenharmony_ci spin_lock(&khugepaged_mm_lock); 244462306a36Sopenharmony_ci VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); 244562306a36Sopenharmony_ci /* 244662306a36Sopenharmony_ci * Release the current mm_slot if this mm is about to die, or 244762306a36Sopenharmony_ci * if we scanned all vmas of this mm. 244862306a36Sopenharmony_ci */ 244962306a36Sopenharmony_ci if (hpage_collapse_test_exit(mm) || !vma) { 245062306a36Sopenharmony_ci /* 245162306a36Sopenharmony_ci * Make sure that if mm_users is reaching zero while 245262306a36Sopenharmony_ci * khugepaged runs here, khugepaged_exit will find 245362306a36Sopenharmony_ci * mm_slot not pointing to the exiting mm. 245462306a36Sopenharmony_ci */ 245562306a36Sopenharmony_ci if (slot->mm_node.next != &khugepaged_scan.mm_head) { 245662306a36Sopenharmony_ci slot = list_entry(slot->mm_node.next, 245762306a36Sopenharmony_ci struct mm_slot, mm_node); 245862306a36Sopenharmony_ci khugepaged_scan.mm_slot = 245962306a36Sopenharmony_ci mm_slot_entry(slot, struct khugepaged_mm_slot, slot); 246062306a36Sopenharmony_ci khugepaged_scan.address = 0; 246162306a36Sopenharmony_ci } else { 246262306a36Sopenharmony_ci khugepaged_scan.mm_slot = NULL; 246362306a36Sopenharmony_ci khugepaged_full_scans++; 246462306a36Sopenharmony_ci } 246562306a36Sopenharmony_ci 246662306a36Sopenharmony_ci collect_mm_slot(mm_slot); 246762306a36Sopenharmony_ci } 246862306a36Sopenharmony_ci 246962306a36Sopenharmony_ci return progress; 247062306a36Sopenharmony_ci} 247162306a36Sopenharmony_ci 247262306a36Sopenharmony_cistatic int khugepaged_has_work(void) 247362306a36Sopenharmony_ci{ 247462306a36Sopenharmony_ci return !list_empty(&khugepaged_scan.mm_head) && 247562306a36Sopenharmony_ci hugepage_flags_enabled(); 247662306a36Sopenharmony_ci} 247762306a36Sopenharmony_ci 247862306a36Sopenharmony_cistatic int khugepaged_wait_event(void) 247962306a36Sopenharmony_ci{ 248062306a36Sopenharmony_ci return !list_empty(&khugepaged_scan.mm_head) || 248162306a36Sopenharmony_ci kthread_should_stop(); 248262306a36Sopenharmony_ci} 248362306a36Sopenharmony_ci 248462306a36Sopenharmony_cistatic void khugepaged_do_scan(struct collapse_control *cc) 248562306a36Sopenharmony_ci{ 248662306a36Sopenharmony_ci unsigned int progress = 0, pass_through_head = 0; 248762306a36Sopenharmony_ci unsigned int pages = READ_ONCE(khugepaged_pages_to_scan); 248862306a36Sopenharmony_ci bool wait = true; 248962306a36Sopenharmony_ci int result = SCAN_SUCCEED; 249062306a36Sopenharmony_ci 249162306a36Sopenharmony_ci lru_add_drain_all(); 249262306a36Sopenharmony_ci 249362306a36Sopenharmony_ci while (true) { 249462306a36Sopenharmony_ci cond_resched(); 249562306a36Sopenharmony_ci 249662306a36Sopenharmony_ci if (unlikely(kthread_should_stop() || try_to_freeze())) 249762306a36Sopenharmony_ci break; 249862306a36Sopenharmony_ci 249962306a36Sopenharmony_ci spin_lock(&khugepaged_mm_lock); 250062306a36Sopenharmony_ci if (!khugepaged_scan.mm_slot) 250162306a36Sopenharmony_ci pass_through_head++; 250262306a36Sopenharmony_ci if (khugepaged_has_work() && 250362306a36Sopenharmony_ci pass_through_head < 2) 250462306a36Sopenharmony_ci progress += khugepaged_scan_mm_slot(pages - progress, 250562306a36Sopenharmony_ci &result, cc); 250662306a36Sopenharmony_ci else 250762306a36Sopenharmony_ci progress = pages; 250862306a36Sopenharmony_ci spin_unlock(&khugepaged_mm_lock); 250962306a36Sopenharmony_ci 251062306a36Sopenharmony_ci if (progress >= pages) 251162306a36Sopenharmony_ci break; 251262306a36Sopenharmony_ci 251362306a36Sopenharmony_ci if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) { 251462306a36Sopenharmony_ci /* 251562306a36Sopenharmony_ci * If fail to allocate the first time, try to sleep for 251662306a36Sopenharmony_ci * a while. When hit again, cancel the scan. 251762306a36Sopenharmony_ci */ 251862306a36Sopenharmony_ci if (!wait) 251962306a36Sopenharmony_ci break; 252062306a36Sopenharmony_ci wait = false; 252162306a36Sopenharmony_ci khugepaged_alloc_sleep(); 252262306a36Sopenharmony_ci } 252362306a36Sopenharmony_ci } 252462306a36Sopenharmony_ci} 252562306a36Sopenharmony_ci 252662306a36Sopenharmony_cistatic bool khugepaged_should_wakeup(void) 252762306a36Sopenharmony_ci{ 252862306a36Sopenharmony_ci return kthread_should_stop() || 252962306a36Sopenharmony_ci time_after_eq(jiffies, khugepaged_sleep_expire); 253062306a36Sopenharmony_ci} 253162306a36Sopenharmony_ci 253262306a36Sopenharmony_cistatic void khugepaged_wait_work(void) 253362306a36Sopenharmony_ci{ 253462306a36Sopenharmony_ci if (khugepaged_has_work()) { 253562306a36Sopenharmony_ci const unsigned long scan_sleep_jiffies = 253662306a36Sopenharmony_ci msecs_to_jiffies(khugepaged_scan_sleep_millisecs); 253762306a36Sopenharmony_ci 253862306a36Sopenharmony_ci if (!scan_sleep_jiffies) 253962306a36Sopenharmony_ci return; 254062306a36Sopenharmony_ci 254162306a36Sopenharmony_ci khugepaged_sleep_expire = jiffies + scan_sleep_jiffies; 254262306a36Sopenharmony_ci wait_event_freezable_timeout(khugepaged_wait, 254362306a36Sopenharmony_ci khugepaged_should_wakeup(), 254462306a36Sopenharmony_ci scan_sleep_jiffies); 254562306a36Sopenharmony_ci return; 254662306a36Sopenharmony_ci } 254762306a36Sopenharmony_ci 254862306a36Sopenharmony_ci if (hugepage_flags_enabled()) 254962306a36Sopenharmony_ci wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); 255062306a36Sopenharmony_ci} 255162306a36Sopenharmony_ci 255262306a36Sopenharmony_cistatic int khugepaged(void *none) 255362306a36Sopenharmony_ci{ 255462306a36Sopenharmony_ci struct khugepaged_mm_slot *mm_slot; 255562306a36Sopenharmony_ci 255662306a36Sopenharmony_ci set_freezable(); 255762306a36Sopenharmony_ci set_user_nice(current, MAX_NICE); 255862306a36Sopenharmony_ci 255962306a36Sopenharmony_ci while (!kthread_should_stop()) { 256062306a36Sopenharmony_ci khugepaged_do_scan(&khugepaged_collapse_control); 256162306a36Sopenharmony_ci khugepaged_wait_work(); 256262306a36Sopenharmony_ci } 256362306a36Sopenharmony_ci 256462306a36Sopenharmony_ci spin_lock(&khugepaged_mm_lock); 256562306a36Sopenharmony_ci mm_slot = khugepaged_scan.mm_slot; 256662306a36Sopenharmony_ci khugepaged_scan.mm_slot = NULL; 256762306a36Sopenharmony_ci if (mm_slot) 256862306a36Sopenharmony_ci collect_mm_slot(mm_slot); 256962306a36Sopenharmony_ci spin_unlock(&khugepaged_mm_lock); 257062306a36Sopenharmony_ci return 0; 257162306a36Sopenharmony_ci} 257262306a36Sopenharmony_ci 257362306a36Sopenharmony_cistatic void set_recommended_min_free_kbytes(void) 257462306a36Sopenharmony_ci{ 257562306a36Sopenharmony_ci struct zone *zone; 257662306a36Sopenharmony_ci int nr_zones = 0; 257762306a36Sopenharmony_ci unsigned long recommended_min; 257862306a36Sopenharmony_ci 257962306a36Sopenharmony_ci if (!hugepage_flags_enabled()) { 258062306a36Sopenharmony_ci calculate_min_free_kbytes(); 258162306a36Sopenharmony_ci goto update_wmarks; 258262306a36Sopenharmony_ci } 258362306a36Sopenharmony_ci 258462306a36Sopenharmony_ci for_each_populated_zone(zone) { 258562306a36Sopenharmony_ci /* 258662306a36Sopenharmony_ci * We don't need to worry about fragmentation of 258762306a36Sopenharmony_ci * ZONE_MOVABLE since it only has movable pages. 258862306a36Sopenharmony_ci */ 258962306a36Sopenharmony_ci if (zone_idx(zone) > gfp_zone(GFP_USER)) 259062306a36Sopenharmony_ci continue; 259162306a36Sopenharmony_ci 259262306a36Sopenharmony_ci nr_zones++; 259362306a36Sopenharmony_ci } 259462306a36Sopenharmony_ci 259562306a36Sopenharmony_ci /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ 259662306a36Sopenharmony_ci recommended_min = pageblock_nr_pages * nr_zones * 2; 259762306a36Sopenharmony_ci 259862306a36Sopenharmony_ci /* 259962306a36Sopenharmony_ci * Make sure that on average at least two pageblocks are almost free 260062306a36Sopenharmony_ci * of another type, one for a migratetype to fall back to and a 260162306a36Sopenharmony_ci * second to avoid subsequent fallbacks of other types There are 3 260262306a36Sopenharmony_ci * MIGRATE_TYPES we care about. 260362306a36Sopenharmony_ci */ 260462306a36Sopenharmony_ci recommended_min += pageblock_nr_pages * nr_zones * 260562306a36Sopenharmony_ci MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; 260662306a36Sopenharmony_ci 260762306a36Sopenharmony_ci /* don't ever allow to reserve more than 5% of the lowmem */ 260862306a36Sopenharmony_ci recommended_min = min(recommended_min, 260962306a36Sopenharmony_ci (unsigned long) nr_free_buffer_pages() / 20); 261062306a36Sopenharmony_ci recommended_min <<= (PAGE_SHIFT-10); 261162306a36Sopenharmony_ci 261262306a36Sopenharmony_ci if (recommended_min > min_free_kbytes) { 261362306a36Sopenharmony_ci if (user_min_free_kbytes >= 0) 261462306a36Sopenharmony_ci pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", 261562306a36Sopenharmony_ci min_free_kbytes, recommended_min); 261662306a36Sopenharmony_ci 261762306a36Sopenharmony_ci min_free_kbytes = recommended_min; 261862306a36Sopenharmony_ci } 261962306a36Sopenharmony_ci 262062306a36Sopenharmony_ciupdate_wmarks: 262162306a36Sopenharmony_ci setup_per_zone_wmarks(); 262262306a36Sopenharmony_ci} 262362306a36Sopenharmony_ci 262462306a36Sopenharmony_ciint start_stop_khugepaged(void) 262562306a36Sopenharmony_ci{ 262662306a36Sopenharmony_ci int err = 0; 262762306a36Sopenharmony_ci 262862306a36Sopenharmony_ci mutex_lock(&khugepaged_mutex); 262962306a36Sopenharmony_ci if (hugepage_flags_enabled()) { 263062306a36Sopenharmony_ci if (!khugepaged_thread) 263162306a36Sopenharmony_ci khugepaged_thread = kthread_run(khugepaged, NULL, 263262306a36Sopenharmony_ci "khugepaged"); 263362306a36Sopenharmony_ci if (IS_ERR(khugepaged_thread)) { 263462306a36Sopenharmony_ci pr_err("khugepaged: kthread_run(khugepaged) failed\n"); 263562306a36Sopenharmony_ci err = PTR_ERR(khugepaged_thread); 263662306a36Sopenharmony_ci khugepaged_thread = NULL; 263762306a36Sopenharmony_ci goto fail; 263862306a36Sopenharmony_ci } 263962306a36Sopenharmony_ci 264062306a36Sopenharmony_ci if (!list_empty(&khugepaged_scan.mm_head)) 264162306a36Sopenharmony_ci wake_up_interruptible(&khugepaged_wait); 264262306a36Sopenharmony_ci } else if (khugepaged_thread) { 264362306a36Sopenharmony_ci kthread_stop(khugepaged_thread); 264462306a36Sopenharmony_ci khugepaged_thread = NULL; 264562306a36Sopenharmony_ci } 264662306a36Sopenharmony_ci set_recommended_min_free_kbytes(); 264762306a36Sopenharmony_cifail: 264862306a36Sopenharmony_ci mutex_unlock(&khugepaged_mutex); 264962306a36Sopenharmony_ci return err; 265062306a36Sopenharmony_ci} 265162306a36Sopenharmony_ci 265262306a36Sopenharmony_civoid khugepaged_min_free_kbytes_update(void) 265362306a36Sopenharmony_ci{ 265462306a36Sopenharmony_ci mutex_lock(&khugepaged_mutex); 265562306a36Sopenharmony_ci if (hugepage_flags_enabled() && khugepaged_thread) 265662306a36Sopenharmony_ci set_recommended_min_free_kbytes(); 265762306a36Sopenharmony_ci mutex_unlock(&khugepaged_mutex); 265862306a36Sopenharmony_ci} 265962306a36Sopenharmony_ci 266062306a36Sopenharmony_cibool current_is_khugepaged(void) 266162306a36Sopenharmony_ci{ 266262306a36Sopenharmony_ci return kthread_func(current) == khugepaged; 266362306a36Sopenharmony_ci} 266462306a36Sopenharmony_ci 266562306a36Sopenharmony_cistatic int madvise_collapse_errno(enum scan_result r) 266662306a36Sopenharmony_ci{ 266762306a36Sopenharmony_ci /* 266862306a36Sopenharmony_ci * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide 266962306a36Sopenharmony_ci * actionable feedback to caller, so they may take an appropriate 267062306a36Sopenharmony_ci * fallback measure depending on the nature of the failure. 267162306a36Sopenharmony_ci */ 267262306a36Sopenharmony_ci switch (r) { 267362306a36Sopenharmony_ci case SCAN_ALLOC_HUGE_PAGE_FAIL: 267462306a36Sopenharmony_ci return -ENOMEM; 267562306a36Sopenharmony_ci case SCAN_CGROUP_CHARGE_FAIL: 267662306a36Sopenharmony_ci case SCAN_EXCEED_NONE_PTE: 267762306a36Sopenharmony_ci return -EBUSY; 267862306a36Sopenharmony_ci /* Resource temporary unavailable - trying again might succeed */ 267962306a36Sopenharmony_ci case SCAN_PAGE_COUNT: 268062306a36Sopenharmony_ci case SCAN_PAGE_LOCK: 268162306a36Sopenharmony_ci case SCAN_PAGE_LRU: 268262306a36Sopenharmony_ci case SCAN_DEL_PAGE_LRU: 268362306a36Sopenharmony_ci case SCAN_PAGE_FILLED: 268462306a36Sopenharmony_ci return -EAGAIN; 268562306a36Sopenharmony_ci /* 268662306a36Sopenharmony_ci * Other: Trying again likely not to succeed / error intrinsic to 268762306a36Sopenharmony_ci * specified memory range. khugepaged likely won't be able to collapse 268862306a36Sopenharmony_ci * either. 268962306a36Sopenharmony_ci */ 269062306a36Sopenharmony_ci default: 269162306a36Sopenharmony_ci return -EINVAL; 269262306a36Sopenharmony_ci } 269362306a36Sopenharmony_ci} 269462306a36Sopenharmony_ci 269562306a36Sopenharmony_ciint madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, 269662306a36Sopenharmony_ci unsigned long start, unsigned long end) 269762306a36Sopenharmony_ci{ 269862306a36Sopenharmony_ci struct collapse_control *cc; 269962306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 270062306a36Sopenharmony_ci unsigned long hstart, hend, addr; 270162306a36Sopenharmony_ci int thps = 0, last_fail = SCAN_FAIL; 270262306a36Sopenharmony_ci bool mmap_locked = true; 270362306a36Sopenharmony_ci 270462306a36Sopenharmony_ci BUG_ON(vma->vm_start > start); 270562306a36Sopenharmony_ci BUG_ON(vma->vm_end < end); 270662306a36Sopenharmony_ci 270762306a36Sopenharmony_ci *prev = vma; 270862306a36Sopenharmony_ci 270962306a36Sopenharmony_ci if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) 271062306a36Sopenharmony_ci return -EINVAL; 271162306a36Sopenharmony_ci 271262306a36Sopenharmony_ci cc = kmalloc(sizeof(*cc), GFP_KERNEL); 271362306a36Sopenharmony_ci if (!cc) 271462306a36Sopenharmony_ci return -ENOMEM; 271562306a36Sopenharmony_ci cc->is_khugepaged = false; 271662306a36Sopenharmony_ci 271762306a36Sopenharmony_ci mmgrab(mm); 271862306a36Sopenharmony_ci lru_add_drain_all(); 271962306a36Sopenharmony_ci 272062306a36Sopenharmony_ci hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 272162306a36Sopenharmony_ci hend = end & HPAGE_PMD_MASK; 272262306a36Sopenharmony_ci 272362306a36Sopenharmony_ci for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) { 272462306a36Sopenharmony_ci int result = SCAN_FAIL; 272562306a36Sopenharmony_ci 272662306a36Sopenharmony_ci if (!mmap_locked) { 272762306a36Sopenharmony_ci cond_resched(); 272862306a36Sopenharmony_ci mmap_read_lock(mm); 272962306a36Sopenharmony_ci mmap_locked = true; 273062306a36Sopenharmony_ci result = hugepage_vma_revalidate(mm, addr, false, &vma, 273162306a36Sopenharmony_ci cc); 273262306a36Sopenharmony_ci if (result != SCAN_SUCCEED) { 273362306a36Sopenharmony_ci last_fail = result; 273462306a36Sopenharmony_ci goto out_nolock; 273562306a36Sopenharmony_ci } 273662306a36Sopenharmony_ci 273762306a36Sopenharmony_ci hend = min(hend, vma->vm_end & HPAGE_PMD_MASK); 273862306a36Sopenharmony_ci } 273962306a36Sopenharmony_ci mmap_assert_locked(mm); 274062306a36Sopenharmony_ci memset(cc->node_load, 0, sizeof(cc->node_load)); 274162306a36Sopenharmony_ci nodes_clear(cc->alloc_nmask); 274262306a36Sopenharmony_ci if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { 274362306a36Sopenharmony_ci struct file *file = get_file(vma->vm_file); 274462306a36Sopenharmony_ci pgoff_t pgoff = linear_page_index(vma, addr); 274562306a36Sopenharmony_ci 274662306a36Sopenharmony_ci mmap_read_unlock(mm); 274762306a36Sopenharmony_ci mmap_locked = false; 274862306a36Sopenharmony_ci result = hpage_collapse_scan_file(mm, addr, file, pgoff, 274962306a36Sopenharmony_ci cc); 275062306a36Sopenharmony_ci fput(file); 275162306a36Sopenharmony_ci } else { 275262306a36Sopenharmony_ci result = hpage_collapse_scan_pmd(mm, vma, addr, 275362306a36Sopenharmony_ci &mmap_locked, cc); 275462306a36Sopenharmony_ci } 275562306a36Sopenharmony_ci if (!mmap_locked) 275662306a36Sopenharmony_ci *prev = NULL; /* Tell caller we dropped mmap_lock */ 275762306a36Sopenharmony_ci 275862306a36Sopenharmony_cihandle_result: 275962306a36Sopenharmony_ci switch (result) { 276062306a36Sopenharmony_ci case SCAN_SUCCEED: 276162306a36Sopenharmony_ci case SCAN_PMD_MAPPED: 276262306a36Sopenharmony_ci ++thps; 276362306a36Sopenharmony_ci break; 276462306a36Sopenharmony_ci case SCAN_PTE_MAPPED_HUGEPAGE: 276562306a36Sopenharmony_ci BUG_ON(mmap_locked); 276662306a36Sopenharmony_ci BUG_ON(*prev); 276762306a36Sopenharmony_ci mmap_read_lock(mm); 276862306a36Sopenharmony_ci result = collapse_pte_mapped_thp(mm, addr, true); 276962306a36Sopenharmony_ci mmap_read_unlock(mm); 277062306a36Sopenharmony_ci goto handle_result; 277162306a36Sopenharmony_ci /* Whitelisted set of results where continuing OK */ 277262306a36Sopenharmony_ci case SCAN_PMD_NULL: 277362306a36Sopenharmony_ci case SCAN_PTE_NON_PRESENT: 277462306a36Sopenharmony_ci case SCAN_PTE_UFFD_WP: 277562306a36Sopenharmony_ci case SCAN_PAGE_RO: 277662306a36Sopenharmony_ci case SCAN_LACK_REFERENCED_PAGE: 277762306a36Sopenharmony_ci case SCAN_PAGE_NULL: 277862306a36Sopenharmony_ci case SCAN_PAGE_COUNT: 277962306a36Sopenharmony_ci case SCAN_PAGE_LOCK: 278062306a36Sopenharmony_ci case SCAN_PAGE_COMPOUND: 278162306a36Sopenharmony_ci case SCAN_PAGE_LRU: 278262306a36Sopenharmony_ci case SCAN_DEL_PAGE_LRU: 278362306a36Sopenharmony_ci last_fail = result; 278462306a36Sopenharmony_ci break; 278562306a36Sopenharmony_ci default: 278662306a36Sopenharmony_ci last_fail = result; 278762306a36Sopenharmony_ci /* Other error, exit */ 278862306a36Sopenharmony_ci goto out_maybelock; 278962306a36Sopenharmony_ci } 279062306a36Sopenharmony_ci } 279162306a36Sopenharmony_ci 279262306a36Sopenharmony_ciout_maybelock: 279362306a36Sopenharmony_ci /* Caller expects us to hold mmap_lock on return */ 279462306a36Sopenharmony_ci if (!mmap_locked) 279562306a36Sopenharmony_ci mmap_read_lock(mm); 279662306a36Sopenharmony_ciout_nolock: 279762306a36Sopenharmony_ci mmap_assert_locked(mm); 279862306a36Sopenharmony_ci mmdrop(mm); 279962306a36Sopenharmony_ci kfree(cc); 280062306a36Sopenharmony_ci 280162306a36Sopenharmony_ci return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0 280262306a36Sopenharmony_ci : madvise_collapse_errno(last_fail); 280362306a36Sopenharmony_ci} 2804