162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Copyright (C) 2009 Red Hat, Inc. 462306a36Sopenharmony_ci */ 562306a36Sopenharmony_ci 662306a36Sopenharmony_ci#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 762306a36Sopenharmony_ci 862306a36Sopenharmony_ci#include <linux/mm.h> 962306a36Sopenharmony_ci#include <linux/sched.h> 1062306a36Sopenharmony_ci#include <linux/sched/mm.h> 1162306a36Sopenharmony_ci#include <linux/sched/coredump.h> 1262306a36Sopenharmony_ci#include <linux/sched/numa_balancing.h> 1362306a36Sopenharmony_ci#include <linux/highmem.h> 1462306a36Sopenharmony_ci#include <linux/hugetlb.h> 1562306a36Sopenharmony_ci#include <linux/mmu_notifier.h> 1662306a36Sopenharmony_ci#include <linux/rmap.h> 1762306a36Sopenharmony_ci#include <linux/swap.h> 1862306a36Sopenharmony_ci#include <linux/shrinker.h> 1962306a36Sopenharmony_ci#include <linux/mm_inline.h> 2062306a36Sopenharmony_ci#include <linux/swapops.h> 2162306a36Sopenharmony_ci#include <linux/backing-dev.h> 2262306a36Sopenharmony_ci#include <linux/dax.h> 2362306a36Sopenharmony_ci#include <linux/khugepaged.h> 2462306a36Sopenharmony_ci#include <linux/freezer.h> 2562306a36Sopenharmony_ci#include <linux/pfn_t.h> 2662306a36Sopenharmony_ci#include <linux/mman.h> 2762306a36Sopenharmony_ci#include <linux/memremap.h> 2862306a36Sopenharmony_ci#include <linux/pagemap.h> 2962306a36Sopenharmony_ci#include <linux/debugfs.h> 3062306a36Sopenharmony_ci#include <linux/migrate.h> 3162306a36Sopenharmony_ci#include <linux/hashtable.h> 3262306a36Sopenharmony_ci#include <linux/userfaultfd_k.h> 3362306a36Sopenharmony_ci#include <linux/page_idle.h> 3462306a36Sopenharmony_ci#include <linux/shmem_fs.h> 3562306a36Sopenharmony_ci#include <linux/oom.h> 3662306a36Sopenharmony_ci#include <linux/numa.h> 3762306a36Sopenharmony_ci#include <linux/page_owner.h> 3862306a36Sopenharmony_ci#include <linux/sched/sysctl.h> 3962306a36Sopenharmony_ci#include <linux/memory-tiers.h> 4062306a36Sopenharmony_ci 4162306a36Sopenharmony_ci#include <asm/tlb.h> 4262306a36Sopenharmony_ci#include <asm/pgalloc.h> 4362306a36Sopenharmony_ci#include "internal.h" 4462306a36Sopenharmony_ci#include "swap.h" 4562306a36Sopenharmony_ci 4662306a36Sopenharmony_ci#define CREATE_TRACE_POINTS 4762306a36Sopenharmony_ci#include <trace/events/thp.h> 4862306a36Sopenharmony_ci 4962306a36Sopenharmony_ci/* 5062306a36Sopenharmony_ci * By default, transparent hugepage support is disabled in order to avoid 5162306a36Sopenharmony_ci * risking an increased memory footprint for applications that are not 5262306a36Sopenharmony_ci * guaranteed to benefit from it. When transparent hugepage support is 5362306a36Sopenharmony_ci * enabled, it is for all mappings, and khugepaged scans all mappings. 5462306a36Sopenharmony_ci * Defrag is invoked by khugepaged hugepage allocations and by page faults 5562306a36Sopenharmony_ci * for all hugepage allocations. 5662306a36Sopenharmony_ci */ 5762306a36Sopenharmony_ciunsigned long transparent_hugepage_flags __read_mostly = 5862306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS 5962306a36Sopenharmony_ci (1<<TRANSPARENT_HUGEPAGE_FLAG)| 6062306a36Sopenharmony_ci#endif 6162306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE 6262306a36Sopenharmony_ci (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| 6362306a36Sopenharmony_ci#endif 6462306a36Sopenharmony_ci (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)| 6562306a36Sopenharmony_ci (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| 6662306a36Sopenharmony_ci (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 6762306a36Sopenharmony_ci 6862306a36Sopenharmony_cistatic struct shrinker deferred_split_shrinker; 6962306a36Sopenharmony_ci 7062306a36Sopenharmony_cistatic atomic_t huge_zero_refcount; 7162306a36Sopenharmony_cistruct page *huge_zero_page __read_mostly; 7262306a36Sopenharmony_ciunsigned long huge_zero_pfn __read_mostly = ~0UL; 7362306a36Sopenharmony_ci 7462306a36Sopenharmony_cibool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, 7562306a36Sopenharmony_ci bool smaps, bool in_pf, bool enforce_sysfs) 7662306a36Sopenharmony_ci{ 7762306a36Sopenharmony_ci if (!vma->vm_mm) /* vdso */ 7862306a36Sopenharmony_ci return false; 7962306a36Sopenharmony_ci 8062306a36Sopenharmony_ci /* 8162306a36Sopenharmony_ci * Explicitly disabled through madvise or prctl, or some 8262306a36Sopenharmony_ci * architectures may disable THP for some mappings, for 8362306a36Sopenharmony_ci * example, s390 kvm. 8462306a36Sopenharmony_ci * */ 8562306a36Sopenharmony_ci if ((vm_flags & VM_NOHUGEPAGE) || 8662306a36Sopenharmony_ci test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) 8762306a36Sopenharmony_ci return false; 8862306a36Sopenharmony_ci /* 8962306a36Sopenharmony_ci * If the hardware/firmware marked hugepage support disabled. 9062306a36Sopenharmony_ci */ 9162306a36Sopenharmony_ci if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED)) 9262306a36Sopenharmony_ci return false; 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_ci /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ 9562306a36Sopenharmony_ci if (vma_is_dax(vma)) 9662306a36Sopenharmony_ci return in_pf; 9762306a36Sopenharmony_ci 9862306a36Sopenharmony_ci /* 9962306a36Sopenharmony_ci * Special VMA and hugetlb VMA. 10062306a36Sopenharmony_ci * Must be checked after dax since some dax mappings may have 10162306a36Sopenharmony_ci * VM_MIXEDMAP set. 10262306a36Sopenharmony_ci */ 10362306a36Sopenharmony_ci if (vm_flags & VM_NO_KHUGEPAGED) 10462306a36Sopenharmony_ci return false; 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_ci /* 10762306a36Sopenharmony_ci * Check alignment for file vma and size for both file and anon vma. 10862306a36Sopenharmony_ci * 10962306a36Sopenharmony_ci * Skip the check for page fault. Huge fault does the check in fault 11062306a36Sopenharmony_ci * handlers. And this check is not suitable for huge PUD fault. 11162306a36Sopenharmony_ci */ 11262306a36Sopenharmony_ci if (!in_pf && 11362306a36Sopenharmony_ci !transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE))) 11462306a36Sopenharmony_ci return false; 11562306a36Sopenharmony_ci 11662306a36Sopenharmony_ci /* 11762306a36Sopenharmony_ci * Enabled via shmem mount options or sysfs settings. 11862306a36Sopenharmony_ci * Must be done before hugepage flags check since shmem has its 11962306a36Sopenharmony_ci * own flags. 12062306a36Sopenharmony_ci */ 12162306a36Sopenharmony_ci if (!in_pf && shmem_file(vma->vm_file)) 12262306a36Sopenharmony_ci return shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff, 12362306a36Sopenharmony_ci !enforce_sysfs, vma->vm_mm, vm_flags); 12462306a36Sopenharmony_ci 12562306a36Sopenharmony_ci /* Enforce sysfs THP requirements as necessary */ 12662306a36Sopenharmony_ci if (enforce_sysfs && 12762306a36Sopenharmony_ci (!hugepage_flags_enabled() || (!(vm_flags & VM_HUGEPAGE) && 12862306a36Sopenharmony_ci !hugepage_flags_always()))) 12962306a36Sopenharmony_ci return false; 13062306a36Sopenharmony_ci 13162306a36Sopenharmony_ci /* Only regular file is valid */ 13262306a36Sopenharmony_ci if (!in_pf && file_thp_enabled(vma)) 13362306a36Sopenharmony_ci return true; 13462306a36Sopenharmony_ci 13562306a36Sopenharmony_ci if (!vma_is_anonymous(vma)) 13662306a36Sopenharmony_ci return false; 13762306a36Sopenharmony_ci 13862306a36Sopenharmony_ci if (vma_is_temporary_stack(vma)) 13962306a36Sopenharmony_ci return false; 14062306a36Sopenharmony_ci 14162306a36Sopenharmony_ci /* 14262306a36Sopenharmony_ci * THPeligible bit of smaps should show 1 for proper VMAs even 14362306a36Sopenharmony_ci * though anon_vma is not initialized yet. 14462306a36Sopenharmony_ci * 14562306a36Sopenharmony_ci * Allow page fault since anon_vma may be not initialized until 14662306a36Sopenharmony_ci * the first page fault. 14762306a36Sopenharmony_ci */ 14862306a36Sopenharmony_ci if (!vma->anon_vma) 14962306a36Sopenharmony_ci return (smaps || in_pf); 15062306a36Sopenharmony_ci 15162306a36Sopenharmony_ci return true; 15262306a36Sopenharmony_ci} 15362306a36Sopenharmony_ci 15462306a36Sopenharmony_cistatic bool get_huge_zero_page(void) 15562306a36Sopenharmony_ci{ 15662306a36Sopenharmony_ci struct page *zero_page; 15762306a36Sopenharmony_ciretry: 15862306a36Sopenharmony_ci if (likely(atomic_inc_not_zero(&huge_zero_refcount))) 15962306a36Sopenharmony_ci return true; 16062306a36Sopenharmony_ci 16162306a36Sopenharmony_ci zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, 16262306a36Sopenharmony_ci HPAGE_PMD_ORDER); 16362306a36Sopenharmony_ci if (!zero_page) { 16462306a36Sopenharmony_ci count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); 16562306a36Sopenharmony_ci return false; 16662306a36Sopenharmony_ci } 16762306a36Sopenharmony_ci preempt_disable(); 16862306a36Sopenharmony_ci if (cmpxchg(&huge_zero_page, NULL, zero_page)) { 16962306a36Sopenharmony_ci preempt_enable(); 17062306a36Sopenharmony_ci __free_pages(zero_page, compound_order(zero_page)); 17162306a36Sopenharmony_ci goto retry; 17262306a36Sopenharmony_ci } 17362306a36Sopenharmony_ci WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page)); 17462306a36Sopenharmony_ci 17562306a36Sopenharmony_ci /* We take additional reference here. It will be put back by shrinker */ 17662306a36Sopenharmony_ci atomic_set(&huge_zero_refcount, 2); 17762306a36Sopenharmony_ci preempt_enable(); 17862306a36Sopenharmony_ci count_vm_event(THP_ZERO_PAGE_ALLOC); 17962306a36Sopenharmony_ci return true; 18062306a36Sopenharmony_ci} 18162306a36Sopenharmony_ci 18262306a36Sopenharmony_cistatic void put_huge_zero_page(void) 18362306a36Sopenharmony_ci{ 18462306a36Sopenharmony_ci /* 18562306a36Sopenharmony_ci * Counter should never go to zero here. Only shrinker can put 18662306a36Sopenharmony_ci * last reference. 18762306a36Sopenharmony_ci */ 18862306a36Sopenharmony_ci BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); 18962306a36Sopenharmony_ci} 19062306a36Sopenharmony_ci 19162306a36Sopenharmony_cistruct page *mm_get_huge_zero_page(struct mm_struct *mm) 19262306a36Sopenharmony_ci{ 19362306a36Sopenharmony_ci if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) 19462306a36Sopenharmony_ci return READ_ONCE(huge_zero_page); 19562306a36Sopenharmony_ci 19662306a36Sopenharmony_ci if (!get_huge_zero_page()) 19762306a36Sopenharmony_ci return NULL; 19862306a36Sopenharmony_ci 19962306a36Sopenharmony_ci if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) 20062306a36Sopenharmony_ci put_huge_zero_page(); 20162306a36Sopenharmony_ci 20262306a36Sopenharmony_ci return READ_ONCE(huge_zero_page); 20362306a36Sopenharmony_ci} 20462306a36Sopenharmony_ci 20562306a36Sopenharmony_civoid mm_put_huge_zero_page(struct mm_struct *mm) 20662306a36Sopenharmony_ci{ 20762306a36Sopenharmony_ci if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) 20862306a36Sopenharmony_ci put_huge_zero_page(); 20962306a36Sopenharmony_ci} 21062306a36Sopenharmony_ci 21162306a36Sopenharmony_cistatic unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, 21262306a36Sopenharmony_ci struct shrink_control *sc) 21362306a36Sopenharmony_ci{ 21462306a36Sopenharmony_ci /* we can free zero page only if last reference remains */ 21562306a36Sopenharmony_ci return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; 21662306a36Sopenharmony_ci} 21762306a36Sopenharmony_ci 21862306a36Sopenharmony_cistatic unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, 21962306a36Sopenharmony_ci struct shrink_control *sc) 22062306a36Sopenharmony_ci{ 22162306a36Sopenharmony_ci if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { 22262306a36Sopenharmony_ci struct page *zero_page = xchg(&huge_zero_page, NULL); 22362306a36Sopenharmony_ci BUG_ON(zero_page == NULL); 22462306a36Sopenharmony_ci WRITE_ONCE(huge_zero_pfn, ~0UL); 22562306a36Sopenharmony_ci __free_pages(zero_page, compound_order(zero_page)); 22662306a36Sopenharmony_ci return HPAGE_PMD_NR; 22762306a36Sopenharmony_ci } 22862306a36Sopenharmony_ci 22962306a36Sopenharmony_ci return 0; 23062306a36Sopenharmony_ci} 23162306a36Sopenharmony_ci 23262306a36Sopenharmony_cistatic struct shrinker huge_zero_page_shrinker = { 23362306a36Sopenharmony_ci .count_objects = shrink_huge_zero_page_count, 23462306a36Sopenharmony_ci .scan_objects = shrink_huge_zero_page_scan, 23562306a36Sopenharmony_ci .seeks = DEFAULT_SEEKS, 23662306a36Sopenharmony_ci}; 23762306a36Sopenharmony_ci 23862306a36Sopenharmony_ci#ifdef CONFIG_SYSFS 23962306a36Sopenharmony_cistatic ssize_t enabled_show(struct kobject *kobj, 24062306a36Sopenharmony_ci struct kobj_attribute *attr, char *buf) 24162306a36Sopenharmony_ci{ 24262306a36Sopenharmony_ci const char *output; 24362306a36Sopenharmony_ci 24462306a36Sopenharmony_ci if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags)) 24562306a36Sopenharmony_ci output = "[always] madvise never"; 24662306a36Sopenharmony_ci else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 24762306a36Sopenharmony_ci &transparent_hugepage_flags)) 24862306a36Sopenharmony_ci output = "always [madvise] never"; 24962306a36Sopenharmony_ci else 25062306a36Sopenharmony_ci output = "always madvise [never]"; 25162306a36Sopenharmony_ci 25262306a36Sopenharmony_ci return sysfs_emit(buf, "%s\n", output); 25362306a36Sopenharmony_ci} 25462306a36Sopenharmony_ci 25562306a36Sopenharmony_cistatic ssize_t enabled_store(struct kobject *kobj, 25662306a36Sopenharmony_ci struct kobj_attribute *attr, 25762306a36Sopenharmony_ci const char *buf, size_t count) 25862306a36Sopenharmony_ci{ 25962306a36Sopenharmony_ci ssize_t ret = count; 26062306a36Sopenharmony_ci 26162306a36Sopenharmony_ci if (sysfs_streq(buf, "always")) { 26262306a36Sopenharmony_ci clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); 26362306a36Sopenharmony_ci set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); 26462306a36Sopenharmony_ci } else if (sysfs_streq(buf, "madvise")) { 26562306a36Sopenharmony_ci clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); 26662306a36Sopenharmony_ci set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); 26762306a36Sopenharmony_ci } else if (sysfs_streq(buf, "never")) { 26862306a36Sopenharmony_ci clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); 26962306a36Sopenharmony_ci clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); 27062306a36Sopenharmony_ci } else 27162306a36Sopenharmony_ci ret = -EINVAL; 27262306a36Sopenharmony_ci 27362306a36Sopenharmony_ci if (ret > 0) { 27462306a36Sopenharmony_ci int err = start_stop_khugepaged(); 27562306a36Sopenharmony_ci if (err) 27662306a36Sopenharmony_ci ret = err; 27762306a36Sopenharmony_ci } 27862306a36Sopenharmony_ci return ret; 27962306a36Sopenharmony_ci} 28062306a36Sopenharmony_ci 28162306a36Sopenharmony_cistatic struct kobj_attribute enabled_attr = __ATTR_RW(enabled); 28262306a36Sopenharmony_ci 28362306a36Sopenharmony_cissize_t single_hugepage_flag_show(struct kobject *kobj, 28462306a36Sopenharmony_ci struct kobj_attribute *attr, char *buf, 28562306a36Sopenharmony_ci enum transparent_hugepage_flag flag) 28662306a36Sopenharmony_ci{ 28762306a36Sopenharmony_ci return sysfs_emit(buf, "%d\n", 28862306a36Sopenharmony_ci !!test_bit(flag, &transparent_hugepage_flags)); 28962306a36Sopenharmony_ci} 29062306a36Sopenharmony_ci 29162306a36Sopenharmony_cissize_t single_hugepage_flag_store(struct kobject *kobj, 29262306a36Sopenharmony_ci struct kobj_attribute *attr, 29362306a36Sopenharmony_ci const char *buf, size_t count, 29462306a36Sopenharmony_ci enum transparent_hugepage_flag flag) 29562306a36Sopenharmony_ci{ 29662306a36Sopenharmony_ci unsigned long value; 29762306a36Sopenharmony_ci int ret; 29862306a36Sopenharmony_ci 29962306a36Sopenharmony_ci ret = kstrtoul(buf, 10, &value); 30062306a36Sopenharmony_ci if (ret < 0) 30162306a36Sopenharmony_ci return ret; 30262306a36Sopenharmony_ci if (value > 1) 30362306a36Sopenharmony_ci return -EINVAL; 30462306a36Sopenharmony_ci 30562306a36Sopenharmony_ci if (value) 30662306a36Sopenharmony_ci set_bit(flag, &transparent_hugepage_flags); 30762306a36Sopenharmony_ci else 30862306a36Sopenharmony_ci clear_bit(flag, &transparent_hugepage_flags); 30962306a36Sopenharmony_ci 31062306a36Sopenharmony_ci return count; 31162306a36Sopenharmony_ci} 31262306a36Sopenharmony_ci 31362306a36Sopenharmony_cistatic ssize_t defrag_show(struct kobject *kobj, 31462306a36Sopenharmony_ci struct kobj_attribute *attr, char *buf) 31562306a36Sopenharmony_ci{ 31662306a36Sopenharmony_ci const char *output; 31762306a36Sopenharmony_ci 31862306a36Sopenharmony_ci if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, 31962306a36Sopenharmony_ci &transparent_hugepage_flags)) 32062306a36Sopenharmony_ci output = "[always] defer defer+madvise madvise never"; 32162306a36Sopenharmony_ci else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, 32262306a36Sopenharmony_ci &transparent_hugepage_flags)) 32362306a36Sopenharmony_ci output = "always [defer] defer+madvise madvise never"; 32462306a36Sopenharmony_ci else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, 32562306a36Sopenharmony_ci &transparent_hugepage_flags)) 32662306a36Sopenharmony_ci output = "always defer [defer+madvise] madvise never"; 32762306a36Sopenharmony_ci else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, 32862306a36Sopenharmony_ci &transparent_hugepage_flags)) 32962306a36Sopenharmony_ci output = "always defer defer+madvise [madvise] never"; 33062306a36Sopenharmony_ci else 33162306a36Sopenharmony_ci output = "always defer defer+madvise madvise [never]"; 33262306a36Sopenharmony_ci 33362306a36Sopenharmony_ci return sysfs_emit(buf, "%s\n", output); 33462306a36Sopenharmony_ci} 33562306a36Sopenharmony_ci 33662306a36Sopenharmony_cistatic ssize_t defrag_store(struct kobject *kobj, 33762306a36Sopenharmony_ci struct kobj_attribute *attr, 33862306a36Sopenharmony_ci const char *buf, size_t count) 33962306a36Sopenharmony_ci{ 34062306a36Sopenharmony_ci if (sysfs_streq(buf, "always")) { 34162306a36Sopenharmony_ci clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 34262306a36Sopenharmony_ci clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 34362306a36Sopenharmony_ci clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 34462306a36Sopenharmony_ci set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 34562306a36Sopenharmony_ci } else if (sysfs_streq(buf, "defer+madvise")) { 34662306a36Sopenharmony_ci clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 34762306a36Sopenharmony_ci clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 34862306a36Sopenharmony_ci clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 34962306a36Sopenharmony_ci set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 35062306a36Sopenharmony_ci } else if (sysfs_streq(buf, "defer")) { 35162306a36Sopenharmony_ci clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 35262306a36Sopenharmony_ci clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 35362306a36Sopenharmony_ci clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 35462306a36Sopenharmony_ci set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 35562306a36Sopenharmony_ci } else if (sysfs_streq(buf, "madvise")) { 35662306a36Sopenharmony_ci clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 35762306a36Sopenharmony_ci clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 35862306a36Sopenharmony_ci clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 35962306a36Sopenharmony_ci set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 36062306a36Sopenharmony_ci } else if (sysfs_streq(buf, "never")) { 36162306a36Sopenharmony_ci clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 36262306a36Sopenharmony_ci clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 36362306a36Sopenharmony_ci clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 36462306a36Sopenharmony_ci clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 36562306a36Sopenharmony_ci } else 36662306a36Sopenharmony_ci return -EINVAL; 36762306a36Sopenharmony_ci 36862306a36Sopenharmony_ci return count; 36962306a36Sopenharmony_ci} 37062306a36Sopenharmony_cistatic struct kobj_attribute defrag_attr = __ATTR_RW(defrag); 37162306a36Sopenharmony_ci 37262306a36Sopenharmony_cistatic ssize_t use_zero_page_show(struct kobject *kobj, 37362306a36Sopenharmony_ci struct kobj_attribute *attr, char *buf) 37462306a36Sopenharmony_ci{ 37562306a36Sopenharmony_ci return single_hugepage_flag_show(kobj, attr, buf, 37662306a36Sopenharmony_ci TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 37762306a36Sopenharmony_ci} 37862306a36Sopenharmony_cistatic ssize_t use_zero_page_store(struct kobject *kobj, 37962306a36Sopenharmony_ci struct kobj_attribute *attr, const char *buf, size_t count) 38062306a36Sopenharmony_ci{ 38162306a36Sopenharmony_ci return single_hugepage_flag_store(kobj, attr, buf, count, 38262306a36Sopenharmony_ci TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 38362306a36Sopenharmony_ci} 38462306a36Sopenharmony_cistatic struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page); 38562306a36Sopenharmony_ci 38662306a36Sopenharmony_cistatic ssize_t hpage_pmd_size_show(struct kobject *kobj, 38762306a36Sopenharmony_ci struct kobj_attribute *attr, char *buf) 38862306a36Sopenharmony_ci{ 38962306a36Sopenharmony_ci return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE); 39062306a36Sopenharmony_ci} 39162306a36Sopenharmony_cistatic struct kobj_attribute hpage_pmd_size_attr = 39262306a36Sopenharmony_ci __ATTR_RO(hpage_pmd_size); 39362306a36Sopenharmony_ci 39462306a36Sopenharmony_cistatic struct attribute *hugepage_attr[] = { 39562306a36Sopenharmony_ci &enabled_attr.attr, 39662306a36Sopenharmony_ci &defrag_attr.attr, 39762306a36Sopenharmony_ci &use_zero_page_attr.attr, 39862306a36Sopenharmony_ci &hpage_pmd_size_attr.attr, 39962306a36Sopenharmony_ci#ifdef CONFIG_SHMEM 40062306a36Sopenharmony_ci &shmem_enabled_attr.attr, 40162306a36Sopenharmony_ci#endif 40262306a36Sopenharmony_ci NULL, 40362306a36Sopenharmony_ci}; 40462306a36Sopenharmony_ci 40562306a36Sopenharmony_cistatic const struct attribute_group hugepage_attr_group = { 40662306a36Sopenharmony_ci .attrs = hugepage_attr, 40762306a36Sopenharmony_ci}; 40862306a36Sopenharmony_ci 40962306a36Sopenharmony_cistatic int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) 41062306a36Sopenharmony_ci{ 41162306a36Sopenharmony_ci int err; 41262306a36Sopenharmony_ci 41362306a36Sopenharmony_ci *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); 41462306a36Sopenharmony_ci if (unlikely(!*hugepage_kobj)) { 41562306a36Sopenharmony_ci pr_err("failed to create transparent hugepage kobject\n"); 41662306a36Sopenharmony_ci return -ENOMEM; 41762306a36Sopenharmony_ci } 41862306a36Sopenharmony_ci 41962306a36Sopenharmony_ci err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); 42062306a36Sopenharmony_ci if (err) { 42162306a36Sopenharmony_ci pr_err("failed to register transparent hugepage group\n"); 42262306a36Sopenharmony_ci goto delete_obj; 42362306a36Sopenharmony_ci } 42462306a36Sopenharmony_ci 42562306a36Sopenharmony_ci err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); 42662306a36Sopenharmony_ci if (err) { 42762306a36Sopenharmony_ci pr_err("failed to register transparent hugepage group\n"); 42862306a36Sopenharmony_ci goto remove_hp_group; 42962306a36Sopenharmony_ci } 43062306a36Sopenharmony_ci 43162306a36Sopenharmony_ci return 0; 43262306a36Sopenharmony_ci 43362306a36Sopenharmony_ciremove_hp_group: 43462306a36Sopenharmony_ci sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); 43562306a36Sopenharmony_cidelete_obj: 43662306a36Sopenharmony_ci kobject_put(*hugepage_kobj); 43762306a36Sopenharmony_ci return err; 43862306a36Sopenharmony_ci} 43962306a36Sopenharmony_ci 44062306a36Sopenharmony_cistatic void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) 44162306a36Sopenharmony_ci{ 44262306a36Sopenharmony_ci sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); 44362306a36Sopenharmony_ci sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); 44462306a36Sopenharmony_ci kobject_put(hugepage_kobj); 44562306a36Sopenharmony_ci} 44662306a36Sopenharmony_ci#else 44762306a36Sopenharmony_cistatic inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) 44862306a36Sopenharmony_ci{ 44962306a36Sopenharmony_ci return 0; 45062306a36Sopenharmony_ci} 45162306a36Sopenharmony_ci 45262306a36Sopenharmony_cistatic inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) 45362306a36Sopenharmony_ci{ 45462306a36Sopenharmony_ci} 45562306a36Sopenharmony_ci#endif /* CONFIG_SYSFS */ 45662306a36Sopenharmony_ci 45762306a36Sopenharmony_cistatic int __init hugepage_init(void) 45862306a36Sopenharmony_ci{ 45962306a36Sopenharmony_ci int err; 46062306a36Sopenharmony_ci struct kobject *hugepage_kobj; 46162306a36Sopenharmony_ci 46262306a36Sopenharmony_ci if (!has_transparent_hugepage()) { 46362306a36Sopenharmony_ci transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED; 46462306a36Sopenharmony_ci return -EINVAL; 46562306a36Sopenharmony_ci } 46662306a36Sopenharmony_ci 46762306a36Sopenharmony_ci /* 46862306a36Sopenharmony_ci * hugepages can't be allocated by the buddy allocator 46962306a36Sopenharmony_ci */ 47062306a36Sopenharmony_ci MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_ORDER); 47162306a36Sopenharmony_ci /* 47262306a36Sopenharmony_ci * we use page->mapping and page->index in second tail page 47362306a36Sopenharmony_ci * as list_head: assuming THP order >= 2 47462306a36Sopenharmony_ci */ 47562306a36Sopenharmony_ci MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); 47662306a36Sopenharmony_ci 47762306a36Sopenharmony_ci err = hugepage_init_sysfs(&hugepage_kobj); 47862306a36Sopenharmony_ci if (err) 47962306a36Sopenharmony_ci goto err_sysfs; 48062306a36Sopenharmony_ci 48162306a36Sopenharmony_ci err = khugepaged_init(); 48262306a36Sopenharmony_ci if (err) 48362306a36Sopenharmony_ci goto err_slab; 48462306a36Sopenharmony_ci 48562306a36Sopenharmony_ci err = register_shrinker(&huge_zero_page_shrinker, "thp-zero"); 48662306a36Sopenharmony_ci if (err) 48762306a36Sopenharmony_ci goto err_hzp_shrinker; 48862306a36Sopenharmony_ci err = register_shrinker(&deferred_split_shrinker, "thp-deferred_split"); 48962306a36Sopenharmony_ci if (err) 49062306a36Sopenharmony_ci goto err_split_shrinker; 49162306a36Sopenharmony_ci 49262306a36Sopenharmony_ci /* 49362306a36Sopenharmony_ci * By default disable transparent hugepages on smaller systems, 49462306a36Sopenharmony_ci * where the extra memory used could hurt more than TLB overhead 49562306a36Sopenharmony_ci * is likely to save. The admin can still enable it through /sys. 49662306a36Sopenharmony_ci */ 49762306a36Sopenharmony_ci if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) { 49862306a36Sopenharmony_ci transparent_hugepage_flags = 0; 49962306a36Sopenharmony_ci return 0; 50062306a36Sopenharmony_ci } 50162306a36Sopenharmony_ci 50262306a36Sopenharmony_ci err = start_stop_khugepaged(); 50362306a36Sopenharmony_ci if (err) 50462306a36Sopenharmony_ci goto err_khugepaged; 50562306a36Sopenharmony_ci 50662306a36Sopenharmony_ci return 0; 50762306a36Sopenharmony_cierr_khugepaged: 50862306a36Sopenharmony_ci unregister_shrinker(&deferred_split_shrinker); 50962306a36Sopenharmony_cierr_split_shrinker: 51062306a36Sopenharmony_ci unregister_shrinker(&huge_zero_page_shrinker); 51162306a36Sopenharmony_cierr_hzp_shrinker: 51262306a36Sopenharmony_ci khugepaged_destroy(); 51362306a36Sopenharmony_cierr_slab: 51462306a36Sopenharmony_ci hugepage_exit_sysfs(hugepage_kobj); 51562306a36Sopenharmony_cierr_sysfs: 51662306a36Sopenharmony_ci return err; 51762306a36Sopenharmony_ci} 51862306a36Sopenharmony_cisubsys_initcall(hugepage_init); 51962306a36Sopenharmony_ci 52062306a36Sopenharmony_cistatic int __init setup_transparent_hugepage(char *str) 52162306a36Sopenharmony_ci{ 52262306a36Sopenharmony_ci int ret = 0; 52362306a36Sopenharmony_ci if (!str) 52462306a36Sopenharmony_ci goto out; 52562306a36Sopenharmony_ci if (!strcmp(str, "always")) { 52662306a36Sopenharmony_ci set_bit(TRANSPARENT_HUGEPAGE_FLAG, 52762306a36Sopenharmony_ci &transparent_hugepage_flags); 52862306a36Sopenharmony_ci clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 52962306a36Sopenharmony_ci &transparent_hugepage_flags); 53062306a36Sopenharmony_ci ret = 1; 53162306a36Sopenharmony_ci } else if (!strcmp(str, "madvise")) { 53262306a36Sopenharmony_ci clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 53362306a36Sopenharmony_ci &transparent_hugepage_flags); 53462306a36Sopenharmony_ci set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 53562306a36Sopenharmony_ci &transparent_hugepage_flags); 53662306a36Sopenharmony_ci ret = 1; 53762306a36Sopenharmony_ci } else if (!strcmp(str, "never")) { 53862306a36Sopenharmony_ci clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 53962306a36Sopenharmony_ci &transparent_hugepage_flags); 54062306a36Sopenharmony_ci clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 54162306a36Sopenharmony_ci &transparent_hugepage_flags); 54262306a36Sopenharmony_ci ret = 1; 54362306a36Sopenharmony_ci } 54462306a36Sopenharmony_ciout: 54562306a36Sopenharmony_ci if (!ret) 54662306a36Sopenharmony_ci pr_warn("transparent_hugepage= cannot parse, ignored\n"); 54762306a36Sopenharmony_ci return ret; 54862306a36Sopenharmony_ci} 54962306a36Sopenharmony_ci__setup("transparent_hugepage=", setup_transparent_hugepage); 55062306a36Sopenharmony_ci 55162306a36Sopenharmony_cipmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 55262306a36Sopenharmony_ci{ 55362306a36Sopenharmony_ci if (likely(vma->vm_flags & VM_WRITE)) 55462306a36Sopenharmony_ci pmd = pmd_mkwrite(pmd, vma); 55562306a36Sopenharmony_ci return pmd; 55662306a36Sopenharmony_ci} 55762306a36Sopenharmony_ci 55862306a36Sopenharmony_ci#ifdef CONFIG_MEMCG 55962306a36Sopenharmony_cistatic inline 56062306a36Sopenharmony_cistruct deferred_split *get_deferred_split_queue(struct folio *folio) 56162306a36Sopenharmony_ci{ 56262306a36Sopenharmony_ci struct mem_cgroup *memcg = folio_memcg(folio); 56362306a36Sopenharmony_ci struct pglist_data *pgdat = NODE_DATA(folio_nid(folio)); 56462306a36Sopenharmony_ci 56562306a36Sopenharmony_ci if (memcg) 56662306a36Sopenharmony_ci return &memcg->deferred_split_queue; 56762306a36Sopenharmony_ci else 56862306a36Sopenharmony_ci return &pgdat->deferred_split_queue; 56962306a36Sopenharmony_ci} 57062306a36Sopenharmony_ci#else 57162306a36Sopenharmony_cistatic inline 57262306a36Sopenharmony_cistruct deferred_split *get_deferred_split_queue(struct folio *folio) 57362306a36Sopenharmony_ci{ 57462306a36Sopenharmony_ci struct pglist_data *pgdat = NODE_DATA(folio_nid(folio)); 57562306a36Sopenharmony_ci 57662306a36Sopenharmony_ci return &pgdat->deferred_split_queue; 57762306a36Sopenharmony_ci} 57862306a36Sopenharmony_ci#endif 57962306a36Sopenharmony_ci 58062306a36Sopenharmony_civoid folio_prep_large_rmappable(struct folio *folio) 58162306a36Sopenharmony_ci{ 58262306a36Sopenharmony_ci VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio); 58362306a36Sopenharmony_ci INIT_LIST_HEAD(&folio->_deferred_list); 58462306a36Sopenharmony_ci folio_set_large_rmappable(folio); 58562306a36Sopenharmony_ci} 58662306a36Sopenharmony_ci 58762306a36Sopenharmony_cistatic inline bool is_transparent_hugepage(struct folio *folio) 58862306a36Sopenharmony_ci{ 58962306a36Sopenharmony_ci if (!folio_test_large(folio)) 59062306a36Sopenharmony_ci return false; 59162306a36Sopenharmony_ci 59262306a36Sopenharmony_ci return is_huge_zero_page(&folio->page) || 59362306a36Sopenharmony_ci folio_test_large_rmappable(folio); 59462306a36Sopenharmony_ci} 59562306a36Sopenharmony_ci 59662306a36Sopenharmony_cistatic unsigned long __thp_get_unmapped_area(struct file *filp, 59762306a36Sopenharmony_ci unsigned long addr, unsigned long len, 59862306a36Sopenharmony_ci loff_t off, unsigned long flags, unsigned long size) 59962306a36Sopenharmony_ci{ 60062306a36Sopenharmony_ci loff_t off_end = off + len; 60162306a36Sopenharmony_ci loff_t off_align = round_up(off, size); 60262306a36Sopenharmony_ci unsigned long len_pad, ret; 60362306a36Sopenharmony_ci 60462306a36Sopenharmony_ci if (off_end <= off_align || (off_end - off_align) < size) 60562306a36Sopenharmony_ci return 0; 60662306a36Sopenharmony_ci 60762306a36Sopenharmony_ci len_pad = len + size; 60862306a36Sopenharmony_ci if (len_pad < len || (off + len_pad) < off) 60962306a36Sopenharmony_ci return 0; 61062306a36Sopenharmony_ci 61162306a36Sopenharmony_ci ret = current->mm->get_unmapped_area(filp, addr, len_pad, 61262306a36Sopenharmony_ci off >> PAGE_SHIFT, flags); 61362306a36Sopenharmony_ci 61462306a36Sopenharmony_ci /* 61562306a36Sopenharmony_ci * The failure might be due to length padding. The caller will retry 61662306a36Sopenharmony_ci * without the padding. 61762306a36Sopenharmony_ci */ 61862306a36Sopenharmony_ci if (IS_ERR_VALUE(ret)) 61962306a36Sopenharmony_ci return 0; 62062306a36Sopenharmony_ci 62162306a36Sopenharmony_ci /* 62262306a36Sopenharmony_ci * Do not try to align to THP boundary if allocation at the address 62362306a36Sopenharmony_ci * hint succeeds. 62462306a36Sopenharmony_ci */ 62562306a36Sopenharmony_ci if (ret == addr) 62662306a36Sopenharmony_ci return addr; 62762306a36Sopenharmony_ci 62862306a36Sopenharmony_ci ret += (off - ret) & (size - 1); 62962306a36Sopenharmony_ci return ret; 63062306a36Sopenharmony_ci} 63162306a36Sopenharmony_ci 63262306a36Sopenharmony_ciunsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, 63362306a36Sopenharmony_ci unsigned long len, unsigned long pgoff, unsigned long flags) 63462306a36Sopenharmony_ci{ 63562306a36Sopenharmony_ci unsigned long ret; 63662306a36Sopenharmony_ci loff_t off = (loff_t)pgoff << PAGE_SHIFT; 63762306a36Sopenharmony_ci 63862306a36Sopenharmony_ci ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE); 63962306a36Sopenharmony_ci if (ret) 64062306a36Sopenharmony_ci return ret; 64162306a36Sopenharmony_ci 64262306a36Sopenharmony_ci return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); 64362306a36Sopenharmony_ci} 64462306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(thp_get_unmapped_area); 64562306a36Sopenharmony_ci 64662306a36Sopenharmony_cistatic vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, 64762306a36Sopenharmony_ci struct page *page, gfp_t gfp) 64862306a36Sopenharmony_ci{ 64962306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 65062306a36Sopenharmony_ci struct folio *folio = page_folio(page); 65162306a36Sopenharmony_ci pgtable_t pgtable; 65262306a36Sopenharmony_ci unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 65362306a36Sopenharmony_ci vm_fault_t ret = 0; 65462306a36Sopenharmony_ci 65562306a36Sopenharmony_ci VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); 65662306a36Sopenharmony_ci 65762306a36Sopenharmony_ci if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { 65862306a36Sopenharmony_ci folio_put(folio); 65962306a36Sopenharmony_ci count_vm_event(THP_FAULT_FALLBACK); 66062306a36Sopenharmony_ci count_vm_event(THP_FAULT_FALLBACK_CHARGE); 66162306a36Sopenharmony_ci return VM_FAULT_FALLBACK; 66262306a36Sopenharmony_ci } 66362306a36Sopenharmony_ci folio_throttle_swaprate(folio, gfp); 66462306a36Sopenharmony_ci 66562306a36Sopenharmony_ci pgtable = pte_alloc_one(vma->vm_mm); 66662306a36Sopenharmony_ci if (unlikely(!pgtable)) { 66762306a36Sopenharmony_ci ret = VM_FAULT_OOM; 66862306a36Sopenharmony_ci goto release; 66962306a36Sopenharmony_ci } 67062306a36Sopenharmony_ci 67162306a36Sopenharmony_ci clear_huge_page(page, vmf->address, HPAGE_PMD_NR); 67262306a36Sopenharmony_ci /* 67362306a36Sopenharmony_ci * The memory barrier inside __folio_mark_uptodate makes sure that 67462306a36Sopenharmony_ci * clear_huge_page writes become visible before the set_pmd_at() 67562306a36Sopenharmony_ci * write. 67662306a36Sopenharmony_ci */ 67762306a36Sopenharmony_ci __folio_mark_uptodate(folio); 67862306a36Sopenharmony_ci 67962306a36Sopenharmony_ci vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 68062306a36Sopenharmony_ci if (unlikely(!pmd_none(*vmf->pmd))) { 68162306a36Sopenharmony_ci goto unlock_release; 68262306a36Sopenharmony_ci } else { 68362306a36Sopenharmony_ci pmd_t entry; 68462306a36Sopenharmony_ci 68562306a36Sopenharmony_ci ret = check_stable_address_space(vma->vm_mm); 68662306a36Sopenharmony_ci if (ret) 68762306a36Sopenharmony_ci goto unlock_release; 68862306a36Sopenharmony_ci 68962306a36Sopenharmony_ci /* Deliver the page fault to userland */ 69062306a36Sopenharmony_ci if (userfaultfd_missing(vma)) { 69162306a36Sopenharmony_ci spin_unlock(vmf->ptl); 69262306a36Sopenharmony_ci folio_put(folio); 69362306a36Sopenharmony_ci pte_free(vma->vm_mm, pgtable); 69462306a36Sopenharmony_ci ret = handle_userfault(vmf, VM_UFFD_MISSING); 69562306a36Sopenharmony_ci VM_BUG_ON(ret & VM_FAULT_FALLBACK); 69662306a36Sopenharmony_ci return ret; 69762306a36Sopenharmony_ci } 69862306a36Sopenharmony_ci 69962306a36Sopenharmony_ci entry = mk_huge_pmd(page, vma->vm_page_prot); 70062306a36Sopenharmony_ci entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 70162306a36Sopenharmony_ci folio_add_new_anon_rmap(folio, vma, haddr); 70262306a36Sopenharmony_ci folio_add_lru_vma(folio, vma); 70362306a36Sopenharmony_ci pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); 70462306a36Sopenharmony_ci set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); 70562306a36Sopenharmony_ci update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 70662306a36Sopenharmony_ci add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); 70762306a36Sopenharmony_ci mm_inc_nr_ptes(vma->vm_mm); 70862306a36Sopenharmony_ci spin_unlock(vmf->ptl); 70962306a36Sopenharmony_ci count_vm_event(THP_FAULT_ALLOC); 71062306a36Sopenharmony_ci count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); 71162306a36Sopenharmony_ci } 71262306a36Sopenharmony_ci 71362306a36Sopenharmony_ci return 0; 71462306a36Sopenharmony_ciunlock_release: 71562306a36Sopenharmony_ci spin_unlock(vmf->ptl); 71662306a36Sopenharmony_cirelease: 71762306a36Sopenharmony_ci if (pgtable) 71862306a36Sopenharmony_ci pte_free(vma->vm_mm, pgtable); 71962306a36Sopenharmony_ci folio_put(folio); 72062306a36Sopenharmony_ci return ret; 72162306a36Sopenharmony_ci 72262306a36Sopenharmony_ci} 72362306a36Sopenharmony_ci 72462306a36Sopenharmony_ci/* 72562306a36Sopenharmony_ci * always: directly stall for all thp allocations 72662306a36Sopenharmony_ci * defer: wake kswapd and fail if not immediately available 72762306a36Sopenharmony_ci * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise 72862306a36Sopenharmony_ci * fail if not immediately available 72962306a36Sopenharmony_ci * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately 73062306a36Sopenharmony_ci * available 73162306a36Sopenharmony_ci * never: never stall for any thp allocation 73262306a36Sopenharmony_ci */ 73362306a36Sopenharmony_cigfp_t vma_thp_gfp_mask(struct vm_area_struct *vma) 73462306a36Sopenharmony_ci{ 73562306a36Sopenharmony_ci const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE); 73662306a36Sopenharmony_ci 73762306a36Sopenharmony_ci /* Always do synchronous compaction */ 73862306a36Sopenharmony_ci if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) 73962306a36Sopenharmony_ci return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); 74062306a36Sopenharmony_ci 74162306a36Sopenharmony_ci /* Kick kcompactd and fail quickly */ 74262306a36Sopenharmony_ci if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) 74362306a36Sopenharmony_ci return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; 74462306a36Sopenharmony_ci 74562306a36Sopenharmony_ci /* Synchronous compaction if madvised, otherwise kick kcompactd */ 74662306a36Sopenharmony_ci if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) 74762306a36Sopenharmony_ci return GFP_TRANSHUGE_LIGHT | 74862306a36Sopenharmony_ci (vma_madvised ? __GFP_DIRECT_RECLAIM : 74962306a36Sopenharmony_ci __GFP_KSWAPD_RECLAIM); 75062306a36Sopenharmony_ci 75162306a36Sopenharmony_ci /* Only do synchronous compaction if madvised */ 75262306a36Sopenharmony_ci if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) 75362306a36Sopenharmony_ci return GFP_TRANSHUGE_LIGHT | 75462306a36Sopenharmony_ci (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); 75562306a36Sopenharmony_ci 75662306a36Sopenharmony_ci return GFP_TRANSHUGE_LIGHT; 75762306a36Sopenharmony_ci} 75862306a36Sopenharmony_ci 75962306a36Sopenharmony_ci/* Caller must hold page table lock. */ 76062306a36Sopenharmony_cistatic void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, 76162306a36Sopenharmony_ci struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, 76262306a36Sopenharmony_ci struct page *zero_page) 76362306a36Sopenharmony_ci{ 76462306a36Sopenharmony_ci pmd_t entry; 76562306a36Sopenharmony_ci if (!pmd_none(*pmd)) 76662306a36Sopenharmony_ci return; 76762306a36Sopenharmony_ci entry = mk_pmd(zero_page, vma->vm_page_prot); 76862306a36Sopenharmony_ci entry = pmd_mkhuge(entry); 76962306a36Sopenharmony_ci pgtable_trans_huge_deposit(mm, pmd, pgtable); 77062306a36Sopenharmony_ci set_pmd_at(mm, haddr, pmd, entry); 77162306a36Sopenharmony_ci mm_inc_nr_ptes(mm); 77262306a36Sopenharmony_ci} 77362306a36Sopenharmony_ci 77462306a36Sopenharmony_civm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) 77562306a36Sopenharmony_ci{ 77662306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 77762306a36Sopenharmony_ci gfp_t gfp; 77862306a36Sopenharmony_ci struct folio *folio; 77962306a36Sopenharmony_ci unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 78062306a36Sopenharmony_ci 78162306a36Sopenharmony_ci if (!transhuge_vma_suitable(vma, haddr)) 78262306a36Sopenharmony_ci return VM_FAULT_FALLBACK; 78362306a36Sopenharmony_ci if (unlikely(anon_vma_prepare(vma))) 78462306a36Sopenharmony_ci return VM_FAULT_OOM; 78562306a36Sopenharmony_ci khugepaged_enter_vma(vma, vma->vm_flags); 78662306a36Sopenharmony_ci 78762306a36Sopenharmony_ci if (!(vmf->flags & FAULT_FLAG_WRITE) && 78862306a36Sopenharmony_ci !mm_forbids_zeropage(vma->vm_mm) && 78962306a36Sopenharmony_ci transparent_hugepage_use_zero_page()) { 79062306a36Sopenharmony_ci pgtable_t pgtable; 79162306a36Sopenharmony_ci struct page *zero_page; 79262306a36Sopenharmony_ci vm_fault_t ret; 79362306a36Sopenharmony_ci pgtable = pte_alloc_one(vma->vm_mm); 79462306a36Sopenharmony_ci if (unlikely(!pgtable)) 79562306a36Sopenharmony_ci return VM_FAULT_OOM; 79662306a36Sopenharmony_ci zero_page = mm_get_huge_zero_page(vma->vm_mm); 79762306a36Sopenharmony_ci if (unlikely(!zero_page)) { 79862306a36Sopenharmony_ci pte_free(vma->vm_mm, pgtable); 79962306a36Sopenharmony_ci count_vm_event(THP_FAULT_FALLBACK); 80062306a36Sopenharmony_ci return VM_FAULT_FALLBACK; 80162306a36Sopenharmony_ci } 80262306a36Sopenharmony_ci vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 80362306a36Sopenharmony_ci ret = 0; 80462306a36Sopenharmony_ci if (pmd_none(*vmf->pmd)) { 80562306a36Sopenharmony_ci ret = check_stable_address_space(vma->vm_mm); 80662306a36Sopenharmony_ci if (ret) { 80762306a36Sopenharmony_ci spin_unlock(vmf->ptl); 80862306a36Sopenharmony_ci pte_free(vma->vm_mm, pgtable); 80962306a36Sopenharmony_ci } else if (userfaultfd_missing(vma)) { 81062306a36Sopenharmony_ci spin_unlock(vmf->ptl); 81162306a36Sopenharmony_ci pte_free(vma->vm_mm, pgtable); 81262306a36Sopenharmony_ci ret = handle_userfault(vmf, VM_UFFD_MISSING); 81362306a36Sopenharmony_ci VM_BUG_ON(ret & VM_FAULT_FALLBACK); 81462306a36Sopenharmony_ci } else { 81562306a36Sopenharmony_ci set_huge_zero_page(pgtable, vma->vm_mm, vma, 81662306a36Sopenharmony_ci haddr, vmf->pmd, zero_page); 81762306a36Sopenharmony_ci update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 81862306a36Sopenharmony_ci spin_unlock(vmf->ptl); 81962306a36Sopenharmony_ci } 82062306a36Sopenharmony_ci } else { 82162306a36Sopenharmony_ci spin_unlock(vmf->ptl); 82262306a36Sopenharmony_ci pte_free(vma->vm_mm, pgtable); 82362306a36Sopenharmony_ci } 82462306a36Sopenharmony_ci return ret; 82562306a36Sopenharmony_ci } 82662306a36Sopenharmony_ci gfp = vma_thp_gfp_mask(vma); 82762306a36Sopenharmony_ci folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true); 82862306a36Sopenharmony_ci if (unlikely(!folio)) { 82962306a36Sopenharmony_ci count_vm_event(THP_FAULT_FALLBACK); 83062306a36Sopenharmony_ci return VM_FAULT_FALLBACK; 83162306a36Sopenharmony_ci } 83262306a36Sopenharmony_ci return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp); 83362306a36Sopenharmony_ci} 83462306a36Sopenharmony_ci 83562306a36Sopenharmony_cistatic void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, 83662306a36Sopenharmony_ci pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write, 83762306a36Sopenharmony_ci pgtable_t pgtable) 83862306a36Sopenharmony_ci{ 83962306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 84062306a36Sopenharmony_ci pmd_t entry; 84162306a36Sopenharmony_ci spinlock_t *ptl; 84262306a36Sopenharmony_ci 84362306a36Sopenharmony_ci ptl = pmd_lock(mm, pmd); 84462306a36Sopenharmony_ci if (!pmd_none(*pmd)) { 84562306a36Sopenharmony_ci if (write) { 84662306a36Sopenharmony_ci if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) { 84762306a36Sopenharmony_ci WARN_ON_ONCE(!is_huge_zero_pmd(*pmd)); 84862306a36Sopenharmony_ci goto out_unlock; 84962306a36Sopenharmony_ci } 85062306a36Sopenharmony_ci entry = pmd_mkyoung(*pmd); 85162306a36Sopenharmony_ci entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 85262306a36Sopenharmony_ci if (pmdp_set_access_flags(vma, addr, pmd, entry, 1)) 85362306a36Sopenharmony_ci update_mmu_cache_pmd(vma, addr, pmd); 85462306a36Sopenharmony_ci } 85562306a36Sopenharmony_ci 85662306a36Sopenharmony_ci goto out_unlock; 85762306a36Sopenharmony_ci } 85862306a36Sopenharmony_ci 85962306a36Sopenharmony_ci entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); 86062306a36Sopenharmony_ci if (pfn_t_devmap(pfn)) 86162306a36Sopenharmony_ci entry = pmd_mkdevmap(entry); 86262306a36Sopenharmony_ci if (write) { 86362306a36Sopenharmony_ci entry = pmd_mkyoung(pmd_mkdirty(entry)); 86462306a36Sopenharmony_ci entry = maybe_pmd_mkwrite(entry, vma); 86562306a36Sopenharmony_ci } 86662306a36Sopenharmony_ci 86762306a36Sopenharmony_ci if (pgtable) { 86862306a36Sopenharmony_ci pgtable_trans_huge_deposit(mm, pmd, pgtable); 86962306a36Sopenharmony_ci mm_inc_nr_ptes(mm); 87062306a36Sopenharmony_ci pgtable = NULL; 87162306a36Sopenharmony_ci } 87262306a36Sopenharmony_ci 87362306a36Sopenharmony_ci set_pmd_at(mm, addr, pmd, entry); 87462306a36Sopenharmony_ci update_mmu_cache_pmd(vma, addr, pmd); 87562306a36Sopenharmony_ci 87662306a36Sopenharmony_ciout_unlock: 87762306a36Sopenharmony_ci spin_unlock(ptl); 87862306a36Sopenharmony_ci if (pgtable) 87962306a36Sopenharmony_ci pte_free(mm, pgtable); 88062306a36Sopenharmony_ci} 88162306a36Sopenharmony_ci 88262306a36Sopenharmony_ci/** 88362306a36Sopenharmony_ci * vmf_insert_pfn_pmd - insert a pmd size pfn 88462306a36Sopenharmony_ci * @vmf: Structure describing the fault 88562306a36Sopenharmony_ci * @pfn: pfn to insert 88662306a36Sopenharmony_ci * @write: whether it's a write fault 88762306a36Sopenharmony_ci * 88862306a36Sopenharmony_ci * Insert a pmd size pfn. See vmf_insert_pfn() for additional info. 88962306a36Sopenharmony_ci * 89062306a36Sopenharmony_ci * Return: vm_fault_t value. 89162306a36Sopenharmony_ci */ 89262306a36Sopenharmony_civm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) 89362306a36Sopenharmony_ci{ 89462306a36Sopenharmony_ci unsigned long addr = vmf->address & PMD_MASK; 89562306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 89662306a36Sopenharmony_ci pgprot_t pgprot = vma->vm_page_prot; 89762306a36Sopenharmony_ci pgtable_t pgtable = NULL; 89862306a36Sopenharmony_ci 89962306a36Sopenharmony_ci /* 90062306a36Sopenharmony_ci * If we had pmd_special, we could avoid all these restrictions, 90162306a36Sopenharmony_ci * but we need to be consistent with PTEs and architectures that 90262306a36Sopenharmony_ci * can't support a 'special' bit. 90362306a36Sopenharmony_ci */ 90462306a36Sopenharmony_ci BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && 90562306a36Sopenharmony_ci !pfn_t_devmap(pfn)); 90662306a36Sopenharmony_ci BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 90762306a36Sopenharmony_ci (VM_PFNMAP|VM_MIXEDMAP)); 90862306a36Sopenharmony_ci BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); 90962306a36Sopenharmony_ci 91062306a36Sopenharmony_ci if (addr < vma->vm_start || addr >= vma->vm_end) 91162306a36Sopenharmony_ci return VM_FAULT_SIGBUS; 91262306a36Sopenharmony_ci 91362306a36Sopenharmony_ci if (arch_needs_pgtable_deposit()) { 91462306a36Sopenharmony_ci pgtable = pte_alloc_one(vma->vm_mm); 91562306a36Sopenharmony_ci if (!pgtable) 91662306a36Sopenharmony_ci return VM_FAULT_OOM; 91762306a36Sopenharmony_ci } 91862306a36Sopenharmony_ci 91962306a36Sopenharmony_ci track_pfn_insert(vma, &pgprot, pfn); 92062306a36Sopenharmony_ci 92162306a36Sopenharmony_ci insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable); 92262306a36Sopenharmony_ci return VM_FAULT_NOPAGE; 92362306a36Sopenharmony_ci} 92462306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); 92562306a36Sopenharmony_ci 92662306a36Sopenharmony_ci#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 92762306a36Sopenharmony_cistatic pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) 92862306a36Sopenharmony_ci{ 92962306a36Sopenharmony_ci if (likely(vma->vm_flags & VM_WRITE)) 93062306a36Sopenharmony_ci pud = pud_mkwrite(pud); 93162306a36Sopenharmony_ci return pud; 93262306a36Sopenharmony_ci} 93362306a36Sopenharmony_ci 93462306a36Sopenharmony_cistatic void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, 93562306a36Sopenharmony_ci pud_t *pud, pfn_t pfn, bool write) 93662306a36Sopenharmony_ci{ 93762306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 93862306a36Sopenharmony_ci pgprot_t prot = vma->vm_page_prot; 93962306a36Sopenharmony_ci pud_t entry; 94062306a36Sopenharmony_ci spinlock_t *ptl; 94162306a36Sopenharmony_ci 94262306a36Sopenharmony_ci ptl = pud_lock(mm, pud); 94362306a36Sopenharmony_ci if (!pud_none(*pud)) { 94462306a36Sopenharmony_ci if (write) { 94562306a36Sopenharmony_ci if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) { 94662306a36Sopenharmony_ci WARN_ON_ONCE(!is_huge_zero_pud(*pud)); 94762306a36Sopenharmony_ci goto out_unlock; 94862306a36Sopenharmony_ci } 94962306a36Sopenharmony_ci entry = pud_mkyoung(*pud); 95062306a36Sopenharmony_ci entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); 95162306a36Sopenharmony_ci if (pudp_set_access_flags(vma, addr, pud, entry, 1)) 95262306a36Sopenharmony_ci update_mmu_cache_pud(vma, addr, pud); 95362306a36Sopenharmony_ci } 95462306a36Sopenharmony_ci goto out_unlock; 95562306a36Sopenharmony_ci } 95662306a36Sopenharmony_ci 95762306a36Sopenharmony_ci entry = pud_mkhuge(pfn_t_pud(pfn, prot)); 95862306a36Sopenharmony_ci if (pfn_t_devmap(pfn)) 95962306a36Sopenharmony_ci entry = pud_mkdevmap(entry); 96062306a36Sopenharmony_ci if (write) { 96162306a36Sopenharmony_ci entry = pud_mkyoung(pud_mkdirty(entry)); 96262306a36Sopenharmony_ci entry = maybe_pud_mkwrite(entry, vma); 96362306a36Sopenharmony_ci } 96462306a36Sopenharmony_ci set_pud_at(mm, addr, pud, entry); 96562306a36Sopenharmony_ci update_mmu_cache_pud(vma, addr, pud); 96662306a36Sopenharmony_ci 96762306a36Sopenharmony_ciout_unlock: 96862306a36Sopenharmony_ci spin_unlock(ptl); 96962306a36Sopenharmony_ci} 97062306a36Sopenharmony_ci 97162306a36Sopenharmony_ci/** 97262306a36Sopenharmony_ci * vmf_insert_pfn_pud - insert a pud size pfn 97362306a36Sopenharmony_ci * @vmf: Structure describing the fault 97462306a36Sopenharmony_ci * @pfn: pfn to insert 97562306a36Sopenharmony_ci * @write: whether it's a write fault 97662306a36Sopenharmony_ci * 97762306a36Sopenharmony_ci * Insert a pud size pfn. See vmf_insert_pfn() for additional info. 97862306a36Sopenharmony_ci * 97962306a36Sopenharmony_ci * Return: vm_fault_t value. 98062306a36Sopenharmony_ci */ 98162306a36Sopenharmony_civm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) 98262306a36Sopenharmony_ci{ 98362306a36Sopenharmony_ci unsigned long addr = vmf->address & PUD_MASK; 98462306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 98562306a36Sopenharmony_ci pgprot_t pgprot = vma->vm_page_prot; 98662306a36Sopenharmony_ci 98762306a36Sopenharmony_ci /* 98862306a36Sopenharmony_ci * If we had pud_special, we could avoid all these restrictions, 98962306a36Sopenharmony_ci * but we need to be consistent with PTEs and architectures that 99062306a36Sopenharmony_ci * can't support a 'special' bit. 99162306a36Sopenharmony_ci */ 99262306a36Sopenharmony_ci BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && 99362306a36Sopenharmony_ci !pfn_t_devmap(pfn)); 99462306a36Sopenharmony_ci BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 99562306a36Sopenharmony_ci (VM_PFNMAP|VM_MIXEDMAP)); 99662306a36Sopenharmony_ci BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); 99762306a36Sopenharmony_ci 99862306a36Sopenharmony_ci if (addr < vma->vm_start || addr >= vma->vm_end) 99962306a36Sopenharmony_ci return VM_FAULT_SIGBUS; 100062306a36Sopenharmony_ci 100162306a36Sopenharmony_ci track_pfn_insert(vma, &pgprot, pfn); 100262306a36Sopenharmony_ci 100362306a36Sopenharmony_ci insert_pfn_pud(vma, addr, vmf->pud, pfn, write); 100462306a36Sopenharmony_ci return VM_FAULT_NOPAGE; 100562306a36Sopenharmony_ci} 100662306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); 100762306a36Sopenharmony_ci#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 100862306a36Sopenharmony_ci 100962306a36Sopenharmony_cistatic void touch_pmd(struct vm_area_struct *vma, unsigned long addr, 101062306a36Sopenharmony_ci pmd_t *pmd, bool write) 101162306a36Sopenharmony_ci{ 101262306a36Sopenharmony_ci pmd_t _pmd; 101362306a36Sopenharmony_ci 101462306a36Sopenharmony_ci _pmd = pmd_mkyoung(*pmd); 101562306a36Sopenharmony_ci if (write) 101662306a36Sopenharmony_ci _pmd = pmd_mkdirty(_pmd); 101762306a36Sopenharmony_ci if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, 101862306a36Sopenharmony_ci pmd, _pmd, write)) 101962306a36Sopenharmony_ci update_mmu_cache_pmd(vma, addr, pmd); 102062306a36Sopenharmony_ci} 102162306a36Sopenharmony_ci 102262306a36Sopenharmony_cistruct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, 102362306a36Sopenharmony_ci pmd_t *pmd, int flags, struct dev_pagemap **pgmap) 102462306a36Sopenharmony_ci{ 102562306a36Sopenharmony_ci unsigned long pfn = pmd_pfn(*pmd); 102662306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 102762306a36Sopenharmony_ci struct page *page; 102862306a36Sopenharmony_ci int ret; 102962306a36Sopenharmony_ci 103062306a36Sopenharmony_ci assert_spin_locked(pmd_lockptr(mm, pmd)); 103162306a36Sopenharmony_ci 103262306a36Sopenharmony_ci if (flags & FOLL_WRITE && !pmd_write(*pmd)) 103362306a36Sopenharmony_ci return NULL; 103462306a36Sopenharmony_ci 103562306a36Sopenharmony_ci if (pmd_present(*pmd) && pmd_devmap(*pmd)) 103662306a36Sopenharmony_ci /* pass */; 103762306a36Sopenharmony_ci else 103862306a36Sopenharmony_ci return NULL; 103962306a36Sopenharmony_ci 104062306a36Sopenharmony_ci if (flags & FOLL_TOUCH) 104162306a36Sopenharmony_ci touch_pmd(vma, addr, pmd, flags & FOLL_WRITE); 104262306a36Sopenharmony_ci 104362306a36Sopenharmony_ci /* 104462306a36Sopenharmony_ci * device mapped pages can only be returned if the 104562306a36Sopenharmony_ci * caller will manage the page reference count. 104662306a36Sopenharmony_ci */ 104762306a36Sopenharmony_ci if (!(flags & (FOLL_GET | FOLL_PIN))) 104862306a36Sopenharmony_ci return ERR_PTR(-EEXIST); 104962306a36Sopenharmony_ci 105062306a36Sopenharmony_ci pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; 105162306a36Sopenharmony_ci *pgmap = get_dev_pagemap(pfn, *pgmap); 105262306a36Sopenharmony_ci if (!*pgmap) 105362306a36Sopenharmony_ci return ERR_PTR(-EFAULT); 105462306a36Sopenharmony_ci page = pfn_to_page(pfn); 105562306a36Sopenharmony_ci ret = try_grab_page(page, flags); 105662306a36Sopenharmony_ci if (ret) 105762306a36Sopenharmony_ci page = ERR_PTR(ret); 105862306a36Sopenharmony_ci 105962306a36Sopenharmony_ci return page; 106062306a36Sopenharmony_ci} 106162306a36Sopenharmony_ci 106262306a36Sopenharmony_ciint copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, 106362306a36Sopenharmony_ci pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, 106462306a36Sopenharmony_ci struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) 106562306a36Sopenharmony_ci{ 106662306a36Sopenharmony_ci spinlock_t *dst_ptl, *src_ptl; 106762306a36Sopenharmony_ci struct page *src_page; 106862306a36Sopenharmony_ci pmd_t pmd; 106962306a36Sopenharmony_ci pgtable_t pgtable = NULL; 107062306a36Sopenharmony_ci int ret = -ENOMEM; 107162306a36Sopenharmony_ci 107262306a36Sopenharmony_ci /* Skip if can be re-fill on fault */ 107362306a36Sopenharmony_ci if (!vma_is_anonymous(dst_vma)) 107462306a36Sopenharmony_ci return 0; 107562306a36Sopenharmony_ci 107662306a36Sopenharmony_ci pgtable = pte_alloc_one(dst_mm); 107762306a36Sopenharmony_ci if (unlikely(!pgtable)) 107862306a36Sopenharmony_ci goto out; 107962306a36Sopenharmony_ci 108062306a36Sopenharmony_ci dst_ptl = pmd_lock(dst_mm, dst_pmd); 108162306a36Sopenharmony_ci src_ptl = pmd_lockptr(src_mm, src_pmd); 108262306a36Sopenharmony_ci spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 108362306a36Sopenharmony_ci 108462306a36Sopenharmony_ci ret = -EAGAIN; 108562306a36Sopenharmony_ci pmd = *src_pmd; 108662306a36Sopenharmony_ci 108762306a36Sopenharmony_ci#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 108862306a36Sopenharmony_ci if (unlikely(is_swap_pmd(pmd))) { 108962306a36Sopenharmony_ci swp_entry_t entry = pmd_to_swp_entry(pmd); 109062306a36Sopenharmony_ci 109162306a36Sopenharmony_ci VM_BUG_ON(!is_pmd_migration_entry(pmd)); 109262306a36Sopenharmony_ci if (!is_readable_migration_entry(entry)) { 109362306a36Sopenharmony_ci entry = make_readable_migration_entry( 109462306a36Sopenharmony_ci swp_offset(entry)); 109562306a36Sopenharmony_ci pmd = swp_entry_to_pmd(entry); 109662306a36Sopenharmony_ci if (pmd_swp_soft_dirty(*src_pmd)) 109762306a36Sopenharmony_ci pmd = pmd_swp_mksoft_dirty(pmd); 109862306a36Sopenharmony_ci if (pmd_swp_uffd_wp(*src_pmd)) 109962306a36Sopenharmony_ci pmd = pmd_swp_mkuffd_wp(pmd); 110062306a36Sopenharmony_ci set_pmd_at(src_mm, addr, src_pmd, pmd); 110162306a36Sopenharmony_ci } 110262306a36Sopenharmony_ci add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 110362306a36Sopenharmony_ci mm_inc_nr_ptes(dst_mm); 110462306a36Sopenharmony_ci pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); 110562306a36Sopenharmony_ci if (!userfaultfd_wp(dst_vma)) 110662306a36Sopenharmony_ci pmd = pmd_swp_clear_uffd_wp(pmd); 110762306a36Sopenharmony_ci set_pmd_at(dst_mm, addr, dst_pmd, pmd); 110862306a36Sopenharmony_ci ret = 0; 110962306a36Sopenharmony_ci goto out_unlock; 111062306a36Sopenharmony_ci } 111162306a36Sopenharmony_ci#endif 111262306a36Sopenharmony_ci 111362306a36Sopenharmony_ci if (unlikely(!pmd_trans_huge(pmd))) { 111462306a36Sopenharmony_ci pte_free(dst_mm, pgtable); 111562306a36Sopenharmony_ci goto out_unlock; 111662306a36Sopenharmony_ci } 111762306a36Sopenharmony_ci /* 111862306a36Sopenharmony_ci * When page table lock is held, the huge zero pmd should not be 111962306a36Sopenharmony_ci * under splitting since we don't split the page itself, only pmd to 112062306a36Sopenharmony_ci * a page table. 112162306a36Sopenharmony_ci */ 112262306a36Sopenharmony_ci if (is_huge_zero_pmd(pmd)) { 112362306a36Sopenharmony_ci /* 112462306a36Sopenharmony_ci * get_huge_zero_page() will never allocate a new page here, 112562306a36Sopenharmony_ci * since we already have a zero page to copy. It just takes a 112662306a36Sopenharmony_ci * reference. 112762306a36Sopenharmony_ci */ 112862306a36Sopenharmony_ci mm_get_huge_zero_page(dst_mm); 112962306a36Sopenharmony_ci goto out_zero_page; 113062306a36Sopenharmony_ci } 113162306a36Sopenharmony_ci 113262306a36Sopenharmony_ci src_page = pmd_page(pmd); 113362306a36Sopenharmony_ci VM_BUG_ON_PAGE(!PageHead(src_page), src_page); 113462306a36Sopenharmony_ci 113562306a36Sopenharmony_ci get_page(src_page); 113662306a36Sopenharmony_ci if (unlikely(page_try_dup_anon_rmap(src_page, true, src_vma))) { 113762306a36Sopenharmony_ci /* Page maybe pinned: split and retry the fault on PTEs. */ 113862306a36Sopenharmony_ci put_page(src_page); 113962306a36Sopenharmony_ci pte_free(dst_mm, pgtable); 114062306a36Sopenharmony_ci spin_unlock(src_ptl); 114162306a36Sopenharmony_ci spin_unlock(dst_ptl); 114262306a36Sopenharmony_ci __split_huge_pmd(src_vma, src_pmd, addr, false, NULL); 114362306a36Sopenharmony_ci return -EAGAIN; 114462306a36Sopenharmony_ci } 114562306a36Sopenharmony_ci add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 114662306a36Sopenharmony_ciout_zero_page: 114762306a36Sopenharmony_ci mm_inc_nr_ptes(dst_mm); 114862306a36Sopenharmony_ci pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); 114962306a36Sopenharmony_ci pmdp_set_wrprotect(src_mm, addr, src_pmd); 115062306a36Sopenharmony_ci if (!userfaultfd_wp(dst_vma)) 115162306a36Sopenharmony_ci pmd = pmd_clear_uffd_wp(pmd); 115262306a36Sopenharmony_ci pmd = pmd_mkold(pmd_wrprotect(pmd)); 115362306a36Sopenharmony_ci set_pmd_at(dst_mm, addr, dst_pmd, pmd); 115462306a36Sopenharmony_ci 115562306a36Sopenharmony_ci ret = 0; 115662306a36Sopenharmony_ciout_unlock: 115762306a36Sopenharmony_ci spin_unlock(src_ptl); 115862306a36Sopenharmony_ci spin_unlock(dst_ptl); 115962306a36Sopenharmony_ciout: 116062306a36Sopenharmony_ci return ret; 116162306a36Sopenharmony_ci} 116262306a36Sopenharmony_ci 116362306a36Sopenharmony_ci#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 116462306a36Sopenharmony_cistatic void touch_pud(struct vm_area_struct *vma, unsigned long addr, 116562306a36Sopenharmony_ci pud_t *pud, bool write) 116662306a36Sopenharmony_ci{ 116762306a36Sopenharmony_ci pud_t _pud; 116862306a36Sopenharmony_ci 116962306a36Sopenharmony_ci _pud = pud_mkyoung(*pud); 117062306a36Sopenharmony_ci if (write) 117162306a36Sopenharmony_ci _pud = pud_mkdirty(_pud); 117262306a36Sopenharmony_ci if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK, 117362306a36Sopenharmony_ci pud, _pud, write)) 117462306a36Sopenharmony_ci update_mmu_cache_pud(vma, addr, pud); 117562306a36Sopenharmony_ci} 117662306a36Sopenharmony_ci 117762306a36Sopenharmony_cistruct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, 117862306a36Sopenharmony_ci pud_t *pud, int flags, struct dev_pagemap **pgmap) 117962306a36Sopenharmony_ci{ 118062306a36Sopenharmony_ci unsigned long pfn = pud_pfn(*pud); 118162306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 118262306a36Sopenharmony_ci struct page *page; 118362306a36Sopenharmony_ci int ret; 118462306a36Sopenharmony_ci 118562306a36Sopenharmony_ci assert_spin_locked(pud_lockptr(mm, pud)); 118662306a36Sopenharmony_ci 118762306a36Sopenharmony_ci if (flags & FOLL_WRITE && !pud_write(*pud)) 118862306a36Sopenharmony_ci return NULL; 118962306a36Sopenharmony_ci 119062306a36Sopenharmony_ci if (pud_present(*pud) && pud_devmap(*pud)) 119162306a36Sopenharmony_ci /* pass */; 119262306a36Sopenharmony_ci else 119362306a36Sopenharmony_ci return NULL; 119462306a36Sopenharmony_ci 119562306a36Sopenharmony_ci if (flags & FOLL_TOUCH) 119662306a36Sopenharmony_ci touch_pud(vma, addr, pud, flags & FOLL_WRITE); 119762306a36Sopenharmony_ci 119862306a36Sopenharmony_ci /* 119962306a36Sopenharmony_ci * device mapped pages can only be returned if the 120062306a36Sopenharmony_ci * caller will manage the page reference count. 120162306a36Sopenharmony_ci * 120262306a36Sopenharmony_ci * At least one of FOLL_GET | FOLL_PIN must be set, so assert that here: 120362306a36Sopenharmony_ci */ 120462306a36Sopenharmony_ci if (!(flags & (FOLL_GET | FOLL_PIN))) 120562306a36Sopenharmony_ci return ERR_PTR(-EEXIST); 120662306a36Sopenharmony_ci 120762306a36Sopenharmony_ci pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; 120862306a36Sopenharmony_ci *pgmap = get_dev_pagemap(pfn, *pgmap); 120962306a36Sopenharmony_ci if (!*pgmap) 121062306a36Sopenharmony_ci return ERR_PTR(-EFAULT); 121162306a36Sopenharmony_ci page = pfn_to_page(pfn); 121262306a36Sopenharmony_ci 121362306a36Sopenharmony_ci ret = try_grab_page(page, flags); 121462306a36Sopenharmony_ci if (ret) 121562306a36Sopenharmony_ci page = ERR_PTR(ret); 121662306a36Sopenharmony_ci 121762306a36Sopenharmony_ci return page; 121862306a36Sopenharmony_ci} 121962306a36Sopenharmony_ci 122062306a36Sopenharmony_ciint copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, 122162306a36Sopenharmony_ci pud_t *dst_pud, pud_t *src_pud, unsigned long addr, 122262306a36Sopenharmony_ci struct vm_area_struct *vma) 122362306a36Sopenharmony_ci{ 122462306a36Sopenharmony_ci spinlock_t *dst_ptl, *src_ptl; 122562306a36Sopenharmony_ci pud_t pud; 122662306a36Sopenharmony_ci int ret; 122762306a36Sopenharmony_ci 122862306a36Sopenharmony_ci dst_ptl = pud_lock(dst_mm, dst_pud); 122962306a36Sopenharmony_ci src_ptl = pud_lockptr(src_mm, src_pud); 123062306a36Sopenharmony_ci spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 123162306a36Sopenharmony_ci 123262306a36Sopenharmony_ci ret = -EAGAIN; 123362306a36Sopenharmony_ci pud = *src_pud; 123462306a36Sopenharmony_ci if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud))) 123562306a36Sopenharmony_ci goto out_unlock; 123662306a36Sopenharmony_ci 123762306a36Sopenharmony_ci /* 123862306a36Sopenharmony_ci * When page table lock is held, the huge zero pud should not be 123962306a36Sopenharmony_ci * under splitting since we don't split the page itself, only pud to 124062306a36Sopenharmony_ci * a page table. 124162306a36Sopenharmony_ci */ 124262306a36Sopenharmony_ci if (is_huge_zero_pud(pud)) { 124362306a36Sopenharmony_ci /* No huge zero pud yet */ 124462306a36Sopenharmony_ci } 124562306a36Sopenharmony_ci 124662306a36Sopenharmony_ci /* 124762306a36Sopenharmony_ci * TODO: once we support anonymous pages, use page_try_dup_anon_rmap() 124862306a36Sopenharmony_ci * and split if duplicating fails. 124962306a36Sopenharmony_ci */ 125062306a36Sopenharmony_ci pudp_set_wrprotect(src_mm, addr, src_pud); 125162306a36Sopenharmony_ci pud = pud_mkold(pud_wrprotect(pud)); 125262306a36Sopenharmony_ci set_pud_at(dst_mm, addr, dst_pud, pud); 125362306a36Sopenharmony_ci 125462306a36Sopenharmony_ci ret = 0; 125562306a36Sopenharmony_ciout_unlock: 125662306a36Sopenharmony_ci spin_unlock(src_ptl); 125762306a36Sopenharmony_ci spin_unlock(dst_ptl); 125862306a36Sopenharmony_ci return ret; 125962306a36Sopenharmony_ci} 126062306a36Sopenharmony_ci 126162306a36Sopenharmony_civoid huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) 126262306a36Sopenharmony_ci{ 126362306a36Sopenharmony_ci bool write = vmf->flags & FAULT_FLAG_WRITE; 126462306a36Sopenharmony_ci 126562306a36Sopenharmony_ci vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud); 126662306a36Sopenharmony_ci if (unlikely(!pud_same(*vmf->pud, orig_pud))) 126762306a36Sopenharmony_ci goto unlock; 126862306a36Sopenharmony_ci 126962306a36Sopenharmony_ci touch_pud(vmf->vma, vmf->address, vmf->pud, write); 127062306a36Sopenharmony_ciunlock: 127162306a36Sopenharmony_ci spin_unlock(vmf->ptl); 127262306a36Sopenharmony_ci} 127362306a36Sopenharmony_ci#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 127462306a36Sopenharmony_ci 127562306a36Sopenharmony_civoid huge_pmd_set_accessed(struct vm_fault *vmf) 127662306a36Sopenharmony_ci{ 127762306a36Sopenharmony_ci bool write = vmf->flags & FAULT_FLAG_WRITE; 127862306a36Sopenharmony_ci 127962306a36Sopenharmony_ci vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); 128062306a36Sopenharmony_ci if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd))) 128162306a36Sopenharmony_ci goto unlock; 128262306a36Sopenharmony_ci 128362306a36Sopenharmony_ci touch_pmd(vmf->vma, vmf->address, vmf->pmd, write); 128462306a36Sopenharmony_ci 128562306a36Sopenharmony_ciunlock: 128662306a36Sopenharmony_ci spin_unlock(vmf->ptl); 128762306a36Sopenharmony_ci} 128862306a36Sopenharmony_ci 128962306a36Sopenharmony_civm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) 129062306a36Sopenharmony_ci{ 129162306a36Sopenharmony_ci const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; 129262306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 129362306a36Sopenharmony_ci struct folio *folio; 129462306a36Sopenharmony_ci struct page *page; 129562306a36Sopenharmony_ci unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 129662306a36Sopenharmony_ci pmd_t orig_pmd = vmf->orig_pmd; 129762306a36Sopenharmony_ci 129862306a36Sopenharmony_ci vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); 129962306a36Sopenharmony_ci VM_BUG_ON_VMA(!vma->anon_vma, vma); 130062306a36Sopenharmony_ci 130162306a36Sopenharmony_ci if (is_huge_zero_pmd(orig_pmd)) 130262306a36Sopenharmony_ci goto fallback; 130362306a36Sopenharmony_ci 130462306a36Sopenharmony_ci spin_lock(vmf->ptl); 130562306a36Sopenharmony_ci 130662306a36Sopenharmony_ci if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { 130762306a36Sopenharmony_ci spin_unlock(vmf->ptl); 130862306a36Sopenharmony_ci return 0; 130962306a36Sopenharmony_ci } 131062306a36Sopenharmony_ci 131162306a36Sopenharmony_ci page = pmd_page(orig_pmd); 131262306a36Sopenharmony_ci folio = page_folio(page); 131362306a36Sopenharmony_ci VM_BUG_ON_PAGE(!PageHead(page), page); 131462306a36Sopenharmony_ci 131562306a36Sopenharmony_ci /* Early check when only holding the PT lock. */ 131662306a36Sopenharmony_ci if (PageAnonExclusive(page)) 131762306a36Sopenharmony_ci goto reuse; 131862306a36Sopenharmony_ci 131962306a36Sopenharmony_ci if (!folio_trylock(folio)) { 132062306a36Sopenharmony_ci folio_get(folio); 132162306a36Sopenharmony_ci spin_unlock(vmf->ptl); 132262306a36Sopenharmony_ci folio_lock(folio); 132362306a36Sopenharmony_ci spin_lock(vmf->ptl); 132462306a36Sopenharmony_ci if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { 132562306a36Sopenharmony_ci spin_unlock(vmf->ptl); 132662306a36Sopenharmony_ci folio_unlock(folio); 132762306a36Sopenharmony_ci folio_put(folio); 132862306a36Sopenharmony_ci return 0; 132962306a36Sopenharmony_ci } 133062306a36Sopenharmony_ci folio_put(folio); 133162306a36Sopenharmony_ci } 133262306a36Sopenharmony_ci 133362306a36Sopenharmony_ci /* Recheck after temporarily dropping the PT lock. */ 133462306a36Sopenharmony_ci if (PageAnonExclusive(page)) { 133562306a36Sopenharmony_ci folio_unlock(folio); 133662306a36Sopenharmony_ci goto reuse; 133762306a36Sopenharmony_ci } 133862306a36Sopenharmony_ci 133962306a36Sopenharmony_ci /* 134062306a36Sopenharmony_ci * See do_wp_page(): we can only reuse the folio exclusively if 134162306a36Sopenharmony_ci * there are no additional references. Note that we always drain 134262306a36Sopenharmony_ci * the LRU cache immediately after adding a THP. 134362306a36Sopenharmony_ci */ 134462306a36Sopenharmony_ci if (folio_ref_count(folio) > 134562306a36Sopenharmony_ci 1 + folio_test_swapcache(folio) * folio_nr_pages(folio)) 134662306a36Sopenharmony_ci goto unlock_fallback; 134762306a36Sopenharmony_ci if (folio_test_swapcache(folio)) 134862306a36Sopenharmony_ci folio_free_swap(folio); 134962306a36Sopenharmony_ci if (folio_ref_count(folio) == 1) { 135062306a36Sopenharmony_ci pmd_t entry; 135162306a36Sopenharmony_ci 135262306a36Sopenharmony_ci page_move_anon_rmap(page, vma); 135362306a36Sopenharmony_ci folio_unlock(folio); 135462306a36Sopenharmony_cireuse: 135562306a36Sopenharmony_ci if (unlikely(unshare)) { 135662306a36Sopenharmony_ci spin_unlock(vmf->ptl); 135762306a36Sopenharmony_ci return 0; 135862306a36Sopenharmony_ci } 135962306a36Sopenharmony_ci entry = pmd_mkyoung(orig_pmd); 136062306a36Sopenharmony_ci entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 136162306a36Sopenharmony_ci if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) 136262306a36Sopenharmony_ci update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 136362306a36Sopenharmony_ci spin_unlock(vmf->ptl); 136462306a36Sopenharmony_ci return 0; 136562306a36Sopenharmony_ci } 136662306a36Sopenharmony_ci 136762306a36Sopenharmony_ciunlock_fallback: 136862306a36Sopenharmony_ci folio_unlock(folio); 136962306a36Sopenharmony_ci spin_unlock(vmf->ptl); 137062306a36Sopenharmony_cifallback: 137162306a36Sopenharmony_ci __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); 137262306a36Sopenharmony_ci return VM_FAULT_FALLBACK; 137362306a36Sopenharmony_ci} 137462306a36Sopenharmony_ci 137562306a36Sopenharmony_cistatic inline bool can_change_pmd_writable(struct vm_area_struct *vma, 137662306a36Sopenharmony_ci unsigned long addr, pmd_t pmd) 137762306a36Sopenharmony_ci{ 137862306a36Sopenharmony_ci struct page *page; 137962306a36Sopenharmony_ci 138062306a36Sopenharmony_ci if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE))) 138162306a36Sopenharmony_ci return false; 138262306a36Sopenharmony_ci 138362306a36Sopenharmony_ci /* Don't touch entries that are not even readable (NUMA hinting). */ 138462306a36Sopenharmony_ci if (pmd_protnone(pmd)) 138562306a36Sopenharmony_ci return false; 138662306a36Sopenharmony_ci 138762306a36Sopenharmony_ci /* Do we need write faults for softdirty tracking? */ 138862306a36Sopenharmony_ci if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd)) 138962306a36Sopenharmony_ci return false; 139062306a36Sopenharmony_ci 139162306a36Sopenharmony_ci /* Do we need write faults for uffd-wp tracking? */ 139262306a36Sopenharmony_ci if (userfaultfd_huge_pmd_wp(vma, pmd)) 139362306a36Sopenharmony_ci return false; 139462306a36Sopenharmony_ci 139562306a36Sopenharmony_ci if (!(vma->vm_flags & VM_SHARED)) { 139662306a36Sopenharmony_ci /* See can_change_pte_writable(). */ 139762306a36Sopenharmony_ci page = vm_normal_page_pmd(vma, addr, pmd); 139862306a36Sopenharmony_ci return page && PageAnon(page) && PageAnonExclusive(page); 139962306a36Sopenharmony_ci } 140062306a36Sopenharmony_ci 140162306a36Sopenharmony_ci /* See can_change_pte_writable(). */ 140262306a36Sopenharmony_ci return pmd_dirty(pmd); 140362306a36Sopenharmony_ci} 140462306a36Sopenharmony_ci 140562306a36Sopenharmony_ci/* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */ 140662306a36Sopenharmony_cistatic inline bool can_follow_write_pmd(pmd_t pmd, struct page *page, 140762306a36Sopenharmony_ci struct vm_area_struct *vma, 140862306a36Sopenharmony_ci unsigned int flags) 140962306a36Sopenharmony_ci{ 141062306a36Sopenharmony_ci /* If the pmd is writable, we can write to the page. */ 141162306a36Sopenharmony_ci if (pmd_write(pmd)) 141262306a36Sopenharmony_ci return true; 141362306a36Sopenharmony_ci 141462306a36Sopenharmony_ci /* Maybe FOLL_FORCE is set to override it? */ 141562306a36Sopenharmony_ci if (!(flags & FOLL_FORCE)) 141662306a36Sopenharmony_ci return false; 141762306a36Sopenharmony_ci 141862306a36Sopenharmony_ci /* But FOLL_FORCE has no effect on shared mappings */ 141962306a36Sopenharmony_ci if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) 142062306a36Sopenharmony_ci return false; 142162306a36Sopenharmony_ci 142262306a36Sopenharmony_ci /* ... or read-only private ones */ 142362306a36Sopenharmony_ci if (!(vma->vm_flags & VM_MAYWRITE)) 142462306a36Sopenharmony_ci return false; 142562306a36Sopenharmony_ci 142662306a36Sopenharmony_ci /* ... or already writable ones that just need to take a write fault */ 142762306a36Sopenharmony_ci if (vma->vm_flags & VM_WRITE) 142862306a36Sopenharmony_ci return false; 142962306a36Sopenharmony_ci 143062306a36Sopenharmony_ci /* 143162306a36Sopenharmony_ci * See can_change_pte_writable(): we broke COW and could map the page 143262306a36Sopenharmony_ci * writable if we have an exclusive anonymous page ... 143362306a36Sopenharmony_ci */ 143462306a36Sopenharmony_ci if (!page || !PageAnon(page) || !PageAnonExclusive(page)) 143562306a36Sopenharmony_ci return false; 143662306a36Sopenharmony_ci 143762306a36Sopenharmony_ci /* ... and a write-fault isn't required for other reasons. */ 143862306a36Sopenharmony_ci if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd)) 143962306a36Sopenharmony_ci return false; 144062306a36Sopenharmony_ci return !userfaultfd_huge_pmd_wp(vma, pmd); 144162306a36Sopenharmony_ci} 144262306a36Sopenharmony_ci 144362306a36Sopenharmony_cistruct page *follow_trans_huge_pmd(struct vm_area_struct *vma, 144462306a36Sopenharmony_ci unsigned long addr, 144562306a36Sopenharmony_ci pmd_t *pmd, 144662306a36Sopenharmony_ci unsigned int flags) 144762306a36Sopenharmony_ci{ 144862306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 144962306a36Sopenharmony_ci struct page *page; 145062306a36Sopenharmony_ci int ret; 145162306a36Sopenharmony_ci 145262306a36Sopenharmony_ci assert_spin_locked(pmd_lockptr(mm, pmd)); 145362306a36Sopenharmony_ci 145462306a36Sopenharmony_ci page = pmd_page(*pmd); 145562306a36Sopenharmony_ci VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); 145662306a36Sopenharmony_ci 145762306a36Sopenharmony_ci if ((flags & FOLL_WRITE) && 145862306a36Sopenharmony_ci !can_follow_write_pmd(*pmd, page, vma, flags)) 145962306a36Sopenharmony_ci return NULL; 146062306a36Sopenharmony_ci 146162306a36Sopenharmony_ci /* Avoid dumping huge zero page */ 146262306a36Sopenharmony_ci if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) 146362306a36Sopenharmony_ci return ERR_PTR(-EFAULT); 146462306a36Sopenharmony_ci 146562306a36Sopenharmony_ci if (pmd_protnone(*pmd) && !gup_can_follow_protnone(vma, flags)) 146662306a36Sopenharmony_ci return NULL; 146762306a36Sopenharmony_ci 146862306a36Sopenharmony_ci if (!pmd_write(*pmd) && gup_must_unshare(vma, flags, page)) 146962306a36Sopenharmony_ci return ERR_PTR(-EMLINK); 147062306a36Sopenharmony_ci 147162306a36Sopenharmony_ci VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && 147262306a36Sopenharmony_ci !PageAnonExclusive(page), page); 147362306a36Sopenharmony_ci 147462306a36Sopenharmony_ci ret = try_grab_page(page, flags); 147562306a36Sopenharmony_ci if (ret) 147662306a36Sopenharmony_ci return ERR_PTR(ret); 147762306a36Sopenharmony_ci 147862306a36Sopenharmony_ci if (flags & FOLL_TOUCH) 147962306a36Sopenharmony_ci touch_pmd(vma, addr, pmd, flags & FOLL_WRITE); 148062306a36Sopenharmony_ci 148162306a36Sopenharmony_ci page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; 148262306a36Sopenharmony_ci VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page); 148362306a36Sopenharmony_ci 148462306a36Sopenharmony_ci return page; 148562306a36Sopenharmony_ci} 148662306a36Sopenharmony_ci 148762306a36Sopenharmony_ci/* NUMA hinting page fault entry point for trans huge pmds */ 148862306a36Sopenharmony_civm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) 148962306a36Sopenharmony_ci{ 149062306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 149162306a36Sopenharmony_ci pmd_t oldpmd = vmf->orig_pmd; 149262306a36Sopenharmony_ci pmd_t pmd; 149362306a36Sopenharmony_ci struct page *page; 149462306a36Sopenharmony_ci unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 149562306a36Sopenharmony_ci int page_nid = NUMA_NO_NODE; 149662306a36Sopenharmony_ci int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK); 149762306a36Sopenharmony_ci bool migrated = false, writable = false; 149862306a36Sopenharmony_ci int flags = 0; 149962306a36Sopenharmony_ci 150062306a36Sopenharmony_ci vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 150162306a36Sopenharmony_ci if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { 150262306a36Sopenharmony_ci spin_unlock(vmf->ptl); 150362306a36Sopenharmony_ci goto out; 150462306a36Sopenharmony_ci } 150562306a36Sopenharmony_ci 150662306a36Sopenharmony_ci pmd = pmd_modify(oldpmd, vma->vm_page_prot); 150762306a36Sopenharmony_ci 150862306a36Sopenharmony_ci /* 150962306a36Sopenharmony_ci * Detect now whether the PMD could be writable; this information 151062306a36Sopenharmony_ci * is only valid while holding the PT lock. 151162306a36Sopenharmony_ci */ 151262306a36Sopenharmony_ci writable = pmd_write(pmd); 151362306a36Sopenharmony_ci if (!writable && vma_wants_manual_pte_write_upgrade(vma) && 151462306a36Sopenharmony_ci can_change_pmd_writable(vma, vmf->address, pmd)) 151562306a36Sopenharmony_ci writable = true; 151662306a36Sopenharmony_ci 151762306a36Sopenharmony_ci page = vm_normal_page_pmd(vma, haddr, pmd); 151862306a36Sopenharmony_ci if (!page) 151962306a36Sopenharmony_ci goto out_map; 152062306a36Sopenharmony_ci 152162306a36Sopenharmony_ci /* See similar comment in do_numa_page for explanation */ 152262306a36Sopenharmony_ci if (!writable) 152362306a36Sopenharmony_ci flags |= TNF_NO_GROUP; 152462306a36Sopenharmony_ci 152562306a36Sopenharmony_ci page_nid = page_to_nid(page); 152662306a36Sopenharmony_ci /* 152762306a36Sopenharmony_ci * For memory tiering mode, cpupid of slow memory page is used 152862306a36Sopenharmony_ci * to record page access time. So use default value. 152962306a36Sopenharmony_ci */ 153062306a36Sopenharmony_ci if (node_is_toptier(page_nid)) 153162306a36Sopenharmony_ci last_cpupid = page_cpupid_last(page); 153262306a36Sopenharmony_ci target_nid = numa_migrate_prep(page, vma, haddr, page_nid, 153362306a36Sopenharmony_ci &flags); 153462306a36Sopenharmony_ci 153562306a36Sopenharmony_ci if (target_nid == NUMA_NO_NODE) { 153662306a36Sopenharmony_ci put_page(page); 153762306a36Sopenharmony_ci goto out_map; 153862306a36Sopenharmony_ci } 153962306a36Sopenharmony_ci 154062306a36Sopenharmony_ci spin_unlock(vmf->ptl); 154162306a36Sopenharmony_ci writable = false; 154262306a36Sopenharmony_ci 154362306a36Sopenharmony_ci migrated = migrate_misplaced_page(page, vma, target_nid); 154462306a36Sopenharmony_ci if (migrated) { 154562306a36Sopenharmony_ci flags |= TNF_MIGRATED; 154662306a36Sopenharmony_ci page_nid = target_nid; 154762306a36Sopenharmony_ci } else { 154862306a36Sopenharmony_ci flags |= TNF_MIGRATE_FAIL; 154962306a36Sopenharmony_ci vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 155062306a36Sopenharmony_ci if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { 155162306a36Sopenharmony_ci spin_unlock(vmf->ptl); 155262306a36Sopenharmony_ci goto out; 155362306a36Sopenharmony_ci } 155462306a36Sopenharmony_ci goto out_map; 155562306a36Sopenharmony_ci } 155662306a36Sopenharmony_ci 155762306a36Sopenharmony_ciout: 155862306a36Sopenharmony_ci if (page_nid != NUMA_NO_NODE) 155962306a36Sopenharmony_ci task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, 156062306a36Sopenharmony_ci flags); 156162306a36Sopenharmony_ci 156262306a36Sopenharmony_ci return 0; 156362306a36Sopenharmony_ci 156462306a36Sopenharmony_ciout_map: 156562306a36Sopenharmony_ci /* Restore the PMD */ 156662306a36Sopenharmony_ci pmd = pmd_modify(oldpmd, vma->vm_page_prot); 156762306a36Sopenharmony_ci pmd = pmd_mkyoung(pmd); 156862306a36Sopenharmony_ci if (writable) 156962306a36Sopenharmony_ci pmd = pmd_mkwrite(pmd, vma); 157062306a36Sopenharmony_ci set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); 157162306a36Sopenharmony_ci update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 157262306a36Sopenharmony_ci spin_unlock(vmf->ptl); 157362306a36Sopenharmony_ci goto out; 157462306a36Sopenharmony_ci} 157562306a36Sopenharmony_ci 157662306a36Sopenharmony_ci/* 157762306a36Sopenharmony_ci * Return true if we do MADV_FREE successfully on entire pmd page. 157862306a36Sopenharmony_ci * Otherwise, return false. 157962306a36Sopenharmony_ci */ 158062306a36Sopenharmony_cibool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 158162306a36Sopenharmony_ci pmd_t *pmd, unsigned long addr, unsigned long next) 158262306a36Sopenharmony_ci{ 158362306a36Sopenharmony_ci spinlock_t *ptl; 158462306a36Sopenharmony_ci pmd_t orig_pmd; 158562306a36Sopenharmony_ci struct folio *folio; 158662306a36Sopenharmony_ci struct mm_struct *mm = tlb->mm; 158762306a36Sopenharmony_ci bool ret = false; 158862306a36Sopenharmony_ci 158962306a36Sopenharmony_ci tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 159062306a36Sopenharmony_ci 159162306a36Sopenharmony_ci ptl = pmd_trans_huge_lock(pmd, vma); 159262306a36Sopenharmony_ci if (!ptl) 159362306a36Sopenharmony_ci goto out_unlocked; 159462306a36Sopenharmony_ci 159562306a36Sopenharmony_ci orig_pmd = *pmd; 159662306a36Sopenharmony_ci if (is_huge_zero_pmd(orig_pmd)) 159762306a36Sopenharmony_ci goto out; 159862306a36Sopenharmony_ci 159962306a36Sopenharmony_ci if (unlikely(!pmd_present(orig_pmd))) { 160062306a36Sopenharmony_ci VM_BUG_ON(thp_migration_supported() && 160162306a36Sopenharmony_ci !is_pmd_migration_entry(orig_pmd)); 160262306a36Sopenharmony_ci goto out; 160362306a36Sopenharmony_ci } 160462306a36Sopenharmony_ci 160562306a36Sopenharmony_ci folio = pfn_folio(pmd_pfn(orig_pmd)); 160662306a36Sopenharmony_ci /* 160762306a36Sopenharmony_ci * If other processes are mapping this folio, we couldn't discard 160862306a36Sopenharmony_ci * the folio unless they all do MADV_FREE so let's skip the folio. 160962306a36Sopenharmony_ci */ 161062306a36Sopenharmony_ci if (folio_estimated_sharers(folio) != 1) 161162306a36Sopenharmony_ci goto out; 161262306a36Sopenharmony_ci 161362306a36Sopenharmony_ci if (!folio_trylock(folio)) 161462306a36Sopenharmony_ci goto out; 161562306a36Sopenharmony_ci 161662306a36Sopenharmony_ci /* 161762306a36Sopenharmony_ci * If user want to discard part-pages of THP, split it so MADV_FREE 161862306a36Sopenharmony_ci * will deactivate only them. 161962306a36Sopenharmony_ci */ 162062306a36Sopenharmony_ci if (next - addr != HPAGE_PMD_SIZE) { 162162306a36Sopenharmony_ci folio_get(folio); 162262306a36Sopenharmony_ci spin_unlock(ptl); 162362306a36Sopenharmony_ci split_folio(folio); 162462306a36Sopenharmony_ci folio_unlock(folio); 162562306a36Sopenharmony_ci folio_put(folio); 162662306a36Sopenharmony_ci goto out_unlocked; 162762306a36Sopenharmony_ci } 162862306a36Sopenharmony_ci 162962306a36Sopenharmony_ci if (folio_test_dirty(folio)) 163062306a36Sopenharmony_ci folio_clear_dirty(folio); 163162306a36Sopenharmony_ci folio_unlock(folio); 163262306a36Sopenharmony_ci 163362306a36Sopenharmony_ci if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { 163462306a36Sopenharmony_ci pmdp_invalidate(vma, addr, pmd); 163562306a36Sopenharmony_ci orig_pmd = pmd_mkold(orig_pmd); 163662306a36Sopenharmony_ci orig_pmd = pmd_mkclean(orig_pmd); 163762306a36Sopenharmony_ci 163862306a36Sopenharmony_ci set_pmd_at(mm, addr, pmd, orig_pmd); 163962306a36Sopenharmony_ci tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 164062306a36Sopenharmony_ci } 164162306a36Sopenharmony_ci 164262306a36Sopenharmony_ci folio_mark_lazyfree(folio); 164362306a36Sopenharmony_ci ret = true; 164462306a36Sopenharmony_ciout: 164562306a36Sopenharmony_ci spin_unlock(ptl); 164662306a36Sopenharmony_ciout_unlocked: 164762306a36Sopenharmony_ci return ret; 164862306a36Sopenharmony_ci} 164962306a36Sopenharmony_ci 165062306a36Sopenharmony_cistatic inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) 165162306a36Sopenharmony_ci{ 165262306a36Sopenharmony_ci pgtable_t pgtable; 165362306a36Sopenharmony_ci 165462306a36Sopenharmony_ci pgtable = pgtable_trans_huge_withdraw(mm, pmd); 165562306a36Sopenharmony_ci pte_free(mm, pgtable); 165662306a36Sopenharmony_ci mm_dec_nr_ptes(mm); 165762306a36Sopenharmony_ci} 165862306a36Sopenharmony_ci 165962306a36Sopenharmony_ciint zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 166062306a36Sopenharmony_ci pmd_t *pmd, unsigned long addr) 166162306a36Sopenharmony_ci{ 166262306a36Sopenharmony_ci pmd_t orig_pmd; 166362306a36Sopenharmony_ci spinlock_t *ptl; 166462306a36Sopenharmony_ci 166562306a36Sopenharmony_ci tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 166662306a36Sopenharmony_ci 166762306a36Sopenharmony_ci ptl = __pmd_trans_huge_lock(pmd, vma); 166862306a36Sopenharmony_ci if (!ptl) 166962306a36Sopenharmony_ci return 0; 167062306a36Sopenharmony_ci /* 167162306a36Sopenharmony_ci * For architectures like ppc64 we look at deposited pgtable 167262306a36Sopenharmony_ci * when calling pmdp_huge_get_and_clear. So do the 167362306a36Sopenharmony_ci * pgtable_trans_huge_withdraw after finishing pmdp related 167462306a36Sopenharmony_ci * operations. 167562306a36Sopenharmony_ci */ 167662306a36Sopenharmony_ci orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd, 167762306a36Sopenharmony_ci tlb->fullmm); 167862306a36Sopenharmony_ci arch_check_zapped_pmd(vma, orig_pmd); 167962306a36Sopenharmony_ci tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 168062306a36Sopenharmony_ci if (vma_is_special_huge(vma)) { 168162306a36Sopenharmony_ci if (arch_needs_pgtable_deposit()) 168262306a36Sopenharmony_ci zap_deposited_table(tlb->mm, pmd); 168362306a36Sopenharmony_ci spin_unlock(ptl); 168462306a36Sopenharmony_ci } else if (is_huge_zero_pmd(orig_pmd)) { 168562306a36Sopenharmony_ci zap_deposited_table(tlb->mm, pmd); 168662306a36Sopenharmony_ci spin_unlock(ptl); 168762306a36Sopenharmony_ci } else { 168862306a36Sopenharmony_ci struct page *page = NULL; 168962306a36Sopenharmony_ci int flush_needed = 1; 169062306a36Sopenharmony_ci 169162306a36Sopenharmony_ci if (pmd_present(orig_pmd)) { 169262306a36Sopenharmony_ci page = pmd_page(orig_pmd); 169362306a36Sopenharmony_ci page_remove_rmap(page, vma, true); 169462306a36Sopenharmony_ci VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); 169562306a36Sopenharmony_ci VM_BUG_ON_PAGE(!PageHead(page), page); 169662306a36Sopenharmony_ci } else if (thp_migration_supported()) { 169762306a36Sopenharmony_ci swp_entry_t entry; 169862306a36Sopenharmony_ci 169962306a36Sopenharmony_ci VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); 170062306a36Sopenharmony_ci entry = pmd_to_swp_entry(orig_pmd); 170162306a36Sopenharmony_ci page = pfn_swap_entry_to_page(entry); 170262306a36Sopenharmony_ci flush_needed = 0; 170362306a36Sopenharmony_ci } else 170462306a36Sopenharmony_ci WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); 170562306a36Sopenharmony_ci 170662306a36Sopenharmony_ci if (PageAnon(page)) { 170762306a36Sopenharmony_ci zap_deposited_table(tlb->mm, pmd); 170862306a36Sopenharmony_ci add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); 170962306a36Sopenharmony_ci } else { 171062306a36Sopenharmony_ci if (arch_needs_pgtable_deposit()) 171162306a36Sopenharmony_ci zap_deposited_table(tlb->mm, pmd); 171262306a36Sopenharmony_ci add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR); 171362306a36Sopenharmony_ci } 171462306a36Sopenharmony_ci 171562306a36Sopenharmony_ci spin_unlock(ptl); 171662306a36Sopenharmony_ci if (flush_needed) 171762306a36Sopenharmony_ci tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE); 171862306a36Sopenharmony_ci } 171962306a36Sopenharmony_ci return 1; 172062306a36Sopenharmony_ci} 172162306a36Sopenharmony_ci 172262306a36Sopenharmony_ci#ifndef pmd_move_must_withdraw 172362306a36Sopenharmony_cistatic inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, 172462306a36Sopenharmony_ci spinlock_t *old_pmd_ptl, 172562306a36Sopenharmony_ci struct vm_area_struct *vma) 172662306a36Sopenharmony_ci{ 172762306a36Sopenharmony_ci /* 172862306a36Sopenharmony_ci * With split pmd lock we also need to move preallocated 172962306a36Sopenharmony_ci * PTE page table if new_pmd is on different PMD page table. 173062306a36Sopenharmony_ci * 173162306a36Sopenharmony_ci * We also don't deposit and withdraw tables for file pages. 173262306a36Sopenharmony_ci */ 173362306a36Sopenharmony_ci return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma); 173462306a36Sopenharmony_ci} 173562306a36Sopenharmony_ci#endif 173662306a36Sopenharmony_ci 173762306a36Sopenharmony_cistatic pmd_t move_soft_dirty_pmd(pmd_t pmd) 173862306a36Sopenharmony_ci{ 173962306a36Sopenharmony_ci#ifdef CONFIG_MEM_SOFT_DIRTY 174062306a36Sopenharmony_ci if (unlikely(is_pmd_migration_entry(pmd))) 174162306a36Sopenharmony_ci pmd = pmd_swp_mksoft_dirty(pmd); 174262306a36Sopenharmony_ci else if (pmd_present(pmd)) 174362306a36Sopenharmony_ci pmd = pmd_mksoft_dirty(pmd); 174462306a36Sopenharmony_ci#endif 174562306a36Sopenharmony_ci return pmd; 174662306a36Sopenharmony_ci} 174762306a36Sopenharmony_ci 174862306a36Sopenharmony_cibool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, 174962306a36Sopenharmony_ci unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd) 175062306a36Sopenharmony_ci{ 175162306a36Sopenharmony_ci spinlock_t *old_ptl, *new_ptl; 175262306a36Sopenharmony_ci pmd_t pmd; 175362306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 175462306a36Sopenharmony_ci bool force_flush = false; 175562306a36Sopenharmony_ci 175662306a36Sopenharmony_ci /* 175762306a36Sopenharmony_ci * The destination pmd shouldn't be established, free_pgtables() 175862306a36Sopenharmony_ci * should have released it; but move_page_tables() might have already 175962306a36Sopenharmony_ci * inserted a page table, if racing against shmem/file collapse. 176062306a36Sopenharmony_ci */ 176162306a36Sopenharmony_ci if (!pmd_none(*new_pmd)) { 176262306a36Sopenharmony_ci VM_BUG_ON(pmd_trans_huge(*new_pmd)); 176362306a36Sopenharmony_ci return false; 176462306a36Sopenharmony_ci } 176562306a36Sopenharmony_ci 176662306a36Sopenharmony_ci /* 176762306a36Sopenharmony_ci * We don't have to worry about the ordering of src and dst 176862306a36Sopenharmony_ci * ptlocks because exclusive mmap_lock prevents deadlock. 176962306a36Sopenharmony_ci */ 177062306a36Sopenharmony_ci old_ptl = __pmd_trans_huge_lock(old_pmd, vma); 177162306a36Sopenharmony_ci if (old_ptl) { 177262306a36Sopenharmony_ci new_ptl = pmd_lockptr(mm, new_pmd); 177362306a36Sopenharmony_ci if (new_ptl != old_ptl) 177462306a36Sopenharmony_ci spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 177562306a36Sopenharmony_ci pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); 177662306a36Sopenharmony_ci if (pmd_present(pmd)) 177762306a36Sopenharmony_ci force_flush = true; 177862306a36Sopenharmony_ci VM_BUG_ON(!pmd_none(*new_pmd)); 177962306a36Sopenharmony_ci 178062306a36Sopenharmony_ci if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) { 178162306a36Sopenharmony_ci pgtable_t pgtable; 178262306a36Sopenharmony_ci pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); 178362306a36Sopenharmony_ci pgtable_trans_huge_deposit(mm, new_pmd, pgtable); 178462306a36Sopenharmony_ci } 178562306a36Sopenharmony_ci pmd = move_soft_dirty_pmd(pmd); 178662306a36Sopenharmony_ci set_pmd_at(mm, new_addr, new_pmd, pmd); 178762306a36Sopenharmony_ci if (force_flush) 178862306a36Sopenharmony_ci flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE); 178962306a36Sopenharmony_ci if (new_ptl != old_ptl) 179062306a36Sopenharmony_ci spin_unlock(new_ptl); 179162306a36Sopenharmony_ci spin_unlock(old_ptl); 179262306a36Sopenharmony_ci return true; 179362306a36Sopenharmony_ci } 179462306a36Sopenharmony_ci return false; 179562306a36Sopenharmony_ci} 179662306a36Sopenharmony_ci 179762306a36Sopenharmony_ci/* 179862306a36Sopenharmony_ci * Returns 179962306a36Sopenharmony_ci * - 0 if PMD could not be locked 180062306a36Sopenharmony_ci * - 1 if PMD was locked but protections unchanged and TLB flush unnecessary 180162306a36Sopenharmony_ci * or if prot_numa but THP migration is not supported 180262306a36Sopenharmony_ci * - HPAGE_PMD_NR if protections changed and TLB flush necessary 180362306a36Sopenharmony_ci */ 180462306a36Sopenharmony_ciint change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 180562306a36Sopenharmony_ci pmd_t *pmd, unsigned long addr, pgprot_t newprot, 180662306a36Sopenharmony_ci unsigned long cp_flags) 180762306a36Sopenharmony_ci{ 180862306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 180962306a36Sopenharmony_ci spinlock_t *ptl; 181062306a36Sopenharmony_ci pmd_t oldpmd, entry; 181162306a36Sopenharmony_ci bool prot_numa = cp_flags & MM_CP_PROT_NUMA; 181262306a36Sopenharmony_ci bool uffd_wp = cp_flags & MM_CP_UFFD_WP; 181362306a36Sopenharmony_ci bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; 181462306a36Sopenharmony_ci int ret = 1; 181562306a36Sopenharmony_ci 181662306a36Sopenharmony_ci tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 181762306a36Sopenharmony_ci 181862306a36Sopenharmony_ci if (prot_numa && !thp_migration_supported()) 181962306a36Sopenharmony_ci return 1; 182062306a36Sopenharmony_ci 182162306a36Sopenharmony_ci ptl = __pmd_trans_huge_lock(pmd, vma); 182262306a36Sopenharmony_ci if (!ptl) 182362306a36Sopenharmony_ci return 0; 182462306a36Sopenharmony_ci 182562306a36Sopenharmony_ci#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 182662306a36Sopenharmony_ci if (is_swap_pmd(*pmd)) { 182762306a36Sopenharmony_ci swp_entry_t entry = pmd_to_swp_entry(*pmd); 182862306a36Sopenharmony_ci struct page *page = pfn_swap_entry_to_page(entry); 182962306a36Sopenharmony_ci pmd_t newpmd; 183062306a36Sopenharmony_ci 183162306a36Sopenharmony_ci VM_BUG_ON(!is_pmd_migration_entry(*pmd)); 183262306a36Sopenharmony_ci if (is_writable_migration_entry(entry)) { 183362306a36Sopenharmony_ci /* 183462306a36Sopenharmony_ci * A protection check is difficult so 183562306a36Sopenharmony_ci * just be safe and disable write 183662306a36Sopenharmony_ci */ 183762306a36Sopenharmony_ci if (PageAnon(page)) 183862306a36Sopenharmony_ci entry = make_readable_exclusive_migration_entry(swp_offset(entry)); 183962306a36Sopenharmony_ci else 184062306a36Sopenharmony_ci entry = make_readable_migration_entry(swp_offset(entry)); 184162306a36Sopenharmony_ci newpmd = swp_entry_to_pmd(entry); 184262306a36Sopenharmony_ci if (pmd_swp_soft_dirty(*pmd)) 184362306a36Sopenharmony_ci newpmd = pmd_swp_mksoft_dirty(newpmd); 184462306a36Sopenharmony_ci } else { 184562306a36Sopenharmony_ci newpmd = *pmd; 184662306a36Sopenharmony_ci } 184762306a36Sopenharmony_ci 184862306a36Sopenharmony_ci if (uffd_wp) 184962306a36Sopenharmony_ci newpmd = pmd_swp_mkuffd_wp(newpmd); 185062306a36Sopenharmony_ci else if (uffd_wp_resolve) 185162306a36Sopenharmony_ci newpmd = pmd_swp_clear_uffd_wp(newpmd); 185262306a36Sopenharmony_ci if (!pmd_same(*pmd, newpmd)) 185362306a36Sopenharmony_ci set_pmd_at(mm, addr, pmd, newpmd); 185462306a36Sopenharmony_ci goto unlock; 185562306a36Sopenharmony_ci } 185662306a36Sopenharmony_ci#endif 185762306a36Sopenharmony_ci 185862306a36Sopenharmony_ci if (prot_numa) { 185962306a36Sopenharmony_ci struct page *page; 186062306a36Sopenharmony_ci bool toptier; 186162306a36Sopenharmony_ci /* 186262306a36Sopenharmony_ci * Avoid trapping faults against the zero page. The read-only 186362306a36Sopenharmony_ci * data is likely to be read-cached on the local CPU and 186462306a36Sopenharmony_ci * local/remote hits to the zero page are not interesting. 186562306a36Sopenharmony_ci */ 186662306a36Sopenharmony_ci if (is_huge_zero_pmd(*pmd)) 186762306a36Sopenharmony_ci goto unlock; 186862306a36Sopenharmony_ci 186962306a36Sopenharmony_ci if (pmd_protnone(*pmd)) 187062306a36Sopenharmony_ci goto unlock; 187162306a36Sopenharmony_ci 187262306a36Sopenharmony_ci page = pmd_page(*pmd); 187362306a36Sopenharmony_ci toptier = node_is_toptier(page_to_nid(page)); 187462306a36Sopenharmony_ci /* 187562306a36Sopenharmony_ci * Skip scanning top tier node if normal numa 187662306a36Sopenharmony_ci * balancing is disabled 187762306a36Sopenharmony_ci */ 187862306a36Sopenharmony_ci if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && 187962306a36Sopenharmony_ci toptier) 188062306a36Sopenharmony_ci goto unlock; 188162306a36Sopenharmony_ci 188262306a36Sopenharmony_ci if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && 188362306a36Sopenharmony_ci !toptier) 188462306a36Sopenharmony_ci xchg_page_access_time(page, jiffies_to_msecs(jiffies)); 188562306a36Sopenharmony_ci } 188662306a36Sopenharmony_ci /* 188762306a36Sopenharmony_ci * In case prot_numa, we are under mmap_read_lock(mm). It's critical 188862306a36Sopenharmony_ci * to not clear pmd intermittently to avoid race with MADV_DONTNEED 188962306a36Sopenharmony_ci * which is also under mmap_read_lock(mm): 189062306a36Sopenharmony_ci * 189162306a36Sopenharmony_ci * CPU0: CPU1: 189262306a36Sopenharmony_ci * change_huge_pmd(prot_numa=1) 189362306a36Sopenharmony_ci * pmdp_huge_get_and_clear_notify() 189462306a36Sopenharmony_ci * madvise_dontneed() 189562306a36Sopenharmony_ci * zap_pmd_range() 189662306a36Sopenharmony_ci * pmd_trans_huge(*pmd) == 0 (without ptl) 189762306a36Sopenharmony_ci * // skip the pmd 189862306a36Sopenharmony_ci * set_pmd_at(); 189962306a36Sopenharmony_ci * // pmd is re-established 190062306a36Sopenharmony_ci * 190162306a36Sopenharmony_ci * The race makes MADV_DONTNEED miss the huge pmd and don't clear it 190262306a36Sopenharmony_ci * which may break userspace. 190362306a36Sopenharmony_ci * 190462306a36Sopenharmony_ci * pmdp_invalidate_ad() is required to make sure we don't miss 190562306a36Sopenharmony_ci * dirty/young flags set by hardware. 190662306a36Sopenharmony_ci */ 190762306a36Sopenharmony_ci oldpmd = pmdp_invalidate_ad(vma, addr, pmd); 190862306a36Sopenharmony_ci 190962306a36Sopenharmony_ci entry = pmd_modify(oldpmd, newprot); 191062306a36Sopenharmony_ci if (uffd_wp) 191162306a36Sopenharmony_ci entry = pmd_mkuffd_wp(entry); 191262306a36Sopenharmony_ci else if (uffd_wp_resolve) 191362306a36Sopenharmony_ci /* 191462306a36Sopenharmony_ci * Leave the write bit to be handled by PF interrupt 191562306a36Sopenharmony_ci * handler, then things like COW could be properly 191662306a36Sopenharmony_ci * handled. 191762306a36Sopenharmony_ci */ 191862306a36Sopenharmony_ci entry = pmd_clear_uffd_wp(entry); 191962306a36Sopenharmony_ci 192062306a36Sopenharmony_ci /* See change_pte_range(). */ 192162306a36Sopenharmony_ci if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) && 192262306a36Sopenharmony_ci can_change_pmd_writable(vma, addr, entry)) 192362306a36Sopenharmony_ci entry = pmd_mkwrite(entry, vma); 192462306a36Sopenharmony_ci 192562306a36Sopenharmony_ci ret = HPAGE_PMD_NR; 192662306a36Sopenharmony_ci set_pmd_at(mm, addr, pmd, entry); 192762306a36Sopenharmony_ci 192862306a36Sopenharmony_ci if (huge_pmd_needs_flush(oldpmd, entry)) 192962306a36Sopenharmony_ci tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE); 193062306a36Sopenharmony_ciunlock: 193162306a36Sopenharmony_ci spin_unlock(ptl); 193262306a36Sopenharmony_ci return ret; 193362306a36Sopenharmony_ci} 193462306a36Sopenharmony_ci 193562306a36Sopenharmony_ci/* 193662306a36Sopenharmony_ci * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise. 193762306a36Sopenharmony_ci * 193862306a36Sopenharmony_ci * Note that if it returns page table lock pointer, this routine returns without 193962306a36Sopenharmony_ci * unlocking page table lock. So callers must unlock it. 194062306a36Sopenharmony_ci */ 194162306a36Sopenharmony_cispinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) 194262306a36Sopenharmony_ci{ 194362306a36Sopenharmony_ci spinlock_t *ptl; 194462306a36Sopenharmony_ci ptl = pmd_lock(vma->vm_mm, pmd); 194562306a36Sopenharmony_ci if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || 194662306a36Sopenharmony_ci pmd_devmap(*pmd))) 194762306a36Sopenharmony_ci return ptl; 194862306a36Sopenharmony_ci spin_unlock(ptl); 194962306a36Sopenharmony_ci return NULL; 195062306a36Sopenharmony_ci} 195162306a36Sopenharmony_ci 195262306a36Sopenharmony_ci/* 195362306a36Sopenharmony_ci * Returns page table lock pointer if a given pud maps a thp, NULL otherwise. 195462306a36Sopenharmony_ci * 195562306a36Sopenharmony_ci * Note that if it returns page table lock pointer, this routine returns without 195662306a36Sopenharmony_ci * unlocking page table lock. So callers must unlock it. 195762306a36Sopenharmony_ci */ 195862306a36Sopenharmony_cispinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) 195962306a36Sopenharmony_ci{ 196062306a36Sopenharmony_ci spinlock_t *ptl; 196162306a36Sopenharmony_ci 196262306a36Sopenharmony_ci ptl = pud_lock(vma->vm_mm, pud); 196362306a36Sopenharmony_ci if (likely(pud_trans_huge(*pud) || pud_devmap(*pud))) 196462306a36Sopenharmony_ci return ptl; 196562306a36Sopenharmony_ci spin_unlock(ptl); 196662306a36Sopenharmony_ci return NULL; 196762306a36Sopenharmony_ci} 196862306a36Sopenharmony_ci 196962306a36Sopenharmony_ci#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 197062306a36Sopenharmony_ciint zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, 197162306a36Sopenharmony_ci pud_t *pud, unsigned long addr) 197262306a36Sopenharmony_ci{ 197362306a36Sopenharmony_ci spinlock_t *ptl; 197462306a36Sopenharmony_ci 197562306a36Sopenharmony_ci ptl = __pud_trans_huge_lock(pud, vma); 197662306a36Sopenharmony_ci if (!ptl) 197762306a36Sopenharmony_ci return 0; 197862306a36Sopenharmony_ci 197962306a36Sopenharmony_ci pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm); 198062306a36Sopenharmony_ci tlb_remove_pud_tlb_entry(tlb, pud, addr); 198162306a36Sopenharmony_ci if (vma_is_special_huge(vma)) { 198262306a36Sopenharmony_ci spin_unlock(ptl); 198362306a36Sopenharmony_ci /* No zero page support yet */ 198462306a36Sopenharmony_ci } else { 198562306a36Sopenharmony_ci /* No support for anonymous PUD pages yet */ 198662306a36Sopenharmony_ci BUG(); 198762306a36Sopenharmony_ci } 198862306a36Sopenharmony_ci return 1; 198962306a36Sopenharmony_ci} 199062306a36Sopenharmony_ci 199162306a36Sopenharmony_cistatic void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud, 199262306a36Sopenharmony_ci unsigned long haddr) 199362306a36Sopenharmony_ci{ 199462306a36Sopenharmony_ci VM_BUG_ON(haddr & ~HPAGE_PUD_MASK); 199562306a36Sopenharmony_ci VM_BUG_ON_VMA(vma->vm_start > haddr, vma); 199662306a36Sopenharmony_ci VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma); 199762306a36Sopenharmony_ci VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud)); 199862306a36Sopenharmony_ci 199962306a36Sopenharmony_ci count_vm_event(THP_SPLIT_PUD); 200062306a36Sopenharmony_ci 200162306a36Sopenharmony_ci pudp_huge_clear_flush(vma, haddr, pud); 200262306a36Sopenharmony_ci} 200362306a36Sopenharmony_ci 200462306a36Sopenharmony_civoid __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, 200562306a36Sopenharmony_ci unsigned long address) 200662306a36Sopenharmony_ci{ 200762306a36Sopenharmony_ci spinlock_t *ptl; 200862306a36Sopenharmony_ci struct mmu_notifier_range range; 200962306a36Sopenharmony_ci 201062306a36Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, 201162306a36Sopenharmony_ci address & HPAGE_PUD_MASK, 201262306a36Sopenharmony_ci (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); 201362306a36Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 201462306a36Sopenharmony_ci ptl = pud_lock(vma->vm_mm, pud); 201562306a36Sopenharmony_ci if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud))) 201662306a36Sopenharmony_ci goto out; 201762306a36Sopenharmony_ci __split_huge_pud_locked(vma, pud, range.start); 201862306a36Sopenharmony_ci 201962306a36Sopenharmony_ciout: 202062306a36Sopenharmony_ci spin_unlock(ptl); 202162306a36Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 202262306a36Sopenharmony_ci} 202362306a36Sopenharmony_ci#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 202462306a36Sopenharmony_ci 202562306a36Sopenharmony_cistatic void __split_huge_zero_page_pmd(struct vm_area_struct *vma, 202662306a36Sopenharmony_ci unsigned long haddr, pmd_t *pmd) 202762306a36Sopenharmony_ci{ 202862306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 202962306a36Sopenharmony_ci pgtable_t pgtable; 203062306a36Sopenharmony_ci pmd_t _pmd, old_pmd; 203162306a36Sopenharmony_ci unsigned long addr; 203262306a36Sopenharmony_ci pte_t *pte; 203362306a36Sopenharmony_ci int i; 203462306a36Sopenharmony_ci 203562306a36Sopenharmony_ci /* 203662306a36Sopenharmony_ci * Leave pmd empty until pte is filled note that it is fine to delay 203762306a36Sopenharmony_ci * notification until mmu_notifier_invalidate_range_end() as we are 203862306a36Sopenharmony_ci * replacing a zero pmd write protected page with a zero pte write 203962306a36Sopenharmony_ci * protected page. 204062306a36Sopenharmony_ci * 204162306a36Sopenharmony_ci * See Documentation/mm/mmu_notifier.rst 204262306a36Sopenharmony_ci */ 204362306a36Sopenharmony_ci old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); 204462306a36Sopenharmony_ci 204562306a36Sopenharmony_ci pgtable = pgtable_trans_huge_withdraw(mm, pmd); 204662306a36Sopenharmony_ci pmd_populate(mm, &_pmd, pgtable); 204762306a36Sopenharmony_ci 204862306a36Sopenharmony_ci pte = pte_offset_map(&_pmd, haddr); 204962306a36Sopenharmony_ci VM_BUG_ON(!pte); 205062306a36Sopenharmony_ci for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { 205162306a36Sopenharmony_ci pte_t entry; 205262306a36Sopenharmony_ci 205362306a36Sopenharmony_ci entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot); 205462306a36Sopenharmony_ci entry = pte_mkspecial(entry); 205562306a36Sopenharmony_ci if (pmd_uffd_wp(old_pmd)) 205662306a36Sopenharmony_ci entry = pte_mkuffd_wp(entry); 205762306a36Sopenharmony_ci VM_BUG_ON(!pte_none(ptep_get(pte))); 205862306a36Sopenharmony_ci set_pte_at(mm, addr, pte, entry); 205962306a36Sopenharmony_ci pte++; 206062306a36Sopenharmony_ci } 206162306a36Sopenharmony_ci pte_unmap(pte - 1); 206262306a36Sopenharmony_ci smp_wmb(); /* make pte visible before pmd */ 206362306a36Sopenharmony_ci pmd_populate(mm, pmd, pgtable); 206462306a36Sopenharmony_ci} 206562306a36Sopenharmony_ci 206662306a36Sopenharmony_cistatic void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, 206762306a36Sopenharmony_ci unsigned long haddr, bool freeze) 206862306a36Sopenharmony_ci{ 206962306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 207062306a36Sopenharmony_ci struct page *page; 207162306a36Sopenharmony_ci pgtable_t pgtable; 207262306a36Sopenharmony_ci pmd_t old_pmd, _pmd; 207362306a36Sopenharmony_ci bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; 207462306a36Sopenharmony_ci bool anon_exclusive = false, dirty = false; 207562306a36Sopenharmony_ci unsigned long addr; 207662306a36Sopenharmony_ci pte_t *pte; 207762306a36Sopenharmony_ci int i; 207862306a36Sopenharmony_ci 207962306a36Sopenharmony_ci VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); 208062306a36Sopenharmony_ci VM_BUG_ON_VMA(vma->vm_start > haddr, vma); 208162306a36Sopenharmony_ci VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); 208262306a36Sopenharmony_ci VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) 208362306a36Sopenharmony_ci && !pmd_devmap(*pmd)); 208462306a36Sopenharmony_ci 208562306a36Sopenharmony_ci count_vm_event(THP_SPLIT_PMD); 208662306a36Sopenharmony_ci 208762306a36Sopenharmony_ci if (!vma_is_anonymous(vma)) { 208862306a36Sopenharmony_ci old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); 208962306a36Sopenharmony_ci /* 209062306a36Sopenharmony_ci * We are going to unmap this huge page. So 209162306a36Sopenharmony_ci * just go ahead and zap it 209262306a36Sopenharmony_ci */ 209362306a36Sopenharmony_ci if (arch_needs_pgtable_deposit()) 209462306a36Sopenharmony_ci zap_deposited_table(mm, pmd); 209562306a36Sopenharmony_ci if (vma_is_special_huge(vma)) 209662306a36Sopenharmony_ci return; 209762306a36Sopenharmony_ci if (unlikely(is_pmd_migration_entry(old_pmd))) { 209862306a36Sopenharmony_ci swp_entry_t entry; 209962306a36Sopenharmony_ci 210062306a36Sopenharmony_ci entry = pmd_to_swp_entry(old_pmd); 210162306a36Sopenharmony_ci page = pfn_swap_entry_to_page(entry); 210262306a36Sopenharmony_ci } else { 210362306a36Sopenharmony_ci page = pmd_page(old_pmd); 210462306a36Sopenharmony_ci if (!PageDirty(page) && pmd_dirty(old_pmd)) 210562306a36Sopenharmony_ci set_page_dirty(page); 210662306a36Sopenharmony_ci if (!PageReferenced(page) && pmd_young(old_pmd)) 210762306a36Sopenharmony_ci SetPageReferenced(page); 210862306a36Sopenharmony_ci page_remove_rmap(page, vma, true); 210962306a36Sopenharmony_ci put_page(page); 211062306a36Sopenharmony_ci } 211162306a36Sopenharmony_ci add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); 211262306a36Sopenharmony_ci return; 211362306a36Sopenharmony_ci } 211462306a36Sopenharmony_ci 211562306a36Sopenharmony_ci if (is_huge_zero_pmd(*pmd)) { 211662306a36Sopenharmony_ci /* 211762306a36Sopenharmony_ci * FIXME: Do we want to invalidate secondary mmu by calling 211862306a36Sopenharmony_ci * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below 211962306a36Sopenharmony_ci * inside __split_huge_pmd() ? 212062306a36Sopenharmony_ci * 212162306a36Sopenharmony_ci * We are going from a zero huge page write protected to zero 212262306a36Sopenharmony_ci * small page also write protected so it does not seems useful 212362306a36Sopenharmony_ci * to invalidate secondary mmu at this time. 212462306a36Sopenharmony_ci */ 212562306a36Sopenharmony_ci return __split_huge_zero_page_pmd(vma, haddr, pmd); 212662306a36Sopenharmony_ci } 212762306a36Sopenharmony_ci 212862306a36Sopenharmony_ci /* 212962306a36Sopenharmony_ci * Up to this point the pmd is present and huge and userland has the 213062306a36Sopenharmony_ci * whole access to the hugepage during the split (which happens in 213162306a36Sopenharmony_ci * place). If we overwrite the pmd with the not-huge version pointing 213262306a36Sopenharmony_ci * to the pte here (which of course we could if all CPUs were bug 213362306a36Sopenharmony_ci * free), userland could trigger a small page size TLB miss on the 213462306a36Sopenharmony_ci * small sized TLB while the hugepage TLB entry is still established in 213562306a36Sopenharmony_ci * the huge TLB. Some CPU doesn't like that. 213662306a36Sopenharmony_ci * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum 213762306a36Sopenharmony_ci * 383 on page 105. Intel should be safe but is also warns that it's 213862306a36Sopenharmony_ci * only safe if the permission and cache attributes of the two entries 213962306a36Sopenharmony_ci * loaded in the two TLB is identical (which should be the case here). 214062306a36Sopenharmony_ci * But it is generally safer to never allow small and huge TLB entries 214162306a36Sopenharmony_ci * for the same virtual address to be loaded simultaneously. So instead 214262306a36Sopenharmony_ci * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the 214362306a36Sopenharmony_ci * current pmd notpresent (atomically because here the pmd_trans_huge 214462306a36Sopenharmony_ci * must remain set at all times on the pmd until the split is complete 214562306a36Sopenharmony_ci * for this pmd), then we flush the SMP TLB and finally we write the 214662306a36Sopenharmony_ci * non-huge version of the pmd entry with pmd_populate. 214762306a36Sopenharmony_ci */ 214862306a36Sopenharmony_ci old_pmd = pmdp_invalidate(vma, haddr, pmd); 214962306a36Sopenharmony_ci 215062306a36Sopenharmony_ci pmd_migration = is_pmd_migration_entry(old_pmd); 215162306a36Sopenharmony_ci if (unlikely(pmd_migration)) { 215262306a36Sopenharmony_ci swp_entry_t entry; 215362306a36Sopenharmony_ci 215462306a36Sopenharmony_ci entry = pmd_to_swp_entry(old_pmd); 215562306a36Sopenharmony_ci page = pfn_swap_entry_to_page(entry); 215662306a36Sopenharmony_ci write = is_writable_migration_entry(entry); 215762306a36Sopenharmony_ci if (PageAnon(page)) 215862306a36Sopenharmony_ci anon_exclusive = is_readable_exclusive_migration_entry(entry); 215962306a36Sopenharmony_ci young = is_migration_entry_young(entry); 216062306a36Sopenharmony_ci dirty = is_migration_entry_dirty(entry); 216162306a36Sopenharmony_ci soft_dirty = pmd_swp_soft_dirty(old_pmd); 216262306a36Sopenharmony_ci uffd_wp = pmd_swp_uffd_wp(old_pmd); 216362306a36Sopenharmony_ci } else { 216462306a36Sopenharmony_ci page = pmd_page(old_pmd); 216562306a36Sopenharmony_ci if (pmd_dirty(old_pmd)) { 216662306a36Sopenharmony_ci dirty = true; 216762306a36Sopenharmony_ci SetPageDirty(page); 216862306a36Sopenharmony_ci } 216962306a36Sopenharmony_ci write = pmd_write(old_pmd); 217062306a36Sopenharmony_ci young = pmd_young(old_pmd); 217162306a36Sopenharmony_ci soft_dirty = pmd_soft_dirty(old_pmd); 217262306a36Sopenharmony_ci uffd_wp = pmd_uffd_wp(old_pmd); 217362306a36Sopenharmony_ci 217462306a36Sopenharmony_ci VM_BUG_ON_PAGE(!page_count(page), page); 217562306a36Sopenharmony_ci 217662306a36Sopenharmony_ci /* 217762306a36Sopenharmony_ci * Without "freeze", we'll simply split the PMD, propagating the 217862306a36Sopenharmony_ci * PageAnonExclusive() flag for each PTE by setting it for 217962306a36Sopenharmony_ci * each subpage -- no need to (temporarily) clear. 218062306a36Sopenharmony_ci * 218162306a36Sopenharmony_ci * With "freeze" we want to replace mapped pages by 218262306a36Sopenharmony_ci * migration entries right away. This is only possible if we 218362306a36Sopenharmony_ci * managed to clear PageAnonExclusive() -- see 218462306a36Sopenharmony_ci * set_pmd_migration_entry(). 218562306a36Sopenharmony_ci * 218662306a36Sopenharmony_ci * In case we cannot clear PageAnonExclusive(), split the PMD 218762306a36Sopenharmony_ci * only and let try_to_migrate_one() fail later. 218862306a36Sopenharmony_ci * 218962306a36Sopenharmony_ci * See page_try_share_anon_rmap(): invalidate PMD first. 219062306a36Sopenharmony_ci */ 219162306a36Sopenharmony_ci anon_exclusive = PageAnon(page) && PageAnonExclusive(page); 219262306a36Sopenharmony_ci if (freeze && anon_exclusive && page_try_share_anon_rmap(page)) 219362306a36Sopenharmony_ci freeze = false; 219462306a36Sopenharmony_ci if (!freeze) 219562306a36Sopenharmony_ci page_ref_add(page, HPAGE_PMD_NR - 1); 219662306a36Sopenharmony_ci } 219762306a36Sopenharmony_ci 219862306a36Sopenharmony_ci /* 219962306a36Sopenharmony_ci * Withdraw the table only after we mark the pmd entry invalid. 220062306a36Sopenharmony_ci * This's critical for some architectures (Power). 220162306a36Sopenharmony_ci */ 220262306a36Sopenharmony_ci pgtable = pgtable_trans_huge_withdraw(mm, pmd); 220362306a36Sopenharmony_ci pmd_populate(mm, &_pmd, pgtable); 220462306a36Sopenharmony_ci 220562306a36Sopenharmony_ci pte = pte_offset_map(&_pmd, haddr); 220662306a36Sopenharmony_ci VM_BUG_ON(!pte); 220762306a36Sopenharmony_ci for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { 220862306a36Sopenharmony_ci pte_t entry; 220962306a36Sopenharmony_ci /* 221062306a36Sopenharmony_ci * Note that NUMA hinting access restrictions are not 221162306a36Sopenharmony_ci * transferred to avoid any possibility of altering 221262306a36Sopenharmony_ci * permissions across VMAs. 221362306a36Sopenharmony_ci */ 221462306a36Sopenharmony_ci if (freeze || pmd_migration) { 221562306a36Sopenharmony_ci swp_entry_t swp_entry; 221662306a36Sopenharmony_ci if (write) 221762306a36Sopenharmony_ci swp_entry = make_writable_migration_entry( 221862306a36Sopenharmony_ci page_to_pfn(page + i)); 221962306a36Sopenharmony_ci else if (anon_exclusive) 222062306a36Sopenharmony_ci swp_entry = make_readable_exclusive_migration_entry( 222162306a36Sopenharmony_ci page_to_pfn(page + i)); 222262306a36Sopenharmony_ci else 222362306a36Sopenharmony_ci swp_entry = make_readable_migration_entry( 222462306a36Sopenharmony_ci page_to_pfn(page + i)); 222562306a36Sopenharmony_ci if (young) 222662306a36Sopenharmony_ci swp_entry = make_migration_entry_young(swp_entry); 222762306a36Sopenharmony_ci if (dirty) 222862306a36Sopenharmony_ci swp_entry = make_migration_entry_dirty(swp_entry); 222962306a36Sopenharmony_ci entry = swp_entry_to_pte(swp_entry); 223062306a36Sopenharmony_ci if (soft_dirty) 223162306a36Sopenharmony_ci entry = pte_swp_mksoft_dirty(entry); 223262306a36Sopenharmony_ci if (uffd_wp) 223362306a36Sopenharmony_ci entry = pte_swp_mkuffd_wp(entry); 223462306a36Sopenharmony_ci } else { 223562306a36Sopenharmony_ci entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot)); 223662306a36Sopenharmony_ci if (write) 223762306a36Sopenharmony_ci entry = pte_mkwrite(entry, vma); 223862306a36Sopenharmony_ci if (anon_exclusive) 223962306a36Sopenharmony_ci SetPageAnonExclusive(page + i); 224062306a36Sopenharmony_ci if (!young) 224162306a36Sopenharmony_ci entry = pte_mkold(entry); 224262306a36Sopenharmony_ci /* NOTE: this may set soft-dirty too on some archs */ 224362306a36Sopenharmony_ci if (dirty) 224462306a36Sopenharmony_ci entry = pte_mkdirty(entry); 224562306a36Sopenharmony_ci if (soft_dirty) 224662306a36Sopenharmony_ci entry = pte_mksoft_dirty(entry); 224762306a36Sopenharmony_ci if (uffd_wp) 224862306a36Sopenharmony_ci entry = pte_mkuffd_wp(entry); 224962306a36Sopenharmony_ci page_add_anon_rmap(page + i, vma, addr, RMAP_NONE); 225062306a36Sopenharmony_ci } 225162306a36Sopenharmony_ci VM_BUG_ON(!pte_none(ptep_get(pte))); 225262306a36Sopenharmony_ci set_pte_at(mm, addr, pte, entry); 225362306a36Sopenharmony_ci pte++; 225462306a36Sopenharmony_ci } 225562306a36Sopenharmony_ci pte_unmap(pte - 1); 225662306a36Sopenharmony_ci 225762306a36Sopenharmony_ci if (!pmd_migration) 225862306a36Sopenharmony_ci page_remove_rmap(page, vma, true); 225962306a36Sopenharmony_ci if (freeze) 226062306a36Sopenharmony_ci put_page(page); 226162306a36Sopenharmony_ci 226262306a36Sopenharmony_ci smp_wmb(); /* make pte visible before pmd */ 226362306a36Sopenharmony_ci pmd_populate(mm, pmd, pgtable); 226462306a36Sopenharmony_ci} 226562306a36Sopenharmony_ci 226662306a36Sopenharmony_civoid __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 226762306a36Sopenharmony_ci unsigned long address, bool freeze, struct folio *folio) 226862306a36Sopenharmony_ci{ 226962306a36Sopenharmony_ci spinlock_t *ptl; 227062306a36Sopenharmony_ci struct mmu_notifier_range range; 227162306a36Sopenharmony_ci 227262306a36Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, 227362306a36Sopenharmony_ci address & HPAGE_PMD_MASK, 227462306a36Sopenharmony_ci (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); 227562306a36Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 227662306a36Sopenharmony_ci ptl = pmd_lock(vma->vm_mm, pmd); 227762306a36Sopenharmony_ci 227862306a36Sopenharmony_ci /* 227962306a36Sopenharmony_ci * If caller asks to setup a migration entry, we need a folio to check 228062306a36Sopenharmony_ci * pmd against. Otherwise we can end up replacing wrong folio. 228162306a36Sopenharmony_ci */ 228262306a36Sopenharmony_ci VM_BUG_ON(freeze && !folio); 228362306a36Sopenharmony_ci VM_WARN_ON_ONCE(folio && !folio_test_locked(folio)); 228462306a36Sopenharmony_ci 228562306a36Sopenharmony_ci if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) || 228662306a36Sopenharmony_ci is_pmd_migration_entry(*pmd)) { 228762306a36Sopenharmony_ci /* 228862306a36Sopenharmony_ci * It's safe to call pmd_page when folio is set because it's 228962306a36Sopenharmony_ci * guaranteed that pmd is present. 229062306a36Sopenharmony_ci */ 229162306a36Sopenharmony_ci if (folio && folio != page_folio(pmd_page(*pmd))) 229262306a36Sopenharmony_ci goto out; 229362306a36Sopenharmony_ci __split_huge_pmd_locked(vma, pmd, range.start, freeze); 229462306a36Sopenharmony_ci } 229562306a36Sopenharmony_ci 229662306a36Sopenharmony_ciout: 229762306a36Sopenharmony_ci spin_unlock(ptl); 229862306a36Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 229962306a36Sopenharmony_ci} 230062306a36Sopenharmony_ci 230162306a36Sopenharmony_civoid split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, 230262306a36Sopenharmony_ci bool freeze, struct folio *folio) 230362306a36Sopenharmony_ci{ 230462306a36Sopenharmony_ci pmd_t *pmd = mm_find_pmd(vma->vm_mm, address); 230562306a36Sopenharmony_ci 230662306a36Sopenharmony_ci if (!pmd) 230762306a36Sopenharmony_ci return; 230862306a36Sopenharmony_ci 230962306a36Sopenharmony_ci __split_huge_pmd(vma, pmd, address, freeze, folio); 231062306a36Sopenharmony_ci} 231162306a36Sopenharmony_ci 231262306a36Sopenharmony_cistatic inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address) 231362306a36Sopenharmony_ci{ 231462306a36Sopenharmony_ci /* 231562306a36Sopenharmony_ci * If the new address isn't hpage aligned and it could previously 231662306a36Sopenharmony_ci * contain an hugepage: check if we need to split an huge pmd. 231762306a36Sopenharmony_ci */ 231862306a36Sopenharmony_ci if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) && 231962306a36Sopenharmony_ci range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE), 232062306a36Sopenharmony_ci ALIGN(address, HPAGE_PMD_SIZE))) 232162306a36Sopenharmony_ci split_huge_pmd_address(vma, address, false, NULL); 232262306a36Sopenharmony_ci} 232362306a36Sopenharmony_ci 232462306a36Sopenharmony_civoid vma_adjust_trans_huge(struct vm_area_struct *vma, 232562306a36Sopenharmony_ci unsigned long start, 232662306a36Sopenharmony_ci unsigned long end, 232762306a36Sopenharmony_ci long adjust_next) 232862306a36Sopenharmony_ci{ 232962306a36Sopenharmony_ci /* Check if we need to split start first. */ 233062306a36Sopenharmony_ci split_huge_pmd_if_needed(vma, start); 233162306a36Sopenharmony_ci 233262306a36Sopenharmony_ci /* Check if we need to split end next. */ 233362306a36Sopenharmony_ci split_huge_pmd_if_needed(vma, end); 233462306a36Sopenharmony_ci 233562306a36Sopenharmony_ci /* 233662306a36Sopenharmony_ci * If we're also updating the next vma vm_start, 233762306a36Sopenharmony_ci * check if we need to split it. 233862306a36Sopenharmony_ci */ 233962306a36Sopenharmony_ci if (adjust_next > 0) { 234062306a36Sopenharmony_ci struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end); 234162306a36Sopenharmony_ci unsigned long nstart = next->vm_start; 234262306a36Sopenharmony_ci nstart += adjust_next; 234362306a36Sopenharmony_ci split_huge_pmd_if_needed(next, nstart); 234462306a36Sopenharmony_ci } 234562306a36Sopenharmony_ci} 234662306a36Sopenharmony_ci 234762306a36Sopenharmony_cistatic void unmap_folio(struct folio *folio) 234862306a36Sopenharmony_ci{ 234962306a36Sopenharmony_ci enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | 235062306a36Sopenharmony_ci TTU_SYNC; 235162306a36Sopenharmony_ci 235262306a36Sopenharmony_ci VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); 235362306a36Sopenharmony_ci 235462306a36Sopenharmony_ci /* 235562306a36Sopenharmony_ci * Anon pages need migration entries to preserve them, but file 235662306a36Sopenharmony_ci * pages can simply be left unmapped, then faulted back on demand. 235762306a36Sopenharmony_ci * If that is ever changed (perhaps for mlock), update remap_page(). 235862306a36Sopenharmony_ci */ 235962306a36Sopenharmony_ci if (folio_test_anon(folio)) 236062306a36Sopenharmony_ci try_to_migrate(folio, ttu_flags); 236162306a36Sopenharmony_ci else 236262306a36Sopenharmony_ci try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK); 236362306a36Sopenharmony_ci} 236462306a36Sopenharmony_ci 236562306a36Sopenharmony_cistatic void remap_page(struct folio *folio, unsigned long nr) 236662306a36Sopenharmony_ci{ 236762306a36Sopenharmony_ci int i = 0; 236862306a36Sopenharmony_ci 236962306a36Sopenharmony_ci /* If unmap_folio() uses try_to_migrate() on file, remove this check */ 237062306a36Sopenharmony_ci if (!folio_test_anon(folio)) 237162306a36Sopenharmony_ci return; 237262306a36Sopenharmony_ci for (;;) { 237362306a36Sopenharmony_ci remove_migration_ptes(folio, folio, true); 237462306a36Sopenharmony_ci i += folio_nr_pages(folio); 237562306a36Sopenharmony_ci if (i >= nr) 237662306a36Sopenharmony_ci break; 237762306a36Sopenharmony_ci folio = folio_next(folio); 237862306a36Sopenharmony_ci } 237962306a36Sopenharmony_ci} 238062306a36Sopenharmony_ci 238162306a36Sopenharmony_cistatic void lru_add_page_tail(struct page *head, struct page *tail, 238262306a36Sopenharmony_ci struct lruvec *lruvec, struct list_head *list) 238362306a36Sopenharmony_ci{ 238462306a36Sopenharmony_ci VM_BUG_ON_PAGE(!PageHead(head), head); 238562306a36Sopenharmony_ci VM_BUG_ON_PAGE(PageCompound(tail), head); 238662306a36Sopenharmony_ci VM_BUG_ON_PAGE(PageLRU(tail), head); 238762306a36Sopenharmony_ci lockdep_assert_held(&lruvec->lru_lock); 238862306a36Sopenharmony_ci 238962306a36Sopenharmony_ci if (list) { 239062306a36Sopenharmony_ci /* page reclaim is reclaiming a huge page */ 239162306a36Sopenharmony_ci VM_WARN_ON(PageLRU(head)); 239262306a36Sopenharmony_ci get_page(tail); 239362306a36Sopenharmony_ci list_add_tail(&tail->lru, list); 239462306a36Sopenharmony_ci } else { 239562306a36Sopenharmony_ci /* head is still on lru (and we have it frozen) */ 239662306a36Sopenharmony_ci VM_WARN_ON(!PageLRU(head)); 239762306a36Sopenharmony_ci if (PageUnevictable(tail)) 239862306a36Sopenharmony_ci tail->mlock_count = 0; 239962306a36Sopenharmony_ci else 240062306a36Sopenharmony_ci list_add_tail(&tail->lru, &head->lru); 240162306a36Sopenharmony_ci SetPageLRU(tail); 240262306a36Sopenharmony_ci } 240362306a36Sopenharmony_ci} 240462306a36Sopenharmony_ci 240562306a36Sopenharmony_cistatic void __split_huge_page_tail(struct folio *folio, int tail, 240662306a36Sopenharmony_ci struct lruvec *lruvec, struct list_head *list) 240762306a36Sopenharmony_ci{ 240862306a36Sopenharmony_ci struct page *head = &folio->page; 240962306a36Sopenharmony_ci struct page *page_tail = head + tail; 241062306a36Sopenharmony_ci /* 241162306a36Sopenharmony_ci * Careful: new_folio is not a "real" folio before we cleared PageTail. 241262306a36Sopenharmony_ci * Don't pass it around before clear_compound_head(). 241362306a36Sopenharmony_ci */ 241462306a36Sopenharmony_ci struct folio *new_folio = (struct folio *)page_tail; 241562306a36Sopenharmony_ci 241662306a36Sopenharmony_ci VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); 241762306a36Sopenharmony_ci 241862306a36Sopenharmony_ci /* 241962306a36Sopenharmony_ci * Clone page flags before unfreezing refcount. 242062306a36Sopenharmony_ci * 242162306a36Sopenharmony_ci * After successful get_page_unless_zero() might follow flags change, 242262306a36Sopenharmony_ci * for example lock_page() which set PG_waiters. 242362306a36Sopenharmony_ci * 242462306a36Sopenharmony_ci * Note that for mapped sub-pages of an anonymous THP, 242562306a36Sopenharmony_ci * PG_anon_exclusive has been cleared in unmap_folio() and is stored in 242662306a36Sopenharmony_ci * the migration entry instead from where remap_page() will restore it. 242762306a36Sopenharmony_ci * We can still have PG_anon_exclusive set on effectively unmapped and 242862306a36Sopenharmony_ci * unreferenced sub-pages of an anonymous THP: we can simply drop 242962306a36Sopenharmony_ci * PG_anon_exclusive (-> PG_mappedtodisk) for these here. 243062306a36Sopenharmony_ci */ 243162306a36Sopenharmony_ci page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 243262306a36Sopenharmony_ci page_tail->flags |= (head->flags & 243362306a36Sopenharmony_ci ((1L << PG_referenced) | 243462306a36Sopenharmony_ci (1L << PG_swapbacked) | 243562306a36Sopenharmony_ci (1L << PG_swapcache) | 243662306a36Sopenharmony_ci (1L << PG_mlocked) | 243762306a36Sopenharmony_ci (1L << PG_uptodate) | 243862306a36Sopenharmony_ci (1L << PG_active) | 243962306a36Sopenharmony_ci (1L << PG_workingset) | 244062306a36Sopenharmony_ci (1L << PG_locked) | 244162306a36Sopenharmony_ci (1L << PG_unevictable) | 244262306a36Sopenharmony_ci#ifdef CONFIG_ARCH_USES_PG_ARCH_X 244362306a36Sopenharmony_ci (1L << PG_arch_2) | 244462306a36Sopenharmony_ci (1L << PG_arch_3) | 244562306a36Sopenharmony_ci#endif 244662306a36Sopenharmony_ci (1L << PG_dirty) | 244762306a36Sopenharmony_ci LRU_GEN_MASK | LRU_REFS_MASK)); 244862306a36Sopenharmony_ci 244962306a36Sopenharmony_ci /* ->mapping in first and second tail page is replaced by other uses */ 245062306a36Sopenharmony_ci VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, 245162306a36Sopenharmony_ci page_tail); 245262306a36Sopenharmony_ci page_tail->mapping = head->mapping; 245362306a36Sopenharmony_ci page_tail->index = head->index + tail; 245462306a36Sopenharmony_ci 245562306a36Sopenharmony_ci /* 245662306a36Sopenharmony_ci * page->private should not be set in tail pages. Fix up and warn once 245762306a36Sopenharmony_ci * if private is unexpectedly set. 245862306a36Sopenharmony_ci */ 245962306a36Sopenharmony_ci if (unlikely(page_tail->private)) { 246062306a36Sopenharmony_ci VM_WARN_ON_ONCE_PAGE(true, page_tail); 246162306a36Sopenharmony_ci page_tail->private = 0; 246262306a36Sopenharmony_ci } 246362306a36Sopenharmony_ci if (folio_test_swapcache(folio)) 246462306a36Sopenharmony_ci new_folio->swap.val = folio->swap.val + tail; 246562306a36Sopenharmony_ci 246662306a36Sopenharmony_ci /* Page flags must be visible before we make the page non-compound. */ 246762306a36Sopenharmony_ci smp_wmb(); 246862306a36Sopenharmony_ci 246962306a36Sopenharmony_ci /* 247062306a36Sopenharmony_ci * Clear PageTail before unfreezing page refcount. 247162306a36Sopenharmony_ci * 247262306a36Sopenharmony_ci * After successful get_page_unless_zero() might follow put_page() 247362306a36Sopenharmony_ci * which needs correct compound_head(). 247462306a36Sopenharmony_ci */ 247562306a36Sopenharmony_ci clear_compound_head(page_tail); 247662306a36Sopenharmony_ci 247762306a36Sopenharmony_ci /* Finally unfreeze refcount. Additional reference from page cache. */ 247862306a36Sopenharmony_ci page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) || 247962306a36Sopenharmony_ci PageSwapCache(head))); 248062306a36Sopenharmony_ci 248162306a36Sopenharmony_ci if (page_is_young(head)) 248262306a36Sopenharmony_ci set_page_young(page_tail); 248362306a36Sopenharmony_ci if (page_is_idle(head)) 248462306a36Sopenharmony_ci set_page_idle(page_tail); 248562306a36Sopenharmony_ci 248662306a36Sopenharmony_ci page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); 248762306a36Sopenharmony_ci 248862306a36Sopenharmony_ci /* 248962306a36Sopenharmony_ci * always add to the tail because some iterators expect new 249062306a36Sopenharmony_ci * pages to show after the currently processed elements - e.g. 249162306a36Sopenharmony_ci * migrate_pages 249262306a36Sopenharmony_ci */ 249362306a36Sopenharmony_ci lru_add_page_tail(head, page_tail, lruvec, list); 249462306a36Sopenharmony_ci} 249562306a36Sopenharmony_ci 249662306a36Sopenharmony_cistatic void __split_huge_page(struct page *page, struct list_head *list, 249762306a36Sopenharmony_ci pgoff_t end) 249862306a36Sopenharmony_ci{ 249962306a36Sopenharmony_ci struct folio *folio = page_folio(page); 250062306a36Sopenharmony_ci struct page *head = &folio->page; 250162306a36Sopenharmony_ci struct lruvec *lruvec; 250262306a36Sopenharmony_ci struct address_space *swap_cache = NULL; 250362306a36Sopenharmony_ci unsigned long offset = 0; 250462306a36Sopenharmony_ci unsigned int nr = thp_nr_pages(head); 250562306a36Sopenharmony_ci int i, nr_dropped = 0; 250662306a36Sopenharmony_ci 250762306a36Sopenharmony_ci /* complete memcg works before add pages to LRU */ 250862306a36Sopenharmony_ci split_page_memcg(head, nr); 250962306a36Sopenharmony_ci 251062306a36Sopenharmony_ci if (folio_test_anon(folio) && folio_test_swapcache(folio)) { 251162306a36Sopenharmony_ci offset = swp_offset(folio->swap); 251262306a36Sopenharmony_ci swap_cache = swap_address_space(folio->swap); 251362306a36Sopenharmony_ci xa_lock(&swap_cache->i_pages); 251462306a36Sopenharmony_ci } 251562306a36Sopenharmony_ci 251662306a36Sopenharmony_ci /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ 251762306a36Sopenharmony_ci lruvec = folio_lruvec_lock(folio); 251862306a36Sopenharmony_ci 251962306a36Sopenharmony_ci ClearPageHasHWPoisoned(head); 252062306a36Sopenharmony_ci 252162306a36Sopenharmony_ci for (i = nr - 1; i >= 1; i--) { 252262306a36Sopenharmony_ci __split_huge_page_tail(folio, i, lruvec, list); 252362306a36Sopenharmony_ci /* Some pages can be beyond EOF: drop them from page cache */ 252462306a36Sopenharmony_ci if (head[i].index >= end) { 252562306a36Sopenharmony_ci struct folio *tail = page_folio(head + i); 252662306a36Sopenharmony_ci 252762306a36Sopenharmony_ci if (shmem_mapping(head->mapping)) 252862306a36Sopenharmony_ci nr_dropped++; 252962306a36Sopenharmony_ci else if (folio_test_clear_dirty(tail)) 253062306a36Sopenharmony_ci folio_account_cleaned(tail, 253162306a36Sopenharmony_ci inode_to_wb(folio->mapping->host)); 253262306a36Sopenharmony_ci __filemap_remove_folio(tail, NULL); 253362306a36Sopenharmony_ci folio_put(tail); 253462306a36Sopenharmony_ci } else if (!PageAnon(page)) { 253562306a36Sopenharmony_ci __xa_store(&head->mapping->i_pages, head[i].index, 253662306a36Sopenharmony_ci head + i, 0); 253762306a36Sopenharmony_ci } else if (swap_cache) { 253862306a36Sopenharmony_ci __xa_store(&swap_cache->i_pages, offset + i, 253962306a36Sopenharmony_ci head + i, 0); 254062306a36Sopenharmony_ci } 254162306a36Sopenharmony_ci } 254262306a36Sopenharmony_ci 254362306a36Sopenharmony_ci ClearPageCompound(head); 254462306a36Sopenharmony_ci unlock_page_lruvec(lruvec); 254562306a36Sopenharmony_ci /* Caller disabled irqs, so they are still disabled here */ 254662306a36Sopenharmony_ci 254762306a36Sopenharmony_ci split_page_owner(head, nr); 254862306a36Sopenharmony_ci 254962306a36Sopenharmony_ci /* See comment in __split_huge_page_tail() */ 255062306a36Sopenharmony_ci if (PageAnon(head)) { 255162306a36Sopenharmony_ci /* Additional pin to swap cache */ 255262306a36Sopenharmony_ci if (PageSwapCache(head)) { 255362306a36Sopenharmony_ci page_ref_add(head, 2); 255462306a36Sopenharmony_ci xa_unlock(&swap_cache->i_pages); 255562306a36Sopenharmony_ci } else { 255662306a36Sopenharmony_ci page_ref_inc(head); 255762306a36Sopenharmony_ci } 255862306a36Sopenharmony_ci } else { 255962306a36Sopenharmony_ci /* Additional pin to page cache */ 256062306a36Sopenharmony_ci page_ref_add(head, 2); 256162306a36Sopenharmony_ci xa_unlock(&head->mapping->i_pages); 256262306a36Sopenharmony_ci } 256362306a36Sopenharmony_ci local_irq_enable(); 256462306a36Sopenharmony_ci 256562306a36Sopenharmony_ci if (nr_dropped) 256662306a36Sopenharmony_ci shmem_uncharge(head->mapping->host, nr_dropped); 256762306a36Sopenharmony_ci remap_page(folio, nr); 256862306a36Sopenharmony_ci 256962306a36Sopenharmony_ci if (folio_test_swapcache(folio)) 257062306a36Sopenharmony_ci split_swap_cluster(folio->swap); 257162306a36Sopenharmony_ci 257262306a36Sopenharmony_ci for (i = 0; i < nr; i++) { 257362306a36Sopenharmony_ci struct page *subpage = head + i; 257462306a36Sopenharmony_ci if (subpage == page) 257562306a36Sopenharmony_ci continue; 257662306a36Sopenharmony_ci unlock_page(subpage); 257762306a36Sopenharmony_ci 257862306a36Sopenharmony_ci /* 257962306a36Sopenharmony_ci * Subpages may be freed if there wasn't any mapping 258062306a36Sopenharmony_ci * like if add_to_swap() is running on a lru page that 258162306a36Sopenharmony_ci * had its mapping zapped. And freeing these pages 258262306a36Sopenharmony_ci * requires taking the lru_lock so we do the put_page 258362306a36Sopenharmony_ci * of the tail pages after the split is complete. 258462306a36Sopenharmony_ci */ 258562306a36Sopenharmony_ci free_page_and_swap_cache(subpage); 258662306a36Sopenharmony_ci } 258762306a36Sopenharmony_ci} 258862306a36Sopenharmony_ci 258962306a36Sopenharmony_ci/* Racy check whether the huge page can be split */ 259062306a36Sopenharmony_cibool can_split_folio(struct folio *folio, int *pextra_pins) 259162306a36Sopenharmony_ci{ 259262306a36Sopenharmony_ci int extra_pins; 259362306a36Sopenharmony_ci 259462306a36Sopenharmony_ci /* Additional pins from page cache */ 259562306a36Sopenharmony_ci if (folio_test_anon(folio)) 259662306a36Sopenharmony_ci extra_pins = folio_test_swapcache(folio) ? 259762306a36Sopenharmony_ci folio_nr_pages(folio) : 0; 259862306a36Sopenharmony_ci else 259962306a36Sopenharmony_ci extra_pins = folio_nr_pages(folio); 260062306a36Sopenharmony_ci if (pextra_pins) 260162306a36Sopenharmony_ci *pextra_pins = extra_pins; 260262306a36Sopenharmony_ci return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1; 260362306a36Sopenharmony_ci} 260462306a36Sopenharmony_ci 260562306a36Sopenharmony_ci/* 260662306a36Sopenharmony_ci * This function splits huge page into normal pages. @page can point to any 260762306a36Sopenharmony_ci * subpage of huge page to split. Split doesn't change the position of @page. 260862306a36Sopenharmony_ci * 260962306a36Sopenharmony_ci * Only caller must hold pin on the @page, otherwise split fails with -EBUSY. 261062306a36Sopenharmony_ci * The huge page must be locked. 261162306a36Sopenharmony_ci * 261262306a36Sopenharmony_ci * If @list is null, tail pages will be added to LRU list, otherwise, to @list. 261362306a36Sopenharmony_ci * 261462306a36Sopenharmony_ci * Both head page and tail pages will inherit mapping, flags, and so on from 261562306a36Sopenharmony_ci * the hugepage. 261662306a36Sopenharmony_ci * 261762306a36Sopenharmony_ci * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if 261862306a36Sopenharmony_ci * they are not mapped. 261962306a36Sopenharmony_ci * 262062306a36Sopenharmony_ci * Returns 0 if the hugepage is split successfully. 262162306a36Sopenharmony_ci * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under 262262306a36Sopenharmony_ci * us. 262362306a36Sopenharmony_ci */ 262462306a36Sopenharmony_ciint split_huge_page_to_list(struct page *page, struct list_head *list) 262562306a36Sopenharmony_ci{ 262662306a36Sopenharmony_ci struct folio *folio = page_folio(page); 262762306a36Sopenharmony_ci struct deferred_split *ds_queue = get_deferred_split_queue(folio); 262862306a36Sopenharmony_ci XA_STATE(xas, &folio->mapping->i_pages, folio->index); 262962306a36Sopenharmony_ci struct anon_vma *anon_vma = NULL; 263062306a36Sopenharmony_ci struct address_space *mapping = NULL; 263162306a36Sopenharmony_ci int extra_pins, ret; 263262306a36Sopenharmony_ci pgoff_t end; 263362306a36Sopenharmony_ci bool is_hzp; 263462306a36Sopenharmony_ci 263562306a36Sopenharmony_ci VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 263662306a36Sopenharmony_ci VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); 263762306a36Sopenharmony_ci 263862306a36Sopenharmony_ci is_hzp = is_huge_zero_page(&folio->page); 263962306a36Sopenharmony_ci if (is_hzp) { 264062306a36Sopenharmony_ci pr_warn_ratelimited("Called split_huge_page for huge zero page\n"); 264162306a36Sopenharmony_ci return -EBUSY; 264262306a36Sopenharmony_ci } 264362306a36Sopenharmony_ci 264462306a36Sopenharmony_ci if (folio_test_writeback(folio)) 264562306a36Sopenharmony_ci return -EBUSY; 264662306a36Sopenharmony_ci 264762306a36Sopenharmony_ci if (folio_test_anon(folio)) { 264862306a36Sopenharmony_ci /* 264962306a36Sopenharmony_ci * The caller does not necessarily hold an mmap_lock that would 265062306a36Sopenharmony_ci * prevent the anon_vma disappearing so we first we take a 265162306a36Sopenharmony_ci * reference to it and then lock the anon_vma for write. This 265262306a36Sopenharmony_ci * is similar to folio_lock_anon_vma_read except the write lock 265362306a36Sopenharmony_ci * is taken to serialise against parallel split or collapse 265462306a36Sopenharmony_ci * operations. 265562306a36Sopenharmony_ci */ 265662306a36Sopenharmony_ci anon_vma = folio_get_anon_vma(folio); 265762306a36Sopenharmony_ci if (!anon_vma) { 265862306a36Sopenharmony_ci ret = -EBUSY; 265962306a36Sopenharmony_ci goto out; 266062306a36Sopenharmony_ci } 266162306a36Sopenharmony_ci end = -1; 266262306a36Sopenharmony_ci mapping = NULL; 266362306a36Sopenharmony_ci anon_vma_lock_write(anon_vma); 266462306a36Sopenharmony_ci } else { 266562306a36Sopenharmony_ci gfp_t gfp; 266662306a36Sopenharmony_ci 266762306a36Sopenharmony_ci mapping = folio->mapping; 266862306a36Sopenharmony_ci 266962306a36Sopenharmony_ci /* Truncated ? */ 267062306a36Sopenharmony_ci if (!mapping) { 267162306a36Sopenharmony_ci ret = -EBUSY; 267262306a36Sopenharmony_ci goto out; 267362306a36Sopenharmony_ci } 267462306a36Sopenharmony_ci 267562306a36Sopenharmony_ci gfp = current_gfp_context(mapping_gfp_mask(mapping) & 267662306a36Sopenharmony_ci GFP_RECLAIM_MASK); 267762306a36Sopenharmony_ci 267862306a36Sopenharmony_ci if (!filemap_release_folio(folio, gfp)) { 267962306a36Sopenharmony_ci ret = -EBUSY; 268062306a36Sopenharmony_ci goto out; 268162306a36Sopenharmony_ci } 268262306a36Sopenharmony_ci 268362306a36Sopenharmony_ci xas_split_alloc(&xas, folio, folio_order(folio), gfp); 268462306a36Sopenharmony_ci if (xas_error(&xas)) { 268562306a36Sopenharmony_ci ret = xas_error(&xas); 268662306a36Sopenharmony_ci goto out; 268762306a36Sopenharmony_ci } 268862306a36Sopenharmony_ci 268962306a36Sopenharmony_ci anon_vma = NULL; 269062306a36Sopenharmony_ci i_mmap_lock_read(mapping); 269162306a36Sopenharmony_ci 269262306a36Sopenharmony_ci /* 269362306a36Sopenharmony_ci *__split_huge_page() may need to trim off pages beyond EOF: 269462306a36Sopenharmony_ci * but on 32-bit, i_size_read() takes an irq-unsafe seqlock, 269562306a36Sopenharmony_ci * which cannot be nested inside the page tree lock. So note 269662306a36Sopenharmony_ci * end now: i_size itself may be changed at any moment, but 269762306a36Sopenharmony_ci * folio lock is good enough to serialize the trimming. 269862306a36Sopenharmony_ci */ 269962306a36Sopenharmony_ci end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); 270062306a36Sopenharmony_ci if (shmem_mapping(mapping)) 270162306a36Sopenharmony_ci end = shmem_fallocend(mapping->host, end); 270262306a36Sopenharmony_ci } 270362306a36Sopenharmony_ci 270462306a36Sopenharmony_ci /* 270562306a36Sopenharmony_ci * Racy check if we can split the page, before unmap_folio() will 270662306a36Sopenharmony_ci * split PMDs 270762306a36Sopenharmony_ci */ 270862306a36Sopenharmony_ci if (!can_split_folio(folio, &extra_pins)) { 270962306a36Sopenharmony_ci ret = -EAGAIN; 271062306a36Sopenharmony_ci goto out_unlock; 271162306a36Sopenharmony_ci } 271262306a36Sopenharmony_ci 271362306a36Sopenharmony_ci unmap_folio(folio); 271462306a36Sopenharmony_ci 271562306a36Sopenharmony_ci /* block interrupt reentry in xa_lock and spinlock */ 271662306a36Sopenharmony_ci local_irq_disable(); 271762306a36Sopenharmony_ci if (mapping) { 271862306a36Sopenharmony_ci /* 271962306a36Sopenharmony_ci * Check if the folio is present in page cache. 272062306a36Sopenharmony_ci * We assume all tail are present too, if folio is there. 272162306a36Sopenharmony_ci */ 272262306a36Sopenharmony_ci xas_lock(&xas); 272362306a36Sopenharmony_ci xas_reset(&xas); 272462306a36Sopenharmony_ci if (xas_load(&xas) != folio) 272562306a36Sopenharmony_ci goto fail; 272662306a36Sopenharmony_ci } 272762306a36Sopenharmony_ci 272862306a36Sopenharmony_ci /* Prevent deferred_split_scan() touching ->_refcount */ 272962306a36Sopenharmony_ci spin_lock(&ds_queue->split_queue_lock); 273062306a36Sopenharmony_ci if (folio_ref_freeze(folio, 1 + extra_pins)) { 273162306a36Sopenharmony_ci if (!list_empty(&folio->_deferred_list)) { 273262306a36Sopenharmony_ci ds_queue->split_queue_len--; 273362306a36Sopenharmony_ci list_del(&folio->_deferred_list); 273462306a36Sopenharmony_ci } 273562306a36Sopenharmony_ci spin_unlock(&ds_queue->split_queue_lock); 273662306a36Sopenharmony_ci if (mapping) { 273762306a36Sopenharmony_ci int nr = folio_nr_pages(folio); 273862306a36Sopenharmony_ci 273962306a36Sopenharmony_ci xas_split(&xas, folio, folio_order(folio)); 274062306a36Sopenharmony_ci if (folio_test_pmd_mappable(folio)) { 274162306a36Sopenharmony_ci if (folio_test_swapbacked(folio)) { 274262306a36Sopenharmony_ci __lruvec_stat_mod_folio(folio, 274362306a36Sopenharmony_ci NR_SHMEM_THPS, -nr); 274462306a36Sopenharmony_ci } else { 274562306a36Sopenharmony_ci __lruvec_stat_mod_folio(folio, 274662306a36Sopenharmony_ci NR_FILE_THPS, -nr); 274762306a36Sopenharmony_ci filemap_nr_thps_dec(mapping); 274862306a36Sopenharmony_ci } 274962306a36Sopenharmony_ci } 275062306a36Sopenharmony_ci } 275162306a36Sopenharmony_ci 275262306a36Sopenharmony_ci __split_huge_page(page, list, end); 275362306a36Sopenharmony_ci ret = 0; 275462306a36Sopenharmony_ci } else { 275562306a36Sopenharmony_ci spin_unlock(&ds_queue->split_queue_lock); 275662306a36Sopenharmony_cifail: 275762306a36Sopenharmony_ci if (mapping) 275862306a36Sopenharmony_ci xas_unlock(&xas); 275962306a36Sopenharmony_ci local_irq_enable(); 276062306a36Sopenharmony_ci remap_page(folio, folio_nr_pages(folio)); 276162306a36Sopenharmony_ci ret = -EAGAIN; 276262306a36Sopenharmony_ci } 276362306a36Sopenharmony_ci 276462306a36Sopenharmony_ciout_unlock: 276562306a36Sopenharmony_ci if (anon_vma) { 276662306a36Sopenharmony_ci anon_vma_unlock_write(anon_vma); 276762306a36Sopenharmony_ci put_anon_vma(anon_vma); 276862306a36Sopenharmony_ci } 276962306a36Sopenharmony_ci if (mapping) 277062306a36Sopenharmony_ci i_mmap_unlock_read(mapping); 277162306a36Sopenharmony_ciout: 277262306a36Sopenharmony_ci xas_destroy(&xas); 277362306a36Sopenharmony_ci count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); 277462306a36Sopenharmony_ci return ret; 277562306a36Sopenharmony_ci} 277662306a36Sopenharmony_ci 277762306a36Sopenharmony_civoid folio_undo_large_rmappable(struct folio *folio) 277862306a36Sopenharmony_ci{ 277962306a36Sopenharmony_ci struct deferred_split *ds_queue; 278062306a36Sopenharmony_ci unsigned long flags; 278162306a36Sopenharmony_ci 278262306a36Sopenharmony_ci /* 278362306a36Sopenharmony_ci * At this point, there is no one trying to add the folio to 278462306a36Sopenharmony_ci * deferred_list. If folio is not in deferred_list, it's safe 278562306a36Sopenharmony_ci * to check without acquiring the split_queue_lock. 278662306a36Sopenharmony_ci */ 278762306a36Sopenharmony_ci if (data_race(list_empty(&folio->_deferred_list))) 278862306a36Sopenharmony_ci return; 278962306a36Sopenharmony_ci 279062306a36Sopenharmony_ci ds_queue = get_deferred_split_queue(folio); 279162306a36Sopenharmony_ci spin_lock_irqsave(&ds_queue->split_queue_lock, flags); 279262306a36Sopenharmony_ci if (!list_empty(&folio->_deferred_list)) { 279362306a36Sopenharmony_ci ds_queue->split_queue_len--; 279462306a36Sopenharmony_ci list_del(&folio->_deferred_list); 279562306a36Sopenharmony_ci } 279662306a36Sopenharmony_ci spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); 279762306a36Sopenharmony_ci} 279862306a36Sopenharmony_ci 279962306a36Sopenharmony_civoid deferred_split_folio(struct folio *folio) 280062306a36Sopenharmony_ci{ 280162306a36Sopenharmony_ci struct deferred_split *ds_queue = get_deferred_split_queue(folio); 280262306a36Sopenharmony_ci#ifdef CONFIG_MEMCG 280362306a36Sopenharmony_ci struct mem_cgroup *memcg = folio_memcg(folio); 280462306a36Sopenharmony_ci#endif 280562306a36Sopenharmony_ci unsigned long flags; 280662306a36Sopenharmony_ci 280762306a36Sopenharmony_ci VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio); 280862306a36Sopenharmony_ci 280962306a36Sopenharmony_ci /* 281062306a36Sopenharmony_ci * The try_to_unmap() in page reclaim path might reach here too, 281162306a36Sopenharmony_ci * this may cause a race condition to corrupt deferred split queue. 281262306a36Sopenharmony_ci * And, if page reclaim is already handling the same folio, it is 281362306a36Sopenharmony_ci * unnecessary to handle it again in shrinker. 281462306a36Sopenharmony_ci * 281562306a36Sopenharmony_ci * Check the swapcache flag to determine if the folio is being 281662306a36Sopenharmony_ci * handled by page reclaim since THP swap would add the folio into 281762306a36Sopenharmony_ci * swap cache before calling try_to_unmap(). 281862306a36Sopenharmony_ci */ 281962306a36Sopenharmony_ci if (folio_test_swapcache(folio)) 282062306a36Sopenharmony_ci return; 282162306a36Sopenharmony_ci 282262306a36Sopenharmony_ci if (!list_empty(&folio->_deferred_list)) 282362306a36Sopenharmony_ci return; 282462306a36Sopenharmony_ci 282562306a36Sopenharmony_ci spin_lock_irqsave(&ds_queue->split_queue_lock, flags); 282662306a36Sopenharmony_ci if (list_empty(&folio->_deferred_list)) { 282762306a36Sopenharmony_ci count_vm_event(THP_DEFERRED_SPLIT_PAGE); 282862306a36Sopenharmony_ci list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); 282962306a36Sopenharmony_ci ds_queue->split_queue_len++; 283062306a36Sopenharmony_ci#ifdef CONFIG_MEMCG 283162306a36Sopenharmony_ci if (memcg) 283262306a36Sopenharmony_ci set_shrinker_bit(memcg, folio_nid(folio), 283362306a36Sopenharmony_ci deferred_split_shrinker.id); 283462306a36Sopenharmony_ci#endif 283562306a36Sopenharmony_ci } 283662306a36Sopenharmony_ci spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); 283762306a36Sopenharmony_ci} 283862306a36Sopenharmony_ci 283962306a36Sopenharmony_cistatic unsigned long deferred_split_count(struct shrinker *shrink, 284062306a36Sopenharmony_ci struct shrink_control *sc) 284162306a36Sopenharmony_ci{ 284262306a36Sopenharmony_ci struct pglist_data *pgdata = NODE_DATA(sc->nid); 284362306a36Sopenharmony_ci struct deferred_split *ds_queue = &pgdata->deferred_split_queue; 284462306a36Sopenharmony_ci 284562306a36Sopenharmony_ci#ifdef CONFIG_MEMCG 284662306a36Sopenharmony_ci if (sc->memcg) 284762306a36Sopenharmony_ci ds_queue = &sc->memcg->deferred_split_queue; 284862306a36Sopenharmony_ci#endif 284962306a36Sopenharmony_ci return READ_ONCE(ds_queue->split_queue_len); 285062306a36Sopenharmony_ci} 285162306a36Sopenharmony_ci 285262306a36Sopenharmony_cistatic unsigned long deferred_split_scan(struct shrinker *shrink, 285362306a36Sopenharmony_ci struct shrink_control *sc) 285462306a36Sopenharmony_ci{ 285562306a36Sopenharmony_ci struct pglist_data *pgdata = NODE_DATA(sc->nid); 285662306a36Sopenharmony_ci struct deferred_split *ds_queue = &pgdata->deferred_split_queue; 285762306a36Sopenharmony_ci unsigned long flags; 285862306a36Sopenharmony_ci LIST_HEAD(list); 285962306a36Sopenharmony_ci struct folio *folio, *next; 286062306a36Sopenharmony_ci int split = 0; 286162306a36Sopenharmony_ci 286262306a36Sopenharmony_ci#ifdef CONFIG_MEMCG 286362306a36Sopenharmony_ci if (sc->memcg) 286462306a36Sopenharmony_ci ds_queue = &sc->memcg->deferred_split_queue; 286562306a36Sopenharmony_ci#endif 286662306a36Sopenharmony_ci 286762306a36Sopenharmony_ci spin_lock_irqsave(&ds_queue->split_queue_lock, flags); 286862306a36Sopenharmony_ci /* Take pin on all head pages to avoid freeing them under us */ 286962306a36Sopenharmony_ci list_for_each_entry_safe(folio, next, &ds_queue->split_queue, 287062306a36Sopenharmony_ci _deferred_list) { 287162306a36Sopenharmony_ci if (folio_try_get(folio)) { 287262306a36Sopenharmony_ci list_move(&folio->_deferred_list, &list); 287362306a36Sopenharmony_ci } else { 287462306a36Sopenharmony_ci /* We lost race with folio_put() */ 287562306a36Sopenharmony_ci list_del_init(&folio->_deferred_list); 287662306a36Sopenharmony_ci ds_queue->split_queue_len--; 287762306a36Sopenharmony_ci } 287862306a36Sopenharmony_ci if (!--sc->nr_to_scan) 287962306a36Sopenharmony_ci break; 288062306a36Sopenharmony_ci } 288162306a36Sopenharmony_ci spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); 288262306a36Sopenharmony_ci 288362306a36Sopenharmony_ci list_for_each_entry_safe(folio, next, &list, _deferred_list) { 288462306a36Sopenharmony_ci if (!folio_trylock(folio)) 288562306a36Sopenharmony_ci goto next; 288662306a36Sopenharmony_ci /* split_huge_page() removes page from list on success */ 288762306a36Sopenharmony_ci if (!split_folio(folio)) 288862306a36Sopenharmony_ci split++; 288962306a36Sopenharmony_ci folio_unlock(folio); 289062306a36Sopenharmony_cinext: 289162306a36Sopenharmony_ci folio_put(folio); 289262306a36Sopenharmony_ci } 289362306a36Sopenharmony_ci 289462306a36Sopenharmony_ci spin_lock_irqsave(&ds_queue->split_queue_lock, flags); 289562306a36Sopenharmony_ci list_splice_tail(&list, &ds_queue->split_queue); 289662306a36Sopenharmony_ci spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); 289762306a36Sopenharmony_ci 289862306a36Sopenharmony_ci /* 289962306a36Sopenharmony_ci * Stop shrinker if we didn't split any page, but the queue is empty. 290062306a36Sopenharmony_ci * This can happen if pages were freed under us. 290162306a36Sopenharmony_ci */ 290262306a36Sopenharmony_ci if (!split && list_empty(&ds_queue->split_queue)) 290362306a36Sopenharmony_ci return SHRINK_STOP; 290462306a36Sopenharmony_ci return split; 290562306a36Sopenharmony_ci} 290662306a36Sopenharmony_ci 290762306a36Sopenharmony_cistatic struct shrinker deferred_split_shrinker = { 290862306a36Sopenharmony_ci .count_objects = deferred_split_count, 290962306a36Sopenharmony_ci .scan_objects = deferred_split_scan, 291062306a36Sopenharmony_ci .seeks = DEFAULT_SEEKS, 291162306a36Sopenharmony_ci .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE | 291262306a36Sopenharmony_ci SHRINKER_NONSLAB, 291362306a36Sopenharmony_ci}; 291462306a36Sopenharmony_ci 291562306a36Sopenharmony_ci#ifdef CONFIG_DEBUG_FS 291662306a36Sopenharmony_cistatic void split_huge_pages_all(void) 291762306a36Sopenharmony_ci{ 291862306a36Sopenharmony_ci struct zone *zone; 291962306a36Sopenharmony_ci struct page *page; 292062306a36Sopenharmony_ci struct folio *folio; 292162306a36Sopenharmony_ci unsigned long pfn, max_zone_pfn; 292262306a36Sopenharmony_ci unsigned long total = 0, split = 0; 292362306a36Sopenharmony_ci 292462306a36Sopenharmony_ci pr_debug("Split all THPs\n"); 292562306a36Sopenharmony_ci for_each_zone(zone) { 292662306a36Sopenharmony_ci if (!managed_zone(zone)) 292762306a36Sopenharmony_ci continue; 292862306a36Sopenharmony_ci max_zone_pfn = zone_end_pfn(zone); 292962306a36Sopenharmony_ci for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { 293062306a36Sopenharmony_ci int nr_pages; 293162306a36Sopenharmony_ci 293262306a36Sopenharmony_ci page = pfn_to_online_page(pfn); 293362306a36Sopenharmony_ci if (!page || PageTail(page)) 293462306a36Sopenharmony_ci continue; 293562306a36Sopenharmony_ci folio = page_folio(page); 293662306a36Sopenharmony_ci if (!folio_try_get(folio)) 293762306a36Sopenharmony_ci continue; 293862306a36Sopenharmony_ci 293962306a36Sopenharmony_ci if (unlikely(page_folio(page) != folio)) 294062306a36Sopenharmony_ci goto next; 294162306a36Sopenharmony_ci 294262306a36Sopenharmony_ci if (zone != folio_zone(folio)) 294362306a36Sopenharmony_ci goto next; 294462306a36Sopenharmony_ci 294562306a36Sopenharmony_ci if (!folio_test_large(folio) 294662306a36Sopenharmony_ci || folio_test_hugetlb(folio) 294762306a36Sopenharmony_ci || !folio_test_lru(folio)) 294862306a36Sopenharmony_ci goto next; 294962306a36Sopenharmony_ci 295062306a36Sopenharmony_ci total++; 295162306a36Sopenharmony_ci folio_lock(folio); 295262306a36Sopenharmony_ci nr_pages = folio_nr_pages(folio); 295362306a36Sopenharmony_ci if (!split_folio(folio)) 295462306a36Sopenharmony_ci split++; 295562306a36Sopenharmony_ci pfn += nr_pages - 1; 295662306a36Sopenharmony_ci folio_unlock(folio); 295762306a36Sopenharmony_cinext: 295862306a36Sopenharmony_ci folio_put(folio); 295962306a36Sopenharmony_ci cond_resched(); 296062306a36Sopenharmony_ci } 296162306a36Sopenharmony_ci } 296262306a36Sopenharmony_ci 296362306a36Sopenharmony_ci pr_debug("%lu of %lu THP split\n", split, total); 296462306a36Sopenharmony_ci} 296562306a36Sopenharmony_ci 296662306a36Sopenharmony_cistatic inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma) 296762306a36Sopenharmony_ci{ 296862306a36Sopenharmony_ci return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) || 296962306a36Sopenharmony_ci is_vm_hugetlb_page(vma); 297062306a36Sopenharmony_ci} 297162306a36Sopenharmony_ci 297262306a36Sopenharmony_cistatic int split_huge_pages_pid(int pid, unsigned long vaddr_start, 297362306a36Sopenharmony_ci unsigned long vaddr_end) 297462306a36Sopenharmony_ci{ 297562306a36Sopenharmony_ci int ret = 0; 297662306a36Sopenharmony_ci struct task_struct *task; 297762306a36Sopenharmony_ci struct mm_struct *mm; 297862306a36Sopenharmony_ci unsigned long total = 0, split = 0; 297962306a36Sopenharmony_ci unsigned long addr; 298062306a36Sopenharmony_ci 298162306a36Sopenharmony_ci vaddr_start &= PAGE_MASK; 298262306a36Sopenharmony_ci vaddr_end &= PAGE_MASK; 298362306a36Sopenharmony_ci 298462306a36Sopenharmony_ci /* Find the task_struct from pid */ 298562306a36Sopenharmony_ci rcu_read_lock(); 298662306a36Sopenharmony_ci task = find_task_by_vpid(pid); 298762306a36Sopenharmony_ci if (!task) { 298862306a36Sopenharmony_ci rcu_read_unlock(); 298962306a36Sopenharmony_ci ret = -ESRCH; 299062306a36Sopenharmony_ci goto out; 299162306a36Sopenharmony_ci } 299262306a36Sopenharmony_ci get_task_struct(task); 299362306a36Sopenharmony_ci rcu_read_unlock(); 299462306a36Sopenharmony_ci 299562306a36Sopenharmony_ci /* Find the mm_struct */ 299662306a36Sopenharmony_ci mm = get_task_mm(task); 299762306a36Sopenharmony_ci put_task_struct(task); 299862306a36Sopenharmony_ci 299962306a36Sopenharmony_ci if (!mm) { 300062306a36Sopenharmony_ci ret = -EINVAL; 300162306a36Sopenharmony_ci goto out; 300262306a36Sopenharmony_ci } 300362306a36Sopenharmony_ci 300462306a36Sopenharmony_ci pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n", 300562306a36Sopenharmony_ci pid, vaddr_start, vaddr_end); 300662306a36Sopenharmony_ci 300762306a36Sopenharmony_ci mmap_read_lock(mm); 300862306a36Sopenharmony_ci /* 300962306a36Sopenharmony_ci * always increase addr by PAGE_SIZE, since we could have a PTE page 301062306a36Sopenharmony_ci * table filled with PTE-mapped THPs, each of which is distinct. 301162306a36Sopenharmony_ci */ 301262306a36Sopenharmony_ci for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) { 301362306a36Sopenharmony_ci struct vm_area_struct *vma = vma_lookup(mm, addr); 301462306a36Sopenharmony_ci struct page *page; 301562306a36Sopenharmony_ci struct folio *folio; 301662306a36Sopenharmony_ci 301762306a36Sopenharmony_ci if (!vma) 301862306a36Sopenharmony_ci break; 301962306a36Sopenharmony_ci 302062306a36Sopenharmony_ci /* skip special VMA and hugetlb VMA */ 302162306a36Sopenharmony_ci if (vma_not_suitable_for_thp_split(vma)) { 302262306a36Sopenharmony_ci addr = vma->vm_end; 302362306a36Sopenharmony_ci continue; 302462306a36Sopenharmony_ci } 302562306a36Sopenharmony_ci 302662306a36Sopenharmony_ci /* FOLL_DUMP to ignore special (like zero) pages */ 302762306a36Sopenharmony_ci page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); 302862306a36Sopenharmony_ci 302962306a36Sopenharmony_ci if (IS_ERR_OR_NULL(page)) 303062306a36Sopenharmony_ci continue; 303162306a36Sopenharmony_ci 303262306a36Sopenharmony_ci folio = page_folio(page); 303362306a36Sopenharmony_ci if (!is_transparent_hugepage(folio)) 303462306a36Sopenharmony_ci goto next; 303562306a36Sopenharmony_ci 303662306a36Sopenharmony_ci total++; 303762306a36Sopenharmony_ci if (!can_split_folio(folio, NULL)) 303862306a36Sopenharmony_ci goto next; 303962306a36Sopenharmony_ci 304062306a36Sopenharmony_ci if (!folio_trylock(folio)) 304162306a36Sopenharmony_ci goto next; 304262306a36Sopenharmony_ci 304362306a36Sopenharmony_ci if (!split_folio(folio)) 304462306a36Sopenharmony_ci split++; 304562306a36Sopenharmony_ci 304662306a36Sopenharmony_ci folio_unlock(folio); 304762306a36Sopenharmony_cinext: 304862306a36Sopenharmony_ci folio_put(folio); 304962306a36Sopenharmony_ci cond_resched(); 305062306a36Sopenharmony_ci } 305162306a36Sopenharmony_ci mmap_read_unlock(mm); 305262306a36Sopenharmony_ci mmput(mm); 305362306a36Sopenharmony_ci 305462306a36Sopenharmony_ci pr_debug("%lu of %lu THP split\n", split, total); 305562306a36Sopenharmony_ci 305662306a36Sopenharmony_ciout: 305762306a36Sopenharmony_ci return ret; 305862306a36Sopenharmony_ci} 305962306a36Sopenharmony_ci 306062306a36Sopenharmony_cistatic int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, 306162306a36Sopenharmony_ci pgoff_t off_end) 306262306a36Sopenharmony_ci{ 306362306a36Sopenharmony_ci struct filename *file; 306462306a36Sopenharmony_ci struct file *candidate; 306562306a36Sopenharmony_ci struct address_space *mapping; 306662306a36Sopenharmony_ci int ret = -EINVAL; 306762306a36Sopenharmony_ci pgoff_t index; 306862306a36Sopenharmony_ci int nr_pages = 1; 306962306a36Sopenharmony_ci unsigned long total = 0, split = 0; 307062306a36Sopenharmony_ci 307162306a36Sopenharmony_ci file = getname_kernel(file_path); 307262306a36Sopenharmony_ci if (IS_ERR(file)) 307362306a36Sopenharmony_ci return ret; 307462306a36Sopenharmony_ci 307562306a36Sopenharmony_ci candidate = file_open_name(file, O_RDONLY, 0); 307662306a36Sopenharmony_ci if (IS_ERR(candidate)) 307762306a36Sopenharmony_ci goto out; 307862306a36Sopenharmony_ci 307962306a36Sopenharmony_ci pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n", 308062306a36Sopenharmony_ci file_path, off_start, off_end); 308162306a36Sopenharmony_ci 308262306a36Sopenharmony_ci mapping = candidate->f_mapping; 308362306a36Sopenharmony_ci 308462306a36Sopenharmony_ci for (index = off_start; index < off_end; index += nr_pages) { 308562306a36Sopenharmony_ci struct folio *folio = filemap_get_folio(mapping, index); 308662306a36Sopenharmony_ci 308762306a36Sopenharmony_ci nr_pages = 1; 308862306a36Sopenharmony_ci if (IS_ERR(folio)) 308962306a36Sopenharmony_ci continue; 309062306a36Sopenharmony_ci 309162306a36Sopenharmony_ci if (!folio_test_large(folio)) 309262306a36Sopenharmony_ci goto next; 309362306a36Sopenharmony_ci 309462306a36Sopenharmony_ci total++; 309562306a36Sopenharmony_ci nr_pages = folio_nr_pages(folio); 309662306a36Sopenharmony_ci 309762306a36Sopenharmony_ci if (!folio_trylock(folio)) 309862306a36Sopenharmony_ci goto next; 309962306a36Sopenharmony_ci 310062306a36Sopenharmony_ci if (!split_folio(folio)) 310162306a36Sopenharmony_ci split++; 310262306a36Sopenharmony_ci 310362306a36Sopenharmony_ci folio_unlock(folio); 310462306a36Sopenharmony_cinext: 310562306a36Sopenharmony_ci folio_put(folio); 310662306a36Sopenharmony_ci cond_resched(); 310762306a36Sopenharmony_ci } 310862306a36Sopenharmony_ci 310962306a36Sopenharmony_ci filp_close(candidate, NULL); 311062306a36Sopenharmony_ci ret = 0; 311162306a36Sopenharmony_ci 311262306a36Sopenharmony_ci pr_debug("%lu of %lu file-backed THP split\n", split, total); 311362306a36Sopenharmony_ciout: 311462306a36Sopenharmony_ci putname(file); 311562306a36Sopenharmony_ci return ret; 311662306a36Sopenharmony_ci} 311762306a36Sopenharmony_ci 311862306a36Sopenharmony_ci#define MAX_INPUT_BUF_SZ 255 311962306a36Sopenharmony_ci 312062306a36Sopenharmony_cistatic ssize_t split_huge_pages_write(struct file *file, const char __user *buf, 312162306a36Sopenharmony_ci size_t count, loff_t *ppops) 312262306a36Sopenharmony_ci{ 312362306a36Sopenharmony_ci static DEFINE_MUTEX(split_debug_mutex); 312462306a36Sopenharmony_ci ssize_t ret; 312562306a36Sopenharmony_ci /* hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end */ 312662306a36Sopenharmony_ci char input_buf[MAX_INPUT_BUF_SZ]; 312762306a36Sopenharmony_ci int pid; 312862306a36Sopenharmony_ci unsigned long vaddr_start, vaddr_end; 312962306a36Sopenharmony_ci 313062306a36Sopenharmony_ci ret = mutex_lock_interruptible(&split_debug_mutex); 313162306a36Sopenharmony_ci if (ret) 313262306a36Sopenharmony_ci return ret; 313362306a36Sopenharmony_ci 313462306a36Sopenharmony_ci ret = -EFAULT; 313562306a36Sopenharmony_ci 313662306a36Sopenharmony_ci memset(input_buf, 0, MAX_INPUT_BUF_SZ); 313762306a36Sopenharmony_ci if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ))) 313862306a36Sopenharmony_ci goto out; 313962306a36Sopenharmony_ci 314062306a36Sopenharmony_ci input_buf[MAX_INPUT_BUF_SZ - 1] = '\0'; 314162306a36Sopenharmony_ci 314262306a36Sopenharmony_ci if (input_buf[0] == '/') { 314362306a36Sopenharmony_ci char *tok; 314462306a36Sopenharmony_ci char *buf = input_buf; 314562306a36Sopenharmony_ci char file_path[MAX_INPUT_BUF_SZ]; 314662306a36Sopenharmony_ci pgoff_t off_start = 0, off_end = 0; 314762306a36Sopenharmony_ci size_t input_len = strlen(input_buf); 314862306a36Sopenharmony_ci 314962306a36Sopenharmony_ci tok = strsep(&buf, ","); 315062306a36Sopenharmony_ci if (tok) { 315162306a36Sopenharmony_ci strcpy(file_path, tok); 315262306a36Sopenharmony_ci } else { 315362306a36Sopenharmony_ci ret = -EINVAL; 315462306a36Sopenharmony_ci goto out; 315562306a36Sopenharmony_ci } 315662306a36Sopenharmony_ci 315762306a36Sopenharmony_ci ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end); 315862306a36Sopenharmony_ci if (ret != 2) { 315962306a36Sopenharmony_ci ret = -EINVAL; 316062306a36Sopenharmony_ci goto out; 316162306a36Sopenharmony_ci } 316262306a36Sopenharmony_ci ret = split_huge_pages_in_file(file_path, off_start, off_end); 316362306a36Sopenharmony_ci if (!ret) 316462306a36Sopenharmony_ci ret = input_len; 316562306a36Sopenharmony_ci 316662306a36Sopenharmony_ci goto out; 316762306a36Sopenharmony_ci } 316862306a36Sopenharmony_ci 316962306a36Sopenharmony_ci ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end); 317062306a36Sopenharmony_ci if (ret == 1 && pid == 1) { 317162306a36Sopenharmony_ci split_huge_pages_all(); 317262306a36Sopenharmony_ci ret = strlen(input_buf); 317362306a36Sopenharmony_ci goto out; 317462306a36Sopenharmony_ci } else if (ret != 3) { 317562306a36Sopenharmony_ci ret = -EINVAL; 317662306a36Sopenharmony_ci goto out; 317762306a36Sopenharmony_ci } 317862306a36Sopenharmony_ci 317962306a36Sopenharmony_ci ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end); 318062306a36Sopenharmony_ci if (!ret) 318162306a36Sopenharmony_ci ret = strlen(input_buf); 318262306a36Sopenharmony_ciout: 318362306a36Sopenharmony_ci mutex_unlock(&split_debug_mutex); 318462306a36Sopenharmony_ci return ret; 318562306a36Sopenharmony_ci 318662306a36Sopenharmony_ci} 318762306a36Sopenharmony_ci 318862306a36Sopenharmony_cistatic const struct file_operations split_huge_pages_fops = { 318962306a36Sopenharmony_ci .owner = THIS_MODULE, 319062306a36Sopenharmony_ci .write = split_huge_pages_write, 319162306a36Sopenharmony_ci .llseek = no_llseek, 319262306a36Sopenharmony_ci}; 319362306a36Sopenharmony_ci 319462306a36Sopenharmony_cistatic int __init split_huge_pages_debugfs(void) 319562306a36Sopenharmony_ci{ 319662306a36Sopenharmony_ci debugfs_create_file("split_huge_pages", 0200, NULL, NULL, 319762306a36Sopenharmony_ci &split_huge_pages_fops); 319862306a36Sopenharmony_ci return 0; 319962306a36Sopenharmony_ci} 320062306a36Sopenharmony_cilate_initcall(split_huge_pages_debugfs); 320162306a36Sopenharmony_ci#endif 320262306a36Sopenharmony_ci 320362306a36Sopenharmony_ci#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 320462306a36Sopenharmony_ciint set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, 320562306a36Sopenharmony_ci struct page *page) 320662306a36Sopenharmony_ci{ 320762306a36Sopenharmony_ci struct vm_area_struct *vma = pvmw->vma; 320862306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 320962306a36Sopenharmony_ci unsigned long address = pvmw->address; 321062306a36Sopenharmony_ci bool anon_exclusive; 321162306a36Sopenharmony_ci pmd_t pmdval; 321262306a36Sopenharmony_ci swp_entry_t entry; 321362306a36Sopenharmony_ci pmd_t pmdswp; 321462306a36Sopenharmony_ci 321562306a36Sopenharmony_ci if (!(pvmw->pmd && !pvmw->pte)) 321662306a36Sopenharmony_ci return 0; 321762306a36Sopenharmony_ci 321862306a36Sopenharmony_ci flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); 321962306a36Sopenharmony_ci pmdval = pmdp_invalidate(vma, address, pvmw->pmd); 322062306a36Sopenharmony_ci 322162306a36Sopenharmony_ci /* See page_try_share_anon_rmap(): invalidate PMD first. */ 322262306a36Sopenharmony_ci anon_exclusive = PageAnon(page) && PageAnonExclusive(page); 322362306a36Sopenharmony_ci if (anon_exclusive && page_try_share_anon_rmap(page)) { 322462306a36Sopenharmony_ci set_pmd_at(mm, address, pvmw->pmd, pmdval); 322562306a36Sopenharmony_ci return -EBUSY; 322662306a36Sopenharmony_ci } 322762306a36Sopenharmony_ci 322862306a36Sopenharmony_ci if (pmd_dirty(pmdval)) 322962306a36Sopenharmony_ci set_page_dirty(page); 323062306a36Sopenharmony_ci if (pmd_write(pmdval)) 323162306a36Sopenharmony_ci entry = make_writable_migration_entry(page_to_pfn(page)); 323262306a36Sopenharmony_ci else if (anon_exclusive) 323362306a36Sopenharmony_ci entry = make_readable_exclusive_migration_entry(page_to_pfn(page)); 323462306a36Sopenharmony_ci else 323562306a36Sopenharmony_ci entry = make_readable_migration_entry(page_to_pfn(page)); 323662306a36Sopenharmony_ci if (pmd_young(pmdval)) 323762306a36Sopenharmony_ci entry = make_migration_entry_young(entry); 323862306a36Sopenharmony_ci if (pmd_dirty(pmdval)) 323962306a36Sopenharmony_ci entry = make_migration_entry_dirty(entry); 324062306a36Sopenharmony_ci pmdswp = swp_entry_to_pmd(entry); 324162306a36Sopenharmony_ci if (pmd_soft_dirty(pmdval)) 324262306a36Sopenharmony_ci pmdswp = pmd_swp_mksoft_dirty(pmdswp); 324362306a36Sopenharmony_ci if (pmd_uffd_wp(pmdval)) 324462306a36Sopenharmony_ci pmdswp = pmd_swp_mkuffd_wp(pmdswp); 324562306a36Sopenharmony_ci set_pmd_at(mm, address, pvmw->pmd, pmdswp); 324662306a36Sopenharmony_ci page_remove_rmap(page, vma, true); 324762306a36Sopenharmony_ci put_page(page); 324862306a36Sopenharmony_ci trace_set_migration_pmd(address, pmd_val(pmdswp)); 324962306a36Sopenharmony_ci 325062306a36Sopenharmony_ci return 0; 325162306a36Sopenharmony_ci} 325262306a36Sopenharmony_ci 325362306a36Sopenharmony_civoid remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) 325462306a36Sopenharmony_ci{ 325562306a36Sopenharmony_ci struct vm_area_struct *vma = pvmw->vma; 325662306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 325762306a36Sopenharmony_ci unsigned long address = pvmw->address; 325862306a36Sopenharmony_ci unsigned long haddr = address & HPAGE_PMD_MASK; 325962306a36Sopenharmony_ci pmd_t pmde; 326062306a36Sopenharmony_ci swp_entry_t entry; 326162306a36Sopenharmony_ci 326262306a36Sopenharmony_ci if (!(pvmw->pmd && !pvmw->pte)) 326362306a36Sopenharmony_ci return; 326462306a36Sopenharmony_ci 326562306a36Sopenharmony_ci entry = pmd_to_swp_entry(*pvmw->pmd); 326662306a36Sopenharmony_ci get_page(new); 326762306a36Sopenharmony_ci pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot)); 326862306a36Sopenharmony_ci if (pmd_swp_soft_dirty(*pvmw->pmd)) 326962306a36Sopenharmony_ci pmde = pmd_mksoft_dirty(pmde); 327062306a36Sopenharmony_ci if (is_writable_migration_entry(entry)) 327162306a36Sopenharmony_ci pmde = pmd_mkwrite(pmde, vma); 327262306a36Sopenharmony_ci if (pmd_swp_uffd_wp(*pvmw->pmd)) 327362306a36Sopenharmony_ci pmde = pmd_mkuffd_wp(pmde); 327462306a36Sopenharmony_ci if (!is_migration_entry_young(entry)) 327562306a36Sopenharmony_ci pmde = pmd_mkold(pmde); 327662306a36Sopenharmony_ci /* NOTE: this may contain setting soft-dirty on some archs */ 327762306a36Sopenharmony_ci if (PageDirty(new) && is_migration_entry_dirty(entry)) 327862306a36Sopenharmony_ci pmde = pmd_mkdirty(pmde); 327962306a36Sopenharmony_ci 328062306a36Sopenharmony_ci if (PageAnon(new)) { 328162306a36Sopenharmony_ci rmap_t rmap_flags = RMAP_COMPOUND; 328262306a36Sopenharmony_ci 328362306a36Sopenharmony_ci if (!is_readable_migration_entry(entry)) 328462306a36Sopenharmony_ci rmap_flags |= RMAP_EXCLUSIVE; 328562306a36Sopenharmony_ci 328662306a36Sopenharmony_ci page_add_anon_rmap(new, vma, haddr, rmap_flags); 328762306a36Sopenharmony_ci } else { 328862306a36Sopenharmony_ci page_add_file_rmap(new, vma, true); 328962306a36Sopenharmony_ci } 329062306a36Sopenharmony_ci VM_BUG_ON(pmd_write(pmde) && PageAnon(new) && !PageAnonExclusive(new)); 329162306a36Sopenharmony_ci set_pmd_at(mm, haddr, pvmw->pmd, pmde); 329262306a36Sopenharmony_ci 329362306a36Sopenharmony_ci /* No need to invalidate - it was non-present before */ 329462306a36Sopenharmony_ci update_mmu_cache_pmd(vma, address, pvmw->pmd); 329562306a36Sopenharmony_ci trace_remove_migration_pmd(address, pmd_val(pmde)); 329662306a36Sopenharmony_ci} 329762306a36Sopenharmony_ci#endif 3298