162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Generic hugetlb support. 462306a36Sopenharmony_ci * (C) Nadia Yvette Chambers, April 2004 562306a36Sopenharmony_ci */ 662306a36Sopenharmony_ci#include <linux/list.h> 762306a36Sopenharmony_ci#include <linux/init.h> 862306a36Sopenharmony_ci#include <linux/mm.h> 962306a36Sopenharmony_ci#include <linux/seq_file.h> 1062306a36Sopenharmony_ci#include <linux/sysctl.h> 1162306a36Sopenharmony_ci#include <linux/highmem.h> 1262306a36Sopenharmony_ci#include <linux/mmu_notifier.h> 1362306a36Sopenharmony_ci#include <linux/nodemask.h> 1462306a36Sopenharmony_ci#include <linux/pagemap.h> 1562306a36Sopenharmony_ci#include <linux/mempolicy.h> 1662306a36Sopenharmony_ci#include <linux/compiler.h> 1762306a36Sopenharmony_ci#include <linux/cpuset.h> 1862306a36Sopenharmony_ci#include <linux/mutex.h> 1962306a36Sopenharmony_ci#include <linux/memblock.h> 2062306a36Sopenharmony_ci#include <linux/sysfs.h> 2162306a36Sopenharmony_ci#include <linux/slab.h> 2262306a36Sopenharmony_ci#include <linux/sched/mm.h> 2362306a36Sopenharmony_ci#include <linux/mmdebug.h> 2462306a36Sopenharmony_ci#include <linux/sched/signal.h> 2562306a36Sopenharmony_ci#include <linux/rmap.h> 2662306a36Sopenharmony_ci#include <linux/string_helpers.h> 2762306a36Sopenharmony_ci#include <linux/swap.h> 2862306a36Sopenharmony_ci#include <linux/swapops.h> 2962306a36Sopenharmony_ci#include <linux/jhash.h> 3062306a36Sopenharmony_ci#include <linux/numa.h> 3162306a36Sopenharmony_ci#include <linux/llist.h> 3262306a36Sopenharmony_ci#include <linux/cma.h> 3362306a36Sopenharmony_ci#include <linux/migrate.h> 3462306a36Sopenharmony_ci#include <linux/nospec.h> 3562306a36Sopenharmony_ci#include <linux/delayacct.h> 3662306a36Sopenharmony_ci#include <linux/memory.h> 3762306a36Sopenharmony_ci#include <linux/mm_inline.h> 3862306a36Sopenharmony_ci 3962306a36Sopenharmony_ci#include <asm/page.h> 4062306a36Sopenharmony_ci#include <asm/pgalloc.h> 4162306a36Sopenharmony_ci#include <asm/tlb.h> 4262306a36Sopenharmony_ci 4362306a36Sopenharmony_ci#include <linux/io.h> 4462306a36Sopenharmony_ci#include <linux/hugetlb.h> 4562306a36Sopenharmony_ci#include <linux/hugetlb_cgroup.h> 4662306a36Sopenharmony_ci#include <linux/node.h> 4762306a36Sopenharmony_ci#include <linux/page_owner.h> 4862306a36Sopenharmony_ci#include "internal.h" 4962306a36Sopenharmony_ci#include "hugetlb_vmemmap.h" 5062306a36Sopenharmony_ci 5162306a36Sopenharmony_ciint hugetlb_max_hstate __read_mostly; 5262306a36Sopenharmony_ciunsigned int default_hstate_idx; 5362306a36Sopenharmony_cistruct hstate hstates[HUGE_MAX_HSTATE]; 5462306a36Sopenharmony_ci 5562306a36Sopenharmony_ci#ifdef CONFIG_CMA 5662306a36Sopenharmony_cistatic struct cma *hugetlb_cma[MAX_NUMNODES]; 5762306a36Sopenharmony_cistatic unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata; 5862306a36Sopenharmony_cistatic bool hugetlb_cma_folio(struct folio *folio, unsigned int order) 5962306a36Sopenharmony_ci{ 6062306a36Sopenharmony_ci return cma_pages_valid(hugetlb_cma[folio_nid(folio)], &folio->page, 6162306a36Sopenharmony_ci 1 << order); 6262306a36Sopenharmony_ci} 6362306a36Sopenharmony_ci#else 6462306a36Sopenharmony_cistatic bool hugetlb_cma_folio(struct folio *folio, unsigned int order) 6562306a36Sopenharmony_ci{ 6662306a36Sopenharmony_ci return false; 6762306a36Sopenharmony_ci} 6862306a36Sopenharmony_ci#endif 6962306a36Sopenharmony_cistatic unsigned long hugetlb_cma_size __initdata; 7062306a36Sopenharmony_ci 7162306a36Sopenharmony_ci__initdata LIST_HEAD(huge_boot_pages); 7262306a36Sopenharmony_ci 7362306a36Sopenharmony_ci/* for command line parsing */ 7462306a36Sopenharmony_cistatic struct hstate * __initdata parsed_hstate; 7562306a36Sopenharmony_cistatic unsigned long __initdata default_hstate_max_huge_pages; 7662306a36Sopenharmony_cistatic bool __initdata parsed_valid_hugepagesz = true; 7762306a36Sopenharmony_cistatic bool __initdata parsed_default_hugepagesz; 7862306a36Sopenharmony_cistatic unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata; 7962306a36Sopenharmony_ci 8062306a36Sopenharmony_ci/* 8162306a36Sopenharmony_ci * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, 8262306a36Sopenharmony_ci * free_huge_pages, and surplus_huge_pages. 8362306a36Sopenharmony_ci */ 8462306a36Sopenharmony_ciDEFINE_SPINLOCK(hugetlb_lock); 8562306a36Sopenharmony_ci 8662306a36Sopenharmony_ci/* 8762306a36Sopenharmony_ci * Serializes faults on the same logical page. This is used to 8862306a36Sopenharmony_ci * prevent spurious OOMs when the hugepage pool is fully utilized. 8962306a36Sopenharmony_ci */ 9062306a36Sopenharmony_cistatic int num_fault_mutexes; 9162306a36Sopenharmony_cistruct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; 9262306a36Sopenharmony_ci 9362306a36Sopenharmony_ci/* Forward declaration */ 9462306a36Sopenharmony_cistatic int hugetlb_acct_memory(struct hstate *h, long delta); 9562306a36Sopenharmony_cistatic void hugetlb_vma_lock_free(struct vm_area_struct *vma); 9662306a36Sopenharmony_cistatic void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); 9762306a36Sopenharmony_cistatic void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); 9862306a36Sopenharmony_cistatic void hugetlb_unshare_pmds(struct vm_area_struct *vma, 9962306a36Sopenharmony_ci unsigned long start, unsigned long end); 10062306a36Sopenharmony_cistatic struct resv_map *vma_resv_map(struct vm_area_struct *vma); 10162306a36Sopenharmony_ci 10262306a36Sopenharmony_cistatic inline bool subpool_is_free(struct hugepage_subpool *spool) 10362306a36Sopenharmony_ci{ 10462306a36Sopenharmony_ci if (spool->count) 10562306a36Sopenharmony_ci return false; 10662306a36Sopenharmony_ci if (spool->max_hpages != -1) 10762306a36Sopenharmony_ci return spool->used_hpages == 0; 10862306a36Sopenharmony_ci if (spool->min_hpages != -1) 10962306a36Sopenharmony_ci return spool->rsv_hpages == spool->min_hpages; 11062306a36Sopenharmony_ci 11162306a36Sopenharmony_ci return true; 11262306a36Sopenharmony_ci} 11362306a36Sopenharmony_ci 11462306a36Sopenharmony_cistatic inline void unlock_or_release_subpool(struct hugepage_subpool *spool, 11562306a36Sopenharmony_ci unsigned long irq_flags) 11662306a36Sopenharmony_ci{ 11762306a36Sopenharmony_ci spin_unlock_irqrestore(&spool->lock, irq_flags); 11862306a36Sopenharmony_ci 11962306a36Sopenharmony_ci /* If no pages are used, and no other handles to the subpool 12062306a36Sopenharmony_ci * remain, give up any reservations based on minimum size and 12162306a36Sopenharmony_ci * free the subpool */ 12262306a36Sopenharmony_ci if (subpool_is_free(spool)) { 12362306a36Sopenharmony_ci if (spool->min_hpages != -1) 12462306a36Sopenharmony_ci hugetlb_acct_memory(spool->hstate, 12562306a36Sopenharmony_ci -spool->min_hpages); 12662306a36Sopenharmony_ci kfree(spool); 12762306a36Sopenharmony_ci } 12862306a36Sopenharmony_ci} 12962306a36Sopenharmony_ci 13062306a36Sopenharmony_cistruct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, 13162306a36Sopenharmony_ci long min_hpages) 13262306a36Sopenharmony_ci{ 13362306a36Sopenharmony_ci struct hugepage_subpool *spool; 13462306a36Sopenharmony_ci 13562306a36Sopenharmony_ci spool = kzalloc(sizeof(*spool), GFP_KERNEL); 13662306a36Sopenharmony_ci if (!spool) 13762306a36Sopenharmony_ci return NULL; 13862306a36Sopenharmony_ci 13962306a36Sopenharmony_ci spin_lock_init(&spool->lock); 14062306a36Sopenharmony_ci spool->count = 1; 14162306a36Sopenharmony_ci spool->max_hpages = max_hpages; 14262306a36Sopenharmony_ci spool->hstate = h; 14362306a36Sopenharmony_ci spool->min_hpages = min_hpages; 14462306a36Sopenharmony_ci 14562306a36Sopenharmony_ci if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) { 14662306a36Sopenharmony_ci kfree(spool); 14762306a36Sopenharmony_ci return NULL; 14862306a36Sopenharmony_ci } 14962306a36Sopenharmony_ci spool->rsv_hpages = min_hpages; 15062306a36Sopenharmony_ci 15162306a36Sopenharmony_ci return spool; 15262306a36Sopenharmony_ci} 15362306a36Sopenharmony_ci 15462306a36Sopenharmony_civoid hugepage_put_subpool(struct hugepage_subpool *spool) 15562306a36Sopenharmony_ci{ 15662306a36Sopenharmony_ci unsigned long flags; 15762306a36Sopenharmony_ci 15862306a36Sopenharmony_ci spin_lock_irqsave(&spool->lock, flags); 15962306a36Sopenharmony_ci BUG_ON(!spool->count); 16062306a36Sopenharmony_ci spool->count--; 16162306a36Sopenharmony_ci unlock_or_release_subpool(spool, flags); 16262306a36Sopenharmony_ci} 16362306a36Sopenharmony_ci 16462306a36Sopenharmony_ci/* 16562306a36Sopenharmony_ci * Subpool accounting for allocating and reserving pages. 16662306a36Sopenharmony_ci * Return -ENOMEM if there are not enough resources to satisfy the 16762306a36Sopenharmony_ci * request. Otherwise, return the number of pages by which the 16862306a36Sopenharmony_ci * global pools must be adjusted (upward). The returned value may 16962306a36Sopenharmony_ci * only be different than the passed value (delta) in the case where 17062306a36Sopenharmony_ci * a subpool minimum size must be maintained. 17162306a36Sopenharmony_ci */ 17262306a36Sopenharmony_cistatic long hugepage_subpool_get_pages(struct hugepage_subpool *spool, 17362306a36Sopenharmony_ci long delta) 17462306a36Sopenharmony_ci{ 17562306a36Sopenharmony_ci long ret = delta; 17662306a36Sopenharmony_ci 17762306a36Sopenharmony_ci if (!spool) 17862306a36Sopenharmony_ci return ret; 17962306a36Sopenharmony_ci 18062306a36Sopenharmony_ci spin_lock_irq(&spool->lock); 18162306a36Sopenharmony_ci 18262306a36Sopenharmony_ci if (spool->max_hpages != -1) { /* maximum size accounting */ 18362306a36Sopenharmony_ci if ((spool->used_hpages + delta) <= spool->max_hpages) 18462306a36Sopenharmony_ci spool->used_hpages += delta; 18562306a36Sopenharmony_ci else { 18662306a36Sopenharmony_ci ret = -ENOMEM; 18762306a36Sopenharmony_ci goto unlock_ret; 18862306a36Sopenharmony_ci } 18962306a36Sopenharmony_ci } 19062306a36Sopenharmony_ci 19162306a36Sopenharmony_ci /* minimum size accounting */ 19262306a36Sopenharmony_ci if (spool->min_hpages != -1 && spool->rsv_hpages) { 19362306a36Sopenharmony_ci if (delta > spool->rsv_hpages) { 19462306a36Sopenharmony_ci /* 19562306a36Sopenharmony_ci * Asking for more reserves than those already taken on 19662306a36Sopenharmony_ci * behalf of subpool. Return difference. 19762306a36Sopenharmony_ci */ 19862306a36Sopenharmony_ci ret = delta - spool->rsv_hpages; 19962306a36Sopenharmony_ci spool->rsv_hpages = 0; 20062306a36Sopenharmony_ci } else { 20162306a36Sopenharmony_ci ret = 0; /* reserves already accounted for */ 20262306a36Sopenharmony_ci spool->rsv_hpages -= delta; 20362306a36Sopenharmony_ci } 20462306a36Sopenharmony_ci } 20562306a36Sopenharmony_ci 20662306a36Sopenharmony_ciunlock_ret: 20762306a36Sopenharmony_ci spin_unlock_irq(&spool->lock); 20862306a36Sopenharmony_ci return ret; 20962306a36Sopenharmony_ci} 21062306a36Sopenharmony_ci 21162306a36Sopenharmony_ci/* 21262306a36Sopenharmony_ci * Subpool accounting for freeing and unreserving pages. 21362306a36Sopenharmony_ci * Return the number of global page reservations that must be dropped. 21462306a36Sopenharmony_ci * The return value may only be different than the passed value (delta) 21562306a36Sopenharmony_ci * in the case where a subpool minimum size must be maintained. 21662306a36Sopenharmony_ci */ 21762306a36Sopenharmony_cistatic long hugepage_subpool_put_pages(struct hugepage_subpool *spool, 21862306a36Sopenharmony_ci long delta) 21962306a36Sopenharmony_ci{ 22062306a36Sopenharmony_ci long ret = delta; 22162306a36Sopenharmony_ci unsigned long flags; 22262306a36Sopenharmony_ci 22362306a36Sopenharmony_ci if (!spool) 22462306a36Sopenharmony_ci return delta; 22562306a36Sopenharmony_ci 22662306a36Sopenharmony_ci spin_lock_irqsave(&spool->lock, flags); 22762306a36Sopenharmony_ci 22862306a36Sopenharmony_ci if (spool->max_hpages != -1) /* maximum size accounting */ 22962306a36Sopenharmony_ci spool->used_hpages -= delta; 23062306a36Sopenharmony_ci 23162306a36Sopenharmony_ci /* minimum size accounting */ 23262306a36Sopenharmony_ci if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) { 23362306a36Sopenharmony_ci if (spool->rsv_hpages + delta <= spool->min_hpages) 23462306a36Sopenharmony_ci ret = 0; 23562306a36Sopenharmony_ci else 23662306a36Sopenharmony_ci ret = spool->rsv_hpages + delta - spool->min_hpages; 23762306a36Sopenharmony_ci 23862306a36Sopenharmony_ci spool->rsv_hpages += delta; 23962306a36Sopenharmony_ci if (spool->rsv_hpages > spool->min_hpages) 24062306a36Sopenharmony_ci spool->rsv_hpages = spool->min_hpages; 24162306a36Sopenharmony_ci } 24262306a36Sopenharmony_ci 24362306a36Sopenharmony_ci /* 24462306a36Sopenharmony_ci * If hugetlbfs_put_super couldn't free spool due to an outstanding 24562306a36Sopenharmony_ci * quota reference, free it now. 24662306a36Sopenharmony_ci */ 24762306a36Sopenharmony_ci unlock_or_release_subpool(spool, flags); 24862306a36Sopenharmony_ci 24962306a36Sopenharmony_ci return ret; 25062306a36Sopenharmony_ci} 25162306a36Sopenharmony_ci 25262306a36Sopenharmony_cistatic inline struct hugepage_subpool *subpool_inode(struct inode *inode) 25362306a36Sopenharmony_ci{ 25462306a36Sopenharmony_ci return HUGETLBFS_SB(inode->i_sb)->spool; 25562306a36Sopenharmony_ci} 25662306a36Sopenharmony_ci 25762306a36Sopenharmony_cistatic inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) 25862306a36Sopenharmony_ci{ 25962306a36Sopenharmony_ci return subpool_inode(file_inode(vma->vm_file)); 26062306a36Sopenharmony_ci} 26162306a36Sopenharmony_ci 26262306a36Sopenharmony_ci/* 26362306a36Sopenharmony_ci * hugetlb vma_lock helper routines 26462306a36Sopenharmony_ci */ 26562306a36Sopenharmony_civoid hugetlb_vma_lock_read(struct vm_area_struct *vma) 26662306a36Sopenharmony_ci{ 26762306a36Sopenharmony_ci if (__vma_shareable_lock(vma)) { 26862306a36Sopenharmony_ci struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; 26962306a36Sopenharmony_ci 27062306a36Sopenharmony_ci down_read(&vma_lock->rw_sema); 27162306a36Sopenharmony_ci } else if (__vma_private_lock(vma)) { 27262306a36Sopenharmony_ci struct resv_map *resv_map = vma_resv_map(vma); 27362306a36Sopenharmony_ci 27462306a36Sopenharmony_ci down_read(&resv_map->rw_sema); 27562306a36Sopenharmony_ci } 27662306a36Sopenharmony_ci} 27762306a36Sopenharmony_ci 27862306a36Sopenharmony_civoid hugetlb_vma_unlock_read(struct vm_area_struct *vma) 27962306a36Sopenharmony_ci{ 28062306a36Sopenharmony_ci if (__vma_shareable_lock(vma)) { 28162306a36Sopenharmony_ci struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; 28262306a36Sopenharmony_ci 28362306a36Sopenharmony_ci up_read(&vma_lock->rw_sema); 28462306a36Sopenharmony_ci } else if (__vma_private_lock(vma)) { 28562306a36Sopenharmony_ci struct resv_map *resv_map = vma_resv_map(vma); 28662306a36Sopenharmony_ci 28762306a36Sopenharmony_ci up_read(&resv_map->rw_sema); 28862306a36Sopenharmony_ci } 28962306a36Sopenharmony_ci} 29062306a36Sopenharmony_ci 29162306a36Sopenharmony_civoid hugetlb_vma_lock_write(struct vm_area_struct *vma) 29262306a36Sopenharmony_ci{ 29362306a36Sopenharmony_ci if (__vma_shareable_lock(vma)) { 29462306a36Sopenharmony_ci struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; 29562306a36Sopenharmony_ci 29662306a36Sopenharmony_ci down_write(&vma_lock->rw_sema); 29762306a36Sopenharmony_ci } else if (__vma_private_lock(vma)) { 29862306a36Sopenharmony_ci struct resv_map *resv_map = vma_resv_map(vma); 29962306a36Sopenharmony_ci 30062306a36Sopenharmony_ci down_write(&resv_map->rw_sema); 30162306a36Sopenharmony_ci } 30262306a36Sopenharmony_ci} 30362306a36Sopenharmony_ci 30462306a36Sopenharmony_civoid hugetlb_vma_unlock_write(struct vm_area_struct *vma) 30562306a36Sopenharmony_ci{ 30662306a36Sopenharmony_ci if (__vma_shareable_lock(vma)) { 30762306a36Sopenharmony_ci struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; 30862306a36Sopenharmony_ci 30962306a36Sopenharmony_ci up_write(&vma_lock->rw_sema); 31062306a36Sopenharmony_ci } else if (__vma_private_lock(vma)) { 31162306a36Sopenharmony_ci struct resv_map *resv_map = vma_resv_map(vma); 31262306a36Sopenharmony_ci 31362306a36Sopenharmony_ci up_write(&resv_map->rw_sema); 31462306a36Sopenharmony_ci } 31562306a36Sopenharmony_ci} 31662306a36Sopenharmony_ci 31762306a36Sopenharmony_ciint hugetlb_vma_trylock_write(struct vm_area_struct *vma) 31862306a36Sopenharmony_ci{ 31962306a36Sopenharmony_ci 32062306a36Sopenharmony_ci if (__vma_shareable_lock(vma)) { 32162306a36Sopenharmony_ci struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; 32262306a36Sopenharmony_ci 32362306a36Sopenharmony_ci return down_write_trylock(&vma_lock->rw_sema); 32462306a36Sopenharmony_ci } else if (__vma_private_lock(vma)) { 32562306a36Sopenharmony_ci struct resv_map *resv_map = vma_resv_map(vma); 32662306a36Sopenharmony_ci 32762306a36Sopenharmony_ci return down_write_trylock(&resv_map->rw_sema); 32862306a36Sopenharmony_ci } 32962306a36Sopenharmony_ci 33062306a36Sopenharmony_ci return 1; 33162306a36Sopenharmony_ci} 33262306a36Sopenharmony_ci 33362306a36Sopenharmony_civoid hugetlb_vma_assert_locked(struct vm_area_struct *vma) 33462306a36Sopenharmony_ci{ 33562306a36Sopenharmony_ci if (__vma_shareable_lock(vma)) { 33662306a36Sopenharmony_ci struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; 33762306a36Sopenharmony_ci 33862306a36Sopenharmony_ci lockdep_assert_held(&vma_lock->rw_sema); 33962306a36Sopenharmony_ci } else if (__vma_private_lock(vma)) { 34062306a36Sopenharmony_ci struct resv_map *resv_map = vma_resv_map(vma); 34162306a36Sopenharmony_ci 34262306a36Sopenharmony_ci lockdep_assert_held(&resv_map->rw_sema); 34362306a36Sopenharmony_ci } 34462306a36Sopenharmony_ci} 34562306a36Sopenharmony_ci 34662306a36Sopenharmony_civoid hugetlb_vma_lock_release(struct kref *kref) 34762306a36Sopenharmony_ci{ 34862306a36Sopenharmony_ci struct hugetlb_vma_lock *vma_lock = container_of(kref, 34962306a36Sopenharmony_ci struct hugetlb_vma_lock, refs); 35062306a36Sopenharmony_ci 35162306a36Sopenharmony_ci kfree(vma_lock); 35262306a36Sopenharmony_ci} 35362306a36Sopenharmony_ci 35462306a36Sopenharmony_cistatic void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock) 35562306a36Sopenharmony_ci{ 35662306a36Sopenharmony_ci struct vm_area_struct *vma = vma_lock->vma; 35762306a36Sopenharmony_ci 35862306a36Sopenharmony_ci /* 35962306a36Sopenharmony_ci * vma_lock structure may or not be released as a result of put, 36062306a36Sopenharmony_ci * it certainly will no longer be attached to vma so clear pointer. 36162306a36Sopenharmony_ci * Semaphore synchronizes access to vma_lock->vma field. 36262306a36Sopenharmony_ci */ 36362306a36Sopenharmony_ci vma_lock->vma = NULL; 36462306a36Sopenharmony_ci vma->vm_private_data = NULL; 36562306a36Sopenharmony_ci up_write(&vma_lock->rw_sema); 36662306a36Sopenharmony_ci kref_put(&vma_lock->refs, hugetlb_vma_lock_release); 36762306a36Sopenharmony_ci} 36862306a36Sopenharmony_ci 36962306a36Sopenharmony_cistatic void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma) 37062306a36Sopenharmony_ci{ 37162306a36Sopenharmony_ci if (__vma_shareable_lock(vma)) { 37262306a36Sopenharmony_ci struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; 37362306a36Sopenharmony_ci 37462306a36Sopenharmony_ci __hugetlb_vma_unlock_write_put(vma_lock); 37562306a36Sopenharmony_ci } else if (__vma_private_lock(vma)) { 37662306a36Sopenharmony_ci struct resv_map *resv_map = vma_resv_map(vma); 37762306a36Sopenharmony_ci 37862306a36Sopenharmony_ci /* no free for anon vmas, but still need to unlock */ 37962306a36Sopenharmony_ci up_write(&resv_map->rw_sema); 38062306a36Sopenharmony_ci } 38162306a36Sopenharmony_ci} 38262306a36Sopenharmony_ci 38362306a36Sopenharmony_cistatic void hugetlb_vma_lock_free(struct vm_area_struct *vma) 38462306a36Sopenharmony_ci{ 38562306a36Sopenharmony_ci /* 38662306a36Sopenharmony_ci * Only present in sharable vmas. 38762306a36Sopenharmony_ci */ 38862306a36Sopenharmony_ci if (!vma || !__vma_shareable_lock(vma)) 38962306a36Sopenharmony_ci return; 39062306a36Sopenharmony_ci 39162306a36Sopenharmony_ci if (vma->vm_private_data) { 39262306a36Sopenharmony_ci struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; 39362306a36Sopenharmony_ci 39462306a36Sopenharmony_ci down_write(&vma_lock->rw_sema); 39562306a36Sopenharmony_ci __hugetlb_vma_unlock_write_put(vma_lock); 39662306a36Sopenharmony_ci } 39762306a36Sopenharmony_ci} 39862306a36Sopenharmony_ci 39962306a36Sopenharmony_cistatic void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) 40062306a36Sopenharmony_ci{ 40162306a36Sopenharmony_ci struct hugetlb_vma_lock *vma_lock; 40262306a36Sopenharmony_ci 40362306a36Sopenharmony_ci /* Only establish in (flags) sharable vmas */ 40462306a36Sopenharmony_ci if (!vma || !(vma->vm_flags & VM_MAYSHARE)) 40562306a36Sopenharmony_ci return; 40662306a36Sopenharmony_ci 40762306a36Sopenharmony_ci /* Should never get here with non-NULL vm_private_data */ 40862306a36Sopenharmony_ci if (vma->vm_private_data) 40962306a36Sopenharmony_ci return; 41062306a36Sopenharmony_ci 41162306a36Sopenharmony_ci vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL); 41262306a36Sopenharmony_ci if (!vma_lock) { 41362306a36Sopenharmony_ci /* 41462306a36Sopenharmony_ci * If we can not allocate structure, then vma can not 41562306a36Sopenharmony_ci * participate in pmd sharing. This is only a possible 41662306a36Sopenharmony_ci * performance enhancement and memory saving issue. 41762306a36Sopenharmony_ci * However, the lock is also used to synchronize page 41862306a36Sopenharmony_ci * faults with truncation. If the lock is not present, 41962306a36Sopenharmony_ci * unlikely races could leave pages in a file past i_size 42062306a36Sopenharmony_ci * until the file is removed. Warn in the unlikely case of 42162306a36Sopenharmony_ci * allocation failure. 42262306a36Sopenharmony_ci */ 42362306a36Sopenharmony_ci pr_warn_once("HugeTLB: unable to allocate vma specific lock\n"); 42462306a36Sopenharmony_ci return; 42562306a36Sopenharmony_ci } 42662306a36Sopenharmony_ci 42762306a36Sopenharmony_ci kref_init(&vma_lock->refs); 42862306a36Sopenharmony_ci init_rwsem(&vma_lock->rw_sema); 42962306a36Sopenharmony_ci vma_lock->vma = vma; 43062306a36Sopenharmony_ci vma->vm_private_data = vma_lock; 43162306a36Sopenharmony_ci} 43262306a36Sopenharmony_ci 43362306a36Sopenharmony_ci/* Helper that removes a struct file_region from the resv_map cache and returns 43462306a36Sopenharmony_ci * it for use. 43562306a36Sopenharmony_ci */ 43662306a36Sopenharmony_cistatic struct file_region * 43762306a36Sopenharmony_ciget_file_region_entry_from_cache(struct resv_map *resv, long from, long to) 43862306a36Sopenharmony_ci{ 43962306a36Sopenharmony_ci struct file_region *nrg; 44062306a36Sopenharmony_ci 44162306a36Sopenharmony_ci VM_BUG_ON(resv->region_cache_count <= 0); 44262306a36Sopenharmony_ci 44362306a36Sopenharmony_ci resv->region_cache_count--; 44462306a36Sopenharmony_ci nrg = list_first_entry(&resv->region_cache, struct file_region, link); 44562306a36Sopenharmony_ci list_del(&nrg->link); 44662306a36Sopenharmony_ci 44762306a36Sopenharmony_ci nrg->from = from; 44862306a36Sopenharmony_ci nrg->to = to; 44962306a36Sopenharmony_ci 45062306a36Sopenharmony_ci return nrg; 45162306a36Sopenharmony_ci} 45262306a36Sopenharmony_ci 45362306a36Sopenharmony_cistatic void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg, 45462306a36Sopenharmony_ci struct file_region *rg) 45562306a36Sopenharmony_ci{ 45662306a36Sopenharmony_ci#ifdef CONFIG_CGROUP_HUGETLB 45762306a36Sopenharmony_ci nrg->reservation_counter = rg->reservation_counter; 45862306a36Sopenharmony_ci nrg->css = rg->css; 45962306a36Sopenharmony_ci if (rg->css) 46062306a36Sopenharmony_ci css_get(rg->css); 46162306a36Sopenharmony_ci#endif 46262306a36Sopenharmony_ci} 46362306a36Sopenharmony_ci 46462306a36Sopenharmony_ci/* Helper that records hugetlb_cgroup uncharge info. */ 46562306a36Sopenharmony_cistatic void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg, 46662306a36Sopenharmony_ci struct hstate *h, 46762306a36Sopenharmony_ci struct resv_map *resv, 46862306a36Sopenharmony_ci struct file_region *nrg) 46962306a36Sopenharmony_ci{ 47062306a36Sopenharmony_ci#ifdef CONFIG_CGROUP_HUGETLB 47162306a36Sopenharmony_ci if (h_cg) { 47262306a36Sopenharmony_ci nrg->reservation_counter = 47362306a36Sopenharmony_ci &h_cg->rsvd_hugepage[hstate_index(h)]; 47462306a36Sopenharmony_ci nrg->css = &h_cg->css; 47562306a36Sopenharmony_ci /* 47662306a36Sopenharmony_ci * The caller will hold exactly one h_cg->css reference for the 47762306a36Sopenharmony_ci * whole contiguous reservation region. But this area might be 47862306a36Sopenharmony_ci * scattered when there are already some file_regions reside in 47962306a36Sopenharmony_ci * it. As a result, many file_regions may share only one css 48062306a36Sopenharmony_ci * reference. In order to ensure that one file_region must hold 48162306a36Sopenharmony_ci * exactly one h_cg->css reference, we should do css_get for 48262306a36Sopenharmony_ci * each file_region and leave the reference held by caller 48362306a36Sopenharmony_ci * untouched. 48462306a36Sopenharmony_ci */ 48562306a36Sopenharmony_ci css_get(&h_cg->css); 48662306a36Sopenharmony_ci if (!resv->pages_per_hpage) 48762306a36Sopenharmony_ci resv->pages_per_hpage = pages_per_huge_page(h); 48862306a36Sopenharmony_ci /* pages_per_hpage should be the same for all entries in 48962306a36Sopenharmony_ci * a resv_map. 49062306a36Sopenharmony_ci */ 49162306a36Sopenharmony_ci VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h)); 49262306a36Sopenharmony_ci } else { 49362306a36Sopenharmony_ci nrg->reservation_counter = NULL; 49462306a36Sopenharmony_ci nrg->css = NULL; 49562306a36Sopenharmony_ci } 49662306a36Sopenharmony_ci#endif 49762306a36Sopenharmony_ci} 49862306a36Sopenharmony_ci 49962306a36Sopenharmony_cistatic void put_uncharge_info(struct file_region *rg) 50062306a36Sopenharmony_ci{ 50162306a36Sopenharmony_ci#ifdef CONFIG_CGROUP_HUGETLB 50262306a36Sopenharmony_ci if (rg->css) 50362306a36Sopenharmony_ci css_put(rg->css); 50462306a36Sopenharmony_ci#endif 50562306a36Sopenharmony_ci} 50662306a36Sopenharmony_ci 50762306a36Sopenharmony_cistatic bool has_same_uncharge_info(struct file_region *rg, 50862306a36Sopenharmony_ci struct file_region *org) 50962306a36Sopenharmony_ci{ 51062306a36Sopenharmony_ci#ifdef CONFIG_CGROUP_HUGETLB 51162306a36Sopenharmony_ci return rg->reservation_counter == org->reservation_counter && 51262306a36Sopenharmony_ci rg->css == org->css; 51362306a36Sopenharmony_ci 51462306a36Sopenharmony_ci#else 51562306a36Sopenharmony_ci return true; 51662306a36Sopenharmony_ci#endif 51762306a36Sopenharmony_ci} 51862306a36Sopenharmony_ci 51962306a36Sopenharmony_cistatic void coalesce_file_region(struct resv_map *resv, struct file_region *rg) 52062306a36Sopenharmony_ci{ 52162306a36Sopenharmony_ci struct file_region *nrg, *prg; 52262306a36Sopenharmony_ci 52362306a36Sopenharmony_ci prg = list_prev_entry(rg, link); 52462306a36Sopenharmony_ci if (&prg->link != &resv->regions && prg->to == rg->from && 52562306a36Sopenharmony_ci has_same_uncharge_info(prg, rg)) { 52662306a36Sopenharmony_ci prg->to = rg->to; 52762306a36Sopenharmony_ci 52862306a36Sopenharmony_ci list_del(&rg->link); 52962306a36Sopenharmony_ci put_uncharge_info(rg); 53062306a36Sopenharmony_ci kfree(rg); 53162306a36Sopenharmony_ci 53262306a36Sopenharmony_ci rg = prg; 53362306a36Sopenharmony_ci } 53462306a36Sopenharmony_ci 53562306a36Sopenharmony_ci nrg = list_next_entry(rg, link); 53662306a36Sopenharmony_ci if (&nrg->link != &resv->regions && nrg->from == rg->to && 53762306a36Sopenharmony_ci has_same_uncharge_info(nrg, rg)) { 53862306a36Sopenharmony_ci nrg->from = rg->from; 53962306a36Sopenharmony_ci 54062306a36Sopenharmony_ci list_del(&rg->link); 54162306a36Sopenharmony_ci put_uncharge_info(rg); 54262306a36Sopenharmony_ci kfree(rg); 54362306a36Sopenharmony_ci } 54462306a36Sopenharmony_ci} 54562306a36Sopenharmony_ci 54662306a36Sopenharmony_cistatic inline long 54762306a36Sopenharmony_cihugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from, 54862306a36Sopenharmony_ci long to, struct hstate *h, struct hugetlb_cgroup *cg, 54962306a36Sopenharmony_ci long *regions_needed) 55062306a36Sopenharmony_ci{ 55162306a36Sopenharmony_ci struct file_region *nrg; 55262306a36Sopenharmony_ci 55362306a36Sopenharmony_ci if (!regions_needed) { 55462306a36Sopenharmony_ci nrg = get_file_region_entry_from_cache(map, from, to); 55562306a36Sopenharmony_ci record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg); 55662306a36Sopenharmony_ci list_add(&nrg->link, rg); 55762306a36Sopenharmony_ci coalesce_file_region(map, nrg); 55862306a36Sopenharmony_ci } else 55962306a36Sopenharmony_ci *regions_needed += 1; 56062306a36Sopenharmony_ci 56162306a36Sopenharmony_ci return to - from; 56262306a36Sopenharmony_ci} 56362306a36Sopenharmony_ci 56462306a36Sopenharmony_ci/* 56562306a36Sopenharmony_ci * Must be called with resv->lock held. 56662306a36Sopenharmony_ci * 56762306a36Sopenharmony_ci * Calling this with regions_needed != NULL will count the number of pages 56862306a36Sopenharmony_ci * to be added but will not modify the linked list. And regions_needed will 56962306a36Sopenharmony_ci * indicate the number of file_regions needed in the cache to carry out to add 57062306a36Sopenharmony_ci * the regions for this range. 57162306a36Sopenharmony_ci */ 57262306a36Sopenharmony_cistatic long add_reservation_in_range(struct resv_map *resv, long f, long t, 57362306a36Sopenharmony_ci struct hugetlb_cgroup *h_cg, 57462306a36Sopenharmony_ci struct hstate *h, long *regions_needed) 57562306a36Sopenharmony_ci{ 57662306a36Sopenharmony_ci long add = 0; 57762306a36Sopenharmony_ci struct list_head *head = &resv->regions; 57862306a36Sopenharmony_ci long last_accounted_offset = f; 57962306a36Sopenharmony_ci struct file_region *iter, *trg = NULL; 58062306a36Sopenharmony_ci struct list_head *rg = NULL; 58162306a36Sopenharmony_ci 58262306a36Sopenharmony_ci if (regions_needed) 58362306a36Sopenharmony_ci *regions_needed = 0; 58462306a36Sopenharmony_ci 58562306a36Sopenharmony_ci /* In this loop, we essentially handle an entry for the range 58662306a36Sopenharmony_ci * [last_accounted_offset, iter->from), at every iteration, with some 58762306a36Sopenharmony_ci * bounds checking. 58862306a36Sopenharmony_ci */ 58962306a36Sopenharmony_ci list_for_each_entry_safe(iter, trg, head, link) { 59062306a36Sopenharmony_ci /* Skip irrelevant regions that start before our range. */ 59162306a36Sopenharmony_ci if (iter->from < f) { 59262306a36Sopenharmony_ci /* If this region ends after the last accounted offset, 59362306a36Sopenharmony_ci * then we need to update last_accounted_offset. 59462306a36Sopenharmony_ci */ 59562306a36Sopenharmony_ci if (iter->to > last_accounted_offset) 59662306a36Sopenharmony_ci last_accounted_offset = iter->to; 59762306a36Sopenharmony_ci continue; 59862306a36Sopenharmony_ci } 59962306a36Sopenharmony_ci 60062306a36Sopenharmony_ci /* When we find a region that starts beyond our range, we've 60162306a36Sopenharmony_ci * finished. 60262306a36Sopenharmony_ci */ 60362306a36Sopenharmony_ci if (iter->from >= t) { 60462306a36Sopenharmony_ci rg = iter->link.prev; 60562306a36Sopenharmony_ci break; 60662306a36Sopenharmony_ci } 60762306a36Sopenharmony_ci 60862306a36Sopenharmony_ci /* Add an entry for last_accounted_offset -> iter->from, and 60962306a36Sopenharmony_ci * update last_accounted_offset. 61062306a36Sopenharmony_ci */ 61162306a36Sopenharmony_ci if (iter->from > last_accounted_offset) 61262306a36Sopenharmony_ci add += hugetlb_resv_map_add(resv, iter->link.prev, 61362306a36Sopenharmony_ci last_accounted_offset, 61462306a36Sopenharmony_ci iter->from, h, h_cg, 61562306a36Sopenharmony_ci regions_needed); 61662306a36Sopenharmony_ci 61762306a36Sopenharmony_ci last_accounted_offset = iter->to; 61862306a36Sopenharmony_ci } 61962306a36Sopenharmony_ci 62062306a36Sopenharmony_ci /* Handle the case where our range extends beyond 62162306a36Sopenharmony_ci * last_accounted_offset. 62262306a36Sopenharmony_ci */ 62362306a36Sopenharmony_ci if (!rg) 62462306a36Sopenharmony_ci rg = head->prev; 62562306a36Sopenharmony_ci if (last_accounted_offset < t) 62662306a36Sopenharmony_ci add += hugetlb_resv_map_add(resv, rg, last_accounted_offset, 62762306a36Sopenharmony_ci t, h, h_cg, regions_needed); 62862306a36Sopenharmony_ci 62962306a36Sopenharmony_ci return add; 63062306a36Sopenharmony_ci} 63162306a36Sopenharmony_ci 63262306a36Sopenharmony_ci/* Must be called with resv->lock acquired. Will drop lock to allocate entries. 63362306a36Sopenharmony_ci */ 63462306a36Sopenharmony_cistatic int allocate_file_region_entries(struct resv_map *resv, 63562306a36Sopenharmony_ci int regions_needed) 63662306a36Sopenharmony_ci __must_hold(&resv->lock) 63762306a36Sopenharmony_ci{ 63862306a36Sopenharmony_ci LIST_HEAD(allocated_regions); 63962306a36Sopenharmony_ci int to_allocate = 0, i = 0; 64062306a36Sopenharmony_ci struct file_region *trg = NULL, *rg = NULL; 64162306a36Sopenharmony_ci 64262306a36Sopenharmony_ci VM_BUG_ON(regions_needed < 0); 64362306a36Sopenharmony_ci 64462306a36Sopenharmony_ci /* 64562306a36Sopenharmony_ci * Check for sufficient descriptors in the cache to accommodate 64662306a36Sopenharmony_ci * the number of in progress add operations plus regions_needed. 64762306a36Sopenharmony_ci * 64862306a36Sopenharmony_ci * This is a while loop because when we drop the lock, some other call 64962306a36Sopenharmony_ci * to region_add or region_del may have consumed some region_entries, 65062306a36Sopenharmony_ci * so we keep looping here until we finally have enough entries for 65162306a36Sopenharmony_ci * (adds_in_progress + regions_needed). 65262306a36Sopenharmony_ci */ 65362306a36Sopenharmony_ci while (resv->region_cache_count < 65462306a36Sopenharmony_ci (resv->adds_in_progress + regions_needed)) { 65562306a36Sopenharmony_ci to_allocate = resv->adds_in_progress + regions_needed - 65662306a36Sopenharmony_ci resv->region_cache_count; 65762306a36Sopenharmony_ci 65862306a36Sopenharmony_ci /* At this point, we should have enough entries in the cache 65962306a36Sopenharmony_ci * for all the existing adds_in_progress. We should only be 66062306a36Sopenharmony_ci * needing to allocate for regions_needed. 66162306a36Sopenharmony_ci */ 66262306a36Sopenharmony_ci VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress); 66362306a36Sopenharmony_ci 66462306a36Sopenharmony_ci spin_unlock(&resv->lock); 66562306a36Sopenharmony_ci for (i = 0; i < to_allocate; i++) { 66662306a36Sopenharmony_ci trg = kmalloc(sizeof(*trg), GFP_KERNEL); 66762306a36Sopenharmony_ci if (!trg) 66862306a36Sopenharmony_ci goto out_of_memory; 66962306a36Sopenharmony_ci list_add(&trg->link, &allocated_regions); 67062306a36Sopenharmony_ci } 67162306a36Sopenharmony_ci 67262306a36Sopenharmony_ci spin_lock(&resv->lock); 67362306a36Sopenharmony_ci 67462306a36Sopenharmony_ci list_splice(&allocated_regions, &resv->region_cache); 67562306a36Sopenharmony_ci resv->region_cache_count += to_allocate; 67662306a36Sopenharmony_ci } 67762306a36Sopenharmony_ci 67862306a36Sopenharmony_ci return 0; 67962306a36Sopenharmony_ci 68062306a36Sopenharmony_ciout_of_memory: 68162306a36Sopenharmony_ci list_for_each_entry_safe(rg, trg, &allocated_regions, link) { 68262306a36Sopenharmony_ci list_del(&rg->link); 68362306a36Sopenharmony_ci kfree(rg); 68462306a36Sopenharmony_ci } 68562306a36Sopenharmony_ci return -ENOMEM; 68662306a36Sopenharmony_ci} 68762306a36Sopenharmony_ci 68862306a36Sopenharmony_ci/* 68962306a36Sopenharmony_ci * Add the huge page range represented by [f, t) to the reserve 69062306a36Sopenharmony_ci * map. Regions will be taken from the cache to fill in this range. 69162306a36Sopenharmony_ci * Sufficient regions should exist in the cache due to the previous 69262306a36Sopenharmony_ci * call to region_chg with the same range, but in some cases the cache will not 69362306a36Sopenharmony_ci * have sufficient entries due to races with other code doing region_add or 69462306a36Sopenharmony_ci * region_del. The extra needed entries will be allocated. 69562306a36Sopenharmony_ci * 69662306a36Sopenharmony_ci * regions_needed is the out value provided by a previous call to region_chg. 69762306a36Sopenharmony_ci * 69862306a36Sopenharmony_ci * Return the number of new huge pages added to the map. This number is greater 69962306a36Sopenharmony_ci * than or equal to zero. If file_region entries needed to be allocated for 70062306a36Sopenharmony_ci * this operation and we were not able to allocate, it returns -ENOMEM. 70162306a36Sopenharmony_ci * region_add of regions of length 1 never allocate file_regions and cannot 70262306a36Sopenharmony_ci * fail; region_chg will always allocate at least 1 entry and a region_add for 70362306a36Sopenharmony_ci * 1 page will only require at most 1 entry. 70462306a36Sopenharmony_ci */ 70562306a36Sopenharmony_cistatic long region_add(struct resv_map *resv, long f, long t, 70662306a36Sopenharmony_ci long in_regions_needed, struct hstate *h, 70762306a36Sopenharmony_ci struct hugetlb_cgroup *h_cg) 70862306a36Sopenharmony_ci{ 70962306a36Sopenharmony_ci long add = 0, actual_regions_needed = 0; 71062306a36Sopenharmony_ci 71162306a36Sopenharmony_ci spin_lock(&resv->lock); 71262306a36Sopenharmony_ciretry: 71362306a36Sopenharmony_ci 71462306a36Sopenharmony_ci /* Count how many regions are actually needed to execute this add. */ 71562306a36Sopenharmony_ci add_reservation_in_range(resv, f, t, NULL, NULL, 71662306a36Sopenharmony_ci &actual_regions_needed); 71762306a36Sopenharmony_ci 71862306a36Sopenharmony_ci /* 71962306a36Sopenharmony_ci * Check for sufficient descriptors in the cache to accommodate 72062306a36Sopenharmony_ci * this add operation. Note that actual_regions_needed may be greater 72162306a36Sopenharmony_ci * than in_regions_needed, as the resv_map may have been modified since 72262306a36Sopenharmony_ci * the region_chg call. In this case, we need to make sure that we 72362306a36Sopenharmony_ci * allocate extra entries, such that we have enough for all the 72462306a36Sopenharmony_ci * existing adds_in_progress, plus the excess needed for this 72562306a36Sopenharmony_ci * operation. 72662306a36Sopenharmony_ci */ 72762306a36Sopenharmony_ci if (actual_regions_needed > in_regions_needed && 72862306a36Sopenharmony_ci resv->region_cache_count < 72962306a36Sopenharmony_ci resv->adds_in_progress + 73062306a36Sopenharmony_ci (actual_regions_needed - in_regions_needed)) { 73162306a36Sopenharmony_ci /* region_add operation of range 1 should never need to 73262306a36Sopenharmony_ci * allocate file_region entries. 73362306a36Sopenharmony_ci */ 73462306a36Sopenharmony_ci VM_BUG_ON(t - f <= 1); 73562306a36Sopenharmony_ci 73662306a36Sopenharmony_ci if (allocate_file_region_entries( 73762306a36Sopenharmony_ci resv, actual_regions_needed - in_regions_needed)) { 73862306a36Sopenharmony_ci return -ENOMEM; 73962306a36Sopenharmony_ci } 74062306a36Sopenharmony_ci 74162306a36Sopenharmony_ci goto retry; 74262306a36Sopenharmony_ci } 74362306a36Sopenharmony_ci 74462306a36Sopenharmony_ci add = add_reservation_in_range(resv, f, t, h_cg, h, NULL); 74562306a36Sopenharmony_ci 74662306a36Sopenharmony_ci resv->adds_in_progress -= in_regions_needed; 74762306a36Sopenharmony_ci 74862306a36Sopenharmony_ci spin_unlock(&resv->lock); 74962306a36Sopenharmony_ci return add; 75062306a36Sopenharmony_ci} 75162306a36Sopenharmony_ci 75262306a36Sopenharmony_ci/* 75362306a36Sopenharmony_ci * Examine the existing reserve map and determine how many 75462306a36Sopenharmony_ci * huge pages in the specified range [f, t) are NOT currently 75562306a36Sopenharmony_ci * represented. This routine is called before a subsequent 75662306a36Sopenharmony_ci * call to region_add that will actually modify the reserve 75762306a36Sopenharmony_ci * map to add the specified range [f, t). region_chg does 75862306a36Sopenharmony_ci * not change the number of huge pages represented by the 75962306a36Sopenharmony_ci * map. A number of new file_region structures is added to the cache as a 76062306a36Sopenharmony_ci * placeholder, for the subsequent region_add call to use. At least 1 76162306a36Sopenharmony_ci * file_region structure is added. 76262306a36Sopenharmony_ci * 76362306a36Sopenharmony_ci * out_regions_needed is the number of regions added to the 76462306a36Sopenharmony_ci * resv->adds_in_progress. This value needs to be provided to a follow up call 76562306a36Sopenharmony_ci * to region_add or region_abort for proper accounting. 76662306a36Sopenharmony_ci * 76762306a36Sopenharmony_ci * Returns the number of huge pages that need to be added to the existing 76862306a36Sopenharmony_ci * reservation map for the range [f, t). This number is greater or equal to 76962306a36Sopenharmony_ci * zero. -ENOMEM is returned if a new file_region structure or cache entry 77062306a36Sopenharmony_ci * is needed and can not be allocated. 77162306a36Sopenharmony_ci */ 77262306a36Sopenharmony_cistatic long region_chg(struct resv_map *resv, long f, long t, 77362306a36Sopenharmony_ci long *out_regions_needed) 77462306a36Sopenharmony_ci{ 77562306a36Sopenharmony_ci long chg = 0; 77662306a36Sopenharmony_ci 77762306a36Sopenharmony_ci spin_lock(&resv->lock); 77862306a36Sopenharmony_ci 77962306a36Sopenharmony_ci /* Count how many hugepages in this range are NOT represented. */ 78062306a36Sopenharmony_ci chg = add_reservation_in_range(resv, f, t, NULL, NULL, 78162306a36Sopenharmony_ci out_regions_needed); 78262306a36Sopenharmony_ci 78362306a36Sopenharmony_ci if (*out_regions_needed == 0) 78462306a36Sopenharmony_ci *out_regions_needed = 1; 78562306a36Sopenharmony_ci 78662306a36Sopenharmony_ci if (allocate_file_region_entries(resv, *out_regions_needed)) 78762306a36Sopenharmony_ci return -ENOMEM; 78862306a36Sopenharmony_ci 78962306a36Sopenharmony_ci resv->adds_in_progress += *out_regions_needed; 79062306a36Sopenharmony_ci 79162306a36Sopenharmony_ci spin_unlock(&resv->lock); 79262306a36Sopenharmony_ci return chg; 79362306a36Sopenharmony_ci} 79462306a36Sopenharmony_ci 79562306a36Sopenharmony_ci/* 79662306a36Sopenharmony_ci * Abort the in progress add operation. The adds_in_progress field 79762306a36Sopenharmony_ci * of the resv_map keeps track of the operations in progress between 79862306a36Sopenharmony_ci * calls to region_chg and region_add. Operations are sometimes 79962306a36Sopenharmony_ci * aborted after the call to region_chg. In such cases, region_abort 80062306a36Sopenharmony_ci * is called to decrement the adds_in_progress counter. regions_needed 80162306a36Sopenharmony_ci * is the value returned by the region_chg call, it is used to decrement 80262306a36Sopenharmony_ci * the adds_in_progress counter. 80362306a36Sopenharmony_ci * 80462306a36Sopenharmony_ci * NOTE: The range arguments [f, t) are not needed or used in this 80562306a36Sopenharmony_ci * routine. They are kept to make reading the calling code easier as 80662306a36Sopenharmony_ci * arguments will match the associated region_chg call. 80762306a36Sopenharmony_ci */ 80862306a36Sopenharmony_cistatic void region_abort(struct resv_map *resv, long f, long t, 80962306a36Sopenharmony_ci long regions_needed) 81062306a36Sopenharmony_ci{ 81162306a36Sopenharmony_ci spin_lock(&resv->lock); 81262306a36Sopenharmony_ci VM_BUG_ON(!resv->region_cache_count); 81362306a36Sopenharmony_ci resv->adds_in_progress -= regions_needed; 81462306a36Sopenharmony_ci spin_unlock(&resv->lock); 81562306a36Sopenharmony_ci} 81662306a36Sopenharmony_ci 81762306a36Sopenharmony_ci/* 81862306a36Sopenharmony_ci * Delete the specified range [f, t) from the reserve map. If the 81962306a36Sopenharmony_ci * t parameter is LONG_MAX, this indicates that ALL regions after f 82062306a36Sopenharmony_ci * should be deleted. Locate the regions which intersect [f, t) 82162306a36Sopenharmony_ci * and either trim, delete or split the existing regions. 82262306a36Sopenharmony_ci * 82362306a36Sopenharmony_ci * Returns the number of huge pages deleted from the reserve map. 82462306a36Sopenharmony_ci * In the normal case, the return value is zero or more. In the 82562306a36Sopenharmony_ci * case where a region must be split, a new region descriptor must 82662306a36Sopenharmony_ci * be allocated. If the allocation fails, -ENOMEM will be returned. 82762306a36Sopenharmony_ci * NOTE: If the parameter t == LONG_MAX, then we will never split 82862306a36Sopenharmony_ci * a region and possibly return -ENOMEM. Callers specifying 82962306a36Sopenharmony_ci * t == LONG_MAX do not need to check for -ENOMEM error. 83062306a36Sopenharmony_ci */ 83162306a36Sopenharmony_cistatic long region_del(struct resv_map *resv, long f, long t) 83262306a36Sopenharmony_ci{ 83362306a36Sopenharmony_ci struct list_head *head = &resv->regions; 83462306a36Sopenharmony_ci struct file_region *rg, *trg; 83562306a36Sopenharmony_ci struct file_region *nrg = NULL; 83662306a36Sopenharmony_ci long del = 0; 83762306a36Sopenharmony_ci 83862306a36Sopenharmony_ciretry: 83962306a36Sopenharmony_ci spin_lock(&resv->lock); 84062306a36Sopenharmony_ci list_for_each_entry_safe(rg, trg, head, link) { 84162306a36Sopenharmony_ci /* 84262306a36Sopenharmony_ci * Skip regions before the range to be deleted. file_region 84362306a36Sopenharmony_ci * ranges are normally of the form [from, to). However, there 84462306a36Sopenharmony_ci * may be a "placeholder" entry in the map which is of the form 84562306a36Sopenharmony_ci * (from, to) with from == to. Check for placeholder entries 84662306a36Sopenharmony_ci * at the beginning of the range to be deleted. 84762306a36Sopenharmony_ci */ 84862306a36Sopenharmony_ci if (rg->to <= f && (rg->to != rg->from || rg->to != f)) 84962306a36Sopenharmony_ci continue; 85062306a36Sopenharmony_ci 85162306a36Sopenharmony_ci if (rg->from >= t) 85262306a36Sopenharmony_ci break; 85362306a36Sopenharmony_ci 85462306a36Sopenharmony_ci if (f > rg->from && t < rg->to) { /* Must split region */ 85562306a36Sopenharmony_ci /* 85662306a36Sopenharmony_ci * Check for an entry in the cache before dropping 85762306a36Sopenharmony_ci * lock and attempting allocation. 85862306a36Sopenharmony_ci */ 85962306a36Sopenharmony_ci if (!nrg && 86062306a36Sopenharmony_ci resv->region_cache_count > resv->adds_in_progress) { 86162306a36Sopenharmony_ci nrg = list_first_entry(&resv->region_cache, 86262306a36Sopenharmony_ci struct file_region, 86362306a36Sopenharmony_ci link); 86462306a36Sopenharmony_ci list_del(&nrg->link); 86562306a36Sopenharmony_ci resv->region_cache_count--; 86662306a36Sopenharmony_ci } 86762306a36Sopenharmony_ci 86862306a36Sopenharmony_ci if (!nrg) { 86962306a36Sopenharmony_ci spin_unlock(&resv->lock); 87062306a36Sopenharmony_ci nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); 87162306a36Sopenharmony_ci if (!nrg) 87262306a36Sopenharmony_ci return -ENOMEM; 87362306a36Sopenharmony_ci goto retry; 87462306a36Sopenharmony_ci } 87562306a36Sopenharmony_ci 87662306a36Sopenharmony_ci del += t - f; 87762306a36Sopenharmony_ci hugetlb_cgroup_uncharge_file_region( 87862306a36Sopenharmony_ci resv, rg, t - f, false); 87962306a36Sopenharmony_ci 88062306a36Sopenharmony_ci /* New entry for end of split region */ 88162306a36Sopenharmony_ci nrg->from = t; 88262306a36Sopenharmony_ci nrg->to = rg->to; 88362306a36Sopenharmony_ci 88462306a36Sopenharmony_ci copy_hugetlb_cgroup_uncharge_info(nrg, rg); 88562306a36Sopenharmony_ci 88662306a36Sopenharmony_ci INIT_LIST_HEAD(&nrg->link); 88762306a36Sopenharmony_ci 88862306a36Sopenharmony_ci /* Original entry is trimmed */ 88962306a36Sopenharmony_ci rg->to = f; 89062306a36Sopenharmony_ci 89162306a36Sopenharmony_ci list_add(&nrg->link, &rg->link); 89262306a36Sopenharmony_ci nrg = NULL; 89362306a36Sopenharmony_ci break; 89462306a36Sopenharmony_ci } 89562306a36Sopenharmony_ci 89662306a36Sopenharmony_ci if (f <= rg->from && t >= rg->to) { /* Remove entire region */ 89762306a36Sopenharmony_ci del += rg->to - rg->from; 89862306a36Sopenharmony_ci hugetlb_cgroup_uncharge_file_region(resv, rg, 89962306a36Sopenharmony_ci rg->to - rg->from, true); 90062306a36Sopenharmony_ci list_del(&rg->link); 90162306a36Sopenharmony_ci kfree(rg); 90262306a36Sopenharmony_ci continue; 90362306a36Sopenharmony_ci } 90462306a36Sopenharmony_ci 90562306a36Sopenharmony_ci if (f <= rg->from) { /* Trim beginning of region */ 90662306a36Sopenharmony_ci hugetlb_cgroup_uncharge_file_region(resv, rg, 90762306a36Sopenharmony_ci t - rg->from, false); 90862306a36Sopenharmony_ci 90962306a36Sopenharmony_ci del += t - rg->from; 91062306a36Sopenharmony_ci rg->from = t; 91162306a36Sopenharmony_ci } else { /* Trim end of region */ 91262306a36Sopenharmony_ci hugetlb_cgroup_uncharge_file_region(resv, rg, 91362306a36Sopenharmony_ci rg->to - f, false); 91462306a36Sopenharmony_ci 91562306a36Sopenharmony_ci del += rg->to - f; 91662306a36Sopenharmony_ci rg->to = f; 91762306a36Sopenharmony_ci } 91862306a36Sopenharmony_ci } 91962306a36Sopenharmony_ci 92062306a36Sopenharmony_ci spin_unlock(&resv->lock); 92162306a36Sopenharmony_ci kfree(nrg); 92262306a36Sopenharmony_ci return del; 92362306a36Sopenharmony_ci} 92462306a36Sopenharmony_ci 92562306a36Sopenharmony_ci/* 92662306a36Sopenharmony_ci * A rare out of memory error was encountered which prevented removal of 92762306a36Sopenharmony_ci * the reserve map region for a page. The huge page itself was free'ed 92862306a36Sopenharmony_ci * and removed from the page cache. This routine will adjust the subpool 92962306a36Sopenharmony_ci * usage count, and the global reserve count if needed. By incrementing 93062306a36Sopenharmony_ci * these counts, the reserve map entry which could not be deleted will 93162306a36Sopenharmony_ci * appear as a "reserved" entry instead of simply dangling with incorrect 93262306a36Sopenharmony_ci * counts. 93362306a36Sopenharmony_ci */ 93462306a36Sopenharmony_civoid hugetlb_fix_reserve_counts(struct inode *inode) 93562306a36Sopenharmony_ci{ 93662306a36Sopenharmony_ci struct hugepage_subpool *spool = subpool_inode(inode); 93762306a36Sopenharmony_ci long rsv_adjust; 93862306a36Sopenharmony_ci bool reserved = false; 93962306a36Sopenharmony_ci 94062306a36Sopenharmony_ci rsv_adjust = hugepage_subpool_get_pages(spool, 1); 94162306a36Sopenharmony_ci if (rsv_adjust > 0) { 94262306a36Sopenharmony_ci struct hstate *h = hstate_inode(inode); 94362306a36Sopenharmony_ci 94462306a36Sopenharmony_ci if (!hugetlb_acct_memory(h, 1)) 94562306a36Sopenharmony_ci reserved = true; 94662306a36Sopenharmony_ci } else if (!rsv_adjust) { 94762306a36Sopenharmony_ci reserved = true; 94862306a36Sopenharmony_ci } 94962306a36Sopenharmony_ci 95062306a36Sopenharmony_ci if (!reserved) 95162306a36Sopenharmony_ci pr_warn("hugetlb: Huge Page Reserved count may go negative.\n"); 95262306a36Sopenharmony_ci} 95362306a36Sopenharmony_ci 95462306a36Sopenharmony_ci/* 95562306a36Sopenharmony_ci * Count and return the number of huge pages in the reserve map 95662306a36Sopenharmony_ci * that intersect with the range [f, t). 95762306a36Sopenharmony_ci */ 95862306a36Sopenharmony_cistatic long region_count(struct resv_map *resv, long f, long t) 95962306a36Sopenharmony_ci{ 96062306a36Sopenharmony_ci struct list_head *head = &resv->regions; 96162306a36Sopenharmony_ci struct file_region *rg; 96262306a36Sopenharmony_ci long chg = 0; 96362306a36Sopenharmony_ci 96462306a36Sopenharmony_ci spin_lock(&resv->lock); 96562306a36Sopenharmony_ci /* Locate each segment we overlap with, and count that overlap. */ 96662306a36Sopenharmony_ci list_for_each_entry(rg, head, link) { 96762306a36Sopenharmony_ci long seg_from; 96862306a36Sopenharmony_ci long seg_to; 96962306a36Sopenharmony_ci 97062306a36Sopenharmony_ci if (rg->to <= f) 97162306a36Sopenharmony_ci continue; 97262306a36Sopenharmony_ci if (rg->from >= t) 97362306a36Sopenharmony_ci break; 97462306a36Sopenharmony_ci 97562306a36Sopenharmony_ci seg_from = max(rg->from, f); 97662306a36Sopenharmony_ci seg_to = min(rg->to, t); 97762306a36Sopenharmony_ci 97862306a36Sopenharmony_ci chg += seg_to - seg_from; 97962306a36Sopenharmony_ci } 98062306a36Sopenharmony_ci spin_unlock(&resv->lock); 98162306a36Sopenharmony_ci 98262306a36Sopenharmony_ci return chg; 98362306a36Sopenharmony_ci} 98462306a36Sopenharmony_ci 98562306a36Sopenharmony_ci/* 98662306a36Sopenharmony_ci * Convert the address within this vma to the page offset within 98762306a36Sopenharmony_ci * the mapping, in pagecache page units; huge pages here. 98862306a36Sopenharmony_ci */ 98962306a36Sopenharmony_cistatic pgoff_t vma_hugecache_offset(struct hstate *h, 99062306a36Sopenharmony_ci struct vm_area_struct *vma, unsigned long address) 99162306a36Sopenharmony_ci{ 99262306a36Sopenharmony_ci return ((address - vma->vm_start) >> huge_page_shift(h)) + 99362306a36Sopenharmony_ci (vma->vm_pgoff >> huge_page_order(h)); 99462306a36Sopenharmony_ci} 99562306a36Sopenharmony_ci 99662306a36Sopenharmony_cipgoff_t linear_hugepage_index(struct vm_area_struct *vma, 99762306a36Sopenharmony_ci unsigned long address) 99862306a36Sopenharmony_ci{ 99962306a36Sopenharmony_ci return vma_hugecache_offset(hstate_vma(vma), vma, address); 100062306a36Sopenharmony_ci} 100162306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(linear_hugepage_index); 100262306a36Sopenharmony_ci 100362306a36Sopenharmony_ci/** 100462306a36Sopenharmony_ci * vma_kernel_pagesize - Page size granularity for this VMA. 100562306a36Sopenharmony_ci * @vma: The user mapping. 100662306a36Sopenharmony_ci * 100762306a36Sopenharmony_ci * Folios in this VMA will be aligned to, and at least the size of the 100862306a36Sopenharmony_ci * number of bytes returned by this function. 100962306a36Sopenharmony_ci * 101062306a36Sopenharmony_ci * Return: The default size of the folios allocated when backing a VMA. 101162306a36Sopenharmony_ci */ 101262306a36Sopenharmony_ciunsigned long vma_kernel_pagesize(struct vm_area_struct *vma) 101362306a36Sopenharmony_ci{ 101462306a36Sopenharmony_ci if (vma->vm_ops && vma->vm_ops->pagesize) 101562306a36Sopenharmony_ci return vma->vm_ops->pagesize(vma); 101662306a36Sopenharmony_ci return PAGE_SIZE; 101762306a36Sopenharmony_ci} 101862306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(vma_kernel_pagesize); 101962306a36Sopenharmony_ci 102062306a36Sopenharmony_ci/* 102162306a36Sopenharmony_ci * Return the page size being used by the MMU to back a VMA. In the majority 102262306a36Sopenharmony_ci * of cases, the page size used by the kernel matches the MMU size. On 102362306a36Sopenharmony_ci * architectures where it differs, an architecture-specific 'strong' 102462306a36Sopenharmony_ci * version of this symbol is required. 102562306a36Sopenharmony_ci */ 102662306a36Sopenharmony_ci__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) 102762306a36Sopenharmony_ci{ 102862306a36Sopenharmony_ci return vma_kernel_pagesize(vma); 102962306a36Sopenharmony_ci} 103062306a36Sopenharmony_ci 103162306a36Sopenharmony_ci/* 103262306a36Sopenharmony_ci * Flags for MAP_PRIVATE reservations. These are stored in the bottom 103362306a36Sopenharmony_ci * bits of the reservation map pointer, which are always clear due to 103462306a36Sopenharmony_ci * alignment. 103562306a36Sopenharmony_ci */ 103662306a36Sopenharmony_ci#define HPAGE_RESV_OWNER (1UL << 0) 103762306a36Sopenharmony_ci#define HPAGE_RESV_UNMAPPED (1UL << 1) 103862306a36Sopenharmony_ci#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) 103962306a36Sopenharmony_ci 104062306a36Sopenharmony_ci/* 104162306a36Sopenharmony_ci * These helpers are used to track how many pages are reserved for 104262306a36Sopenharmony_ci * faults in a MAP_PRIVATE mapping. Only the process that called mmap() 104362306a36Sopenharmony_ci * is guaranteed to have their future faults succeed. 104462306a36Sopenharmony_ci * 104562306a36Sopenharmony_ci * With the exception of hugetlb_dup_vma_private() which is called at fork(), 104662306a36Sopenharmony_ci * the reserve counters are updated with the hugetlb_lock held. It is safe 104762306a36Sopenharmony_ci * to reset the VMA at fork() time as it is not in use yet and there is no 104862306a36Sopenharmony_ci * chance of the global counters getting corrupted as a result of the values. 104962306a36Sopenharmony_ci * 105062306a36Sopenharmony_ci * The private mapping reservation is represented in a subtly different 105162306a36Sopenharmony_ci * manner to a shared mapping. A shared mapping has a region map associated 105262306a36Sopenharmony_ci * with the underlying file, this region map represents the backing file 105362306a36Sopenharmony_ci * pages which have ever had a reservation assigned which this persists even 105462306a36Sopenharmony_ci * after the page is instantiated. A private mapping has a region map 105562306a36Sopenharmony_ci * associated with the original mmap which is attached to all VMAs which 105662306a36Sopenharmony_ci * reference it, this region map represents those offsets which have consumed 105762306a36Sopenharmony_ci * reservation ie. where pages have been instantiated. 105862306a36Sopenharmony_ci */ 105962306a36Sopenharmony_cistatic unsigned long get_vma_private_data(struct vm_area_struct *vma) 106062306a36Sopenharmony_ci{ 106162306a36Sopenharmony_ci return (unsigned long)vma->vm_private_data; 106262306a36Sopenharmony_ci} 106362306a36Sopenharmony_ci 106462306a36Sopenharmony_cistatic void set_vma_private_data(struct vm_area_struct *vma, 106562306a36Sopenharmony_ci unsigned long value) 106662306a36Sopenharmony_ci{ 106762306a36Sopenharmony_ci vma->vm_private_data = (void *)value; 106862306a36Sopenharmony_ci} 106962306a36Sopenharmony_ci 107062306a36Sopenharmony_cistatic void 107162306a36Sopenharmony_ciresv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map, 107262306a36Sopenharmony_ci struct hugetlb_cgroup *h_cg, 107362306a36Sopenharmony_ci struct hstate *h) 107462306a36Sopenharmony_ci{ 107562306a36Sopenharmony_ci#ifdef CONFIG_CGROUP_HUGETLB 107662306a36Sopenharmony_ci if (!h_cg || !h) { 107762306a36Sopenharmony_ci resv_map->reservation_counter = NULL; 107862306a36Sopenharmony_ci resv_map->pages_per_hpage = 0; 107962306a36Sopenharmony_ci resv_map->css = NULL; 108062306a36Sopenharmony_ci } else { 108162306a36Sopenharmony_ci resv_map->reservation_counter = 108262306a36Sopenharmony_ci &h_cg->rsvd_hugepage[hstate_index(h)]; 108362306a36Sopenharmony_ci resv_map->pages_per_hpage = pages_per_huge_page(h); 108462306a36Sopenharmony_ci resv_map->css = &h_cg->css; 108562306a36Sopenharmony_ci } 108662306a36Sopenharmony_ci#endif 108762306a36Sopenharmony_ci} 108862306a36Sopenharmony_ci 108962306a36Sopenharmony_cistruct resv_map *resv_map_alloc(void) 109062306a36Sopenharmony_ci{ 109162306a36Sopenharmony_ci struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); 109262306a36Sopenharmony_ci struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL); 109362306a36Sopenharmony_ci 109462306a36Sopenharmony_ci if (!resv_map || !rg) { 109562306a36Sopenharmony_ci kfree(resv_map); 109662306a36Sopenharmony_ci kfree(rg); 109762306a36Sopenharmony_ci return NULL; 109862306a36Sopenharmony_ci } 109962306a36Sopenharmony_ci 110062306a36Sopenharmony_ci kref_init(&resv_map->refs); 110162306a36Sopenharmony_ci spin_lock_init(&resv_map->lock); 110262306a36Sopenharmony_ci INIT_LIST_HEAD(&resv_map->regions); 110362306a36Sopenharmony_ci init_rwsem(&resv_map->rw_sema); 110462306a36Sopenharmony_ci 110562306a36Sopenharmony_ci resv_map->adds_in_progress = 0; 110662306a36Sopenharmony_ci /* 110762306a36Sopenharmony_ci * Initialize these to 0. On shared mappings, 0's here indicate these 110862306a36Sopenharmony_ci * fields don't do cgroup accounting. On private mappings, these will be 110962306a36Sopenharmony_ci * re-initialized to the proper values, to indicate that hugetlb cgroup 111062306a36Sopenharmony_ci * reservations are to be un-charged from here. 111162306a36Sopenharmony_ci */ 111262306a36Sopenharmony_ci resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL); 111362306a36Sopenharmony_ci 111462306a36Sopenharmony_ci INIT_LIST_HEAD(&resv_map->region_cache); 111562306a36Sopenharmony_ci list_add(&rg->link, &resv_map->region_cache); 111662306a36Sopenharmony_ci resv_map->region_cache_count = 1; 111762306a36Sopenharmony_ci 111862306a36Sopenharmony_ci return resv_map; 111962306a36Sopenharmony_ci} 112062306a36Sopenharmony_ci 112162306a36Sopenharmony_civoid resv_map_release(struct kref *ref) 112262306a36Sopenharmony_ci{ 112362306a36Sopenharmony_ci struct resv_map *resv_map = container_of(ref, struct resv_map, refs); 112462306a36Sopenharmony_ci struct list_head *head = &resv_map->region_cache; 112562306a36Sopenharmony_ci struct file_region *rg, *trg; 112662306a36Sopenharmony_ci 112762306a36Sopenharmony_ci /* Clear out any active regions before we release the map. */ 112862306a36Sopenharmony_ci region_del(resv_map, 0, LONG_MAX); 112962306a36Sopenharmony_ci 113062306a36Sopenharmony_ci /* ... and any entries left in the cache */ 113162306a36Sopenharmony_ci list_for_each_entry_safe(rg, trg, head, link) { 113262306a36Sopenharmony_ci list_del(&rg->link); 113362306a36Sopenharmony_ci kfree(rg); 113462306a36Sopenharmony_ci } 113562306a36Sopenharmony_ci 113662306a36Sopenharmony_ci VM_BUG_ON(resv_map->adds_in_progress); 113762306a36Sopenharmony_ci 113862306a36Sopenharmony_ci kfree(resv_map); 113962306a36Sopenharmony_ci} 114062306a36Sopenharmony_ci 114162306a36Sopenharmony_cistatic inline struct resv_map *inode_resv_map(struct inode *inode) 114262306a36Sopenharmony_ci{ 114362306a36Sopenharmony_ci /* 114462306a36Sopenharmony_ci * At inode evict time, i_mapping may not point to the original 114562306a36Sopenharmony_ci * address space within the inode. This original address space 114662306a36Sopenharmony_ci * contains the pointer to the resv_map. So, always use the 114762306a36Sopenharmony_ci * address space embedded within the inode. 114862306a36Sopenharmony_ci * The VERY common case is inode->mapping == &inode->i_data but, 114962306a36Sopenharmony_ci * this may not be true for device special inodes. 115062306a36Sopenharmony_ci */ 115162306a36Sopenharmony_ci return (struct resv_map *)(&inode->i_data)->private_data; 115262306a36Sopenharmony_ci} 115362306a36Sopenharmony_ci 115462306a36Sopenharmony_cistatic struct resv_map *vma_resv_map(struct vm_area_struct *vma) 115562306a36Sopenharmony_ci{ 115662306a36Sopenharmony_ci VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 115762306a36Sopenharmony_ci if (vma->vm_flags & VM_MAYSHARE) { 115862306a36Sopenharmony_ci struct address_space *mapping = vma->vm_file->f_mapping; 115962306a36Sopenharmony_ci struct inode *inode = mapping->host; 116062306a36Sopenharmony_ci 116162306a36Sopenharmony_ci return inode_resv_map(inode); 116262306a36Sopenharmony_ci 116362306a36Sopenharmony_ci } else { 116462306a36Sopenharmony_ci return (struct resv_map *)(get_vma_private_data(vma) & 116562306a36Sopenharmony_ci ~HPAGE_RESV_MASK); 116662306a36Sopenharmony_ci } 116762306a36Sopenharmony_ci} 116862306a36Sopenharmony_ci 116962306a36Sopenharmony_cistatic void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) 117062306a36Sopenharmony_ci{ 117162306a36Sopenharmony_ci VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 117262306a36Sopenharmony_ci VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); 117362306a36Sopenharmony_ci 117462306a36Sopenharmony_ci set_vma_private_data(vma, (unsigned long)map); 117562306a36Sopenharmony_ci} 117662306a36Sopenharmony_ci 117762306a36Sopenharmony_cistatic void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) 117862306a36Sopenharmony_ci{ 117962306a36Sopenharmony_ci VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 118062306a36Sopenharmony_ci VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); 118162306a36Sopenharmony_ci 118262306a36Sopenharmony_ci set_vma_private_data(vma, get_vma_private_data(vma) | flags); 118362306a36Sopenharmony_ci} 118462306a36Sopenharmony_ci 118562306a36Sopenharmony_cistatic int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) 118662306a36Sopenharmony_ci{ 118762306a36Sopenharmony_ci VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 118862306a36Sopenharmony_ci 118962306a36Sopenharmony_ci return (get_vma_private_data(vma) & flag) != 0; 119062306a36Sopenharmony_ci} 119162306a36Sopenharmony_ci 119262306a36Sopenharmony_cibool __vma_private_lock(struct vm_area_struct *vma) 119362306a36Sopenharmony_ci{ 119462306a36Sopenharmony_ci return !(vma->vm_flags & VM_MAYSHARE) && 119562306a36Sopenharmony_ci get_vma_private_data(vma) & ~HPAGE_RESV_MASK && 119662306a36Sopenharmony_ci is_vma_resv_set(vma, HPAGE_RESV_OWNER); 119762306a36Sopenharmony_ci} 119862306a36Sopenharmony_ci 119962306a36Sopenharmony_civoid hugetlb_dup_vma_private(struct vm_area_struct *vma) 120062306a36Sopenharmony_ci{ 120162306a36Sopenharmony_ci VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 120262306a36Sopenharmony_ci /* 120362306a36Sopenharmony_ci * Clear vm_private_data 120462306a36Sopenharmony_ci * - For shared mappings this is a per-vma semaphore that may be 120562306a36Sopenharmony_ci * allocated in a subsequent call to hugetlb_vm_op_open. 120662306a36Sopenharmony_ci * Before clearing, make sure pointer is not associated with vma 120762306a36Sopenharmony_ci * as this will leak the structure. This is the case when called 120862306a36Sopenharmony_ci * via clear_vma_resv_huge_pages() and hugetlb_vm_op_open has already 120962306a36Sopenharmony_ci * been called to allocate a new structure. 121062306a36Sopenharmony_ci * - For MAP_PRIVATE mappings, this is the reserve map which does 121162306a36Sopenharmony_ci * not apply to children. Faults generated by the children are 121262306a36Sopenharmony_ci * not guaranteed to succeed, even if read-only. 121362306a36Sopenharmony_ci */ 121462306a36Sopenharmony_ci if (vma->vm_flags & VM_MAYSHARE) { 121562306a36Sopenharmony_ci struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; 121662306a36Sopenharmony_ci 121762306a36Sopenharmony_ci if (vma_lock && vma_lock->vma != vma) 121862306a36Sopenharmony_ci vma->vm_private_data = NULL; 121962306a36Sopenharmony_ci } else 122062306a36Sopenharmony_ci vma->vm_private_data = NULL; 122162306a36Sopenharmony_ci} 122262306a36Sopenharmony_ci 122362306a36Sopenharmony_ci/* 122462306a36Sopenharmony_ci * Reset and decrement one ref on hugepage private reservation. 122562306a36Sopenharmony_ci * Called with mm->mmap_lock writer semaphore held. 122662306a36Sopenharmony_ci * This function should be only used by move_vma() and operate on 122762306a36Sopenharmony_ci * same sized vma. It should never come here with last ref on the 122862306a36Sopenharmony_ci * reservation. 122962306a36Sopenharmony_ci */ 123062306a36Sopenharmony_civoid clear_vma_resv_huge_pages(struct vm_area_struct *vma) 123162306a36Sopenharmony_ci{ 123262306a36Sopenharmony_ci /* 123362306a36Sopenharmony_ci * Clear the old hugetlb private page reservation. 123462306a36Sopenharmony_ci * It has already been transferred to new_vma. 123562306a36Sopenharmony_ci * 123662306a36Sopenharmony_ci * During a mremap() operation of a hugetlb vma we call move_vma() 123762306a36Sopenharmony_ci * which copies vma into new_vma and unmaps vma. After the copy 123862306a36Sopenharmony_ci * operation both new_vma and vma share a reference to the resv_map 123962306a36Sopenharmony_ci * struct, and at that point vma is about to be unmapped. We don't 124062306a36Sopenharmony_ci * want to return the reservation to the pool at unmap of vma because 124162306a36Sopenharmony_ci * the reservation still lives on in new_vma, so simply decrement the 124262306a36Sopenharmony_ci * ref here and remove the resv_map reference from this vma. 124362306a36Sopenharmony_ci */ 124462306a36Sopenharmony_ci struct resv_map *reservations = vma_resv_map(vma); 124562306a36Sopenharmony_ci 124662306a36Sopenharmony_ci if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 124762306a36Sopenharmony_ci resv_map_put_hugetlb_cgroup_uncharge_info(reservations); 124862306a36Sopenharmony_ci kref_put(&reservations->refs, resv_map_release); 124962306a36Sopenharmony_ci } 125062306a36Sopenharmony_ci 125162306a36Sopenharmony_ci hugetlb_dup_vma_private(vma); 125262306a36Sopenharmony_ci} 125362306a36Sopenharmony_ci 125462306a36Sopenharmony_ci/* Returns true if the VMA has associated reserve pages */ 125562306a36Sopenharmony_cistatic bool vma_has_reserves(struct vm_area_struct *vma, long chg) 125662306a36Sopenharmony_ci{ 125762306a36Sopenharmony_ci if (vma->vm_flags & VM_NORESERVE) { 125862306a36Sopenharmony_ci /* 125962306a36Sopenharmony_ci * This address is already reserved by other process(chg == 0), 126062306a36Sopenharmony_ci * so, we should decrement reserved count. Without decrementing, 126162306a36Sopenharmony_ci * reserve count remains after releasing inode, because this 126262306a36Sopenharmony_ci * allocated page will go into page cache and is regarded as 126362306a36Sopenharmony_ci * coming from reserved pool in releasing step. Currently, we 126462306a36Sopenharmony_ci * don't have any other solution to deal with this situation 126562306a36Sopenharmony_ci * properly, so add work-around here. 126662306a36Sopenharmony_ci */ 126762306a36Sopenharmony_ci if (vma->vm_flags & VM_MAYSHARE && chg == 0) 126862306a36Sopenharmony_ci return true; 126962306a36Sopenharmony_ci else 127062306a36Sopenharmony_ci return false; 127162306a36Sopenharmony_ci } 127262306a36Sopenharmony_ci 127362306a36Sopenharmony_ci /* Shared mappings always use reserves */ 127462306a36Sopenharmony_ci if (vma->vm_flags & VM_MAYSHARE) { 127562306a36Sopenharmony_ci /* 127662306a36Sopenharmony_ci * We know VM_NORESERVE is not set. Therefore, there SHOULD 127762306a36Sopenharmony_ci * be a region map for all pages. The only situation where 127862306a36Sopenharmony_ci * there is no region map is if a hole was punched via 127962306a36Sopenharmony_ci * fallocate. In this case, there really are no reserves to 128062306a36Sopenharmony_ci * use. This situation is indicated if chg != 0. 128162306a36Sopenharmony_ci */ 128262306a36Sopenharmony_ci if (chg) 128362306a36Sopenharmony_ci return false; 128462306a36Sopenharmony_ci else 128562306a36Sopenharmony_ci return true; 128662306a36Sopenharmony_ci } 128762306a36Sopenharmony_ci 128862306a36Sopenharmony_ci /* 128962306a36Sopenharmony_ci * Only the process that called mmap() has reserves for 129062306a36Sopenharmony_ci * private mappings. 129162306a36Sopenharmony_ci */ 129262306a36Sopenharmony_ci if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 129362306a36Sopenharmony_ci /* 129462306a36Sopenharmony_ci * Like the shared case above, a hole punch or truncate 129562306a36Sopenharmony_ci * could have been performed on the private mapping. 129662306a36Sopenharmony_ci * Examine the value of chg to determine if reserves 129762306a36Sopenharmony_ci * actually exist or were previously consumed. 129862306a36Sopenharmony_ci * Very Subtle - The value of chg comes from a previous 129962306a36Sopenharmony_ci * call to vma_needs_reserves(). The reserve map for 130062306a36Sopenharmony_ci * private mappings has different (opposite) semantics 130162306a36Sopenharmony_ci * than that of shared mappings. vma_needs_reserves() 130262306a36Sopenharmony_ci * has already taken this difference in semantics into 130362306a36Sopenharmony_ci * account. Therefore, the meaning of chg is the same 130462306a36Sopenharmony_ci * as in the shared case above. Code could easily be 130562306a36Sopenharmony_ci * combined, but keeping it separate draws attention to 130662306a36Sopenharmony_ci * subtle differences. 130762306a36Sopenharmony_ci */ 130862306a36Sopenharmony_ci if (chg) 130962306a36Sopenharmony_ci return false; 131062306a36Sopenharmony_ci else 131162306a36Sopenharmony_ci return true; 131262306a36Sopenharmony_ci } 131362306a36Sopenharmony_ci 131462306a36Sopenharmony_ci return false; 131562306a36Sopenharmony_ci} 131662306a36Sopenharmony_ci 131762306a36Sopenharmony_cistatic void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio) 131862306a36Sopenharmony_ci{ 131962306a36Sopenharmony_ci int nid = folio_nid(folio); 132062306a36Sopenharmony_ci 132162306a36Sopenharmony_ci lockdep_assert_held(&hugetlb_lock); 132262306a36Sopenharmony_ci VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); 132362306a36Sopenharmony_ci 132462306a36Sopenharmony_ci list_move(&folio->lru, &h->hugepage_freelists[nid]); 132562306a36Sopenharmony_ci h->free_huge_pages++; 132662306a36Sopenharmony_ci h->free_huge_pages_node[nid]++; 132762306a36Sopenharmony_ci folio_set_hugetlb_freed(folio); 132862306a36Sopenharmony_ci} 132962306a36Sopenharmony_ci 133062306a36Sopenharmony_cistatic struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h, 133162306a36Sopenharmony_ci int nid) 133262306a36Sopenharmony_ci{ 133362306a36Sopenharmony_ci struct folio *folio; 133462306a36Sopenharmony_ci bool pin = !!(current->flags & PF_MEMALLOC_PIN); 133562306a36Sopenharmony_ci 133662306a36Sopenharmony_ci lockdep_assert_held(&hugetlb_lock); 133762306a36Sopenharmony_ci list_for_each_entry(folio, &h->hugepage_freelists[nid], lru) { 133862306a36Sopenharmony_ci if (pin && !folio_is_longterm_pinnable(folio)) 133962306a36Sopenharmony_ci continue; 134062306a36Sopenharmony_ci 134162306a36Sopenharmony_ci if (folio_test_hwpoison(folio)) 134262306a36Sopenharmony_ci continue; 134362306a36Sopenharmony_ci 134462306a36Sopenharmony_ci list_move(&folio->lru, &h->hugepage_activelist); 134562306a36Sopenharmony_ci folio_ref_unfreeze(folio, 1); 134662306a36Sopenharmony_ci folio_clear_hugetlb_freed(folio); 134762306a36Sopenharmony_ci h->free_huge_pages--; 134862306a36Sopenharmony_ci h->free_huge_pages_node[nid]--; 134962306a36Sopenharmony_ci return folio; 135062306a36Sopenharmony_ci } 135162306a36Sopenharmony_ci 135262306a36Sopenharmony_ci return NULL; 135362306a36Sopenharmony_ci} 135462306a36Sopenharmony_ci 135562306a36Sopenharmony_cistatic struct folio *dequeue_hugetlb_folio_nodemask(struct hstate *h, gfp_t gfp_mask, 135662306a36Sopenharmony_ci int nid, nodemask_t *nmask) 135762306a36Sopenharmony_ci{ 135862306a36Sopenharmony_ci unsigned int cpuset_mems_cookie; 135962306a36Sopenharmony_ci struct zonelist *zonelist; 136062306a36Sopenharmony_ci struct zone *zone; 136162306a36Sopenharmony_ci struct zoneref *z; 136262306a36Sopenharmony_ci int node = NUMA_NO_NODE; 136362306a36Sopenharmony_ci 136462306a36Sopenharmony_ci zonelist = node_zonelist(nid, gfp_mask); 136562306a36Sopenharmony_ci 136662306a36Sopenharmony_ciretry_cpuset: 136762306a36Sopenharmony_ci cpuset_mems_cookie = read_mems_allowed_begin(); 136862306a36Sopenharmony_ci for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) { 136962306a36Sopenharmony_ci struct folio *folio; 137062306a36Sopenharmony_ci 137162306a36Sopenharmony_ci if (!cpuset_zone_allowed(zone, gfp_mask)) 137262306a36Sopenharmony_ci continue; 137362306a36Sopenharmony_ci /* 137462306a36Sopenharmony_ci * no need to ask again on the same node. Pool is node rather than 137562306a36Sopenharmony_ci * zone aware 137662306a36Sopenharmony_ci */ 137762306a36Sopenharmony_ci if (zone_to_nid(zone) == node) 137862306a36Sopenharmony_ci continue; 137962306a36Sopenharmony_ci node = zone_to_nid(zone); 138062306a36Sopenharmony_ci 138162306a36Sopenharmony_ci folio = dequeue_hugetlb_folio_node_exact(h, node); 138262306a36Sopenharmony_ci if (folio) 138362306a36Sopenharmony_ci return folio; 138462306a36Sopenharmony_ci } 138562306a36Sopenharmony_ci if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie))) 138662306a36Sopenharmony_ci goto retry_cpuset; 138762306a36Sopenharmony_ci 138862306a36Sopenharmony_ci return NULL; 138962306a36Sopenharmony_ci} 139062306a36Sopenharmony_ci 139162306a36Sopenharmony_cistatic unsigned long available_huge_pages(struct hstate *h) 139262306a36Sopenharmony_ci{ 139362306a36Sopenharmony_ci return h->free_huge_pages - h->resv_huge_pages; 139462306a36Sopenharmony_ci} 139562306a36Sopenharmony_ci 139662306a36Sopenharmony_cistatic struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, 139762306a36Sopenharmony_ci struct vm_area_struct *vma, 139862306a36Sopenharmony_ci unsigned long address, int avoid_reserve, 139962306a36Sopenharmony_ci long chg) 140062306a36Sopenharmony_ci{ 140162306a36Sopenharmony_ci struct folio *folio = NULL; 140262306a36Sopenharmony_ci struct mempolicy *mpol; 140362306a36Sopenharmony_ci gfp_t gfp_mask; 140462306a36Sopenharmony_ci nodemask_t *nodemask; 140562306a36Sopenharmony_ci int nid; 140662306a36Sopenharmony_ci 140762306a36Sopenharmony_ci /* 140862306a36Sopenharmony_ci * A child process with MAP_PRIVATE mappings created by their parent 140962306a36Sopenharmony_ci * have no page reserves. This check ensures that reservations are 141062306a36Sopenharmony_ci * not "stolen". The child may still get SIGKILLed 141162306a36Sopenharmony_ci */ 141262306a36Sopenharmony_ci if (!vma_has_reserves(vma, chg) && !available_huge_pages(h)) 141362306a36Sopenharmony_ci goto err; 141462306a36Sopenharmony_ci 141562306a36Sopenharmony_ci /* If reserves cannot be used, ensure enough pages are in the pool */ 141662306a36Sopenharmony_ci if (avoid_reserve && !available_huge_pages(h)) 141762306a36Sopenharmony_ci goto err; 141862306a36Sopenharmony_ci 141962306a36Sopenharmony_ci gfp_mask = htlb_alloc_mask(h); 142062306a36Sopenharmony_ci nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); 142162306a36Sopenharmony_ci 142262306a36Sopenharmony_ci if (mpol_is_preferred_many(mpol)) { 142362306a36Sopenharmony_ci folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, 142462306a36Sopenharmony_ci nid, nodemask); 142562306a36Sopenharmony_ci 142662306a36Sopenharmony_ci /* Fallback to all nodes if page==NULL */ 142762306a36Sopenharmony_ci nodemask = NULL; 142862306a36Sopenharmony_ci } 142962306a36Sopenharmony_ci 143062306a36Sopenharmony_ci if (!folio) 143162306a36Sopenharmony_ci folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, 143262306a36Sopenharmony_ci nid, nodemask); 143362306a36Sopenharmony_ci 143462306a36Sopenharmony_ci if (folio && !avoid_reserve && vma_has_reserves(vma, chg)) { 143562306a36Sopenharmony_ci folio_set_hugetlb_restore_reserve(folio); 143662306a36Sopenharmony_ci h->resv_huge_pages--; 143762306a36Sopenharmony_ci } 143862306a36Sopenharmony_ci 143962306a36Sopenharmony_ci mpol_cond_put(mpol); 144062306a36Sopenharmony_ci return folio; 144162306a36Sopenharmony_ci 144262306a36Sopenharmony_cierr: 144362306a36Sopenharmony_ci return NULL; 144462306a36Sopenharmony_ci} 144562306a36Sopenharmony_ci 144662306a36Sopenharmony_ci/* 144762306a36Sopenharmony_ci * common helper functions for hstate_next_node_to_{alloc|free}. 144862306a36Sopenharmony_ci * We may have allocated or freed a huge page based on a different 144962306a36Sopenharmony_ci * nodes_allowed previously, so h->next_node_to_{alloc|free} might 145062306a36Sopenharmony_ci * be outside of *nodes_allowed. Ensure that we use an allowed 145162306a36Sopenharmony_ci * node for alloc or free. 145262306a36Sopenharmony_ci */ 145362306a36Sopenharmony_cistatic int next_node_allowed(int nid, nodemask_t *nodes_allowed) 145462306a36Sopenharmony_ci{ 145562306a36Sopenharmony_ci nid = next_node_in(nid, *nodes_allowed); 145662306a36Sopenharmony_ci VM_BUG_ON(nid >= MAX_NUMNODES); 145762306a36Sopenharmony_ci 145862306a36Sopenharmony_ci return nid; 145962306a36Sopenharmony_ci} 146062306a36Sopenharmony_ci 146162306a36Sopenharmony_cistatic int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) 146262306a36Sopenharmony_ci{ 146362306a36Sopenharmony_ci if (!node_isset(nid, *nodes_allowed)) 146462306a36Sopenharmony_ci nid = next_node_allowed(nid, nodes_allowed); 146562306a36Sopenharmony_ci return nid; 146662306a36Sopenharmony_ci} 146762306a36Sopenharmony_ci 146862306a36Sopenharmony_ci/* 146962306a36Sopenharmony_ci * returns the previously saved node ["this node"] from which to 147062306a36Sopenharmony_ci * allocate a persistent huge page for the pool and advance the 147162306a36Sopenharmony_ci * next node from which to allocate, handling wrap at end of node 147262306a36Sopenharmony_ci * mask. 147362306a36Sopenharmony_ci */ 147462306a36Sopenharmony_cistatic int hstate_next_node_to_alloc(struct hstate *h, 147562306a36Sopenharmony_ci nodemask_t *nodes_allowed) 147662306a36Sopenharmony_ci{ 147762306a36Sopenharmony_ci int nid; 147862306a36Sopenharmony_ci 147962306a36Sopenharmony_ci VM_BUG_ON(!nodes_allowed); 148062306a36Sopenharmony_ci 148162306a36Sopenharmony_ci nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); 148262306a36Sopenharmony_ci h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); 148362306a36Sopenharmony_ci 148462306a36Sopenharmony_ci return nid; 148562306a36Sopenharmony_ci} 148662306a36Sopenharmony_ci 148762306a36Sopenharmony_ci/* 148862306a36Sopenharmony_ci * helper for remove_pool_huge_page() - return the previously saved 148962306a36Sopenharmony_ci * node ["this node"] from which to free a huge page. Advance the 149062306a36Sopenharmony_ci * next node id whether or not we find a free huge page to free so 149162306a36Sopenharmony_ci * that the next attempt to free addresses the next node. 149262306a36Sopenharmony_ci */ 149362306a36Sopenharmony_cistatic int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) 149462306a36Sopenharmony_ci{ 149562306a36Sopenharmony_ci int nid; 149662306a36Sopenharmony_ci 149762306a36Sopenharmony_ci VM_BUG_ON(!nodes_allowed); 149862306a36Sopenharmony_ci 149962306a36Sopenharmony_ci nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); 150062306a36Sopenharmony_ci h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); 150162306a36Sopenharmony_ci 150262306a36Sopenharmony_ci return nid; 150362306a36Sopenharmony_ci} 150462306a36Sopenharmony_ci 150562306a36Sopenharmony_ci#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ 150662306a36Sopenharmony_ci for (nr_nodes = nodes_weight(*mask); \ 150762306a36Sopenharmony_ci nr_nodes > 0 && \ 150862306a36Sopenharmony_ci ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ 150962306a36Sopenharmony_ci nr_nodes--) 151062306a36Sopenharmony_ci 151162306a36Sopenharmony_ci#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ 151262306a36Sopenharmony_ci for (nr_nodes = nodes_weight(*mask); \ 151362306a36Sopenharmony_ci nr_nodes > 0 && \ 151462306a36Sopenharmony_ci ((node = hstate_next_node_to_free(hs, mask)) || 1); \ 151562306a36Sopenharmony_ci nr_nodes--) 151662306a36Sopenharmony_ci 151762306a36Sopenharmony_ci/* used to demote non-gigantic_huge pages as well */ 151862306a36Sopenharmony_cistatic void __destroy_compound_gigantic_folio(struct folio *folio, 151962306a36Sopenharmony_ci unsigned int order, bool demote) 152062306a36Sopenharmony_ci{ 152162306a36Sopenharmony_ci int i; 152262306a36Sopenharmony_ci int nr_pages = 1 << order; 152362306a36Sopenharmony_ci struct page *p; 152462306a36Sopenharmony_ci 152562306a36Sopenharmony_ci atomic_set(&folio->_entire_mapcount, 0); 152662306a36Sopenharmony_ci atomic_set(&folio->_nr_pages_mapped, 0); 152762306a36Sopenharmony_ci atomic_set(&folio->_pincount, 0); 152862306a36Sopenharmony_ci 152962306a36Sopenharmony_ci for (i = 1; i < nr_pages; i++) { 153062306a36Sopenharmony_ci p = folio_page(folio, i); 153162306a36Sopenharmony_ci p->flags &= ~PAGE_FLAGS_CHECK_AT_FREE; 153262306a36Sopenharmony_ci p->mapping = NULL; 153362306a36Sopenharmony_ci clear_compound_head(p); 153462306a36Sopenharmony_ci if (!demote) 153562306a36Sopenharmony_ci set_page_refcounted(p); 153662306a36Sopenharmony_ci } 153762306a36Sopenharmony_ci 153862306a36Sopenharmony_ci __folio_clear_head(folio); 153962306a36Sopenharmony_ci} 154062306a36Sopenharmony_ci 154162306a36Sopenharmony_cistatic void destroy_compound_hugetlb_folio_for_demote(struct folio *folio, 154262306a36Sopenharmony_ci unsigned int order) 154362306a36Sopenharmony_ci{ 154462306a36Sopenharmony_ci __destroy_compound_gigantic_folio(folio, order, true); 154562306a36Sopenharmony_ci} 154662306a36Sopenharmony_ci 154762306a36Sopenharmony_ci#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE 154862306a36Sopenharmony_cistatic void destroy_compound_gigantic_folio(struct folio *folio, 154962306a36Sopenharmony_ci unsigned int order) 155062306a36Sopenharmony_ci{ 155162306a36Sopenharmony_ci __destroy_compound_gigantic_folio(folio, order, false); 155262306a36Sopenharmony_ci} 155362306a36Sopenharmony_ci 155462306a36Sopenharmony_cistatic void free_gigantic_folio(struct folio *folio, unsigned int order) 155562306a36Sopenharmony_ci{ 155662306a36Sopenharmony_ci /* 155762306a36Sopenharmony_ci * If the page isn't allocated using the cma allocator, 155862306a36Sopenharmony_ci * cma_release() returns false. 155962306a36Sopenharmony_ci */ 156062306a36Sopenharmony_ci#ifdef CONFIG_CMA 156162306a36Sopenharmony_ci int nid = folio_nid(folio); 156262306a36Sopenharmony_ci 156362306a36Sopenharmony_ci if (cma_release(hugetlb_cma[nid], &folio->page, 1 << order)) 156462306a36Sopenharmony_ci return; 156562306a36Sopenharmony_ci#endif 156662306a36Sopenharmony_ci 156762306a36Sopenharmony_ci free_contig_range(folio_pfn(folio), 1 << order); 156862306a36Sopenharmony_ci} 156962306a36Sopenharmony_ci 157062306a36Sopenharmony_ci#ifdef CONFIG_CONTIG_ALLOC 157162306a36Sopenharmony_cistatic struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, 157262306a36Sopenharmony_ci int nid, nodemask_t *nodemask) 157362306a36Sopenharmony_ci{ 157462306a36Sopenharmony_ci struct page *page; 157562306a36Sopenharmony_ci unsigned long nr_pages = pages_per_huge_page(h); 157662306a36Sopenharmony_ci if (nid == NUMA_NO_NODE) 157762306a36Sopenharmony_ci nid = numa_mem_id(); 157862306a36Sopenharmony_ci 157962306a36Sopenharmony_ci#ifdef CONFIG_CMA 158062306a36Sopenharmony_ci { 158162306a36Sopenharmony_ci int node; 158262306a36Sopenharmony_ci 158362306a36Sopenharmony_ci if (hugetlb_cma[nid]) { 158462306a36Sopenharmony_ci page = cma_alloc(hugetlb_cma[nid], nr_pages, 158562306a36Sopenharmony_ci huge_page_order(h), true); 158662306a36Sopenharmony_ci if (page) 158762306a36Sopenharmony_ci return page_folio(page); 158862306a36Sopenharmony_ci } 158962306a36Sopenharmony_ci 159062306a36Sopenharmony_ci if (!(gfp_mask & __GFP_THISNODE)) { 159162306a36Sopenharmony_ci for_each_node_mask(node, *nodemask) { 159262306a36Sopenharmony_ci if (node == nid || !hugetlb_cma[node]) 159362306a36Sopenharmony_ci continue; 159462306a36Sopenharmony_ci 159562306a36Sopenharmony_ci page = cma_alloc(hugetlb_cma[node], nr_pages, 159662306a36Sopenharmony_ci huge_page_order(h), true); 159762306a36Sopenharmony_ci if (page) 159862306a36Sopenharmony_ci return page_folio(page); 159962306a36Sopenharmony_ci } 160062306a36Sopenharmony_ci } 160162306a36Sopenharmony_ci } 160262306a36Sopenharmony_ci#endif 160362306a36Sopenharmony_ci 160462306a36Sopenharmony_ci page = alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); 160562306a36Sopenharmony_ci return page ? page_folio(page) : NULL; 160662306a36Sopenharmony_ci} 160762306a36Sopenharmony_ci 160862306a36Sopenharmony_ci#else /* !CONFIG_CONTIG_ALLOC */ 160962306a36Sopenharmony_cistatic struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, 161062306a36Sopenharmony_ci int nid, nodemask_t *nodemask) 161162306a36Sopenharmony_ci{ 161262306a36Sopenharmony_ci return NULL; 161362306a36Sopenharmony_ci} 161462306a36Sopenharmony_ci#endif /* CONFIG_CONTIG_ALLOC */ 161562306a36Sopenharmony_ci 161662306a36Sopenharmony_ci#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ 161762306a36Sopenharmony_cistatic struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, 161862306a36Sopenharmony_ci int nid, nodemask_t *nodemask) 161962306a36Sopenharmony_ci{ 162062306a36Sopenharmony_ci return NULL; 162162306a36Sopenharmony_ci} 162262306a36Sopenharmony_cistatic inline void free_gigantic_folio(struct folio *folio, 162362306a36Sopenharmony_ci unsigned int order) { } 162462306a36Sopenharmony_cistatic inline void destroy_compound_gigantic_folio(struct folio *folio, 162562306a36Sopenharmony_ci unsigned int order) { } 162662306a36Sopenharmony_ci#endif 162762306a36Sopenharmony_ci 162862306a36Sopenharmony_cistatic inline void __clear_hugetlb_destructor(struct hstate *h, 162962306a36Sopenharmony_ci struct folio *folio) 163062306a36Sopenharmony_ci{ 163162306a36Sopenharmony_ci lockdep_assert_held(&hugetlb_lock); 163262306a36Sopenharmony_ci 163362306a36Sopenharmony_ci folio_clear_hugetlb(folio); 163462306a36Sopenharmony_ci} 163562306a36Sopenharmony_ci 163662306a36Sopenharmony_ci/* 163762306a36Sopenharmony_ci * Remove hugetlb folio from lists. 163862306a36Sopenharmony_ci * If vmemmap exists for the folio, update dtor so that the folio appears 163962306a36Sopenharmony_ci * as just a compound page. Otherwise, wait until after allocating vmemmap 164062306a36Sopenharmony_ci * to update dtor. 164162306a36Sopenharmony_ci * 164262306a36Sopenharmony_ci * A reference is held on the folio, except in the case of demote. 164362306a36Sopenharmony_ci * 164462306a36Sopenharmony_ci * Must be called with hugetlb lock held. 164562306a36Sopenharmony_ci */ 164662306a36Sopenharmony_cistatic void __remove_hugetlb_folio(struct hstate *h, struct folio *folio, 164762306a36Sopenharmony_ci bool adjust_surplus, 164862306a36Sopenharmony_ci bool demote) 164962306a36Sopenharmony_ci{ 165062306a36Sopenharmony_ci int nid = folio_nid(folio); 165162306a36Sopenharmony_ci 165262306a36Sopenharmony_ci VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio(folio), folio); 165362306a36Sopenharmony_ci VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio_rsvd(folio), folio); 165462306a36Sopenharmony_ci 165562306a36Sopenharmony_ci lockdep_assert_held(&hugetlb_lock); 165662306a36Sopenharmony_ci if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) 165762306a36Sopenharmony_ci return; 165862306a36Sopenharmony_ci 165962306a36Sopenharmony_ci list_del(&folio->lru); 166062306a36Sopenharmony_ci 166162306a36Sopenharmony_ci if (folio_test_hugetlb_freed(folio)) { 166262306a36Sopenharmony_ci h->free_huge_pages--; 166362306a36Sopenharmony_ci h->free_huge_pages_node[nid]--; 166462306a36Sopenharmony_ci } 166562306a36Sopenharmony_ci if (adjust_surplus) { 166662306a36Sopenharmony_ci h->surplus_huge_pages--; 166762306a36Sopenharmony_ci h->surplus_huge_pages_node[nid]--; 166862306a36Sopenharmony_ci } 166962306a36Sopenharmony_ci 167062306a36Sopenharmony_ci /* 167162306a36Sopenharmony_ci * We can only clear the hugetlb destructor after allocating vmemmap 167262306a36Sopenharmony_ci * pages. Otherwise, someone (memory error handling) may try to write 167362306a36Sopenharmony_ci * to tail struct pages. 167462306a36Sopenharmony_ci */ 167562306a36Sopenharmony_ci if (!folio_test_hugetlb_vmemmap_optimized(folio)) 167662306a36Sopenharmony_ci __clear_hugetlb_destructor(h, folio); 167762306a36Sopenharmony_ci 167862306a36Sopenharmony_ci /* 167962306a36Sopenharmony_ci * In the case of demote we do not ref count the page as it will soon 168062306a36Sopenharmony_ci * be turned into a page of smaller size. 168162306a36Sopenharmony_ci */ 168262306a36Sopenharmony_ci if (!demote) 168362306a36Sopenharmony_ci folio_ref_unfreeze(folio, 1); 168462306a36Sopenharmony_ci 168562306a36Sopenharmony_ci h->nr_huge_pages--; 168662306a36Sopenharmony_ci h->nr_huge_pages_node[nid]--; 168762306a36Sopenharmony_ci} 168862306a36Sopenharmony_ci 168962306a36Sopenharmony_cistatic void remove_hugetlb_folio(struct hstate *h, struct folio *folio, 169062306a36Sopenharmony_ci bool adjust_surplus) 169162306a36Sopenharmony_ci{ 169262306a36Sopenharmony_ci __remove_hugetlb_folio(h, folio, adjust_surplus, false); 169362306a36Sopenharmony_ci} 169462306a36Sopenharmony_ci 169562306a36Sopenharmony_cistatic void remove_hugetlb_folio_for_demote(struct hstate *h, struct folio *folio, 169662306a36Sopenharmony_ci bool adjust_surplus) 169762306a36Sopenharmony_ci{ 169862306a36Sopenharmony_ci __remove_hugetlb_folio(h, folio, adjust_surplus, true); 169962306a36Sopenharmony_ci} 170062306a36Sopenharmony_ci 170162306a36Sopenharmony_cistatic void add_hugetlb_folio(struct hstate *h, struct folio *folio, 170262306a36Sopenharmony_ci bool adjust_surplus) 170362306a36Sopenharmony_ci{ 170462306a36Sopenharmony_ci int zeroed; 170562306a36Sopenharmony_ci int nid = folio_nid(folio); 170662306a36Sopenharmony_ci 170762306a36Sopenharmony_ci VM_BUG_ON_FOLIO(!folio_test_hugetlb_vmemmap_optimized(folio), folio); 170862306a36Sopenharmony_ci 170962306a36Sopenharmony_ci lockdep_assert_held(&hugetlb_lock); 171062306a36Sopenharmony_ci 171162306a36Sopenharmony_ci INIT_LIST_HEAD(&folio->lru); 171262306a36Sopenharmony_ci h->nr_huge_pages++; 171362306a36Sopenharmony_ci h->nr_huge_pages_node[nid]++; 171462306a36Sopenharmony_ci 171562306a36Sopenharmony_ci if (adjust_surplus) { 171662306a36Sopenharmony_ci h->surplus_huge_pages++; 171762306a36Sopenharmony_ci h->surplus_huge_pages_node[nid]++; 171862306a36Sopenharmony_ci } 171962306a36Sopenharmony_ci 172062306a36Sopenharmony_ci folio_set_hugetlb(folio); 172162306a36Sopenharmony_ci folio_change_private(folio, NULL); 172262306a36Sopenharmony_ci /* 172362306a36Sopenharmony_ci * We have to set hugetlb_vmemmap_optimized again as above 172462306a36Sopenharmony_ci * folio_change_private(folio, NULL) cleared it. 172562306a36Sopenharmony_ci */ 172662306a36Sopenharmony_ci folio_set_hugetlb_vmemmap_optimized(folio); 172762306a36Sopenharmony_ci 172862306a36Sopenharmony_ci /* 172962306a36Sopenharmony_ci * This folio is about to be managed by the hugetlb allocator and 173062306a36Sopenharmony_ci * should have no users. Drop our reference, and check for others 173162306a36Sopenharmony_ci * just in case. 173262306a36Sopenharmony_ci */ 173362306a36Sopenharmony_ci zeroed = folio_put_testzero(folio); 173462306a36Sopenharmony_ci if (unlikely(!zeroed)) 173562306a36Sopenharmony_ci /* 173662306a36Sopenharmony_ci * It is VERY unlikely soneone else has taken a ref 173762306a36Sopenharmony_ci * on the folio. In this case, we simply return as 173862306a36Sopenharmony_ci * free_huge_folio() will be called when this other ref 173962306a36Sopenharmony_ci * is dropped. 174062306a36Sopenharmony_ci */ 174162306a36Sopenharmony_ci return; 174262306a36Sopenharmony_ci 174362306a36Sopenharmony_ci arch_clear_hugepage_flags(&folio->page); 174462306a36Sopenharmony_ci enqueue_hugetlb_folio(h, folio); 174562306a36Sopenharmony_ci} 174662306a36Sopenharmony_ci 174762306a36Sopenharmony_cistatic void __update_and_free_hugetlb_folio(struct hstate *h, 174862306a36Sopenharmony_ci struct folio *folio) 174962306a36Sopenharmony_ci{ 175062306a36Sopenharmony_ci bool clear_dtor = folio_test_hugetlb_vmemmap_optimized(folio); 175162306a36Sopenharmony_ci 175262306a36Sopenharmony_ci if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) 175362306a36Sopenharmony_ci return; 175462306a36Sopenharmony_ci 175562306a36Sopenharmony_ci /* 175662306a36Sopenharmony_ci * If we don't know which subpages are hwpoisoned, we can't free 175762306a36Sopenharmony_ci * the hugepage, so it's leaked intentionally. 175862306a36Sopenharmony_ci */ 175962306a36Sopenharmony_ci if (folio_test_hugetlb_raw_hwp_unreliable(folio)) 176062306a36Sopenharmony_ci return; 176162306a36Sopenharmony_ci 176262306a36Sopenharmony_ci if (hugetlb_vmemmap_restore(h, &folio->page)) { 176362306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 176462306a36Sopenharmony_ci /* 176562306a36Sopenharmony_ci * If we cannot allocate vmemmap pages, just refuse to free the 176662306a36Sopenharmony_ci * page and put the page back on the hugetlb free list and treat 176762306a36Sopenharmony_ci * as a surplus page. 176862306a36Sopenharmony_ci */ 176962306a36Sopenharmony_ci add_hugetlb_folio(h, folio, true); 177062306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 177162306a36Sopenharmony_ci return; 177262306a36Sopenharmony_ci } 177362306a36Sopenharmony_ci 177462306a36Sopenharmony_ci /* 177562306a36Sopenharmony_ci * Move PageHWPoison flag from head page to the raw error pages, 177662306a36Sopenharmony_ci * which makes any healthy subpages reusable. 177762306a36Sopenharmony_ci */ 177862306a36Sopenharmony_ci if (unlikely(folio_test_hwpoison(folio))) 177962306a36Sopenharmony_ci folio_clear_hugetlb_hwpoison(folio); 178062306a36Sopenharmony_ci 178162306a36Sopenharmony_ci /* 178262306a36Sopenharmony_ci * If vmemmap pages were allocated above, then we need to clear the 178362306a36Sopenharmony_ci * hugetlb destructor under the hugetlb lock. 178462306a36Sopenharmony_ci */ 178562306a36Sopenharmony_ci if (clear_dtor) { 178662306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 178762306a36Sopenharmony_ci __clear_hugetlb_destructor(h, folio); 178862306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 178962306a36Sopenharmony_ci } 179062306a36Sopenharmony_ci 179162306a36Sopenharmony_ci /* 179262306a36Sopenharmony_ci * Non-gigantic pages demoted from CMA allocated gigantic pages 179362306a36Sopenharmony_ci * need to be given back to CMA in free_gigantic_folio. 179462306a36Sopenharmony_ci */ 179562306a36Sopenharmony_ci if (hstate_is_gigantic(h) || 179662306a36Sopenharmony_ci hugetlb_cma_folio(folio, huge_page_order(h))) { 179762306a36Sopenharmony_ci destroy_compound_gigantic_folio(folio, huge_page_order(h)); 179862306a36Sopenharmony_ci free_gigantic_folio(folio, huge_page_order(h)); 179962306a36Sopenharmony_ci } else { 180062306a36Sopenharmony_ci __free_pages(&folio->page, huge_page_order(h)); 180162306a36Sopenharmony_ci } 180262306a36Sopenharmony_ci} 180362306a36Sopenharmony_ci 180462306a36Sopenharmony_ci/* 180562306a36Sopenharmony_ci * As update_and_free_hugetlb_folio() can be called under any context, so we cannot 180662306a36Sopenharmony_ci * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the 180762306a36Sopenharmony_ci * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate 180862306a36Sopenharmony_ci * the vmemmap pages. 180962306a36Sopenharmony_ci * 181062306a36Sopenharmony_ci * free_hpage_workfn() locklessly retrieves the linked list of pages to be 181162306a36Sopenharmony_ci * freed and frees them one-by-one. As the page->mapping pointer is going 181262306a36Sopenharmony_ci * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node 181362306a36Sopenharmony_ci * structure of a lockless linked list of huge pages to be freed. 181462306a36Sopenharmony_ci */ 181562306a36Sopenharmony_cistatic LLIST_HEAD(hpage_freelist); 181662306a36Sopenharmony_ci 181762306a36Sopenharmony_cistatic void free_hpage_workfn(struct work_struct *work) 181862306a36Sopenharmony_ci{ 181962306a36Sopenharmony_ci struct llist_node *node; 182062306a36Sopenharmony_ci 182162306a36Sopenharmony_ci node = llist_del_all(&hpage_freelist); 182262306a36Sopenharmony_ci 182362306a36Sopenharmony_ci while (node) { 182462306a36Sopenharmony_ci struct page *page; 182562306a36Sopenharmony_ci struct hstate *h; 182662306a36Sopenharmony_ci 182762306a36Sopenharmony_ci page = container_of((struct address_space **)node, 182862306a36Sopenharmony_ci struct page, mapping); 182962306a36Sopenharmony_ci node = node->next; 183062306a36Sopenharmony_ci page->mapping = NULL; 183162306a36Sopenharmony_ci /* 183262306a36Sopenharmony_ci * The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in 183362306a36Sopenharmony_ci * folio_hstate() is going to trigger because a previous call to 183462306a36Sopenharmony_ci * remove_hugetlb_folio() will clear the hugetlb bit, so do 183562306a36Sopenharmony_ci * not use folio_hstate() directly. 183662306a36Sopenharmony_ci */ 183762306a36Sopenharmony_ci h = size_to_hstate(page_size(page)); 183862306a36Sopenharmony_ci 183962306a36Sopenharmony_ci __update_and_free_hugetlb_folio(h, page_folio(page)); 184062306a36Sopenharmony_ci 184162306a36Sopenharmony_ci cond_resched(); 184262306a36Sopenharmony_ci } 184362306a36Sopenharmony_ci} 184462306a36Sopenharmony_cistatic DECLARE_WORK(free_hpage_work, free_hpage_workfn); 184562306a36Sopenharmony_ci 184662306a36Sopenharmony_cistatic inline void flush_free_hpage_work(struct hstate *h) 184762306a36Sopenharmony_ci{ 184862306a36Sopenharmony_ci if (hugetlb_vmemmap_optimizable(h)) 184962306a36Sopenharmony_ci flush_work(&free_hpage_work); 185062306a36Sopenharmony_ci} 185162306a36Sopenharmony_ci 185262306a36Sopenharmony_cistatic void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio, 185362306a36Sopenharmony_ci bool atomic) 185462306a36Sopenharmony_ci{ 185562306a36Sopenharmony_ci if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) { 185662306a36Sopenharmony_ci __update_and_free_hugetlb_folio(h, folio); 185762306a36Sopenharmony_ci return; 185862306a36Sopenharmony_ci } 185962306a36Sopenharmony_ci 186062306a36Sopenharmony_ci /* 186162306a36Sopenharmony_ci * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages. 186262306a36Sopenharmony_ci * 186362306a36Sopenharmony_ci * Only call schedule_work() if hpage_freelist is previously 186462306a36Sopenharmony_ci * empty. Otherwise, schedule_work() had been called but the workfn 186562306a36Sopenharmony_ci * hasn't retrieved the list yet. 186662306a36Sopenharmony_ci */ 186762306a36Sopenharmony_ci if (llist_add((struct llist_node *)&folio->mapping, &hpage_freelist)) 186862306a36Sopenharmony_ci schedule_work(&free_hpage_work); 186962306a36Sopenharmony_ci} 187062306a36Sopenharmony_ci 187162306a36Sopenharmony_cistatic void update_and_free_pages_bulk(struct hstate *h, struct list_head *list) 187262306a36Sopenharmony_ci{ 187362306a36Sopenharmony_ci struct page *page, *t_page; 187462306a36Sopenharmony_ci struct folio *folio; 187562306a36Sopenharmony_ci 187662306a36Sopenharmony_ci list_for_each_entry_safe(page, t_page, list, lru) { 187762306a36Sopenharmony_ci folio = page_folio(page); 187862306a36Sopenharmony_ci update_and_free_hugetlb_folio(h, folio, false); 187962306a36Sopenharmony_ci cond_resched(); 188062306a36Sopenharmony_ci } 188162306a36Sopenharmony_ci} 188262306a36Sopenharmony_ci 188362306a36Sopenharmony_cistruct hstate *size_to_hstate(unsigned long size) 188462306a36Sopenharmony_ci{ 188562306a36Sopenharmony_ci struct hstate *h; 188662306a36Sopenharmony_ci 188762306a36Sopenharmony_ci for_each_hstate(h) { 188862306a36Sopenharmony_ci if (huge_page_size(h) == size) 188962306a36Sopenharmony_ci return h; 189062306a36Sopenharmony_ci } 189162306a36Sopenharmony_ci return NULL; 189262306a36Sopenharmony_ci} 189362306a36Sopenharmony_ci 189462306a36Sopenharmony_civoid free_huge_folio(struct folio *folio) 189562306a36Sopenharmony_ci{ 189662306a36Sopenharmony_ci /* 189762306a36Sopenharmony_ci * Can't pass hstate in here because it is called from the 189862306a36Sopenharmony_ci * compound page destructor. 189962306a36Sopenharmony_ci */ 190062306a36Sopenharmony_ci struct hstate *h = folio_hstate(folio); 190162306a36Sopenharmony_ci int nid = folio_nid(folio); 190262306a36Sopenharmony_ci struct hugepage_subpool *spool = hugetlb_folio_subpool(folio); 190362306a36Sopenharmony_ci bool restore_reserve; 190462306a36Sopenharmony_ci unsigned long flags; 190562306a36Sopenharmony_ci 190662306a36Sopenharmony_ci VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); 190762306a36Sopenharmony_ci VM_BUG_ON_FOLIO(folio_mapcount(folio), folio); 190862306a36Sopenharmony_ci 190962306a36Sopenharmony_ci hugetlb_set_folio_subpool(folio, NULL); 191062306a36Sopenharmony_ci if (folio_test_anon(folio)) 191162306a36Sopenharmony_ci __ClearPageAnonExclusive(&folio->page); 191262306a36Sopenharmony_ci folio->mapping = NULL; 191362306a36Sopenharmony_ci restore_reserve = folio_test_hugetlb_restore_reserve(folio); 191462306a36Sopenharmony_ci folio_clear_hugetlb_restore_reserve(folio); 191562306a36Sopenharmony_ci 191662306a36Sopenharmony_ci /* 191762306a36Sopenharmony_ci * If HPageRestoreReserve was set on page, page allocation consumed a 191862306a36Sopenharmony_ci * reservation. If the page was associated with a subpool, there 191962306a36Sopenharmony_ci * would have been a page reserved in the subpool before allocation 192062306a36Sopenharmony_ci * via hugepage_subpool_get_pages(). Since we are 'restoring' the 192162306a36Sopenharmony_ci * reservation, do not call hugepage_subpool_put_pages() as this will 192262306a36Sopenharmony_ci * remove the reserved page from the subpool. 192362306a36Sopenharmony_ci */ 192462306a36Sopenharmony_ci if (!restore_reserve) { 192562306a36Sopenharmony_ci /* 192662306a36Sopenharmony_ci * A return code of zero implies that the subpool will be 192762306a36Sopenharmony_ci * under its minimum size if the reservation is not restored 192862306a36Sopenharmony_ci * after page is free. Therefore, force restore_reserve 192962306a36Sopenharmony_ci * operation. 193062306a36Sopenharmony_ci */ 193162306a36Sopenharmony_ci if (hugepage_subpool_put_pages(spool, 1) == 0) 193262306a36Sopenharmony_ci restore_reserve = true; 193362306a36Sopenharmony_ci } 193462306a36Sopenharmony_ci 193562306a36Sopenharmony_ci spin_lock_irqsave(&hugetlb_lock, flags); 193662306a36Sopenharmony_ci folio_clear_hugetlb_migratable(folio); 193762306a36Sopenharmony_ci hugetlb_cgroup_uncharge_folio(hstate_index(h), 193862306a36Sopenharmony_ci pages_per_huge_page(h), folio); 193962306a36Sopenharmony_ci hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), 194062306a36Sopenharmony_ci pages_per_huge_page(h), folio); 194162306a36Sopenharmony_ci if (restore_reserve) 194262306a36Sopenharmony_ci h->resv_huge_pages++; 194362306a36Sopenharmony_ci 194462306a36Sopenharmony_ci if (folio_test_hugetlb_temporary(folio)) { 194562306a36Sopenharmony_ci remove_hugetlb_folio(h, folio, false); 194662306a36Sopenharmony_ci spin_unlock_irqrestore(&hugetlb_lock, flags); 194762306a36Sopenharmony_ci update_and_free_hugetlb_folio(h, folio, true); 194862306a36Sopenharmony_ci } else if (h->surplus_huge_pages_node[nid]) { 194962306a36Sopenharmony_ci /* remove the page from active list */ 195062306a36Sopenharmony_ci remove_hugetlb_folio(h, folio, true); 195162306a36Sopenharmony_ci spin_unlock_irqrestore(&hugetlb_lock, flags); 195262306a36Sopenharmony_ci update_and_free_hugetlb_folio(h, folio, true); 195362306a36Sopenharmony_ci } else { 195462306a36Sopenharmony_ci arch_clear_hugepage_flags(&folio->page); 195562306a36Sopenharmony_ci enqueue_hugetlb_folio(h, folio); 195662306a36Sopenharmony_ci spin_unlock_irqrestore(&hugetlb_lock, flags); 195762306a36Sopenharmony_ci } 195862306a36Sopenharmony_ci} 195962306a36Sopenharmony_ci 196062306a36Sopenharmony_ci/* 196162306a36Sopenharmony_ci * Must be called with the hugetlb lock held 196262306a36Sopenharmony_ci */ 196362306a36Sopenharmony_cistatic void __prep_account_new_huge_page(struct hstate *h, int nid) 196462306a36Sopenharmony_ci{ 196562306a36Sopenharmony_ci lockdep_assert_held(&hugetlb_lock); 196662306a36Sopenharmony_ci h->nr_huge_pages++; 196762306a36Sopenharmony_ci h->nr_huge_pages_node[nid]++; 196862306a36Sopenharmony_ci} 196962306a36Sopenharmony_ci 197062306a36Sopenharmony_cistatic void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) 197162306a36Sopenharmony_ci{ 197262306a36Sopenharmony_ci hugetlb_vmemmap_optimize(h, &folio->page); 197362306a36Sopenharmony_ci INIT_LIST_HEAD(&folio->lru); 197462306a36Sopenharmony_ci folio_set_hugetlb(folio); 197562306a36Sopenharmony_ci hugetlb_set_folio_subpool(folio, NULL); 197662306a36Sopenharmony_ci set_hugetlb_cgroup(folio, NULL); 197762306a36Sopenharmony_ci set_hugetlb_cgroup_rsvd(folio, NULL); 197862306a36Sopenharmony_ci} 197962306a36Sopenharmony_ci 198062306a36Sopenharmony_cistatic void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid) 198162306a36Sopenharmony_ci{ 198262306a36Sopenharmony_ci __prep_new_hugetlb_folio(h, folio); 198362306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 198462306a36Sopenharmony_ci __prep_account_new_huge_page(h, nid); 198562306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 198662306a36Sopenharmony_ci} 198762306a36Sopenharmony_ci 198862306a36Sopenharmony_cistatic bool __prep_compound_gigantic_folio(struct folio *folio, 198962306a36Sopenharmony_ci unsigned int order, bool demote) 199062306a36Sopenharmony_ci{ 199162306a36Sopenharmony_ci int i, j; 199262306a36Sopenharmony_ci int nr_pages = 1 << order; 199362306a36Sopenharmony_ci struct page *p; 199462306a36Sopenharmony_ci 199562306a36Sopenharmony_ci __folio_clear_reserved(folio); 199662306a36Sopenharmony_ci for (i = 0; i < nr_pages; i++) { 199762306a36Sopenharmony_ci p = folio_page(folio, i); 199862306a36Sopenharmony_ci 199962306a36Sopenharmony_ci /* 200062306a36Sopenharmony_ci * For gigantic hugepages allocated through bootmem at 200162306a36Sopenharmony_ci * boot, it's safer to be consistent with the not-gigantic 200262306a36Sopenharmony_ci * hugepages and clear the PG_reserved bit from all tail pages 200362306a36Sopenharmony_ci * too. Otherwise drivers using get_user_pages() to access tail 200462306a36Sopenharmony_ci * pages may get the reference counting wrong if they see 200562306a36Sopenharmony_ci * PG_reserved set on a tail page (despite the head page not 200662306a36Sopenharmony_ci * having PG_reserved set). Enforcing this consistency between 200762306a36Sopenharmony_ci * head and tail pages allows drivers to optimize away a check 200862306a36Sopenharmony_ci * on the head page when they need know if put_page() is needed 200962306a36Sopenharmony_ci * after get_user_pages(). 201062306a36Sopenharmony_ci */ 201162306a36Sopenharmony_ci if (i != 0) /* head page cleared above */ 201262306a36Sopenharmony_ci __ClearPageReserved(p); 201362306a36Sopenharmony_ci /* 201462306a36Sopenharmony_ci * Subtle and very unlikely 201562306a36Sopenharmony_ci * 201662306a36Sopenharmony_ci * Gigantic 'page allocators' such as memblock or cma will 201762306a36Sopenharmony_ci * return a set of pages with each page ref counted. We need 201862306a36Sopenharmony_ci * to turn this set of pages into a compound page with tail 201962306a36Sopenharmony_ci * page ref counts set to zero. Code such as speculative page 202062306a36Sopenharmony_ci * cache adding could take a ref on a 'to be' tail page. 202162306a36Sopenharmony_ci * We need to respect any increased ref count, and only set 202262306a36Sopenharmony_ci * the ref count to zero if count is currently 1. If count 202362306a36Sopenharmony_ci * is not 1, we return an error. An error return indicates 202462306a36Sopenharmony_ci * the set of pages can not be converted to a gigantic page. 202562306a36Sopenharmony_ci * The caller who allocated the pages should then discard the 202662306a36Sopenharmony_ci * pages using the appropriate free interface. 202762306a36Sopenharmony_ci * 202862306a36Sopenharmony_ci * In the case of demote, the ref count will be zero. 202962306a36Sopenharmony_ci */ 203062306a36Sopenharmony_ci if (!demote) { 203162306a36Sopenharmony_ci if (!page_ref_freeze(p, 1)) { 203262306a36Sopenharmony_ci pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n"); 203362306a36Sopenharmony_ci goto out_error; 203462306a36Sopenharmony_ci } 203562306a36Sopenharmony_ci } else { 203662306a36Sopenharmony_ci VM_BUG_ON_PAGE(page_count(p), p); 203762306a36Sopenharmony_ci } 203862306a36Sopenharmony_ci if (i != 0) 203962306a36Sopenharmony_ci set_compound_head(p, &folio->page); 204062306a36Sopenharmony_ci } 204162306a36Sopenharmony_ci __folio_set_head(folio); 204262306a36Sopenharmony_ci /* we rely on prep_new_hugetlb_folio to set the destructor */ 204362306a36Sopenharmony_ci folio_set_order(folio, order); 204462306a36Sopenharmony_ci atomic_set(&folio->_entire_mapcount, -1); 204562306a36Sopenharmony_ci atomic_set(&folio->_nr_pages_mapped, 0); 204662306a36Sopenharmony_ci atomic_set(&folio->_pincount, 0); 204762306a36Sopenharmony_ci return true; 204862306a36Sopenharmony_ci 204962306a36Sopenharmony_ciout_error: 205062306a36Sopenharmony_ci /* undo page modifications made above */ 205162306a36Sopenharmony_ci for (j = 0; j < i; j++) { 205262306a36Sopenharmony_ci p = folio_page(folio, j); 205362306a36Sopenharmony_ci if (j != 0) 205462306a36Sopenharmony_ci clear_compound_head(p); 205562306a36Sopenharmony_ci set_page_refcounted(p); 205662306a36Sopenharmony_ci } 205762306a36Sopenharmony_ci /* need to clear PG_reserved on remaining tail pages */ 205862306a36Sopenharmony_ci for (; j < nr_pages; j++) { 205962306a36Sopenharmony_ci p = folio_page(folio, j); 206062306a36Sopenharmony_ci __ClearPageReserved(p); 206162306a36Sopenharmony_ci } 206262306a36Sopenharmony_ci return false; 206362306a36Sopenharmony_ci} 206462306a36Sopenharmony_ci 206562306a36Sopenharmony_cistatic bool prep_compound_gigantic_folio(struct folio *folio, 206662306a36Sopenharmony_ci unsigned int order) 206762306a36Sopenharmony_ci{ 206862306a36Sopenharmony_ci return __prep_compound_gigantic_folio(folio, order, false); 206962306a36Sopenharmony_ci} 207062306a36Sopenharmony_ci 207162306a36Sopenharmony_cistatic bool prep_compound_gigantic_folio_for_demote(struct folio *folio, 207262306a36Sopenharmony_ci unsigned int order) 207362306a36Sopenharmony_ci{ 207462306a36Sopenharmony_ci return __prep_compound_gigantic_folio(folio, order, true); 207562306a36Sopenharmony_ci} 207662306a36Sopenharmony_ci 207762306a36Sopenharmony_ci/* 207862306a36Sopenharmony_ci * PageHuge() only returns true for hugetlbfs pages, but not for normal or 207962306a36Sopenharmony_ci * transparent huge pages. See the PageTransHuge() documentation for more 208062306a36Sopenharmony_ci * details. 208162306a36Sopenharmony_ci */ 208262306a36Sopenharmony_ciint PageHuge(struct page *page) 208362306a36Sopenharmony_ci{ 208462306a36Sopenharmony_ci struct folio *folio; 208562306a36Sopenharmony_ci 208662306a36Sopenharmony_ci if (!PageCompound(page)) 208762306a36Sopenharmony_ci return 0; 208862306a36Sopenharmony_ci folio = page_folio(page); 208962306a36Sopenharmony_ci return folio_test_hugetlb(folio); 209062306a36Sopenharmony_ci} 209162306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(PageHuge); 209262306a36Sopenharmony_ci 209362306a36Sopenharmony_ci/* 209462306a36Sopenharmony_ci * Find and lock address space (mapping) in write mode. 209562306a36Sopenharmony_ci * 209662306a36Sopenharmony_ci * Upon entry, the page is locked which means that page_mapping() is 209762306a36Sopenharmony_ci * stable. Due to locking order, we can only trylock_write. If we can 209862306a36Sopenharmony_ci * not get the lock, simply return NULL to caller. 209962306a36Sopenharmony_ci */ 210062306a36Sopenharmony_cistruct address_space *hugetlb_page_mapping_lock_write(struct page *hpage) 210162306a36Sopenharmony_ci{ 210262306a36Sopenharmony_ci struct address_space *mapping = page_mapping(hpage); 210362306a36Sopenharmony_ci 210462306a36Sopenharmony_ci if (!mapping) 210562306a36Sopenharmony_ci return mapping; 210662306a36Sopenharmony_ci 210762306a36Sopenharmony_ci if (i_mmap_trylock_write(mapping)) 210862306a36Sopenharmony_ci return mapping; 210962306a36Sopenharmony_ci 211062306a36Sopenharmony_ci return NULL; 211162306a36Sopenharmony_ci} 211262306a36Sopenharmony_ci 211362306a36Sopenharmony_cipgoff_t hugetlb_basepage_index(struct page *page) 211462306a36Sopenharmony_ci{ 211562306a36Sopenharmony_ci struct page *page_head = compound_head(page); 211662306a36Sopenharmony_ci pgoff_t index = page_index(page_head); 211762306a36Sopenharmony_ci unsigned long compound_idx; 211862306a36Sopenharmony_ci 211962306a36Sopenharmony_ci if (compound_order(page_head) > MAX_ORDER) 212062306a36Sopenharmony_ci compound_idx = page_to_pfn(page) - page_to_pfn(page_head); 212162306a36Sopenharmony_ci else 212262306a36Sopenharmony_ci compound_idx = page - page_head; 212362306a36Sopenharmony_ci 212462306a36Sopenharmony_ci return (index << compound_order(page_head)) + compound_idx; 212562306a36Sopenharmony_ci} 212662306a36Sopenharmony_ci 212762306a36Sopenharmony_cistatic struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, 212862306a36Sopenharmony_ci gfp_t gfp_mask, int nid, nodemask_t *nmask, 212962306a36Sopenharmony_ci nodemask_t *node_alloc_noretry) 213062306a36Sopenharmony_ci{ 213162306a36Sopenharmony_ci int order = huge_page_order(h); 213262306a36Sopenharmony_ci struct page *page; 213362306a36Sopenharmony_ci bool alloc_try_hard = true; 213462306a36Sopenharmony_ci bool retry = true; 213562306a36Sopenharmony_ci 213662306a36Sopenharmony_ci /* 213762306a36Sopenharmony_ci * By default we always try hard to allocate the page with 213862306a36Sopenharmony_ci * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in 213962306a36Sopenharmony_ci * a loop (to adjust global huge page counts) and previous allocation 214062306a36Sopenharmony_ci * failed, do not continue to try hard on the same node. Use the 214162306a36Sopenharmony_ci * node_alloc_noretry bitmap to manage this state information. 214262306a36Sopenharmony_ci */ 214362306a36Sopenharmony_ci if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry)) 214462306a36Sopenharmony_ci alloc_try_hard = false; 214562306a36Sopenharmony_ci gfp_mask |= __GFP_COMP|__GFP_NOWARN; 214662306a36Sopenharmony_ci if (alloc_try_hard) 214762306a36Sopenharmony_ci gfp_mask |= __GFP_RETRY_MAYFAIL; 214862306a36Sopenharmony_ci if (nid == NUMA_NO_NODE) 214962306a36Sopenharmony_ci nid = numa_mem_id(); 215062306a36Sopenharmony_ciretry: 215162306a36Sopenharmony_ci page = __alloc_pages(gfp_mask, order, nid, nmask); 215262306a36Sopenharmony_ci 215362306a36Sopenharmony_ci /* Freeze head page */ 215462306a36Sopenharmony_ci if (page && !page_ref_freeze(page, 1)) { 215562306a36Sopenharmony_ci __free_pages(page, order); 215662306a36Sopenharmony_ci if (retry) { /* retry once */ 215762306a36Sopenharmony_ci retry = false; 215862306a36Sopenharmony_ci goto retry; 215962306a36Sopenharmony_ci } 216062306a36Sopenharmony_ci /* WOW! twice in a row. */ 216162306a36Sopenharmony_ci pr_warn("HugeTLB head page unexpected inflated ref count\n"); 216262306a36Sopenharmony_ci page = NULL; 216362306a36Sopenharmony_ci } 216462306a36Sopenharmony_ci 216562306a36Sopenharmony_ci /* 216662306a36Sopenharmony_ci * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this 216762306a36Sopenharmony_ci * indicates an overall state change. Clear bit so that we resume 216862306a36Sopenharmony_ci * normal 'try hard' allocations. 216962306a36Sopenharmony_ci */ 217062306a36Sopenharmony_ci if (node_alloc_noretry && page && !alloc_try_hard) 217162306a36Sopenharmony_ci node_clear(nid, *node_alloc_noretry); 217262306a36Sopenharmony_ci 217362306a36Sopenharmony_ci /* 217462306a36Sopenharmony_ci * If we tried hard to get a page but failed, set bit so that 217562306a36Sopenharmony_ci * subsequent attempts will not try as hard until there is an 217662306a36Sopenharmony_ci * overall state change. 217762306a36Sopenharmony_ci */ 217862306a36Sopenharmony_ci if (node_alloc_noretry && !page && alloc_try_hard) 217962306a36Sopenharmony_ci node_set(nid, *node_alloc_noretry); 218062306a36Sopenharmony_ci 218162306a36Sopenharmony_ci if (!page) { 218262306a36Sopenharmony_ci __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 218362306a36Sopenharmony_ci return NULL; 218462306a36Sopenharmony_ci } 218562306a36Sopenharmony_ci 218662306a36Sopenharmony_ci __count_vm_event(HTLB_BUDDY_PGALLOC); 218762306a36Sopenharmony_ci return page_folio(page); 218862306a36Sopenharmony_ci} 218962306a36Sopenharmony_ci 219062306a36Sopenharmony_ci/* 219162306a36Sopenharmony_ci * Common helper to allocate a fresh hugetlb page. All specific allocators 219262306a36Sopenharmony_ci * should use this function to get new hugetlb pages 219362306a36Sopenharmony_ci * 219462306a36Sopenharmony_ci * Note that returned page is 'frozen': ref count of head page and all tail 219562306a36Sopenharmony_ci * pages is zero. 219662306a36Sopenharmony_ci */ 219762306a36Sopenharmony_cistatic struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, 219862306a36Sopenharmony_ci gfp_t gfp_mask, int nid, nodemask_t *nmask, 219962306a36Sopenharmony_ci nodemask_t *node_alloc_noretry) 220062306a36Sopenharmony_ci{ 220162306a36Sopenharmony_ci struct folio *folio; 220262306a36Sopenharmony_ci bool retry = false; 220362306a36Sopenharmony_ci 220462306a36Sopenharmony_ciretry: 220562306a36Sopenharmony_ci if (hstate_is_gigantic(h)) 220662306a36Sopenharmony_ci folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); 220762306a36Sopenharmony_ci else 220862306a36Sopenharmony_ci folio = alloc_buddy_hugetlb_folio(h, gfp_mask, 220962306a36Sopenharmony_ci nid, nmask, node_alloc_noretry); 221062306a36Sopenharmony_ci if (!folio) 221162306a36Sopenharmony_ci return NULL; 221262306a36Sopenharmony_ci if (hstate_is_gigantic(h)) { 221362306a36Sopenharmony_ci if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) { 221462306a36Sopenharmony_ci /* 221562306a36Sopenharmony_ci * Rare failure to convert pages to compound page. 221662306a36Sopenharmony_ci * Free pages and try again - ONCE! 221762306a36Sopenharmony_ci */ 221862306a36Sopenharmony_ci free_gigantic_folio(folio, huge_page_order(h)); 221962306a36Sopenharmony_ci if (!retry) { 222062306a36Sopenharmony_ci retry = true; 222162306a36Sopenharmony_ci goto retry; 222262306a36Sopenharmony_ci } 222362306a36Sopenharmony_ci return NULL; 222462306a36Sopenharmony_ci } 222562306a36Sopenharmony_ci } 222662306a36Sopenharmony_ci prep_new_hugetlb_folio(h, folio, folio_nid(folio)); 222762306a36Sopenharmony_ci 222862306a36Sopenharmony_ci return folio; 222962306a36Sopenharmony_ci} 223062306a36Sopenharmony_ci 223162306a36Sopenharmony_ci/* 223262306a36Sopenharmony_ci * Allocates a fresh page to the hugetlb allocator pool in the node interleaved 223362306a36Sopenharmony_ci * manner. 223462306a36Sopenharmony_ci */ 223562306a36Sopenharmony_cistatic int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, 223662306a36Sopenharmony_ci nodemask_t *node_alloc_noretry) 223762306a36Sopenharmony_ci{ 223862306a36Sopenharmony_ci struct folio *folio; 223962306a36Sopenharmony_ci int nr_nodes, node; 224062306a36Sopenharmony_ci gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; 224162306a36Sopenharmony_ci 224262306a36Sopenharmony_ci for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { 224362306a36Sopenharmony_ci folio = alloc_fresh_hugetlb_folio(h, gfp_mask, node, 224462306a36Sopenharmony_ci nodes_allowed, node_alloc_noretry); 224562306a36Sopenharmony_ci if (folio) { 224662306a36Sopenharmony_ci free_huge_folio(folio); /* free it into the hugepage allocator */ 224762306a36Sopenharmony_ci return 1; 224862306a36Sopenharmony_ci } 224962306a36Sopenharmony_ci } 225062306a36Sopenharmony_ci 225162306a36Sopenharmony_ci return 0; 225262306a36Sopenharmony_ci} 225362306a36Sopenharmony_ci 225462306a36Sopenharmony_ci/* 225562306a36Sopenharmony_ci * Remove huge page from pool from next node to free. Attempt to keep 225662306a36Sopenharmony_ci * persistent huge pages more or less balanced over allowed nodes. 225762306a36Sopenharmony_ci * This routine only 'removes' the hugetlb page. The caller must make 225862306a36Sopenharmony_ci * an additional call to free the page to low level allocators. 225962306a36Sopenharmony_ci * Called with hugetlb_lock locked. 226062306a36Sopenharmony_ci */ 226162306a36Sopenharmony_cistatic struct page *remove_pool_huge_page(struct hstate *h, 226262306a36Sopenharmony_ci nodemask_t *nodes_allowed, 226362306a36Sopenharmony_ci bool acct_surplus) 226462306a36Sopenharmony_ci{ 226562306a36Sopenharmony_ci int nr_nodes, node; 226662306a36Sopenharmony_ci struct page *page = NULL; 226762306a36Sopenharmony_ci struct folio *folio; 226862306a36Sopenharmony_ci 226962306a36Sopenharmony_ci lockdep_assert_held(&hugetlb_lock); 227062306a36Sopenharmony_ci for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { 227162306a36Sopenharmony_ci /* 227262306a36Sopenharmony_ci * If we're returning unused surplus pages, only examine 227362306a36Sopenharmony_ci * nodes with surplus pages. 227462306a36Sopenharmony_ci */ 227562306a36Sopenharmony_ci if ((!acct_surplus || h->surplus_huge_pages_node[node]) && 227662306a36Sopenharmony_ci !list_empty(&h->hugepage_freelists[node])) { 227762306a36Sopenharmony_ci page = list_entry(h->hugepage_freelists[node].next, 227862306a36Sopenharmony_ci struct page, lru); 227962306a36Sopenharmony_ci folio = page_folio(page); 228062306a36Sopenharmony_ci remove_hugetlb_folio(h, folio, acct_surplus); 228162306a36Sopenharmony_ci break; 228262306a36Sopenharmony_ci } 228362306a36Sopenharmony_ci } 228462306a36Sopenharmony_ci 228562306a36Sopenharmony_ci return page; 228662306a36Sopenharmony_ci} 228762306a36Sopenharmony_ci 228862306a36Sopenharmony_ci/* 228962306a36Sopenharmony_ci * Dissolve a given free hugepage into free buddy pages. This function does 229062306a36Sopenharmony_ci * nothing for in-use hugepages and non-hugepages. 229162306a36Sopenharmony_ci * This function returns values like below: 229262306a36Sopenharmony_ci * 229362306a36Sopenharmony_ci * -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages 229462306a36Sopenharmony_ci * when the system is under memory pressure and the feature of 229562306a36Sopenharmony_ci * freeing unused vmemmap pages associated with each hugetlb page 229662306a36Sopenharmony_ci * is enabled. 229762306a36Sopenharmony_ci * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use 229862306a36Sopenharmony_ci * (allocated or reserved.) 229962306a36Sopenharmony_ci * 0: successfully dissolved free hugepages or the page is not a 230062306a36Sopenharmony_ci * hugepage (considered as already dissolved) 230162306a36Sopenharmony_ci */ 230262306a36Sopenharmony_ciint dissolve_free_huge_page(struct page *page) 230362306a36Sopenharmony_ci{ 230462306a36Sopenharmony_ci int rc = -EBUSY; 230562306a36Sopenharmony_ci struct folio *folio = page_folio(page); 230662306a36Sopenharmony_ci 230762306a36Sopenharmony_ciretry: 230862306a36Sopenharmony_ci /* Not to disrupt normal path by vainly holding hugetlb_lock */ 230962306a36Sopenharmony_ci if (!folio_test_hugetlb(folio)) 231062306a36Sopenharmony_ci return 0; 231162306a36Sopenharmony_ci 231262306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 231362306a36Sopenharmony_ci if (!folio_test_hugetlb(folio)) { 231462306a36Sopenharmony_ci rc = 0; 231562306a36Sopenharmony_ci goto out; 231662306a36Sopenharmony_ci } 231762306a36Sopenharmony_ci 231862306a36Sopenharmony_ci if (!folio_ref_count(folio)) { 231962306a36Sopenharmony_ci struct hstate *h = folio_hstate(folio); 232062306a36Sopenharmony_ci if (!available_huge_pages(h)) 232162306a36Sopenharmony_ci goto out; 232262306a36Sopenharmony_ci 232362306a36Sopenharmony_ci /* 232462306a36Sopenharmony_ci * We should make sure that the page is already on the free list 232562306a36Sopenharmony_ci * when it is dissolved. 232662306a36Sopenharmony_ci */ 232762306a36Sopenharmony_ci if (unlikely(!folio_test_hugetlb_freed(folio))) { 232862306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 232962306a36Sopenharmony_ci cond_resched(); 233062306a36Sopenharmony_ci 233162306a36Sopenharmony_ci /* 233262306a36Sopenharmony_ci * Theoretically, we should return -EBUSY when we 233362306a36Sopenharmony_ci * encounter this race. In fact, we have a chance 233462306a36Sopenharmony_ci * to successfully dissolve the page if we do a 233562306a36Sopenharmony_ci * retry. Because the race window is quite small. 233662306a36Sopenharmony_ci * If we seize this opportunity, it is an optimization 233762306a36Sopenharmony_ci * for increasing the success rate of dissolving page. 233862306a36Sopenharmony_ci */ 233962306a36Sopenharmony_ci goto retry; 234062306a36Sopenharmony_ci } 234162306a36Sopenharmony_ci 234262306a36Sopenharmony_ci remove_hugetlb_folio(h, folio, false); 234362306a36Sopenharmony_ci h->max_huge_pages--; 234462306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 234562306a36Sopenharmony_ci 234662306a36Sopenharmony_ci /* 234762306a36Sopenharmony_ci * Normally update_and_free_hugtlb_folio will allocate required vmemmmap 234862306a36Sopenharmony_ci * before freeing the page. update_and_free_hugtlb_folio will fail to 234962306a36Sopenharmony_ci * free the page if it can not allocate required vmemmap. We 235062306a36Sopenharmony_ci * need to adjust max_huge_pages if the page is not freed. 235162306a36Sopenharmony_ci * Attempt to allocate vmemmmap here so that we can take 235262306a36Sopenharmony_ci * appropriate action on failure. 235362306a36Sopenharmony_ci */ 235462306a36Sopenharmony_ci rc = hugetlb_vmemmap_restore(h, &folio->page); 235562306a36Sopenharmony_ci if (!rc) { 235662306a36Sopenharmony_ci update_and_free_hugetlb_folio(h, folio, false); 235762306a36Sopenharmony_ci } else { 235862306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 235962306a36Sopenharmony_ci add_hugetlb_folio(h, folio, false); 236062306a36Sopenharmony_ci h->max_huge_pages++; 236162306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 236262306a36Sopenharmony_ci } 236362306a36Sopenharmony_ci 236462306a36Sopenharmony_ci return rc; 236562306a36Sopenharmony_ci } 236662306a36Sopenharmony_ciout: 236762306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 236862306a36Sopenharmony_ci return rc; 236962306a36Sopenharmony_ci} 237062306a36Sopenharmony_ci 237162306a36Sopenharmony_ci/* 237262306a36Sopenharmony_ci * Dissolve free hugepages in a given pfn range. Used by memory hotplug to 237362306a36Sopenharmony_ci * make specified memory blocks removable from the system. 237462306a36Sopenharmony_ci * Note that this will dissolve a free gigantic hugepage completely, if any 237562306a36Sopenharmony_ci * part of it lies within the given range. 237662306a36Sopenharmony_ci * Also note that if dissolve_free_huge_page() returns with an error, all 237762306a36Sopenharmony_ci * free hugepages that were dissolved before that error are lost. 237862306a36Sopenharmony_ci */ 237962306a36Sopenharmony_ciint dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) 238062306a36Sopenharmony_ci{ 238162306a36Sopenharmony_ci unsigned long pfn; 238262306a36Sopenharmony_ci struct page *page; 238362306a36Sopenharmony_ci int rc = 0; 238462306a36Sopenharmony_ci unsigned int order; 238562306a36Sopenharmony_ci struct hstate *h; 238662306a36Sopenharmony_ci 238762306a36Sopenharmony_ci if (!hugepages_supported()) 238862306a36Sopenharmony_ci return rc; 238962306a36Sopenharmony_ci 239062306a36Sopenharmony_ci order = huge_page_order(&default_hstate); 239162306a36Sopenharmony_ci for_each_hstate(h) 239262306a36Sopenharmony_ci order = min(order, huge_page_order(h)); 239362306a36Sopenharmony_ci 239462306a36Sopenharmony_ci for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) { 239562306a36Sopenharmony_ci page = pfn_to_page(pfn); 239662306a36Sopenharmony_ci rc = dissolve_free_huge_page(page); 239762306a36Sopenharmony_ci if (rc) 239862306a36Sopenharmony_ci break; 239962306a36Sopenharmony_ci } 240062306a36Sopenharmony_ci 240162306a36Sopenharmony_ci return rc; 240262306a36Sopenharmony_ci} 240362306a36Sopenharmony_ci 240462306a36Sopenharmony_ci/* 240562306a36Sopenharmony_ci * Allocates a fresh surplus page from the page allocator. 240662306a36Sopenharmony_ci */ 240762306a36Sopenharmony_cistatic struct folio *alloc_surplus_hugetlb_folio(struct hstate *h, 240862306a36Sopenharmony_ci gfp_t gfp_mask, int nid, nodemask_t *nmask) 240962306a36Sopenharmony_ci{ 241062306a36Sopenharmony_ci struct folio *folio = NULL; 241162306a36Sopenharmony_ci 241262306a36Sopenharmony_ci if (hstate_is_gigantic(h)) 241362306a36Sopenharmony_ci return NULL; 241462306a36Sopenharmony_ci 241562306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 241662306a36Sopenharmony_ci if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) 241762306a36Sopenharmony_ci goto out_unlock; 241862306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 241962306a36Sopenharmony_ci 242062306a36Sopenharmony_ci folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); 242162306a36Sopenharmony_ci if (!folio) 242262306a36Sopenharmony_ci return NULL; 242362306a36Sopenharmony_ci 242462306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 242562306a36Sopenharmony_ci /* 242662306a36Sopenharmony_ci * We could have raced with the pool size change. 242762306a36Sopenharmony_ci * Double check that and simply deallocate the new page 242862306a36Sopenharmony_ci * if we would end up overcommiting the surpluses. Abuse 242962306a36Sopenharmony_ci * temporary page to workaround the nasty free_huge_folio 243062306a36Sopenharmony_ci * codeflow 243162306a36Sopenharmony_ci */ 243262306a36Sopenharmony_ci if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { 243362306a36Sopenharmony_ci folio_set_hugetlb_temporary(folio); 243462306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 243562306a36Sopenharmony_ci free_huge_folio(folio); 243662306a36Sopenharmony_ci return NULL; 243762306a36Sopenharmony_ci } 243862306a36Sopenharmony_ci 243962306a36Sopenharmony_ci h->surplus_huge_pages++; 244062306a36Sopenharmony_ci h->surplus_huge_pages_node[folio_nid(folio)]++; 244162306a36Sopenharmony_ci 244262306a36Sopenharmony_ciout_unlock: 244362306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 244462306a36Sopenharmony_ci 244562306a36Sopenharmony_ci return folio; 244662306a36Sopenharmony_ci} 244762306a36Sopenharmony_ci 244862306a36Sopenharmony_cistatic struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, 244962306a36Sopenharmony_ci int nid, nodemask_t *nmask) 245062306a36Sopenharmony_ci{ 245162306a36Sopenharmony_ci struct folio *folio; 245262306a36Sopenharmony_ci 245362306a36Sopenharmony_ci if (hstate_is_gigantic(h)) 245462306a36Sopenharmony_ci return NULL; 245562306a36Sopenharmony_ci 245662306a36Sopenharmony_ci folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); 245762306a36Sopenharmony_ci if (!folio) 245862306a36Sopenharmony_ci return NULL; 245962306a36Sopenharmony_ci 246062306a36Sopenharmony_ci /* fresh huge pages are frozen */ 246162306a36Sopenharmony_ci folio_ref_unfreeze(folio, 1); 246262306a36Sopenharmony_ci /* 246362306a36Sopenharmony_ci * We do not account these pages as surplus because they are only 246462306a36Sopenharmony_ci * temporary and will be released properly on the last reference 246562306a36Sopenharmony_ci */ 246662306a36Sopenharmony_ci folio_set_hugetlb_temporary(folio); 246762306a36Sopenharmony_ci 246862306a36Sopenharmony_ci return folio; 246962306a36Sopenharmony_ci} 247062306a36Sopenharmony_ci 247162306a36Sopenharmony_ci/* 247262306a36Sopenharmony_ci * Use the VMA's mpolicy to allocate a huge page from the buddy. 247362306a36Sopenharmony_ci */ 247462306a36Sopenharmony_cistatic 247562306a36Sopenharmony_cistruct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h, 247662306a36Sopenharmony_ci struct vm_area_struct *vma, unsigned long addr) 247762306a36Sopenharmony_ci{ 247862306a36Sopenharmony_ci struct folio *folio = NULL; 247962306a36Sopenharmony_ci struct mempolicy *mpol; 248062306a36Sopenharmony_ci gfp_t gfp_mask = htlb_alloc_mask(h); 248162306a36Sopenharmony_ci int nid; 248262306a36Sopenharmony_ci nodemask_t *nodemask; 248362306a36Sopenharmony_ci 248462306a36Sopenharmony_ci nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); 248562306a36Sopenharmony_ci if (mpol_is_preferred_many(mpol)) { 248662306a36Sopenharmony_ci gfp_t gfp = gfp_mask | __GFP_NOWARN; 248762306a36Sopenharmony_ci 248862306a36Sopenharmony_ci gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); 248962306a36Sopenharmony_ci folio = alloc_surplus_hugetlb_folio(h, gfp, nid, nodemask); 249062306a36Sopenharmony_ci 249162306a36Sopenharmony_ci /* Fallback to all nodes if page==NULL */ 249262306a36Sopenharmony_ci nodemask = NULL; 249362306a36Sopenharmony_ci } 249462306a36Sopenharmony_ci 249562306a36Sopenharmony_ci if (!folio) 249662306a36Sopenharmony_ci folio = alloc_surplus_hugetlb_folio(h, gfp_mask, nid, nodemask); 249762306a36Sopenharmony_ci mpol_cond_put(mpol); 249862306a36Sopenharmony_ci return folio; 249962306a36Sopenharmony_ci} 250062306a36Sopenharmony_ci 250162306a36Sopenharmony_ci/* folio migration callback function */ 250262306a36Sopenharmony_cistruct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, 250362306a36Sopenharmony_ci nodemask_t *nmask, gfp_t gfp_mask) 250462306a36Sopenharmony_ci{ 250562306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 250662306a36Sopenharmony_ci if (available_huge_pages(h)) { 250762306a36Sopenharmony_ci struct folio *folio; 250862306a36Sopenharmony_ci 250962306a36Sopenharmony_ci folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, 251062306a36Sopenharmony_ci preferred_nid, nmask); 251162306a36Sopenharmony_ci if (folio) { 251262306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 251362306a36Sopenharmony_ci return folio; 251462306a36Sopenharmony_ci } 251562306a36Sopenharmony_ci } 251662306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 251762306a36Sopenharmony_ci 251862306a36Sopenharmony_ci return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask); 251962306a36Sopenharmony_ci} 252062306a36Sopenharmony_ci 252162306a36Sopenharmony_ci/* mempolicy aware migration callback */ 252262306a36Sopenharmony_cistruct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, 252362306a36Sopenharmony_ci unsigned long address) 252462306a36Sopenharmony_ci{ 252562306a36Sopenharmony_ci struct mempolicy *mpol; 252662306a36Sopenharmony_ci nodemask_t *nodemask; 252762306a36Sopenharmony_ci struct folio *folio; 252862306a36Sopenharmony_ci gfp_t gfp_mask; 252962306a36Sopenharmony_ci int node; 253062306a36Sopenharmony_ci 253162306a36Sopenharmony_ci gfp_mask = htlb_alloc_mask(h); 253262306a36Sopenharmony_ci node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); 253362306a36Sopenharmony_ci folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); 253462306a36Sopenharmony_ci mpol_cond_put(mpol); 253562306a36Sopenharmony_ci 253662306a36Sopenharmony_ci return folio; 253762306a36Sopenharmony_ci} 253862306a36Sopenharmony_ci 253962306a36Sopenharmony_ci/* 254062306a36Sopenharmony_ci * Increase the hugetlb pool such that it can accommodate a reservation 254162306a36Sopenharmony_ci * of size 'delta'. 254262306a36Sopenharmony_ci */ 254362306a36Sopenharmony_cistatic int gather_surplus_pages(struct hstate *h, long delta) 254462306a36Sopenharmony_ci __must_hold(&hugetlb_lock) 254562306a36Sopenharmony_ci{ 254662306a36Sopenharmony_ci LIST_HEAD(surplus_list); 254762306a36Sopenharmony_ci struct folio *folio, *tmp; 254862306a36Sopenharmony_ci int ret; 254962306a36Sopenharmony_ci long i; 255062306a36Sopenharmony_ci long needed, allocated; 255162306a36Sopenharmony_ci bool alloc_ok = true; 255262306a36Sopenharmony_ci 255362306a36Sopenharmony_ci lockdep_assert_held(&hugetlb_lock); 255462306a36Sopenharmony_ci needed = (h->resv_huge_pages + delta) - h->free_huge_pages; 255562306a36Sopenharmony_ci if (needed <= 0) { 255662306a36Sopenharmony_ci h->resv_huge_pages += delta; 255762306a36Sopenharmony_ci return 0; 255862306a36Sopenharmony_ci } 255962306a36Sopenharmony_ci 256062306a36Sopenharmony_ci allocated = 0; 256162306a36Sopenharmony_ci 256262306a36Sopenharmony_ci ret = -ENOMEM; 256362306a36Sopenharmony_ciretry: 256462306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 256562306a36Sopenharmony_ci for (i = 0; i < needed; i++) { 256662306a36Sopenharmony_ci folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), 256762306a36Sopenharmony_ci NUMA_NO_NODE, NULL); 256862306a36Sopenharmony_ci if (!folio) { 256962306a36Sopenharmony_ci alloc_ok = false; 257062306a36Sopenharmony_ci break; 257162306a36Sopenharmony_ci } 257262306a36Sopenharmony_ci list_add(&folio->lru, &surplus_list); 257362306a36Sopenharmony_ci cond_resched(); 257462306a36Sopenharmony_ci } 257562306a36Sopenharmony_ci allocated += i; 257662306a36Sopenharmony_ci 257762306a36Sopenharmony_ci /* 257862306a36Sopenharmony_ci * After retaking hugetlb_lock, we need to recalculate 'needed' 257962306a36Sopenharmony_ci * because either resv_huge_pages or free_huge_pages may have changed. 258062306a36Sopenharmony_ci */ 258162306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 258262306a36Sopenharmony_ci needed = (h->resv_huge_pages + delta) - 258362306a36Sopenharmony_ci (h->free_huge_pages + allocated); 258462306a36Sopenharmony_ci if (needed > 0) { 258562306a36Sopenharmony_ci if (alloc_ok) 258662306a36Sopenharmony_ci goto retry; 258762306a36Sopenharmony_ci /* 258862306a36Sopenharmony_ci * We were not able to allocate enough pages to 258962306a36Sopenharmony_ci * satisfy the entire reservation so we free what 259062306a36Sopenharmony_ci * we've allocated so far. 259162306a36Sopenharmony_ci */ 259262306a36Sopenharmony_ci goto free; 259362306a36Sopenharmony_ci } 259462306a36Sopenharmony_ci /* 259562306a36Sopenharmony_ci * The surplus_list now contains _at_least_ the number of extra pages 259662306a36Sopenharmony_ci * needed to accommodate the reservation. Add the appropriate number 259762306a36Sopenharmony_ci * of pages to the hugetlb pool and free the extras back to the buddy 259862306a36Sopenharmony_ci * allocator. Commit the entire reservation here to prevent another 259962306a36Sopenharmony_ci * process from stealing the pages as they are added to the pool but 260062306a36Sopenharmony_ci * before they are reserved. 260162306a36Sopenharmony_ci */ 260262306a36Sopenharmony_ci needed += allocated; 260362306a36Sopenharmony_ci h->resv_huge_pages += delta; 260462306a36Sopenharmony_ci ret = 0; 260562306a36Sopenharmony_ci 260662306a36Sopenharmony_ci /* Free the needed pages to the hugetlb pool */ 260762306a36Sopenharmony_ci list_for_each_entry_safe(folio, tmp, &surplus_list, lru) { 260862306a36Sopenharmony_ci if ((--needed) < 0) 260962306a36Sopenharmony_ci break; 261062306a36Sopenharmony_ci /* Add the page to the hugetlb allocator */ 261162306a36Sopenharmony_ci enqueue_hugetlb_folio(h, folio); 261262306a36Sopenharmony_ci } 261362306a36Sopenharmony_cifree: 261462306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 261562306a36Sopenharmony_ci 261662306a36Sopenharmony_ci /* 261762306a36Sopenharmony_ci * Free unnecessary surplus pages to the buddy allocator. 261862306a36Sopenharmony_ci * Pages have no ref count, call free_huge_folio directly. 261962306a36Sopenharmony_ci */ 262062306a36Sopenharmony_ci list_for_each_entry_safe(folio, tmp, &surplus_list, lru) 262162306a36Sopenharmony_ci free_huge_folio(folio); 262262306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 262362306a36Sopenharmony_ci 262462306a36Sopenharmony_ci return ret; 262562306a36Sopenharmony_ci} 262662306a36Sopenharmony_ci 262762306a36Sopenharmony_ci/* 262862306a36Sopenharmony_ci * This routine has two main purposes: 262962306a36Sopenharmony_ci * 1) Decrement the reservation count (resv_huge_pages) by the value passed 263062306a36Sopenharmony_ci * in unused_resv_pages. This corresponds to the prior adjustments made 263162306a36Sopenharmony_ci * to the associated reservation map. 263262306a36Sopenharmony_ci * 2) Free any unused surplus pages that may have been allocated to satisfy 263362306a36Sopenharmony_ci * the reservation. As many as unused_resv_pages may be freed. 263462306a36Sopenharmony_ci */ 263562306a36Sopenharmony_cistatic void return_unused_surplus_pages(struct hstate *h, 263662306a36Sopenharmony_ci unsigned long unused_resv_pages) 263762306a36Sopenharmony_ci{ 263862306a36Sopenharmony_ci unsigned long nr_pages; 263962306a36Sopenharmony_ci struct page *page; 264062306a36Sopenharmony_ci LIST_HEAD(page_list); 264162306a36Sopenharmony_ci 264262306a36Sopenharmony_ci lockdep_assert_held(&hugetlb_lock); 264362306a36Sopenharmony_ci /* Uncommit the reservation */ 264462306a36Sopenharmony_ci h->resv_huge_pages -= unused_resv_pages; 264562306a36Sopenharmony_ci 264662306a36Sopenharmony_ci if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) 264762306a36Sopenharmony_ci goto out; 264862306a36Sopenharmony_ci 264962306a36Sopenharmony_ci /* 265062306a36Sopenharmony_ci * Part (or even all) of the reservation could have been backed 265162306a36Sopenharmony_ci * by pre-allocated pages. Only free surplus pages. 265262306a36Sopenharmony_ci */ 265362306a36Sopenharmony_ci nr_pages = min(unused_resv_pages, h->surplus_huge_pages); 265462306a36Sopenharmony_ci 265562306a36Sopenharmony_ci /* 265662306a36Sopenharmony_ci * We want to release as many surplus pages as possible, spread 265762306a36Sopenharmony_ci * evenly across all nodes with memory. Iterate across these nodes 265862306a36Sopenharmony_ci * until we can no longer free unreserved surplus pages. This occurs 265962306a36Sopenharmony_ci * when the nodes with surplus pages have no free pages. 266062306a36Sopenharmony_ci * remove_pool_huge_page() will balance the freed pages across the 266162306a36Sopenharmony_ci * on-line nodes with memory and will handle the hstate accounting. 266262306a36Sopenharmony_ci */ 266362306a36Sopenharmony_ci while (nr_pages--) { 266462306a36Sopenharmony_ci page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1); 266562306a36Sopenharmony_ci if (!page) 266662306a36Sopenharmony_ci goto out; 266762306a36Sopenharmony_ci 266862306a36Sopenharmony_ci list_add(&page->lru, &page_list); 266962306a36Sopenharmony_ci } 267062306a36Sopenharmony_ci 267162306a36Sopenharmony_ciout: 267262306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 267362306a36Sopenharmony_ci update_and_free_pages_bulk(h, &page_list); 267462306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 267562306a36Sopenharmony_ci} 267662306a36Sopenharmony_ci 267762306a36Sopenharmony_ci 267862306a36Sopenharmony_ci/* 267962306a36Sopenharmony_ci * vma_needs_reservation, vma_commit_reservation and vma_end_reservation 268062306a36Sopenharmony_ci * are used by the huge page allocation routines to manage reservations. 268162306a36Sopenharmony_ci * 268262306a36Sopenharmony_ci * vma_needs_reservation is called to determine if the huge page at addr 268362306a36Sopenharmony_ci * within the vma has an associated reservation. If a reservation is 268462306a36Sopenharmony_ci * needed, the value 1 is returned. The caller is then responsible for 268562306a36Sopenharmony_ci * managing the global reservation and subpool usage counts. After 268662306a36Sopenharmony_ci * the huge page has been allocated, vma_commit_reservation is called 268762306a36Sopenharmony_ci * to add the page to the reservation map. If the page allocation fails, 268862306a36Sopenharmony_ci * the reservation must be ended instead of committed. vma_end_reservation 268962306a36Sopenharmony_ci * is called in such cases. 269062306a36Sopenharmony_ci * 269162306a36Sopenharmony_ci * In the normal case, vma_commit_reservation returns the same value 269262306a36Sopenharmony_ci * as the preceding vma_needs_reservation call. The only time this 269362306a36Sopenharmony_ci * is not the case is if a reserve map was changed between calls. It 269462306a36Sopenharmony_ci * is the responsibility of the caller to notice the difference and 269562306a36Sopenharmony_ci * take appropriate action. 269662306a36Sopenharmony_ci * 269762306a36Sopenharmony_ci * vma_add_reservation is used in error paths where a reservation must 269862306a36Sopenharmony_ci * be restored when a newly allocated huge page must be freed. It is 269962306a36Sopenharmony_ci * to be called after calling vma_needs_reservation to determine if a 270062306a36Sopenharmony_ci * reservation exists. 270162306a36Sopenharmony_ci * 270262306a36Sopenharmony_ci * vma_del_reservation is used in error paths where an entry in the reserve 270362306a36Sopenharmony_ci * map was created during huge page allocation and must be removed. It is to 270462306a36Sopenharmony_ci * be called after calling vma_needs_reservation to determine if a reservation 270562306a36Sopenharmony_ci * exists. 270662306a36Sopenharmony_ci */ 270762306a36Sopenharmony_cienum vma_resv_mode { 270862306a36Sopenharmony_ci VMA_NEEDS_RESV, 270962306a36Sopenharmony_ci VMA_COMMIT_RESV, 271062306a36Sopenharmony_ci VMA_END_RESV, 271162306a36Sopenharmony_ci VMA_ADD_RESV, 271262306a36Sopenharmony_ci VMA_DEL_RESV, 271362306a36Sopenharmony_ci}; 271462306a36Sopenharmony_cistatic long __vma_reservation_common(struct hstate *h, 271562306a36Sopenharmony_ci struct vm_area_struct *vma, unsigned long addr, 271662306a36Sopenharmony_ci enum vma_resv_mode mode) 271762306a36Sopenharmony_ci{ 271862306a36Sopenharmony_ci struct resv_map *resv; 271962306a36Sopenharmony_ci pgoff_t idx; 272062306a36Sopenharmony_ci long ret; 272162306a36Sopenharmony_ci long dummy_out_regions_needed; 272262306a36Sopenharmony_ci 272362306a36Sopenharmony_ci resv = vma_resv_map(vma); 272462306a36Sopenharmony_ci if (!resv) 272562306a36Sopenharmony_ci return 1; 272662306a36Sopenharmony_ci 272762306a36Sopenharmony_ci idx = vma_hugecache_offset(h, vma, addr); 272862306a36Sopenharmony_ci switch (mode) { 272962306a36Sopenharmony_ci case VMA_NEEDS_RESV: 273062306a36Sopenharmony_ci ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed); 273162306a36Sopenharmony_ci /* We assume that vma_reservation_* routines always operate on 273262306a36Sopenharmony_ci * 1 page, and that adding to resv map a 1 page entry can only 273362306a36Sopenharmony_ci * ever require 1 region. 273462306a36Sopenharmony_ci */ 273562306a36Sopenharmony_ci VM_BUG_ON(dummy_out_regions_needed != 1); 273662306a36Sopenharmony_ci break; 273762306a36Sopenharmony_ci case VMA_COMMIT_RESV: 273862306a36Sopenharmony_ci ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); 273962306a36Sopenharmony_ci /* region_add calls of range 1 should never fail. */ 274062306a36Sopenharmony_ci VM_BUG_ON(ret < 0); 274162306a36Sopenharmony_ci break; 274262306a36Sopenharmony_ci case VMA_END_RESV: 274362306a36Sopenharmony_ci region_abort(resv, idx, idx + 1, 1); 274462306a36Sopenharmony_ci ret = 0; 274562306a36Sopenharmony_ci break; 274662306a36Sopenharmony_ci case VMA_ADD_RESV: 274762306a36Sopenharmony_ci if (vma->vm_flags & VM_MAYSHARE) { 274862306a36Sopenharmony_ci ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); 274962306a36Sopenharmony_ci /* region_add calls of range 1 should never fail. */ 275062306a36Sopenharmony_ci VM_BUG_ON(ret < 0); 275162306a36Sopenharmony_ci } else { 275262306a36Sopenharmony_ci region_abort(resv, idx, idx + 1, 1); 275362306a36Sopenharmony_ci ret = region_del(resv, idx, idx + 1); 275462306a36Sopenharmony_ci } 275562306a36Sopenharmony_ci break; 275662306a36Sopenharmony_ci case VMA_DEL_RESV: 275762306a36Sopenharmony_ci if (vma->vm_flags & VM_MAYSHARE) { 275862306a36Sopenharmony_ci region_abort(resv, idx, idx + 1, 1); 275962306a36Sopenharmony_ci ret = region_del(resv, idx, idx + 1); 276062306a36Sopenharmony_ci } else { 276162306a36Sopenharmony_ci ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); 276262306a36Sopenharmony_ci /* region_add calls of range 1 should never fail. */ 276362306a36Sopenharmony_ci VM_BUG_ON(ret < 0); 276462306a36Sopenharmony_ci } 276562306a36Sopenharmony_ci break; 276662306a36Sopenharmony_ci default: 276762306a36Sopenharmony_ci BUG(); 276862306a36Sopenharmony_ci } 276962306a36Sopenharmony_ci 277062306a36Sopenharmony_ci if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV) 277162306a36Sopenharmony_ci return ret; 277262306a36Sopenharmony_ci /* 277362306a36Sopenharmony_ci * We know private mapping must have HPAGE_RESV_OWNER set. 277462306a36Sopenharmony_ci * 277562306a36Sopenharmony_ci * In most cases, reserves always exist for private mappings. 277662306a36Sopenharmony_ci * However, a file associated with mapping could have been 277762306a36Sopenharmony_ci * hole punched or truncated after reserves were consumed. 277862306a36Sopenharmony_ci * As subsequent fault on such a range will not use reserves. 277962306a36Sopenharmony_ci * Subtle - The reserve map for private mappings has the 278062306a36Sopenharmony_ci * opposite meaning than that of shared mappings. If NO 278162306a36Sopenharmony_ci * entry is in the reserve map, it means a reservation exists. 278262306a36Sopenharmony_ci * If an entry exists in the reserve map, it means the 278362306a36Sopenharmony_ci * reservation has already been consumed. As a result, the 278462306a36Sopenharmony_ci * return value of this routine is the opposite of the 278562306a36Sopenharmony_ci * value returned from reserve map manipulation routines above. 278662306a36Sopenharmony_ci */ 278762306a36Sopenharmony_ci if (ret > 0) 278862306a36Sopenharmony_ci return 0; 278962306a36Sopenharmony_ci if (ret == 0) 279062306a36Sopenharmony_ci return 1; 279162306a36Sopenharmony_ci return ret; 279262306a36Sopenharmony_ci} 279362306a36Sopenharmony_ci 279462306a36Sopenharmony_cistatic long vma_needs_reservation(struct hstate *h, 279562306a36Sopenharmony_ci struct vm_area_struct *vma, unsigned long addr) 279662306a36Sopenharmony_ci{ 279762306a36Sopenharmony_ci return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV); 279862306a36Sopenharmony_ci} 279962306a36Sopenharmony_ci 280062306a36Sopenharmony_cistatic long vma_commit_reservation(struct hstate *h, 280162306a36Sopenharmony_ci struct vm_area_struct *vma, unsigned long addr) 280262306a36Sopenharmony_ci{ 280362306a36Sopenharmony_ci return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV); 280462306a36Sopenharmony_ci} 280562306a36Sopenharmony_ci 280662306a36Sopenharmony_cistatic void vma_end_reservation(struct hstate *h, 280762306a36Sopenharmony_ci struct vm_area_struct *vma, unsigned long addr) 280862306a36Sopenharmony_ci{ 280962306a36Sopenharmony_ci (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV); 281062306a36Sopenharmony_ci} 281162306a36Sopenharmony_ci 281262306a36Sopenharmony_cistatic long vma_add_reservation(struct hstate *h, 281362306a36Sopenharmony_ci struct vm_area_struct *vma, unsigned long addr) 281462306a36Sopenharmony_ci{ 281562306a36Sopenharmony_ci return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV); 281662306a36Sopenharmony_ci} 281762306a36Sopenharmony_ci 281862306a36Sopenharmony_cistatic long vma_del_reservation(struct hstate *h, 281962306a36Sopenharmony_ci struct vm_area_struct *vma, unsigned long addr) 282062306a36Sopenharmony_ci{ 282162306a36Sopenharmony_ci return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV); 282262306a36Sopenharmony_ci} 282362306a36Sopenharmony_ci 282462306a36Sopenharmony_ci/* 282562306a36Sopenharmony_ci * This routine is called to restore reservation information on error paths. 282662306a36Sopenharmony_ci * It should ONLY be called for folios allocated via alloc_hugetlb_folio(), 282762306a36Sopenharmony_ci * and the hugetlb mutex should remain held when calling this routine. 282862306a36Sopenharmony_ci * 282962306a36Sopenharmony_ci * It handles two specific cases: 283062306a36Sopenharmony_ci * 1) A reservation was in place and the folio consumed the reservation. 283162306a36Sopenharmony_ci * hugetlb_restore_reserve is set in the folio. 283262306a36Sopenharmony_ci * 2) No reservation was in place for the page, so hugetlb_restore_reserve is 283362306a36Sopenharmony_ci * not set. However, alloc_hugetlb_folio always updates the reserve map. 283462306a36Sopenharmony_ci * 283562306a36Sopenharmony_ci * In case 1, free_huge_folio later in the error path will increment the 283662306a36Sopenharmony_ci * global reserve count. But, free_huge_folio does not have enough context 283762306a36Sopenharmony_ci * to adjust the reservation map. This case deals primarily with private 283862306a36Sopenharmony_ci * mappings. Adjust the reserve map here to be consistent with global 283962306a36Sopenharmony_ci * reserve count adjustments to be made by free_huge_folio. Make sure the 284062306a36Sopenharmony_ci * reserve map indicates there is a reservation present. 284162306a36Sopenharmony_ci * 284262306a36Sopenharmony_ci * In case 2, simply undo reserve map modifications done by alloc_hugetlb_folio. 284362306a36Sopenharmony_ci */ 284462306a36Sopenharmony_civoid restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, 284562306a36Sopenharmony_ci unsigned long address, struct folio *folio) 284662306a36Sopenharmony_ci{ 284762306a36Sopenharmony_ci long rc = vma_needs_reservation(h, vma, address); 284862306a36Sopenharmony_ci 284962306a36Sopenharmony_ci if (folio_test_hugetlb_restore_reserve(folio)) { 285062306a36Sopenharmony_ci if (unlikely(rc < 0)) 285162306a36Sopenharmony_ci /* 285262306a36Sopenharmony_ci * Rare out of memory condition in reserve map 285362306a36Sopenharmony_ci * manipulation. Clear hugetlb_restore_reserve so 285462306a36Sopenharmony_ci * that global reserve count will not be incremented 285562306a36Sopenharmony_ci * by free_huge_folio. This will make it appear 285662306a36Sopenharmony_ci * as though the reservation for this folio was 285762306a36Sopenharmony_ci * consumed. This may prevent the task from 285862306a36Sopenharmony_ci * faulting in the folio at a later time. This 285962306a36Sopenharmony_ci * is better than inconsistent global huge page 286062306a36Sopenharmony_ci * accounting of reserve counts. 286162306a36Sopenharmony_ci */ 286262306a36Sopenharmony_ci folio_clear_hugetlb_restore_reserve(folio); 286362306a36Sopenharmony_ci else if (rc) 286462306a36Sopenharmony_ci (void)vma_add_reservation(h, vma, address); 286562306a36Sopenharmony_ci else 286662306a36Sopenharmony_ci vma_end_reservation(h, vma, address); 286762306a36Sopenharmony_ci } else { 286862306a36Sopenharmony_ci if (!rc) { 286962306a36Sopenharmony_ci /* 287062306a36Sopenharmony_ci * This indicates there is an entry in the reserve map 287162306a36Sopenharmony_ci * not added by alloc_hugetlb_folio. We know it was added 287262306a36Sopenharmony_ci * before the alloc_hugetlb_folio call, otherwise 287362306a36Sopenharmony_ci * hugetlb_restore_reserve would be set on the folio. 287462306a36Sopenharmony_ci * Remove the entry so that a subsequent allocation 287562306a36Sopenharmony_ci * does not consume a reservation. 287662306a36Sopenharmony_ci */ 287762306a36Sopenharmony_ci rc = vma_del_reservation(h, vma, address); 287862306a36Sopenharmony_ci if (rc < 0) 287962306a36Sopenharmony_ci /* 288062306a36Sopenharmony_ci * VERY rare out of memory condition. Since 288162306a36Sopenharmony_ci * we can not delete the entry, set 288262306a36Sopenharmony_ci * hugetlb_restore_reserve so that the reserve 288362306a36Sopenharmony_ci * count will be incremented when the folio 288462306a36Sopenharmony_ci * is freed. This reserve will be consumed 288562306a36Sopenharmony_ci * on a subsequent allocation. 288662306a36Sopenharmony_ci */ 288762306a36Sopenharmony_ci folio_set_hugetlb_restore_reserve(folio); 288862306a36Sopenharmony_ci } else if (rc < 0) { 288962306a36Sopenharmony_ci /* 289062306a36Sopenharmony_ci * Rare out of memory condition from 289162306a36Sopenharmony_ci * vma_needs_reservation call. Memory allocation is 289262306a36Sopenharmony_ci * only attempted if a new entry is needed. Therefore, 289362306a36Sopenharmony_ci * this implies there is not an entry in the 289462306a36Sopenharmony_ci * reserve map. 289562306a36Sopenharmony_ci * 289662306a36Sopenharmony_ci * For shared mappings, no entry in the map indicates 289762306a36Sopenharmony_ci * no reservation. We are done. 289862306a36Sopenharmony_ci */ 289962306a36Sopenharmony_ci if (!(vma->vm_flags & VM_MAYSHARE)) 290062306a36Sopenharmony_ci /* 290162306a36Sopenharmony_ci * For private mappings, no entry indicates 290262306a36Sopenharmony_ci * a reservation is present. Since we can 290362306a36Sopenharmony_ci * not add an entry, set hugetlb_restore_reserve 290462306a36Sopenharmony_ci * on the folio so reserve count will be 290562306a36Sopenharmony_ci * incremented when freed. This reserve will 290662306a36Sopenharmony_ci * be consumed on a subsequent allocation. 290762306a36Sopenharmony_ci */ 290862306a36Sopenharmony_ci folio_set_hugetlb_restore_reserve(folio); 290962306a36Sopenharmony_ci } else 291062306a36Sopenharmony_ci /* 291162306a36Sopenharmony_ci * No reservation present, do nothing 291262306a36Sopenharmony_ci */ 291362306a36Sopenharmony_ci vma_end_reservation(h, vma, address); 291462306a36Sopenharmony_ci } 291562306a36Sopenharmony_ci} 291662306a36Sopenharmony_ci 291762306a36Sopenharmony_ci/* 291862306a36Sopenharmony_ci * alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve 291962306a36Sopenharmony_ci * the old one 292062306a36Sopenharmony_ci * @h: struct hstate old page belongs to 292162306a36Sopenharmony_ci * @old_folio: Old folio to dissolve 292262306a36Sopenharmony_ci * @list: List to isolate the page in case we need to 292362306a36Sopenharmony_ci * Returns 0 on success, otherwise negated error. 292462306a36Sopenharmony_ci */ 292562306a36Sopenharmony_cistatic int alloc_and_dissolve_hugetlb_folio(struct hstate *h, 292662306a36Sopenharmony_ci struct folio *old_folio, struct list_head *list) 292762306a36Sopenharmony_ci{ 292862306a36Sopenharmony_ci gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; 292962306a36Sopenharmony_ci int nid = folio_nid(old_folio); 293062306a36Sopenharmony_ci struct folio *new_folio; 293162306a36Sopenharmony_ci int ret = 0; 293262306a36Sopenharmony_ci 293362306a36Sopenharmony_ci /* 293462306a36Sopenharmony_ci * Before dissolving the folio, we need to allocate a new one for the 293562306a36Sopenharmony_ci * pool to remain stable. Here, we allocate the folio and 'prep' it 293662306a36Sopenharmony_ci * by doing everything but actually updating counters and adding to 293762306a36Sopenharmony_ci * the pool. This simplifies and let us do most of the processing 293862306a36Sopenharmony_ci * under the lock. 293962306a36Sopenharmony_ci */ 294062306a36Sopenharmony_ci new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, NULL, NULL); 294162306a36Sopenharmony_ci if (!new_folio) 294262306a36Sopenharmony_ci return -ENOMEM; 294362306a36Sopenharmony_ci __prep_new_hugetlb_folio(h, new_folio); 294462306a36Sopenharmony_ci 294562306a36Sopenharmony_ciretry: 294662306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 294762306a36Sopenharmony_ci if (!folio_test_hugetlb(old_folio)) { 294862306a36Sopenharmony_ci /* 294962306a36Sopenharmony_ci * Freed from under us. Drop new_folio too. 295062306a36Sopenharmony_ci */ 295162306a36Sopenharmony_ci goto free_new; 295262306a36Sopenharmony_ci } else if (folio_ref_count(old_folio)) { 295362306a36Sopenharmony_ci bool isolated; 295462306a36Sopenharmony_ci 295562306a36Sopenharmony_ci /* 295662306a36Sopenharmony_ci * Someone has grabbed the folio, try to isolate it here. 295762306a36Sopenharmony_ci * Fail with -EBUSY if not possible. 295862306a36Sopenharmony_ci */ 295962306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 296062306a36Sopenharmony_ci isolated = isolate_hugetlb(old_folio, list); 296162306a36Sopenharmony_ci ret = isolated ? 0 : -EBUSY; 296262306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 296362306a36Sopenharmony_ci goto free_new; 296462306a36Sopenharmony_ci } else if (!folio_test_hugetlb_freed(old_folio)) { 296562306a36Sopenharmony_ci /* 296662306a36Sopenharmony_ci * Folio's refcount is 0 but it has not been enqueued in the 296762306a36Sopenharmony_ci * freelist yet. Race window is small, so we can succeed here if 296862306a36Sopenharmony_ci * we retry. 296962306a36Sopenharmony_ci */ 297062306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 297162306a36Sopenharmony_ci cond_resched(); 297262306a36Sopenharmony_ci goto retry; 297362306a36Sopenharmony_ci } else { 297462306a36Sopenharmony_ci /* 297562306a36Sopenharmony_ci * Ok, old_folio is still a genuine free hugepage. Remove it from 297662306a36Sopenharmony_ci * the freelist and decrease the counters. These will be 297762306a36Sopenharmony_ci * incremented again when calling __prep_account_new_huge_page() 297862306a36Sopenharmony_ci * and enqueue_hugetlb_folio() for new_folio. The counters will 297962306a36Sopenharmony_ci * remain stable since this happens under the lock. 298062306a36Sopenharmony_ci */ 298162306a36Sopenharmony_ci remove_hugetlb_folio(h, old_folio, false); 298262306a36Sopenharmony_ci 298362306a36Sopenharmony_ci /* 298462306a36Sopenharmony_ci * Ref count on new_folio is already zero as it was dropped 298562306a36Sopenharmony_ci * earlier. It can be directly added to the pool free list. 298662306a36Sopenharmony_ci */ 298762306a36Sopenharmony_ci __prep_account_new_huge_page(h, nid); 298862306a36Sopenharmony_ci enqueue_hugetlb_folio(h, new_folio); 298962306a36Sopenharmony_ci 299062306a36Sopenharmony_ci /* 299162306a36Sopenharmony_ci * Folio has been replaced, we can safely free the old one. 299262306a36Sopenharmony_ci */ 299362306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 299462306a36Sopenharmony_ci update_and_free_hugetlb_folio(h, old_folio, false); 299562306a36Sopenharmony_ci } 299662306a36Sopenharmony_ci 299762306a36Sopenharmony_ci return ret; 299862306a36Sopenharmony_ci 299962306a36Sopenharmony_cifree_new: 300062306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 300162306a36Sopenharmony_ci /* Folio has a zero ref count, but needs a ref to be freed */ 300262306a36Sopenharmony_ci folio_ref_unfreeze(new_folio, 1); 300362306a36Sopenharmony_ci update_and_free_hugetlb_folio(h, new_folio, false); 300462306a36Sopenharmony_ci 300562306a36Sopenharmony_ci return ret; 300662306a36Sopenharmony_ci} 300762306a36Sopenharmony_ci 300862306a36Sopenharmony_ciint isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) 300962306a36Sopenharmony_ci{ 301062306a36Sopenharmony_ci struct hstate *h; 301162306a36Sopenharmony_ci struct folio *folio = page_folio(page); 301262306a36Sopenharmony_ci int ret = -EBUSY; 301362306a36Sopenharmony_ci 301462306a36Sopenharmony_ci /* 301562306a36Sopenharmony_ci * The page might have been dissolved from under our feet, so make sure 301662306a36Sopenharmony_ci * to carefully check the state under the lock. 301762306a36Sopenharmony_ci * Return success when racing as if we dissolved the page ourselves. 301862306a36Sopenharmony_ci */ 301962306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 302062306a36Sopenharmony_ci if (folio_test_hugetlb(folio)) { 302162306a36Sopenharmony_ci h = folio_hstate(folio); 302262306a36Sopenharmony_ci } else { 302362306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 302462306a36Sopenharmony_ci return 0; 302562306a36Sopenharmony_ci } 302662306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 302762306a36Sopenharmony_ci 302862306a36Sopenharmony_ci /* 302962306a36Sopenharmony_ci * Fence off gigantic pages as there is a cyclic dependency between 303062306a36Sopenharmony_ci * alloc_contig_range and them. Return -ENOMEM as this has the effect 303162306a36Sopenharmony_ci * of bailing out right away without further retrying. 303262306a36Sopenharmony_ci */ 303362306a36Sopenharmony_ci if (hstate_is_gigantic(h)) 303462306a36Sopenharmony_ci return -ENOMEM; 303562306a36Sopenharmony_ci 303662306a36Sopenharmony_ci if (folio_ref_count(folio) && isolate_hugetlb(folio, list)) 303762306a36Sopenharmony_ci ret = 0; 303862306a36Sopenharmony_ci else if (!folio_ref_count(folio)) 303962306a36Sopenharmony_ci ret = alloc_and_dissolve_hugetlb_folio(h, folio, list); 304062306a36Sopenharmony_ci 304162306a36Sopenharmony_ci return ret; 304262306a36Sopenharmony_ci} 304362306a36Sopenharmony_ci 304462306a36Sopenharmony_cistruct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, 304562306a36Sopenharmony_ci unsigned long addr, int avoid_reserve) 304662306a36Sopenharmony_ci{ 304762306a36Sopenharmony_ci struct hugepage_subpool *spool = subpool_vma(vma); 304862306a36Sopenharmony_ci struct hstate *h = hstate_vma(vma); 304962306a36Sopenharmony_ci struct folio *folio; 305062306a36Sopenharmony_ci long map_chg, map_commit; 305162306a36Sopenharmony_ci long gbl_chg; 305262306a36Sopenharmony_ci int ret, idx; 305362306a36Sopenharmony_ci struct hugetlb_cgroup *h_cg = NULL; 305462306a36Sopenharmony_ci bool deferred_reserve; 305562306a36Sopenharmony_ci 305662306a36Sopenharmony_ci idx = hstate_index(h); 305762306a36Sopenharmony_ci /* 305862306a36Sopenharmony_ci * Examine the region/reserve map to determine if the process 305962306a36Sopenharmony_ci * has a reservation for the page to be allocated. A return 306062306a36Sopenharmony_ci * code of zero indicates a reservation exists (no change). 306162306a36Sopenharmony_ci */ 306262306a36Sopenharmony_ci map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); 306362306a36Sopenharmony_ci if (map_chg < 0) 306462306a36Sopenharmony_ci return ERR_PTR(-ENOMEM); 306562306a36Sopenharmony_ci 306662306a36Sopenharmony_ci /* 306762306a36Sopenharmony_ci * Processes that did not create the mapping will have no 306862306a36Sopenharmony_ci * reserves as indicated by the region/reserve map. Check 306962306a36Sopenharmony_ci * that the allocation will not exceed the subpool limit. 307062306a36Sopenharmony_ci * Allocations for MAP_NORESERVE mappings also need to be 307162306a36Sopenharmony_ci * checked against any subpool limit. 307262306a36Sopenharmony_ci */ 307362306a36Sopenharmony_ci if (map_chg || avoid_reserve) { 307462306a36Sopenharmony_ci gbl_chg = hugepage_subpool_get_pages(spool, 1); 307562306a36Sopenharmony_ci if (gbl_chg < 0) { 307662306a36Sopenharmony_ci vma_end_reservation(h, vma, addr); 307762306a36Sopenharmony_ci return ERR_PTR(-ENOSPC); 307862306a36Sopenharmony_ci } 307962306a36Sopenharmony_ci 308062306a36Sopenharmony_ci /* 308162306a36Sopenharmony_ci * Even though there was no reservation in the region/reserve 308262306a36Sopenharmony_ci * map, there could be reservations associated with the 308362306a36Sopenharmony_ci * subpool that can be used. This would be indicated if the 308462306a36Sopenharmony_ci * return value of hugepage_subpool_get_pages() is zero. 308562306a36Sopenharmony_ci * However, if avoid_reserve is specified we still avoid even 308662306a36Sopenharmony_ci * the subpool reservations. 308762306a36Sopenharmony_ci */ 308862306a36Sopenharmony_ci if (avoid_reserve) 308962306a36Sopenharmony_ci gbl_chg = 1; 309062306a36Sopenharmony_ci } 309162306a36Sopenharmony_ci 309262306a36Sopenharmony_ci /* If this allocation is not consuming a reservation, charge it now. 309362306a36Sopenharmony_ci */ 309462306a36Sopenharmony_ci deferred_reserve = map_chg || avoid_reserve; 309562306a36Sopenharmony_ci if (deferred_reserve) { 309662306a36Sopenharmony_ci ret = hugetlb_cgroup_charge_cgroup_rsvd( 309762306a36Sopenharmony_ci idx, pages_per_huge_page(h), &h_cg); 309862306a36Sopenharmony_ci if (ret) 309962306a36Sopenharmony_ci goto out_subpool_put; 310062306a36Sopenharmony_ci } 310162306a36Sopenharmony_ci 310262306a36Sopenharmony_ci ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); 310362306a36Sopenharmony_ci if (ret) 310462306a36Sopenharmony_ci goto out_uncharge_cgroup_reservation; 310562306a36Sopenharmony_ci 310662306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 310762306a36Sopenharmony_ci /* 310862306a36Sopenharmony_ci * glb_chg is passed to indicate whether or not a page must be taken 310962306a36Sopenharmony_ci * from the global free pool (global change). gbl_chg == 0 indicates 311062306a36Sopenharmony_ci * a reservation exists for the allocation. 311162306a36Sopenharmony_ci */ 311262306a36Sopenharmony_ci folio = dequeue_hugetlb_folio_vma(h, vma, addr, avoid_reserve, gbl_chg); 311362306a36Sopenharmony_ci if (!folio) { 311462306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 311562306a36Sopenharmony_ci folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr); 311662306a36Sopenharmony_ci if (!folio) 311762306a36Sopenharmony_ci goto out_uncharge_cgroup; 311862306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 311962306a36Sopenharmony_ci if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { 312062306a36Sopenharmony_ci folio_set_hugetlb_restore_reserve(folio); 312162306a36Sopenharmony_ci h->resv_huge_pages--; 312262306a36Sopenharmony_ci } 312362306a36Sopenharmony_ci list_add(&folio->lru, &h->hugepage_activelist); 312462306a36Sopenharmony_ci folio_ref_unfreeze(folio, 1); 312562306a36Sopenharmony_ci /* Fall through */ 312662306a36Sopenharmony_ci } 312762306a36Sopenharmony_ci 312862306a36Sopenharmony_ci hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio); 312962306a36Sopenharmony_ci /* If allocation is not consuming a reservation, also store the 313062306a36Sopenharmony_ci * hugetlb_cgroup pointer on the page. 313162306a36Sopenharmony_ci */ 313262306a36Sopenharmony_ci if (deferred_reserve) { 313362306a36Sopenharmony_ci hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), 313462306a36Sopenharmony_ci h_cg, folio); 313562306a36Sopenharmony_ci } 313662306a36Sopenharmony_ci 313762306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 313862306a36Sopenharmony_ci 313962306a36Sopenharmony_ci hugetlb_set_folio_subpool(folio, spool); 314062306a36Sopenharmony_ci 314162306a36Sopenharmony_ci map_commit = vma_commit_reservation(h, vma, addr); 314262306a36Sopenharmony_ci if (unlikely(map_chg > map_commit)) { 314362306a36Sopenharmony_ci /* 314462306a36Sopenharmony_ci * The page was added to the reservation map between 314562306a36Sopenharmony_ci * vma_needs_reservation and vma_commit_reservation. 314662306a36Sopenharmony_ci * This indicates a race with hugetlb_reserve_pages. 314762306a36Sopenharmony_ci * Adjust for the subpool count incremented above AND 314862306a36Sopenharmony_ci * in hugetlb_reserve_pages for the same page. Also, 314962306a36Sopenharmony_ci * the reservation count added in hugetlb_reserve_pages 315062306a36Sopenharmony_ci * no longer applies. 315162306a36Sopenharmony_ci */ 315262306a36Sopenharmony_ci long rsv_adjust; 315362306a36Sopenharmony_ci 315462306a36Sopenharmony_ci rsv_adjust = hugepage_subpool_put_pages(spool, 1); 315562306a36Sopenharmony_ci hugetlb_acct_memory(h, -rsv_adjust); 315662306a36Sopenharmony_ci if (deferred_reserve) 315762306a36Sopenharmony_ci hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), 315862306a36Sopenharmony_ci pages_per_huge_page(h), folio); 315962306a36Sopenharmony_ci } 316062306a36Sopenharmony_ci return folio; 316162306a36Sopenharmony_ci 316262306a36Sopenharmony_ciout_uncharge_cgroup: 316362306a36Sopenharmony_ci hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); 316462306a36Sopenharmony_ciout_uncharge_cgroup_reservation: 316562306a36Sopenharmony_ci if (deferred_reserve) 316662306a36Sopenharmony_ci hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), 316762306a36Sopenharmony_ci h_cg); 316862306a36Sopenharmony_ciout_subpool_put: 316962306a36Sopenharmony_ci if (map_chg || avoid_reserve) 317062306a36Sopenharmony_ci hugepage_subpool_put_pages(spool, 1); 317162306a36Sopenharmony_ci vma_end_reservation(h, vma, addr); 317262306a36Sopenharmony_ci return ERR_PTR(-ENOSPC); 317362306a36Sopenharmony_ci} 317462306a36Sopenharmony_ci 317562306a36Sopenharmony_ciint alloc_bootmem_huge_page(struct hstate *h, int nid) 317662306a36Sopenharmony_ci __attribute__ ((weak, alias("__alloc_bootmem_huge_page"))); 317762306a36Sopenharmony_ciint __alloc_bootmem_huge_page(struct hstate *h, int nid) 317862306a36Sopenharmony_ci{ 317962306a36Sopenharmony_ci struct huge_bootmem_page *m = NULL; /* initialize for clang */ 318062306a36Sopenharmony_ci int nr_nodes, node; 318162306a36Sopenharmony_ci 318262306a36Sopenharmony_ci /* do node specific alloc */ 318362306a36Sopenharmony_ci if (nid != NUMA_NO_NODE) { 318462306a36Sopenharmony_ci m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h), 318562306a36Sopenharmony_ci 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid); 318662306a36Sopenharmony_ci if (!m) 318762306a36Sopenharmony_ci return 0; 318862306a36Sopenharmony_ci goto found; 318962306a36Sopenharmony_ci } 319062306a36Sopenharmony_ci /* allocate from next node when distributing huge pages */ 319162306a36Sopenharmony_ci for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { 319262306a36Sopenharmony_ci m = memblock_alloc_try_nid_raw( 319362306a36Sopenharmony_ci huge_page_size(h), huge_page_size(h), 319462306a36Sopenharmony_ci 0, MEMBLOCK_ALLOC_ACCESSIBLE, node); 319562306a36Sopenharmony_ci /* 319662306a36Sopenharmony_ci * Use the beginning of the huge page to store the 319762306a36Sopenharmony_ci * huge_bootmem_page struct (until gather_bootmem 319862306a36Sopenharmony_ci * puts them into the mem_map). 319962306a36Sopenharmony_ci */ 320062306a36Sopenharmony_ci if (!m) 320162306a36Sopenharmony_ci return 0; 320262306a36Sopenharmony_ci goto found; 320362306a36Sopenharmony_ci } 320462306a36Sopenharmony_ci 320562306a36Sopenharmony_cifound: 320662306a36Sopenharmony_ci /* Put them into a private list first because mem_map is not up yet */ 320762306a36Sopenharmony_ci INIT_LIST_HEAD(&m->list); 320862306a36Sopenharmony_ci list_add(&m->list, &huge_boot_pages); 320962306a36Sopenharmony_ci m->hstate = h; 321062306a36Sopenharmony_ci return 1; 321162306a36Sopenharmony_ci} 321262306a36Sopenharmony_ci 321362306a36Sopenharmony_ci/* 321462306a36Sopenharmony_ci * Put bootmem huge pages into the standard lists after mem_map is up. 321562306a36Sopenharmony_ci * Note: This only applies to gigantic (order > MAX_ORDER) pages. 321662306a36Sopenharmony_ci */ 321762306a36Sopenharmony_cistatic void __init gather_bootmem_prealloc(void) 321862306a36Sopenharmony_ci{ 321962306a36Sopenharmony_ci struct huge_bootmem_page *m; 322062306a36Sopenharmony_ci 322162306a36Sopenharmony_ci list_for_each_entry(m, &huge_boot_pages, list) { 322262306a36Sopenharmony_ci struct page *page = virt_to_page(m); 322362306a36Sopenharmony_ci struct folio *folio = page_folio(page); 322462306a36Sopenharmony_ci struct hstate *h = m->hstate; 322562306a36Sopenharmony_ci 322662306a36Sopenharmony_ci VM_BUG_ON(!hstate_is_gigantic(h)); 322762306a36Sopenharmony_ci WARN_ON(folio_ref_count(folio) != 1); 322862306a36Sopenharmony_ci if (prep_compound_gigantic_folio(folio, huge_page_order(h))) { 322962306a36Sopenharmony_ci WARN_ON(folio_test_reserved(folio)); 323062306a36Sopenharmony_ci prep_new_hugetlb_folio(h, folio, folio_nid(folio)); 323162306a36Sopenharmony_ci free_huge_folio(folio); /* add to the hugepage allocator */ 323262306a36Sopenharmony_ci } else { 323362306a36Sopenharmony_ci /* VERY unlikely inflated ref count on a tail page */ 323462306a36Sopenharmony_ci free_gigantic_folio(folio, huge_page_order(h)); 323562306a36Sopenharmony_ci } 323662306a36Sopenharmony_ci 323762306a36Sopenharmony_ci /* 323862306a36Sopenharmony_ci * We need to restore the 'stolen' pages to totalram_pages 323962306a36Sopenharmony_ci * in order to fix confusing memory reports from free(1) and 324062306a36Sopenharmony_ci * other side-effects, like CommitLimit going negative. 324162306a36Sopenharmony_ci */ 324262306a36Sopenharmony_ci adjust_managed_page_count(page, pages_per_huge_page(h)); 324362306a36Sopenharmony_ci cond_resched(); 324462306a36Sopenharmony_ci } 324562306a36Sopenharmony_ci} 324662306a36Sopenharmony_cistatic void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) 324762306a36Sopenharmony_ci{ 324862306a36Sopenharmony_ci unsigned long i; 324962306a36Sopenharmony_ci char buf[32]; 325062306a36Sopenharmony_ci 325162306a36Sopenharmony_ci for (i = 0; i < h->max_huge_pages_node[nid]; ++i) { 325262306a36Sopenharmony_ci if (hstate_is_gigantic(h)) { 325362306a36Sopenharmony_ci if (!alloc_bootmem_huge_page(h, nid)) 325462306a36Sopenharmony_ci break; 325562306a36Sopenharmony_ci } else { 325662306a36Sopenharmony_ci struct folio *folio; 325762306a36Sopenharmony_ci gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; 325862306a36Sopenharmony_ci 325962306a36Sopenharmony_ci folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, 326062306a36Sopenharmony_ci &node_states[N_MEMORY], NULL); 326162306a36Sopenharmony_ci if (!folio) 326262306a36Sopenharmony_ci break; 326362306a36Sopenharmony_ci free_huge_folio(folio); /* free it into the hugepage allocator */ 326462306a36Sopenharmony_ci } 326562306a36Sopenharmony_ci cond_resched(); 326662306a36Sopenharmony_ci } 326762306a36Sopenharmony_ci if (i == h->max_huge_pages_node[nid]) 326862306a36Sopenharmony_ci return; 326962306a36Sopenharmony_ci 327062306a36Sopenharmony_ci string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); 327162306a36Sopenharmony_ci pr_warn("HugeTLB: allocating %u of page size %s failed node%d. Only allocated %lu hugepages.\n", 327262306a36Sopenharmony_ci h->max_huge_pages_node[nid], buf, nid, i); 327362306a36Sopenharmony_ci h->max_huge_pages -= (h->max_huge_pages_node[nid] - i); 327462306a36Sopenharmony_ci h->max_huge_pages_node[nid] = i; 327562306a36Sopenharmony_ci} 327662306a36Sopenharmony_ci 327762306a36Sopenharmony_cistatic void __init hugetlb_hstate_alloc_pages(struct hstate *h) 327862306a36Sopenharmony_ci{ 327962306a36Sopenharmony_ci unsigned long i; 328062306a36Sopenharmony_ci nodemask_t *node_alloc_noretry; 328162306a36Sopenharmony_ci bool node_specific_alloc = false; 328262306a36Sopenharmony_ci 328362306a36Sopenharmony_ci /* skip gigantic hugepages allocation if hugetlb_cma enabled */ 328462306a36Sopenharmony_ci if (hstate_is_gigantic(h) && hugetlb_cma_size) { 328562306a36Sopenharmony_ci pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n"); 328662306a36Sopenharmony_ci return; 328762306a36Sopenharmony_ci } 328862306a36Sopenharmony_ci 328962306a36Sopenharmony_ci /* do node specific alloc */ 329062306a36Sopenharmony_ci for_each_online_node(i) { 329162306a36Sopenharmony_ci if (h->max_huge_pages_node[i] > 0) { 329262306a36Sopenharmony_ci hugetlb_hstate_alloc_pages_onenode(h, i); 329362306a36Sopenharmony_ci node_specific_alloc = true; 329462306a36Sopenharmony_ci } 329562306a36Sopenharmony_ci } 329662306a36Sopenharmony_ci 329762306a36Sopenharmony_ci if (node_specific_alloc) 329862306a36Sopenharmony_ci return; 329962306a36Sopenharmony_ci 330062306a36Sopenharmony_ci /* below will do all node balanced alloc */ 330162306a36Sopenharmony_ci if (!hstate_is_gigantic(h)) { 330262306a36Sopenharmony_ci /* 330362306a36Sopenharmony_ci * Bit mask controlling how hard we retry per-node allocations. 330462306a36Sopenharmony_ci * Ignore errors as lower level routines can deal with 330562306a36Sopenharmony_ci * node_alloc_noretry == NULL. If this kmalloc fails at boot 330662306a36Sopenharmony_ci * time, we are likely in bigger trouble. 330762306a36Sopenharmony_ci */ 330862306a36Sopenharmony_ci node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry), 330962306a36Sopenharmony_ci GFP_KERNEL); 331062306a36Sopenharmony_ci } else { 331162306a36Sopenharmony_ci /* allocations done at boot time */ 331262306a36Sopenharmony_ci node_alloc_noretry = NULL; 331362306a36Sopenharmony_ci } 331462306a36Sopenharmony_ci 331562306a36Sopenharmony_ci /* bit mask controlling how hard we retry per-node allocations */ 331662306a36Sopenharmony_ci if (node_alloc_noretry) 331762306a36Sopenharmony_ci nodes_clear(*node_alloc_noretry); 331862306a36Sopenharmony_ci 331962306a36Sopenharmony_ci for (i = 0; i < h->max_huge_pages; ++i) { 332062306a36Sopenharmony_ci if (hstate_is_gigantic(h)) { 332162306a36Sopenharmony_ci if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE)) 332262306a36Sopenharmony_ci break; 332362306a36Sopenharmony_ci } else if (!alloc_pool_huge_page(h, 332462306a36Sopenharmony_ci &node_states[N_MEMORY], 332562306a36Sopenharmony_ci node_alloc_noretry)) 332662306a36Sopenharmony_ci break; 332762306a36Sopenharmony_ci cond_resched(); 332862306a36Sopenharmony_ci } 332962306a36Sopenharmony_ci if (i < h->max_huge_pages) { 333062306a36Sopenharmony_ci char buf[32]; 333162306a36Sopenharmony_ci 333262306a36Sopenharmony_ci string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); 333362306a36Sopenharmony_ci pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n", 333462306a36Sopenharmony_ci h->max_huge_pages, buf, i); 333562306a36Sopenharmony_ci h->max_huge_pages = i; 333662306a36Sopenharmony_ci } 333762306a36Sopenharmony_ci kfree(node_alloc_noretry); 333862306a36Sopenharmony_ci} 333962306a36Sopenharmony_ci 334062306a36Sopenharmony_cistatic void __init hugetlb_init_hstates(void) 334162306a36Sopenharmony_ci{ 334262306a36Sopenharmony_ci struct hstate *h, *h2; 334362306a36Sopenharmony_ci 334462306a36Sopenharmony_ci for_each_hstate(h) { 334562306a36Sopenharmony_ci /* oversize hugepages were init'ed in early boot */ 334662306a36Sopenharmony_ci if (!hstate_is_gigantic(h)) 334762306a36Sopenharmony_ci hugetlb_hstate_alloc_pages(h); 334862306a36Sopenharmony_ci 334962306a36Sopenharmony_ci /* 335062306a36Sopenharmony_ci * Set demote order for each hstate. Note that 335162306a36Sopenharmony_ci * h->demote_order is initially 0. 335262306a36Sopenharmony_ci * - We can not demote gigantic pages if runtime freeing 335362306a36Sopenharmony_ci * is not supported, so skip this. 335462306a36Sopenharmony_ci * - If CMA allocation is possible, we can not demote 335562306a36Sopenharmony_ci * HUGETLB_PAGE_ORDER or smaller size pages. 335662306a36Sopenharmony_ci */ 335762306a36Sopenharmony_ci if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) 335862306a36Sopenharmony_ci continue; 335962306a36Sopenharmony_ci if (hugetlb_cma_size && h->order <= HUGETLB_PAGE_ORDER) 336062306a36Sopenharmony_ci continue; 336162306a36Sopenharmony_ci for_each_hstate(h2) { 336262306a36Sopenharmony_ci if (h2 == h) 336362306a36Sopenharmony_ci continue; 336462306a36Sopenharmony_ci if (h2->order < h->order && 336562306a36Sopenharmony_ci h2->order > h->demote_order) 336662306a36Sopenharmony_ci h->demote_order = h2->order; 336762306a36Sopenharmony_ci } 336862306a36Sopenharmony_ci } 336962306a36Sopenharmony_ci} 337062306a36Sopenharmony_ci 337162306a36Sopenharmony_cistatic void __init report_hugepages(void) 337262306a36Sopenharmony_ci{ 337362306a36Sopenharmony_ci struct hstate *h; 337462306a36Sopenharmony_ci 337562306a36Sopenharmony_ci for_each_hstate(h) { 337662306a36Sopenharmony_ci char buf[32]; 337762306a36Sopenharmony_ci 337862306a36Sopenharmony_ci string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); 337962306a36Sopenharmony_ci pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n", 338062306a36Sopenharmony_ci buf, h->free_huge_pages); 338162306a36Sopenharmony_ci pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n", 338262306a36Sopenharmony_ci hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf); 338362306a36Sopenharmony_ci } 338462306a36Sopenharmony_ci} 338562306a36Sopenharmony_ci 338662306a36Sopenharmony_ci#ifdef CONFIG_HIGHMEM 338762306a36Sopenharmony_cistatic void try_to_free_low(struct hstate *h, unsigned long count, 338862306a36Sopenharmony_ci nodemask_t *nodes_allowed) 338962306a36Sopenharmony_ci{ 339062306a36Sopenharmony_ci int i; 339162306a36Sopenharmony_ci LIST_HEAD(page_list); 339262306a36Sopenharmony_ci 339362306a36Sopenharmony_ci lockdep_assert_held(&hugetlb_lock); 339462306a36Sopenharmony_ci if (hstate_is_gigantic(h)) 339562306a36Sopenharmony_ci return; 339662306a36Sopenharmony_ci 339762306a36Sopenharmony_ci /* 339862306a36Sopenharmony_ci * Collect pages to be freed on a list, and free after dropping lock 339962306a36Sopenharmony_ci */ 340062306a36Sopenharmony_ci for_each_node_mask(i, *nodes_allowed) { 340162306a36Sopenharmony_ci struct page *page, *next; 340262306a36Sopenharmony_ci struct list_head *freel = &h->hugepage_freelists[i]; 340362306a36Sopenharmony_ci list_for_each_entry_safe(page, next, freel, lru) { 340462306a36Sopenharmony_ci if (count >= h->nr_huge_pages) 340562306a36Sopenharmony_ci goto out; 340662306a36Sopenharmony_ci if (PageHighMem(page)) 340762306a36Sopenharmony_ci continue; 340862306a36Sopenharmony_ci remove_hugetlb_folio(h, page_folio(page), false); 340962306a36Sopenharmony_ci list_add(&page->lru, &page_list); 341062306a36Sopenharmony_ci } 341162306a36Sopenharmony_ci } 341262306a36Sopenharmony_ci 341362306a36Sopenharmony_ciout: 341462306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 341562306a36Sopenharmony_ci update_and_free_pages_bulk(h, &page_list); 341662306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 341762306a36Sopenharmony_ci} 341862306a36Sopenharmony_ci#else 341962306a36Sopenharmony_cistatic inline void try_to_free_low(struct hstate *h, unsigned long count, 342062306a36Sopenharmony_ci nodemask_t *nodes_allowed) 342162306a36Sopenharmony_ci{ 342262306a36Sopenharmony_ci} 342362306a36Sopenharmony_ci#endif 342462306a36Sopenharmony_ci 342562306a36Sopenharmony_ci/* 342662306a36Sopenharmony_ci * Increment or decrement surplus_huge_pages. Keep node-specific counters 342762306a36Sopenharmony_ci * balanced by operating on them in a round-robin fashion. 342862306a36Sopenharmony_ci * Returns 1 if an adjustment was made. 342962306a36Sopenharmony_ci */ 343062306a36Sopenharmony_cistatic int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, 343162306a36Sopenharmony_ci int delta) 343262306a36Sopenharmony_ci{ 343362306a36Sopenharmony_ci int nr_nodes, node; 343462306a36Sopenharmony_ci 343562306a36Sopenharmony_ci lockdep_assert_held(&hugetlb_lock); 343662306a36Sopenharmony_ci VM_BUG_ON(delta != -1 && delta != 1); 343762306a36Sopenharmony_ci 343862306a36Sopenharmony_ci if (delta < 0) { 343962306a36Sopenharmony_ci for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { 344062306a36Sopenharmony_ci if (h->surplus_huge_pages_node[node]) 344162306a36Sopenharmony_ci goto found; 344262306a36Sopenharmony_ci } 344362306a36Sopenharmony_ci } else { 344462306a36Sopenharmony_ci for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { 344562306a36Sopenharmony_ci if (h->surplus_huge_pages_node[node] < 344662306a36Sopenharmony_ci h->nr_huge_pages_node[node]) 344762306a36Sopenharmony_ci goto found; 344862306a36Sopenharmony_ci } 344962306a36Sopenharmony_ci } 345062306a36Sopenharmony_ci return 0; 345162306a36Sopenharmony_ci 345262306a36Sopenharmony_cifound: 345362306a36Sopenharmony_ci h->surplus_huge_pages += delta; 345462306a36Sopenharmony_ci h->surplus_huge_pages_node[node] += delta; 345562306a36Sopenharmony_ci return 1; 345662306a36Sopenharmony_ci} 345762306a36Sopenharmony_ci 345862306a36Sopenharmony_ci#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) 345962306a36Sopenharmony_cistatic int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, 346062306a36Sopenharmony_ci nodemask_t *nodes_allowed) 346162306a36Sopenharmony_ci{ 346262306a36Sopenharmony_ci unsigned long min_count, ret; 346362306a36Sopenharmony_ci struct page *page; 346462306a36Sopenharmony_ci LIST_HEAD(page_list); 346562306a36Sopenharmony_ci NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); 346662306a36Sopenharmony_ci 346762306a36Sopenharmony_ci /* 346862306a36Sopenharmony_ci * Bit mask controlling how hard we retry per-node allocations. 346962306a36Sopenharmony_ci * If we can not allocate the bit mask, do not attempt to allocate 347062306a36Sopenharmony_ci * the requested huge pages. 347162306a36Sopenharmony_ci */ 347262306a36Sopenharmony_ci if (node_alloc_noretry) 347362306a36Sopenharmony_ci nodes_clear(*node_alloc_noretry); 347462306a36Sopenharmony_ci else 347562306a36Sopenharmony_ci return -ENOMEM; 347662306a36Sopenharmony_ci 347762306a36Sopenharmony_ci /* 347862306a36Sopenharmony_ci * resize_lock mutex prevents concurrent adjustments to number of 347962306a36Sopenharmony_ci * pages in hstate via the proc/sysfs interfaces. 348062306a36Sopenharmony_ci */ 348162306a36Sopenharmony_ci mutex_lock(&h->resize_lock); 348262306a36Sopenharmony_ci flush_free_hpage_work(h); 348362306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 348462306a36Sopenharmony_ci 348562306a36Sopenharmony_ci /* 348662306a36Sopenharmony_ci * Check for a node specific request. 348762306a36Sopenharmony_ci * Changing node specific huge page count may require a corresponding 348862306a36Sopenharmony_ci * change to the global count. In any case, the passed node mask 348962306a36Sopenharmony_ci * (nodes_allowed) will restrict alloc/free to the specified node. 349062306a36Sopenharmony_ci */ 349162306a36Sopenharmony_ci if (nid != NUMA_NO_NODE) { 349262306a36Sopenharmony_ci unsigned long old_count = count; 349362306a36Sopenharmony_ci 349462306a36Sopenharmony_ci count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; 349562306a36Sopenharmony_ci /* 349662306a36Sopenharmony_ci * User may have specified a large count value which caused the 349762306a36Sopenharmony_ci * above calculation to overflow. In this case, they wanted 349862306a36Sopenharmony_ci * to allocate as many huge pages as possible. Set count to 349962306a36Sopenharmony_ci * largest possible value to align with their intention. 350062306a36Sopenharmony_ci */ 350162306a36Sopenharmony_ci if (count < old_count) 350262306a36Sopenharmony_ci count = ULONG_MAX; 350362306a36Sopenharmony_ci } 350462306a36Sopenharmony_ci 350562306a36Sopenharmony_ci /* 350662306a36Sopenharmony_ci * Gigantic pages runtime allocation depend on the capability for large 350762306a36Sopenharmony_ci * page range allocation. 350862306a36Sopenharmony_ci * If the system does not provide this feature, return an error when 350962306a36Sopenharmony_ci * the user tries to allocate gigantic pages but let the user free the 351062306a36Sopenharmony_ci * boottime allocated gigantic pages. 351162306a36Sopenharmony_ci */ 351262306a36Sopenharmony_ci if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { 351362306a36Sopenharmony_ci if (count > persistent_huge_pages(h)) { 351462306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 351562306a36Sopenharmony_ci mutex_unlock(&h->resize_lock); 351662306a36Sopenharmony_ci NODEMASK_FREE(node_alloc_noretry); 351762306a36Sopenharmony_ci return -EINVAL; 351862306a36Sopenharmony_ci } 351962306a36Sopenharmony_ci /* Fall through to decrease pool */ 352062306a36Sopenharmony_ci } 352162306a36Sopenharmony_ci 352262306a36Sopenharmony_ci /* 352362306a36Sopenharmony_ci * Increase the pool size 352462306a36Sopenharmony_ci * First take pages out of surplus state. Then make up the 352562306a36Sopenharmony_ci * remaining difference by allocating fresh huge pages. 352662306a36Sopenharmony_ci * 352762306a36Sopenharmony_ci * We might race with alloc_surplus_hugetlb_folio() here and be unable 352862306a36Sopenharmony_ci * to convert a surplus huge page to a normal huge page. That is 352962306a36Sopenharmony_ci * not critical, though, it just means the overall size of the 353062306a36Sopenharmony_ci * pool might be one hugepage larger than it needs to be, but 353162306a36Sopenharmony_ci * within all the constraints specified by the sysctls. 353262306a36Sopenharmony_ci */ 353362306a36Sopenharmony_ci while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { 353462306a36Sopenharmony_ci if (!adjust_pool_surplus(h, nodes_allowed, -1)) 353562306a36Sopenharmony_ci break; 353662306a36Sopenharmony_ci } 353762306a36Sopenharmony_ci 353862306a36Sopenharmony_ci while (count > persistent_huge_pages(h)) { 353962306a36Sopenharmony_ci /* 354062306a36Sopenharmony_ci * If this allocation races such that we no longer need the 354162306a36Sopenharmony_ci * page, free_huge_folio will handle it by freeing the page 354262306a36Sopenharmony_ci * and reducing the surplus. 354362306a36Sopenharmony_ci */ 354462306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 354562306a36Sopenharmony_ci 354662306a36Sopenharmony_ci /* yield cpu to avoid soft lockup */ 354762306a36Sopenharmony_ci cond_resched(); 354862306a36Sopenharmony_ci 354962306a36Sopenharmony_ci ret = alloc_pool_huge_page(h, nodes_allowed, 355062306a36Sopenharmony_ci node_alloc_noretry); 355162306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 355262306a36Sopenharmony_ci if (!ret) 355362306a36Sopenharmony_ci goto out; 355462306a36Sopenharmony_ci 355562306a36Sopenharmony_ci /* Bail for signals. Probably ctrl-c from user */ 355662306a36Sopenharmony_ci if (signal_pending(current)) 355762306a36Sopenharmony_ci goto out; 355862306a36Sopenharmony_ci } 355962306a36Sopenharmony_ci 356062306a36Sopenharmony_ci /* 356162306a36Sopenharmony_ci * Decrease the pool size 356262306a36Sopenharmony_ci * First return free pages to the buddy allocator (being careful 356362306a36Sopenharmony_ci * to keep enough around to satisfy reservations). Then place 356462306a36Sopenharmony_ci * pages into surplus state as needed so the pool will shrink 356562306a36Sopenharmony_ci * to the desired size as pages become free. 356662306a36Sopenharmony_ci * 356762306a36Sopenharmony_ci * By placing pages into the surplus state independent of the 356862306a36Sopenharmony_ci * overcommit value, we are allowing the surplus pool size to 356962306a36Sopenharmony_ci * exceed overcommit. There are few sane options here. Since 357062306a36Sopenharmony_ci * alloc_surplus_hugetlb_folio() is checking the global counter, 357162306a36Sopenharmony_ci * though, we'll note that we're not allowed to exceed surplus 357262306a36Sopenharmony_ci * and won't grow the pool anywhere else. Not until one of the 357362306a36Sopenharmony_ci * sysctls are changed, or the surplus pages go out of use. 357462306a36Sopenharmony_ci */ 357562306a36Sopenharmony_ci min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; 357662306a36Sopenharmony_ci min_count = max(count, min_count); 357762306a36Sopenharmony_ci try_to_free_low(h, min_count, nodes_allowed); 357862306a36Sopenharmony_ci 357962306a36Sopenharmony_ci /* 358062306a36Sopenharmony_ci * Collect pages to be removed on list without dropping lock 358162306a36Sopenharmony_ci */ 358262306a36Sopenharmony_ci while (min_count < persistent_huge_pages(h)) { 358362306a36Sopenharmony_ci page = remove_pool_huge_page(h, nodes_allowed, 0); 358462306a36Sopenharmony_ci if (!page) 358562306a36Sopenharmony_ci break; 358662306a36Sopenharmony_ci 358762306a36Sopenharmony_ci list_add(&page->lru, &page_list); 358862306a36Sopenharmony_ci } 358962306a36Sopenharmony_ci /* free the pages after dropping lock */ 359062306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 359162306a36Sopenharmony_ci update_and_free_pages_bulk(h, &page_list); 359262306a36Sopenharmony_ci flush_free_hpage_work(h); 359362306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 359462306a36Sopenharmony_ci 359562306a36Sopenharmony_ci while (count < persistent_huge_pages(h)) { 359662306a36Sopenharmony_ci if (!adjust_pool_surplus(h, nodes_allowed, 1)) 359762306a36Sopenharmony_ci break; 359862306a36Sopenharmony_ci } 359962306a36Sopenharmony_ciout: 360062306a36Sopenharmony_ci h->max_huge_pages = persistent_huge_pages(h); 360162306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 360262306a36Sopenharmony_ci mutex_unlock(&h->resize_lock); 360362306a36Sopenharmony_ci 360462306a36Sopenharmony_ci NODEMASK_FREE(node_alloc_noretry); 360562306a36Sopenharmony_ci 360662306a36Sopenharmony_ci return 0; 360762306a36Sopenharmony_ci} 360862306a36Sopenharmony_ci 360962306a36Sopenharmony_cistatic int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio) 361062306a36Sopenharmony_ci{ 361162306a36Sopenharmony_ci int i, nid = folio_nid(folio); 361262306a36Sopenharmony_ci struct hstate *target_hstate; 361362306a36Sopenharmony_ci struct page *subpage; 361462306a36Sopenharmony_ci struct folio *inner_folio; 361562306a36Sopenharmony_ci int rc = 0; 361662306a36Sopenharmony_ci 361762306a36Sopenharmony_ci target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order); 361862306a36Sopenharmony_ci 361962306a36Sopenharmony_ci remove_hugetlb_folio_for_demote(h, folio, false); 362062306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 362162306a36Sopenharmony_ci 362262306a36Sopenharmony_ci rc = hugetlb_vmemmap_restore(h, &folio->page); 362362306a36Sopenharmony_ci if (rc) { 362462306a36Sopenharmony_ci /* Allocation of vmemmmap failed, we can not demote folio */ 362562306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 362662306a36Sopenharmony_ci folio_ref_unfreeze(folio, 1); 362762306a36Sopenharmony_ci add_hugetlb_folio(h, folio, false); 362862306a36Sopenharmony_ci return rc; 362962306a36Sopenharmony_ci } 363062306a36Sopenharmony_ci 363162306a36Sopenharmony_ci /* 363262306a36Sopenharmony_ci * Use destroy_compound_hugetlb_folio_for_demote for all huge page 363362306a36Sopenharmony_ci * sizes as it will not ref count folios. 363462306a36Sopenharmony_ci */ 363562306a36Sopenharmony_ci destroy_compound_hugetlb_folio_for_demote(folio, huge_page_order(h)); 363662306a36Sopenharmony_ci 363762306a36Sopenharmony_ci /* 363862306a36Sopenharmony_ci * Taking target hstate mutex synchronizes with set_max_huge_pages. 363962306a36Sopenharmony_ci * Without the mutex, pages added to target hstate could be marked 364062306a36Sopenharmony_ci * as surplus. 364162306a36Sopenharmony_ci * 364262306a36Sopenharmony_ci * Note that we already hold h->resize_lock. To prevent deadlock, 364362306a36Sopenharmony_ci * use the convention of always taking larger size hstate mutex first. 364462306a36Sopenharmony_ci */ 364562306a36Sopenharmony_ci mutex_lock(&target_hstate->resize_lock); 364662306a36Sopenharmony_ci for (i = 0; i < pages_per_huge_page(h); 364762306a36Sopenharmony_ci i += pages_per_huge_page(target_hstate)) { 364862306a36Sopenharmony_ci subpage = folio_page(folio, i); 364962306a36Sopenharmony_ci inner_folio = page_folio(subpage); 365062306a36Sopenharmony_ci if (hstate_is_gigantic(target_hstate)) 365162306a36Sopenharmony_ci prep_compound_gigantic_folio_for_demote(inner_folio, 365262306a36Sopenharmony_ci target_hstate->order); 365362306a36Sopenharmony_ci else 365462306a36Sopenharmony_ci prep_compound_page(subpage, target_hstate->order); 365562306a36Sopenharmony_ci folio_change_private(inner_folio, NULL); 365662306a36Sopenharmony_ci prep_new_hugetlb_folio(target_hstate, inner_folio, nid); 365762306a36Sopenharmony_ci free_huge_folio(inner_folio); 365862306a36Sopenharmony_ci } 365962306a36Sopenharmony_ci mutex_unlock(&target_hstate->resize_lock); 366062306a36Sopenharmony_ci 366162306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 366262306a36Sopenharmony_ci 366362306a36Sopenharmony_ci /* 366462306a36Sopenharmony_ci * Not absolutely necessary, but for consistency update max_huge_pages 366562306a36Sopenharmony_ci * based on pool changes for the demoted page. 366662306a36Sopenharmony_ci */ 366762306a36Sopenharmony_ci h->max_huge_pages--; 366862306a36Sopenharmony_ci target_hstate->max_huge_pages += 366962306a36Sopenharmony_ci pages_per_huge_page(h) / pages_per_huge_page(target_hstate); 367062306a36Sopenharmony_ci 367162306a36Sopenharmony_ci return rc; 367262306a36Sopenharmony_ci} 367362306a36Sopenharmony_ci 367462306a36Sopenharmony_cistatic int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) 367562306a36Sopenharmony_ci __must_hold(&hugetlb_lock) 367662306a36Sopenharmony_ci{ 367762306a36Sopenharmony_ci int nr_nodes, node; 367862306a36Sopenharmony_ci struct folio *folio; 367962306a36Sopenharmony_ci 368062306a36Sopenharmony_ci lockdep_assert_held(&hugetlb_lock); 368162306a36Sopenharmony_ci 368262306a36Sopenharmony_ci /* We should never get here if no demote order */ 368362306a36Sopenharmony_ci if (!h->demote_order) { 368462306a36Sopenharmony_ci pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n"); 368562306a36Sopenharmony_ci return -EINVAL; /* internal error */ 368662306a36Sopenharmony_ci } 368762306a36Sopenharmony_ci 368862306a36Sopenharmony_ci for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { 368962306a36Sopenharmony_ci list_for_each_entry(folio, &h->hugepage_freelists[node], lru) { 369062306a36Sopenharmony_ci if (folio_test_hwpoison(folio)) 369162306a36Sopenharmony_ci continue; 369262306a36Sopenharmony_ci return demote_free_hugetlb_folio(h, folio); 369362306a36Sopenharmony_ci } 369462306a36Sopenharmony_ci } 369562306a36Sopenharmony_ci 369662306a36Sopenharmony_ci /* 369762306a36Sopenharmony_ci * Only way to get here is if all pages on free lists are poisoned. 369862306a36Sopenharmony_ci * Return -EBUSY so that caller will not retry. 369962306a36Sopenharmony_ci */ 370062306a36Sopenharmony_ci return -EBUSY; 370162306a36Sopenharmony_ci} 370262306a36Sopenharmony_ci 370362306a36Sopenharmony_ci#define HSTATE_ATTR_RO(_name) \ 370462306a36Sopenharmony_ci static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 370562306a36Sopenharmony_ci 370662306a36Sopenharmony_ci#define HSTATE_ATTR_WO(_name) \ 370762306a36Sopenharmony_ci static struct kobj_attribute _name##_attr = __ATTR_WO(_name) 370862306a36Sopenharmony_ci 370962306a36Sopenharmony_ci#define HSTATE_ATTR(_name) \ 371062306a36Sopenharmony_ci static struct kobj_attribute _name##_attr = __ATTR_RW(_name) 371162306a36Sopenharmony_ci 371262306a36Sopenharmony_cistatic struct kobject *hugepages_kobj; 371362306a36Sopenharmony_cistatic struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 371462306a36Sopenharmony_ci 371562306a36Sopenharmony_cistatic struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); 371662306a36Sopenharmony_ci 371762306a36Sopenharmony_cistatic struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) 371862306a36Sopenharmony_ci{ 371962306a36Sopenharmony_ci int i; 372062306a36Sopenharmony_ci 372162306a36Sopenharmony_ci for (i = 0; i < HUGE_MAX_HSTATE; i++) 372262306a36Sopenharmony_ci if (hstate_kobjs[i] == kobj) { 372362306a36Sopenharmony_ci if (nidp) 372462306a36Sopenharmony_ci *nidp = NUMA_NO_NODE; 372562306a36Sopenharmony_ci return &hstates[i]; 372662306a36Sopenharmony_ci } 372762306a36Sopenharmony_ci 372862306a36Sopenharmony_ci return kobj_to_node_hstate(kobj, nidp); 372962306a36Sopenharmony_ci} 373062306a36Sopenharmony_ci 373162306a36Sopenharmony_cistatic ssize_t nr_hugepages_show_common(struct kobject *kobj, 373262306a36Sopenharmony_ci struct kobj_attribute *attr, char *buf) 373362306a36Sopenharmony_ci{ 373462306a36Sopenharmony_ci struct hstate *h; 373562306a36Sopenharmony_ci unsigned long nr_huge_pages; 373662306a36Sopenharmony_ci int nid; 373762306a36Sopenharmony_ci 373862306a36Sopenharmony_ci h = kobj_to_hstate(kobj, &nid); 373962306a36Sopenharmony_ci if (nid == NUMA_NO_NODE) 374062306a36Sopenharmony_ci nr_huge_pages = h->nr_huge_pages; 374162306a36Sopenharmony_ci else 374262306a36Sopenharmony_ci nr_huge_pages = h->nr_huge_pages_node[nid]; 374362306a36Sopenharmony_ci 374462306a36Sopenharmony_ci return sysfs_emit(buf, "%lu\n", nr_huge_pages); 374562306a36Sopenharmony_ci} 374662306a36Sopenharmony_ci 374762306a36Sopenharmony_cistatic ssize_t __nr_hugepages_store_common(bool obey_mempolicy, 374862306a36Sopenharmony_ci struct hstate *h, int nid, 374962306a36Sopenharmony_ci unsigned long count, size_t len) 375062306a36Sopenharmony_ci{ 375162306a36Sopenharmony_ci int err; 375262306a36Sopenharmony_ci nodemask_t nodes_allowed, *n_mask; 375362306a36Sopenharmony_ci 375462306a36Sopenharmony_ci if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) 375562306a36Sopenharmony_ci return -EINVAL; 375662306a36Sopenharmony_ci 375762306a36Sopenharmony_ci if (nid == NUMA_NO_NODE) { 375862306a36Sopenharmony_ci /* 375962306a36Sopenharmony_ci * global hstate attribute 376062306a36Sopenharmony_ci */ 376162306a36Sopenharmony_ci if (!(obey_mempolicy && 376262306a36Sopenharmony_ci init_nodemask_of_mempolicy(&nodes_allowed))) 376362306a36Sopenharmony_ci n_mask = &node_states[N_MEMORY]; 376462306a36Sopenharmony_ci else 376562306a36Sopenharmony_ci n_mask = &nodes_allowed; 376662306a36Sopenharmony_ci } else { 376762306a36Sopenharmony_ci /* 376862306a36Sopenharmony_ci * Node specific request. count adjustment happens in 376962306a36Sopenharmony_ci * set_max_huge_pages() after acquiring hugetlb_lock. 377062306a36Sopenharmony_ci */ 377162306a36Sopenharmony_ci init_nodemask_of_node(&nodes_allowed, nid); 377262306a36Sopenharmony_ci n_mask = &nodes_allowed; 377362306a36Sopenharmony_ci } 377462306a36Sopenharmony_ci 377562306a36Sopenharmony_ci err = set_max_huge_pages(h, count, nid, n_mask); 377662306a36Sopenharmony_ci 377762306a36Sopenharmony_ci return err ? err : len; 377862306a36Sopenharmony_ci} 377962306a36Sopenharmony_ci 378062306a36Sopenharmony_cistatic ssize_t nr_hugepages_store_common(bool obey_mempolicy, 378162306a36Sopenharmony_ci struct kobject *kobj, const char *buf, 378262306a36Sopenharmony_ci size_t len) 378362306a36Sopenharmony_ci{ 378462306a36Sopenharmony_ci struct hstate *h; 378562306a36Sopenharmony_ci unsigned long count; 378662306a36Sopenharmony_ci int nid; 378762306a36Sopenharmony_ci int err; 378862306a36Sopenharmony_ci 378962306a36Sopenharmony_ci err = kstrtoul(buf, 10, &count); 379062306a36Sopenharmony_ci if (err) 379162306a36Sopenharmony_ci return err; 379262306a36Sopenharmony_ci 379362306a36Sopenharmony_ci h = kobj_to_hstate(kobj, &nid); 379462306a36Sopenharmony_ci return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len); 379562306a36Sopenharmony_ci} 379662306a36Sopenharmony_ci 379762306a36Sopenharmony_cistatic ssize_t nr_hugepages_show(struct kobject *kobj, 379862306a36Sopenharmony_ci struct kobj_attribute *attr, char *buf) 379962306a36Sopenharmony_ci{ 380062306a36Sopenharmony_ci return nr_hugepages_show_common(kobj, attr, buf); 380162306a36Sopenharmony_ci} 380262306a36Sopenharmony_ci 380362306a36Sopenharmony_cistatic ssize_t nr_hugepages_store(struct kobject *kobj, 380462306a36Sopenharmony_ci struct kobj_attribute *attr, const char *buf, size_t len) 380562306a36Sopenharmony_ci{ 380662306a36Sopenharmony_ci return nr_hugepages_store_common(false, kobj, buf, len); 380762306a36Sopenharmony_ci} 380862306a36Sopenharmony_ciHSTATE_ATTR(nr_hugepages); 380962306a36Sopenharmony_ci 381062306a36Sopenharmony_ci#ifdef CONFIG_NUMA 381162306a36Sopenharmony_ci 381262306a36Sopenharmony_ci/* 381362306a36Sopenharmony_ci * hstate attribute for optionally mempolicy-based constraint on persistent 381462306a36Sopenharmony_ci * huge page alloc/free. 381562306a36Sopenharmony_ci */ 381662306a36Sopenharmony_cistatic ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, 381762306a36Sopenharmony_ci struct kobj_attribute *attr, 381862306a36Sopenharmony_ci char *buf) 381962306a36Sopenharmony_ci{ 382062306a36Sopenharmony_ci return nr_hugepages_show_common(kobj, attr, buf); 382162306a36Sopenharmony_ci} 382262306a36Sopenharmony_ci 382362306a36Sopenharmony_cistatic ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, 382462306a36Sopenharmony_ci struct kobj_attribute *attr, const char *buf, size_t len) 382562306a36Sopenharmony_ci{ 382662306a36Sopenharmony_ci return nr_hugepages_store_common(true, kobj, buf, len); 382762306a36Sopenharmony_ci} 382862306a36Sopenharmony_ciHSTATE_ATTR(nr_hugepages_mempolicy); 382962306a36Sopenharmony_ci#endif 383062306a36Sopenharmony_ci 383162306a36Sopenharmony_ci 383262306a36Sopenharmony_cistatic ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, 383362306a36Sopenharmony_ci struct kobj_attribute *attr, char *buf) 383462306a36Sopenharmony_ci{ 383562306a36Sopenharmony_ci struct hstate *h = kobj_to_hstate(kobj, NULL); 383662306a36Sopenharmony_ci return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages); 383762306a36Sopenharmony_ci} 383862306a36Sopenharmony_ci 383962306a36Sopenharmony_cistatic ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, 384062306a36Sopenharmony_ci struct kobj_attribute *attr, const char *buf, size_t count) 384162306a36Sopenharmony_ci{ 384262306a36Sopenharmony_ci int err; 384362306a36Sopenharmony_ci unsigned long input; 384462306a36Sopenharmony_ci struct hstate *h = kobj_to_hstate(kobj, NULL); 384562306a36Sopenharmony_ci 384662306a36Sopenharmony_ci if (hstate_is_gigantic(h)) 384762306a36Sopenharmony_ci return -EINVAL; 384862306a36Sopenharmony_ci 384962306a36Sopenharmony_ci err = kstrtoul(buf, 10, &input); 385062306a36Sopenharmony_ci if (err) 385162306a36Sopenharmony_ci return err; 385262306a36Sopenharmony_ci 385362306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 385462306a36Sopenharmony_ci h->nr_overcommit_huge_pages = input; 385562306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 385662306a36Sopenharmony_ci 385762306a36Sopenharmony_ci return count; 385862306a36Sopenharmony_ci} 385962306a36Sopenharmony_ciHSTATE_ATTR(nr_overcommit_hugepages); 386062306a36Sopenharmony_ci 386162306a36Sopenharmony_cistatic ssize_t free_hugepages_show(struct kobject *kobj, 386262306a36Sopenharmony_ci struct kobj_attribute *attr, char *buf) 386362306a36Sopenharmony_ci{ 386462306a36Sopenharmony_ci struct hstate *h; 386562306a36Sopenharmony_ci unsigned long free_huge_pages; 386662306a36Sopenharmony_ci int nid; 386762306a36Sopenharmony_ci 386862306a36Sopenharmony_ci h = kobj_to_hstate(kobj, &nid); 386962306a36Sopenharmony_ci if (nid == NUMA_NO_NODE) 387062306a36Sopenharmony_ci free_huge_pages = h->free_huge_pages; 387162306a36Sopenharmony_ci else 387262306a36Sopenharmony_ci free_huge_pages = h->free_huge_pages_node[nid]; 387362306a36Sopenharmony_ci 387462306a36Sopenharmony_ci return sysfs_emit(buf, "%lu\n", free_huge_pages); 387562306a36Sopenharmony_ci} 387662306a36Sopenharmony_ciHSTATE_ATTR_RO(free_hugepages); 387762306a36Sopenharmony_ci 387862306a36Sopenharmony_cistatic ssize_t resv_hugepages_show(struct kobject *kobj, 387962306a36Sopenharmony_ci struct kobj_attribute *attr, char *buf) 388062306a36Sopenharmony_ci{ 388162306a36Sopenharmony_ci struct hstate *h = kobj_to_hstate(kobj, NULL); 388262306a36Sopenharmony_ci return sysfs_emit(buf, "%lu\n", h->resv_huge_pages); 388362306a36Sopenharmony_ci} 388462306a36Sopenharmony_ciHSTATE_ATTR_RO(resv_hugepages); 388562306a36Sopenharmony_ci 388662306a36Sopenharmony_cistatic ssize_t surplus_hugepages_show(struct kobject *kobj, 388762306a36Sopenharmony_ci struct kobj_attribute *attr, char *buf) 388862306a36Sopenharmony_ci{ 388962306a36Sopenharmony_ci struct hstate *h; 389062306a36Sopenharmony_ci unsigned long surplus_huge_pages; 389162306a36Sopenharmony_ci int nid; 389262306a36Sopenharmony_ci 389362306a36Sopenharmony_ci h = kobj_to_hstate(kobj, &nid); 389462306a36Sopenharmony_ci if (nid == NUMA_NO_NODE) 389562306a36Sopenharmony_ci surplus_huge_pages = h->surplus_huge_pages; 389662306a36Sopenharmony_ci else 389762306a36Sopenharmony_ci surplus_huge_pages = h->surplus_huge_pages_node[nid]; 389862306a36Sopenharmony_ci 389962306a36Sopenharmony_ci return sysfs_emit(buf, "%lu\n", surplus_huge_pages); 390062306a36Sopenharmony_ci} 390162306a36Sopenharmony_ciHSTATE_ATTR_RO(surplus_hugepages); 390262306a36Sopenharmony_ci 390362306a36Sopenharmony_cistatic ssize_t demote_store(struct kobject *kobj, 390462306a36Sopenharmony_ci struct kobj_attribute *attr, const char *buf, size_t len) 390562306a36Sopenharmony_ci{ 390662306a36Sopenharmony_ci unsigned long nr_demote; 390762306a36Sopenharmony_ci unsigned long nr_available; 390862306a36Sopenharmony_ci nodemask_t nodes_allowed, *n_mask; 390962306a36Sopenharmony_ci struct hstate *h; 391062306a36Sopenharmony_ci int err; 391162306a36Sopenharmony_ci int nid; 391262306a36Sopenharmony_ci 391362306a36Sopenharmony_ci err = kstrtoul(buf, 10, &nr_demote); 391462306a36Sopenharmony_ci if (err) 391562306a36Sopenharmony_ci return err; 391662306a36Sopenharmony_ci h = kobj_to_hstate(kobj, &nid); 391762306a36Sopenharmony_ci 391862306a36Sopenharmony_ci if (nid != NUMA_NO_NODE) { 391962306a36Sopenharmony_ci init_nodemask_of_node(&nodes_allowed, nid); 392062306a36Sopenharmony_ci n_mask = &nodes_allowed; 392162306a36Sopenharmony_ci } else { 392262306a36Sopenharmony_ci n_mask = &node_states[N_MEMORY]; 392362306a36Sopenharmony_ci } 392462306a36Sopenharmony_ci 392562306a36Sopenharmony_ci /* Synchronize with other sysfs operations modifying huge pages */ 392662306a36Sopenharmony_ci mutex_lock(&h->resize_lock); 392762306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 392862306a36Sopenharmony_ci 392962306a36Sopenharmony_ci while (nr_demote) { 393062306a36Sopenharmony_ci /* 393162306a36Sopenharmony_ci * Check for available pages to demote each time thorough the 393262306a36Sopenharmony_ci * loop as demote_pool_huge_page will drop hugetlb_lock. 393362306a36Sopenharmony_ci */ 393462306a36Sopenharmony_ci if (nid != NUMA_NO_NODE) 393562306a36Sopenharmony_ci nr_available = h->free_huge_pages_node[nid]; 393662306a36Sopenharmony_ci else 393762306a36Sopenharmony_ci nr_available = h->free_huge_pages; 393862306a36Sopenharmony_ci nr_available -= h->resv_huge_pages; 393962306a36Sopenharmony_ci if (!nr_available) 394062306a36Sopenharmony_ci break; 394162306a36Sopenharmony_ci 394262306a36Sopenharmony_ci err = demote_pool_huge_page(h, n_mask); 394362306a36Sopenharmony_ci if (err) 394462306a36Sopenharmony_ci break; 394562306a36Sopenharmony_ci 394662306a36Sopenharmony_ci nr_demote--; 394762306a36Sopenharmony_ci } 394862306a36Sopenharmony_ci 394962306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 395062306a36Sopenharmony_ci mutex_unlock(&h->resize_lock); 395162306a36Sopenharmony_ci 395262306a36Sopenharmony_ci if (err) 395362306a36Sopenharmony_ci return err; 395462306a36Sopenharmony_ci return len; 395562306a36Sopenharmony_ci} 395662306a36Sopenharmony_ciHSTATE_ATTR_WO(demote); 395762306a36Sopenharmony_ci 395862306a36Sopenharmony_cistatic ssize_t demote_size_show(struct kobject *kobj, 395962306a36Sopenharmony_ci struct kobj_attribute *attr, char *buf) 396062306a36Sopenharmony_ci{ 396162306a36Sopenharmony_ci struct hstate *h = kobj_to_hstate(kobj, NULL); 396262306a36Sopenharmony_ci unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K; 396362306a36Sopenharmony_ci 396462306a36Sopenharmony_ci return sysfs_emit(buf, "%lukB\n", demote_size); 396562306a36Sopenharmony_ci} 396662306a36Sopenharmony_ci 396762306a36Sopenharmony_cistatic ssize_t demote_size_store(struct kobject *kobj, 396862306a36Sopenharmony_ci struct kobj_attribute *attr, 396962306a36Sopenharmony_ci const char *buf, size_t count) 397062306a36Sopenharmony_ci{ 397162306a36Sopenharmony_ci struct hstate *h, *demote_hstate; 397262306a36Sopenharmony_ci unsigned long demote_size; 397362306a36Sopenharmony_ci unsigned int demote_order; 397462306a36Sopenharmony_ci 397562306a36Sopenharmony_ci demote_size = (unsigned long)memparse(buf, NULL); 397662306a36Sopenharmony_ci 397762306a36Sopenharmony_ci demote_hstate = size_to_hstate(demote_size); 397862306a36Sopenharmony_ci if (!demote_hstate) 397962306a36Sopenharmony_ci return -EINVAL; 398062306a36Sopenharmony_ci demote_order = demote_hstate->order; 398162306a36Sopenharmony_ci if (demote_order < HUGETLB_PAGE_ORDER) 398262306a36Sopenharmony_ci return -EINVAL; 398362306a36Sopenharmony_ci 398462306a36Sopenharmony_ci /* demote order must be smaller than hstate order */ 398562306a36Sopenharmony_ci h = kobj_to_hstate(kobj, NULL); 398662306a36Sopenharmony_ci if (demote_order >= h->order) 398762306a36Sopenharmony_ci return -EINVAL; 398862306a36Sopenharmony_ci 398962306a36Sopenharmony_ci /* resize_lock synchronizes access to demote size and writes */ 399062306a36Sopenharmony_ci mutex_lock(&h->resize_lock); 399162306a36Sopenharmony_ci h->demote_order = demote_order; 399262306a36Sopenharmony_ci mutex_unlock(&h->resize_lock); 399362306a36Sopenharmony_ci 399462306a36Sopenharmony_ci return count; 399562306a36Sopenharmony_ci} 399662306a36Sopenharmony_ciHSTATE_ATTR(demote_size); 399762306a36Sopenharmony_ci 399862306a36Sopenharmony_cistatic struct attribute *hstate_attrs[] = { 399962306a36Sopenharmony_ci &nr_hugepages_attr.attr, 400062306a36Sopenharmony_ci &nr_overcommit_hugepages_attr.attr, 400162306a36Sopenharmony_ci &free_hugepages_attr.attr, 400262306a36Sopenharmony_ci &resv_hugepages_attr.attr, 400362306a36Sopenharmony_ci &surplus_hugepages_attr.attr, 400462306a36Sopenharmony_ci#ifdef CONFIG_NUMA 400562306a36Sopenharmony_ci &nr_hugepages_mempolicy_attr.attr, 400662306a36Sopenharmony_ci#endif 400762306a36Sopenharmony_ci NULL, 400862306a36Sopenharmony_ci}; 400962306a36Sopenharmony_ci 401062306a36Sopenharmony_cistatic const struct attribute_group hstate_attr_group = { 401162306a36Sopenharmony_ci .attrs = hstate_attrs, 401262306a36Sopenharmony_ci}; 401362306a36Sopenharmony_ci 401462306a36Sopenharmony_cistatic struct attribute *hstate_demote_attrs[] = { 401562306a36Sopenharmony_ci &demote_size_attr.attr, 401662306a36Sopenharmony_ci &demote_attr.attr, 401762306a36Sopenharmony_ci NULL, 401862306a36Sopenharmony_ci}; 401962306a36Sopenharmony_ci 402062306a36Sopenharmony_cistatic const struct attribute_group hstate_demote_attr_group = { 402162306a36Sopenharmony_ci .attrs = hstate_demote_attrs, 402262306a36Sopenharmony_ci}; 402362306a36Sopenharmony_ci 402462306a36Sopenharmony_cistatic int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, 402562306a36Sopenharmony_ci struct kobject **hstate_kobjs, 402662306a36Sopenharmony_ci const struct attribute_group *hstate_attr_group) 402762306a36Sopenharmony_ci{ 402862306a36Sopenharmony_ci int retval; 402962306a36Sopenharmony_ci int hi = hstate_index(h); 403062306a36Sopenharmony_ci 403162306a36Sopenharmony_ci hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); 403262306a36Sopenharmony_ci if (!hstate_kobjs[hi]) 403362306a36Sopenharmony_ci return -ENOMEM; 403462306a36Sopenharmony_ci 403562306a36Sopenharmony_ci retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); 403662306a36Sopenharmony_ci if (retval) { 403762306a36Sopenharmony_ci kobject_put(hstate_kobjs[hi]); 403862306a36Sopenharmony_ci hstate_kobjs[hi] = NULL; 403962306a36Sopenharmony_ci return retval; 404062306a36Sopenharmony_ci } 404162306a36Sopenharmony_ci 404262306a36Sopenharmony_ci if (h->demote_order) { 404362306a36Sopenharmony_ci retval = sysfs_create_group(hstate_kobjs[hi], 404462306a36Sopenharmony_ci &hstate_demote_attr_group); 404562306a36Sopenharmony_ci if (retval) { 404662306a36Sopenharmony_ci pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name); 404762306a36Sopenharmony_ci sysfs_remove_group(hstate_kobjs[hi], hstate_attr_group); 404862306a36Sopenharmony_ci kobject_put(hstate_kobjs[hi]); 404962306a36Sopenharmony_ci hstate_kobjs[hi] = NULL; 405062306a36Sopenharmony_ci return retval; 405162306a36Sopenharmony_ci } 405262306a36Sopenharmony_ci } 405362306a36Sopenharmony_ci 405462306a36Sopenharmony_ci return 0; 405562306a36Sopenharmony_ci} 405662306a36Sopenharmony_ci 405762306a36Sopenharmony_ci#ifdef CONFIG_NUMA 405862306a36Sopenharmony_cistatic bool hugetlb_sysfs_initialized __ro_after_init; 405962306a36Sopenharmony_ci 406062306a36Sopenharmony_ci/* 406162306a36Sopenharmony_ci * node_hstate/s - associate per node hstate attributes, via their kobjects, 406262306a36Sopenharmony_ci * with node devices in node_devices[] using a parallel array. The array 406362306a36Sopenharmony_ci * index of a node device or _hstate == node id. 406462306a36Sopenharmony_ci * This is here to avoid any static dependency of the node device driver, in 406562306a36Sopenharmony_ci * the base kernel, on the hugetlb module. 406662306a36Sopenharmony_ci */ 406762306a36Sopenharmony_cistruct node_hstate { 406862306a36Sopenharmony_ci struct kobject *hugepages_kobj; 406962306a36Sopenharmony_ci struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 407062306a36Sopenharmony_ci}; 407162306a36Sopenharmony_cistatic struct node_hstate node_hstates[MAX_NUMNODES]; 407262306a36Sopenharmony_ci 407362306a36Sopenharmony_ci/* 407462306a36Sopenharmony_ci * A subset of global hstate attributes for node devices 407562306a36Sopenharmony_ci */ 407662306a36Sopenharmony_cistatic struct attribute *per_node_hstate_attrs[] = { 407762306a36Sopenharmony_ci &nr_hugepages_attr.attr, 407862306a36Sopenharmony_ci &free_hugepages_attr.attr, 407962306a36Sopenharmony_ci &surplus_hugepages_attr.attr, 408062306a36Sopenharmony_ci NULL, 408162306a36Sopenharmony_ci}; 408262306a36Sopenharmony_ci 408362306a36Sopenharmony_cistatic const struct attribute_group per_node_hstate_attr_group = { 408462306a36Sopenharmony_ci .attrs = per_node_hstate_attrs, 408562306a36Sopenharmony_ci}; 408662306a36Sopenharmony_ci 408762306a36Sopenharmony_ci/* 408862306a36Sopenharmony_ci * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj. 408962306a36Sopenharmony_ci * Returns node id via non-NULL nidp. 409062306a36Sopenharmony_ci */ 409162306a36Sopenharmony_cistatic struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) 409262306a36Sopenharmony_ci{ 409362306a36Sopenharmony_ci int nid; 409462306a36Sopenharmony_ci 409562306a36Sopenharmony_ci for (nid = 0; nid < nr_node_ids; nid++) { 409662306a36Sopenharmony_ci struct node_hstate *nhs = &node_hstates[nid]; 409762306a36Sopenharmony_ci int i; 409862306a36Sopenharmony_ci for (i = 0; i < HUGE_MAX_HSTATE; i++) 409962306a36Sopenharmony_ci if (nhs->hstate_kobjs[i] == kobj) { 410062306a36Sopenharmony_ci if (nidp) 410162306a36Sopenharmony_ci *nidp = nid; 410262306a36Sopenharmony_ci return &hstates[i]; 410362306a36Sopenharmony_ci } 410462306a36Sopenharmony_ci } 410562306a36Sopenharmony_ci 410662306a36Sopenharmony_ci BUG(); 410762306a36Sopenharmony_ci return NULL; 410862306a36Sopenharmony_ci} 410962306a36Sopenharmony_ci 411062306a36Sopenharmony_ci/* 411162306a36Sopenharmony_ci * Unregister hstate attributes from a single node device. 411262306a36Sopenharmony_ci * No-op if no hstate attributes attached. 411362306a36Sopenharmony_ci */ 411462306a36Sopenharmony_civoid hugetlb_unregister_node(struct node *node) 411562306a36Sopenharmony_ci{ 411662306a36Sopenharmony_ci struct hstate *h; 411762306a36Sopenharmony_ci struct node_hstate *nhs = &node_hstates[node->dev.id]; 411862306a36Sopenharmony_ci 411962306a36Sopenharmony_ci if (!nhs->hugepages_kobj) 412062306a36Sopenharmony_ci return; /* no hstate attributes */ 412162306a36Sopenharmony_ci 412262306a36Sopenharmony_ci for_each_hstate(h) { 412362306a36Sopenharmony_ci int idx = hstate_index(h); 412462306a36Sopenharmony_ci struct kobject *hstate_kobj = nhs->hstate_kobjs[idx]; 412562306a36Sopenharmony_ci 412662306a36Sopenharmony_ci if (!hstate_kobj) 412762306a36Sopenharmony_ci continue; 412862306a36Sopenharmony_ci if (h->demote_order) 412962306a36Sopenharmony_ci sysfs_remove_group(hstate_kobj, &hstate_demote_attr_group); 413062306a36Sopenharmony_ci sysfs_remove_group(hstate_kobj, &per_node_hstate_attr_group); 413162306a36Sopenharmony_ci kobject_put(hstate_kobj); 413262306a36Sopenharmony_ci nhs->hstate_kobjs[idx] = NULL; 413362306a36Sopenharmony_ci } 413462306a36Sopenharmony_ci 413562306a36Sopenharmony_ci kobject_put(nhs->hugepages_kobj); 413662306a36Sopenharmony_ci nhs->hugepages_kobj = NULL; 413762306a36Sopenharmony_ci} 413862306a36Sopenharmony_ci 413962306a36Sopenharmony_ci 414062306a36Sopenharmony_ci/* 414162306a36Sopenharmony_ci * Register hstate attributes for a single node device. 414262306a36Sopenharmony_ci * No-op if attributes already registered. 414362306a36Sopenharmony_ci */ 414462306a36Sopenharmony_civoid hugetlb_register_node(struct node *node) 414562306a36Sopenharmony_ci{ 414662306a36Sopenharmony_ci struct hstate *h; 414762306a36Sopenharmony_ci struct node_hstate *nhs = &node_hstates[node->dev.id]; 414862306a36Sopenharmony_ci int err; 414962306a36Sopenharmony_ci 415062306a36Sopenharmony_ci if (!hugetlb_sysfs_initialized) 415162306a36Sopenharmony_ci return; 415262306a36Sopenharmony_ci 415362306a36Sopenharmony_ci if (nhs->hugepages_kobj) 415462306a36Sopenharmony_ci return; /* already allocated */ 415562306a36Sopenharmony_ci 415662306a36Sopenharmony_ci nhs->hugepages_kobj = kobject_create_and_add("hugepages", 415762306a36Sopenharmony_ci &node->dev.kobj); 415862306a36Sopenharmony_ci if (!nhs->hugepages_kobj) 415962306a36Sopenharmony_ci return; 416062306a36Sopenharmony_ci 416162306a36Sopenharmony_ci for_each_hstate(h) { 416262306a36Sopenharmony_ci err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, 416362306a36Sopenharmony_ci nhs->hstate_kobjs, 416462306a36Sopenharmony_ci &per_node_hstate_attr_group); 416562306a36Sopenharmony_ci if (err) { 416662306a36Sopenharmony_ci pr_err("HugeTLB: Unable to add hstate %s for node %d\n", 416762306a36Sopenharmony_ci h->name, node->dev.id); 416862306a36Sopenharmony_ci hugetlb_unregister_node(node); 416962306a36Sopenharmony_ci break; 417062306a36Sopenharmony_ci } 417162306a36Sopenharmony_ci } 417262306a36Sopenharmony_ci} 417362306a36Sopenharmony_ci 417462306a36Sopenharmony_ci/* 417562306a36Sopenharmony_ci * hugetlb init time: register hstate attributes for all registered node 417662306a36Sopenharmony_ci * devices of nodes that have memory. All on-line nodes should have 417762306a36Sopenharmony_ci * registered their associated device by this time. 417862306a36Sopenharmony_ci */ 417962306a36Sopenharmony_cistatic void __init hugetlb_register_all_nodes(void) 418062306a36Sopenharmony_ci{ 418162306a36Sopenharmony_ci int nid; 418262306a36Sopenharmony_ci 418362306a36Sopenharmony_ci for_each_online_node(nid) 418462306a36Sopenharmony_ci hugetlb_register_node(node_devices[nid]); 418562306a36Sopenharmony_ci} 418662306a36Sopenharmony_ci#else /* !CONFIG_NUMA */ 418762306a36Sopenharmony_ci 418862306a36Sopenharmony_cistatic struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) 418962306a36Sopenharmony_ci{ 419062306a36Sopenharmony_ci BUG(); 419162306a36Sopenharmony_ci if (nidp) 419262306a36Sopenharmony_ci *nidp = -1; 419362306a36Sopenharmony_ci return NULL; 419462306a36Sopenharmony_ci} 419562306a36Sopenharmony_ci 419662306a36Sopenharmony_cistatic void hugetlb_register_all_nodes(void) { } 419762306a36Sopenharmony_ci 419862306a36Sopenharmony_ci#endif 419962306a36Sopenharmony_ci 420062306a36Sopenharmony_ci#ifdef CONFIG_CMA 420162306a36Sopenharmony_cistatic void __init hugetlb_cma_check(void); 420262306a36Sopenharmony_ci#else 420362306a36Sopenharmony_cistatic inline __init void hugetlb_cma_check(void) 420462306a36Sopenharmony_ci{ 420562306a36Sopenharmony_ci} 420662306a36Sopenharmony_ci#endif 420762306a36Sopenharmony_ci 420862306a36Sopenharmony_cistatic void __init hugetlb_sysfs_init(void) 420962306a36Sopenharmony_ci{ 421062306a36Sopenharmony_ci struct hstate *h; 421162306a36Sopenharmony_ci int err; 421262306a36Sopenharmony_ci 421362306a36Sopenharmony_ci hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); 421462306a36Sopenharmony_ci if (!hugepages_kobj) 421562306a36Sopenharmony_ci return; 421662306a36Sopenharmony_ci 421762306a36Sopenharmony_ci for_each_hstate(h) { 421862306a36Sopenharmony_ci err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, 421962306a36Sopenharmony_ci hstate_kobjs, &hstate_attr_group); 422062306a36Sopenharmony_ci if (err) 422162306a36Sopenharmony_ci pr_err("HugeTLB: Unable to add hstate %s", h->name); 422262306a36Sopenharmony_ci } 422362306a36Sopenharmony_ci 422462306a36Sopenharmony_ci#ifdef CONFIG_NUMA 422562306a36Sopenharmony_ci hugetlb_sysfs_initialized = true; 422662306a36Sopenharmony_ci#endif 422762306a36Sopenharmony_ci hugetlb_register_all_nodes(); 422862306a36Sopenharmony_ci} 422962306a36Sopenharmony_ci 423062306a36Sopenharmony_ci#ifdef CONFIG_SYSCTL 423162306a36Sopenharmony_cistatic void hugetlb_sysctl_init(void); 423262306a36Sopenharmony_ci#else 423362306a36Sopenharmony_cistatic inline void hugetlb_sysctl_init(void) { } 423462306a36Sopenharmony_ci#endif 423562306a36Sopenharmony_ci 423662306a36Sopenharmony_cistatic int __init hugetlb_init(void) 423762306a36Sopenharmony_ci{ 423862306a36Sopenharmony_ci int i; 423962306a36Sopenharmony_ci 424062306a36Sopenharmony_ci BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE < 424162306a36Sopenharmony_ci __NR_HPAGEFLAGS); 424262306a36Sopenharmony_ci 424362306a36Sopenharmony_ci if (!hugepages_supported()) { 424462306a36Sopenharmony_ci if (hugetlb_max_hstate || default_hstate_max_huge_pages) 424562306a36Sopenharmony_ci pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n"); 424662306a36Sopenharmony_ci return 0; 424762306a36Sopenharmony_ci } 424862306a36Sopenharmony_ci 424962306a36Sopenharmony_ci /* 425062306a36Sopenharmony_ci * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some 425162306a36Sopenharmony_ci * architectures depend on setup being done here. 425262306a36Sopenharmony_ci */ 425362306a36Sopenharmony_ci hugetlb_add_hstate(HUGETLB_PAGE_ORDER); 425462306a36Sopenharmony_ci if (!parsed_default_hugepagesz) { 425562306a36Sopenharmony_ci /* 425662306a36Sopenharmony_ci * If we did not parse a default huge page size, set 425762306a36Sopenharmony_ci * default_hstate_idx to HPAGE_SIZE hstate. And, if the 425862306a36Sopenharmony_ci * number of huge pages for this default size was implicitly 425962306a36Sopenharmony_ci * specified, set that here as well. 426062306a36Sopenharmony_ci * Note that the implicit setting will overwrite an explicit 426162306a36Sopenharmony_ci * setting. A warning will be printed in this case. 426262306a36Sopenharmony_ci */ 426362306a36Sopenharmony_ci default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE)); 426462306a36Sopenharmony_ci if (default_hstate_max_huge_pages) { 426562306a36Sopenharmony_ci if (default_hstate.max_huge_pages) { 426662306a36Sopenharmony_ci char buf[32]; 426762306a36Sopenharmony_ci 426862306a36Sopenharmony_ci string_get_size(huge_page_size(&default_hstate), 426962306a36Sopenharmony_ci 1, STRING_UNITS_2, buf, 32); 427062306a36Sopenharmony_ci pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n", 427162306a36Sopenharmony_ci default_hstate.max_huge_pages, buf); 427262306a36Sopenharmony_ci pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n", 427362306a36Sopenharmony_ci default_hstate_max_huge_pages); 427462306a36Sopenharmony_ci } 427562306a36Sopenharmony_ci default_hstate.max_huge_pages = 427662306a36Sopenharmony_ci default_hstate_max_huge_pages; 427762306a36Sopenharmony_ci 427862306a36Sopenharmony_ci for_each_online_node(i) 427962306a36Sopenharmony_ci default_hstate.max_huge_pages_node[i] = 428062306a36Sopenharmony_ci default_hugepages_in_node[i]; 428162306a36Sopenharmony_ci } 428262306a36Sopenharmony_ci } 428362306a36Sopenharmony_ci 428462306a36Sopenharmony_ci hugetlb_cma_check(); 428562306a36Sopenharmony_ci hugetlb_init_hstates(); 428662306a36Sopenharmony_ci gather_bootmem_prealloc(); 428762306a36Sopenharmony_ci report_hugepages(); 428862306a36Sopenharmony_ci 428962306a36Sopenharmony_ci hugetlb_sysfs_init(); 429062306a36Sopenharmony_ci hugetlb_cgroup_file_init(); 429162306a36Sopenharmony_ci hugetlb_sysctl_init(); 429262306a36Sopenharmony_ci 429362306a36Sopenharmony_ci#ifdef CONFIG_SMP 429462306a36Sopenharmony_ci num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus()); 429562306a36Sopenharmony_ci#else 429662306a36Sopenharmony_ci num_fault_mutexes = 1; 429762306a36Sopenharmony_ci#endif 429862306a36Sopenharmony_ci hugetlb_fault_mutex_table = 429962306a36Sopenharmony_ci kmalloc_array(num_fault_mutexes, sizeof(struct mutex), 430062306a36Sopenharmony_ci GFP_KERNEL); 430162306a36Sopenharmony_ci BUG_ON(!hugetlb_fault_mutex_table); 430262306a36Sopenharmony_ci 430362306a36Sopenharmony_ci for (i = 0; i < num_fault_mutexes; i++) 430462306a36Sopenharmony_ci mutex_init(&hugetlb_fault_mutex_table[i]); 430562306a36Sopenharmony_ci return 0; 430662306a36Sopenharmony_ci} 430762306a36Sopenharmony_cisubsys_initcall(hugetlb_init); 430862306a36Sopenharmony_ci 430962306a36Sopenharmony_ci/* Overwritten by architectures with more huge page sizes */ 431062306a36Sopenharmony_cibool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size) 431162306a36Sopenharmony_ci{ 431262306a36Sopenharmony_ci return size == HPAGE_SIZE; 431362306a36Sopenharmony_ci} 431462306a36Sopenharmony_ci 431562306a36Sopenharmony_civoid __init hugetlb_add_hstate(unsigned int order) 431662306a36Sopenharmony_ci{ 431762306a36Sopenharmony_ci struct hstate *h; 431862306a36Sopenharmony_ci unsigned long i; 431962306a36Sopenharmony_ci 432062306a36Sopenharmony_ci if (size_to_hstate(PAGE_SIZE << order)) { 432162306a36Sopenharmony_ci return; 432262306a36Sopenharmony_ci } 432362306a36Sopenharmony_ci BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); 432462306a36Sopenharmony_ci BUG_ON(order == 0); 432562306a36Sopenharmony_ci h = &hstates[hugetlb_max_hstate++]; 432662306a36Sopenharmony_ci mutex_init(&h->resize_lock); 432762306a36Sopenharmony_ci h->order = order; 432862306a36Sopenharmony_ci h->mask = ~(huge_page_size(h) - 1); 432962306a36Sopenharmony_ci for (i = 0; i < MAX_NUMNODES; ++i) 433062306a36Sopenharmony_ci INIT_LIST_HEAD(&h->hugepage_freelists[i]); 433162306a36Sopenharmony_ci INIT_LIST_HEAD(&h->hugepage_activelist); 433262306a36Sopenharmony_ci h->next_nid_to_alloc = first_memory_node; 433362306a36Sopenharmony_ci h->next_nid_to_free = first_memory_node; 433462306a36Sopenharmony_ci snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 433562306a36Sopenharmony_ci huge_page_size(h)/SZ_1K); 433662306a36Sopenharmony_ci 433762306a36Sopenharmony_ci parsed_hstate = h; 433862306a36Sopenharmony_ci} 433962306a36Sopenharmony_ci 434062306a36Sopenharmony_cibool __init __weak hugetlb_node_alloc_supported(void) 434162306a36Sopenharmony_ci{ 434262306a36Sopenharmony_ci return true; 434362306a36Sopenharmony_ci} 434462306a36Sopenharmony_ci 434562306a36Sopenharmony_cistatic void __init hugepages_clear_pages_in_node(void) 434662306a36Sopenharmony_ci{ 434762306a36Sopenharmony_ci if (!hugetlb_max_hstate) { 434862306a36Sopenharmony_ci default_hstate_max_huge_pages = 0; 434962306a36Sopenharmony_ci memset(default_hugepages_in_node, 0, 435062306a36Sopenharmony_ci sizeof(default_hugepages_in_node)); 435162306a36Sopenharmony_ci } else { 435262306a36Sopenharmony_ci parsed_hstate->max_huge_pages = 0; 435362306a36Sopenharmony_ci memset(parsed_hstate->max_huge_pages_node, 0, 435462306a36Sopenharmony_ci sizeof(parsed_hstate->max_huge_pages_node)); 435562306a36Sopenharmony_ci } 435662306a36Sopenharmony_ci} 435762306a36Sopenharmony_ci 435862306a36Sopenharmony_ci/* 435962306a36Sopenharmony_ci * hugepages command line processing 436062306a36Sopenharmony_ci * hugepages normally follows a valid hugepagsz or default_hugepagsz 436162306a36Sopenharmony_ci * specification. If not, ignore the hugepages value. hugepages can also 436262306a36Sopenharmony_ci * be the first huge page command line option in which case it implicitly 436362306a36Sopenharmony_ci * specifies the number of huge pages for the default size. 436462306a36Sopenharmony_ci */ 436562306a36Sopenharmony_cistatic int __init hugepages_setup(char *s) 436662306a36Sopenharmony_ci{ 436762306a36Sopenharmony_ci unsigned long *mhp; 436862306a36Sopenharmony_ci static unsigned long *last_mhp; 436962306a36Sopenharmony_ci int node = NUMA_NO_NODE; 437062306a36Sopenharmony_ci int count; 437162306a36Sopenharmony_ci unsigned long tmp; 437262306a36Sopenharmony_ci char *p = s; 437362306a36Sopenharmony_ci 437462306a36Sopenharmony_ci if (!parsed_valid_hugepagesz) { 437562306a36Sopenharmony_ci pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s); 437662306a36Sopenharmony_ci parsed_valid_hugepagesz = true; 437762306a36Sopenharmony_ci return 1; 437862306a36Sopenharmony_ci } 437962306a36Sopenharmony_ci 438062306a36Sopenharmony_ci /* 438162306a36Sopenharmony_ci * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter 438262306a36Sopenharmony_ci * yet, so this hugepages= parameter goes to the "default hstate". 438362306a36Sopenharmony_ci * Otherwise, it goes with the previously parsed hugepagesz or 438462306a36Sopenharmony_ci * default_hugepagesz. 438562306a36Sopenharmony_ci */ 438662306a36Sopenharmony_ci else if (!hugetlb_max_hstate) 438762306a36Sopenharmony_ci mhp = &default_hstate_max_huge_pages; 438862306a36Sopenharmony_ci else 438962306a36Sopenharmony_ci mhp = &parsed_hstate->max_huge_pages; 439062306a36Sopenharmony_ci 439162306a36Sopenharmony_ci if (mhp == last_mhp) { 439262306a36Sopenharmony_ci pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s); 439362306a36Sopenharmony_ci return 1; 439462306a36Sopenharmony_ci } 439562306a36Sopenharmony_ci 439662306a36Sopenharmony_ci while (*p) { 439762306a36Sopenharmony_ci count = 0; 439862306a36Sopenharmony_ci if (sscanf(p, "%lu%n", &tmp, &count) != 1) 439962306a36Sopenharmony_ci goto invalid; 440062306a36Sopenharmony_ci /* Parameter is node format */ 440162306a36Sopenharmony_ci if (p[count] == ':') { 440262306a36Sopenharmony_ci if (!hugetlb_node_alloc_supported()) { 440362306a36Sopenharmony_ci pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n"); 440462306a36Sopenharmony_ci return 1; 440562306a36Sopenharmony_ci } 440662306a36Sopenharmony_ci if (tmp >= MAX_NUMNODES || !node_online(tmp)) 440762306a36Sopenharmony_ci goto invalid; 440862306a36Sopenharmony_ci node = array_index_nospec(tmp, MAX_NUMNODES); 440962306a36Sopenharmony_ci p += count + 1; 441062306a36Sopenharmony_ci /* Parse hugepages */ 441162306a36Sopenharmony_ci if (sscanf(p, "%lu%n", &tmp, &count) != 1) 441262306a36Sopenharmony_ci goto invalid; 441362306a36Sopenharmony_ci if (!hugetlb_max_hstate) 441462306a36Sopenharmony_ci default_hugepages_in_node[node] = tmp; 441562306a36Sopenharmony_ci else 441662306a36Sopenharmony_ci parsed_hstate->max_huge_pages_node[node] = tmp; 441762306a36Sopenharmony_ci *mhp += tmp; 441862306a36Sopenharmony_ci /* Go to parse next node*/ 441962306a36Sopenharmony_ci if (p[count] == ',') 442062306a36Sopenharmony_ci p += count + 1; 442162306a36Sopenharmony_ci else 442262306a36Sopenharmony_ci break; 442362306a36Sopenharmony_ci } else { 442462306a36Sopenharmony_ci if (p != s) 442562306a36Sopenharmony_ci goto invalid; 442662306a36Sopenharmony_ci *mhp = tmp; 442762306a36Sopenharmony_ci break; 442862306a36Sopenharmony_ci } 442962306a36Sopenharmony_ci } 443062306a36Sopenharmony_ci 443162306a36Sopenharmony_ci /* 443262306a36Sopenharmony_ci * Global state is always initialized later in hugetlb_init. 443362306a36Sopenharmony_ci * But we need to allocate gigantic hstates here early to still 443462306a36Sopenharmony_ci * use the bootmem allocator. 443562306a36Sopenharmony_ci */ 443662306a36Sopenharmony_ci if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate)) 443762306a36Sopenharmony_ci hugetlb_hstate_alloc_pages(parsed_hstate); 443862306a36Sopenharmony_ci 443962306a36Sopenharmony_ci last_mhp = mhp; 444062306a36Sopenharmony_ci 444162306a36Sopenharmony_ci return 1; 444262306a36Sopenharmony_ci 444362306a36Sopenharmony_ciinvalid: 444462306a36Sopenharmony_ci pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p); 444562306a36Sopenharmony_ci hugepages_clear_pages_in_node(); 444662306a36Sopenharmony_ci return 1; 444762306a36Sopenharmony_ci} 444862306a36Sopenharmony_ci__setup("hugepages=", hugepages_setup); 444962306a36Sopenharmony_ci 445062306a36Sopenharmony_ci/* 445162306a36Sopenharmony_ci * hugepagesz command line processing 445262306a36Sopenharmony_ci * A specific huge page size can only be specified once with hugepagesz. 445362306a36Sopenharmony_ci * hugepagesz is followed by hugepages on the command line. The global 445462306a36Sopenharmony_ci * variable 'parsed_valid_hugepagesz' is used to determine if prior 445562306a36Sopenharmony_ci * hugepagesz argument was valid. 445662306a36Sopenharmony_ci */ 445762306a36Sopenharmony_cistatic int __init hugepagesz_setup(char *s) 445862306a36Sopenharmony_ci{ 445962306a36Sopenharmony_ci unsigned long size; 446062306a36Sopenharmony_ci struct hstate *h; 446162306a36Sopenharmony_ci 446262306a36Sopenharmony_ci parsed_valid_hugepagesz = false; 446362306a36Sopenharmony_ci size = (unsigned long)memparse(s, NULL); 446462306a36Sopenharmony_ci 446562306a36Sopenharmony_ci if (!arch_hugetlb_valid_size(size)) { 446662306a36Sopenharmony_ci pr_err("HugeTLB: unsupported hugepagesz=%s\n", s); 446762306a36Sopenharmony_ci return 1; 446862306a36Sopenharmony_ci } 446962306a36Sopenharmony_ci 447062306a36Sopenharmony_ci h = size_to_hstate(size); 447162306a36Sopenharmony_ci if (h) { 447262306a36Sopenharmony_ci /* 447362306a36Sopenharmony_ci * hstate for this size already exists. This is normally 447462306a36Sopenharmony_ci * an error, but is allowed if the existing hstate is the 447562306a36Sopenharmony_ci * default hstate. More specifically, it is only allowed if 447662306a36Sopenharmony_ci * the number of huge pages for the default hstate was not 447762306a36Sopenharmony_ci * previously specified. 447862306a36Sopenharmony_ci */ 447962306a36Sopenharmony_ci if (!parsed_default_hugepagesz || h != &default_hstate || 448062306a36Sopenharmony_ci default_hstate.max_huge_pages) { 448162306a36Sopenharmony_ci pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s); 448262306a36Sopenharmony_ci return 1; 448362306a36Sopenharmony_ci } 448462306a36Sopenharmony_ci 448562306a36Sopenharmony_ci /* 448662306a36Sopenharmony_ci * No need to call hugetlb_add_hstate() as hstate already 448762306a36Sopenharmony_ci * exists. But, do set parsed_hstate so that a following 448862306a36Sopenharmony_ci * hugepages= parameter will be applied to this hstate. 448962306a36Sopenharmony_ci */ 449062306a36Sopenharmony_ci parsed_hstate = h; 449162306a36Sopenharmony_ci parsed_valid_hugepagesz = true; 449262306a36Sopenharmony_ci return 1; 449362306a36Sopenharmony_ci } 449462306a36Sopenharmony_ci 449562306a36Sopenharmony_ci hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); 449662306a36Sopenharmony_ci parsed_valid_hugepagesz = true; 449762306a36Sopenharmony_ci return 1; 449862306a36Sopenharmony_ci} 449962306a36Sopenharmony_ci__setup("hugepagesz=", hugepagesz_setup); 450062306a36Sopenharmony_ci 450162306a36Sopenharmony_ci/* 450262306a36Sopenharmony_ci * default_hugepagesz command line input 450362306a36Sopenharmony_ci * Only one instance of default_hugepagesz allowed on command line. 450462306a36Sopenharmony_ci */ 450562306a36Sopenharmony_cistatic int __init default_hugepagesz_setup(char *s) 450662306a36Sopenharmony_ci{ 450762306a36Sopenharmony_ci unsigned long size; 450862306a36Sopenharmony_ci int i; 450962306a36Sopenharmony_ci 451062306a36Sopenharmony_ci parsed_valid_hugepagesz = false; 451162306a36Sopenharmony_ci if (parsed_default_hugepagesz) { 451262306a36Sopenharmony_ci pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s); 451362306a36Sopenharmony_ci return 1; 451462306a36Sopenharmony_ci } 451562306a36Sopenharmony_ci 451662306a36Sopenharmony_ci size = (unsigned long)memparse(s, NULL); 451762306a36Sopenharmony_ci 451862306a36Sopenharmony_ci if (!arch_hugetlb_valid_size(size)) { 451962306a36Sopenharmony_ci pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s); 452062306a36Sopenharmony_ci return 1; 452162306a36Sopenharmony_ci } 452262306a36Sopenharmony_ci 452362306a36Sopenharmony_ci hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); 452462306a36Sopenharmony_ci parsed_valid_hugepagesz = true; 452562306a36Sopenharmony_ci parsed_default_hugepagesz = true; 452662306a36Sopenharmony_ci default_hstate_idx = hstate_index(size_to_hstate(size)); 452762306a36Sopenharmony_ci 452862306a36Sopenharmony_ci /* 452962306a36Sopenharmony_ci * The number of default huge pages (for this size) could have been 453062306a36Sopenharmony_ci * specified as the first hugetlb parameter: hugepages=X. If so, 453162306a36Sopenharmony_ci * then default_hstate_max_huge_pages is set. If the default huge 453262306a36Sopenharmony_ci * page size is gigantic (> MAX_ORDER), then the pages must be 453362306a36Sopenharmony_ci * allocated here from bootmem allocator. 453462306a36Sopenharmony_ci */ 453562306a36Sopenharmony_ci if (default_hstate_max_huge_pages) { 453662306a36Sopenharmony_ci default_hstate.max_huge_pages = default_hstate_max_huge_pages; 453762306a36Sopenharmony_ci for_each_online_node(i) 453862306a36Sopenharmony_ci default_hstate.max_huge_pages_node[i] = 453962306a36Sopenharmony_ci default_hugepages_in_node[i]; 454062306a36Sopenharmony_ci if (hstate_is_gigantic(&default_hstate)) 454162306a36Sopenharmony_ci hugetlb_hstate_alloc_pages(&default_hstate); 454262306a36Sopenharmony_ci default_hstate_max_huge_pages = 0; 454362306a36Sopenharmony_ci } 454462306a36Sopenharmony_ci 454562306a36Sopenharmony_ci return 1; 454662306a36Sopenharmony_ci} 454762306a36Sopenharmony_ci__setup("default_hugepagesz=", default_hugepagesz_setup); 454862306a36Sopenharmony_ci 454962306a36Sopenharmony_cistatic nodemask_t *policy_mbind_nodemask(gfp_t gfp) 455062306a36Sopenharmony_ci{ 455162306a36Sopenharmony_ci#ifdef CONFIG_NUMA 455262306a36Sopenharmony_ci struct mempolicy *mpol = get_task_policy(current); 455362306a36Sopenharmony_ci 455462306a36Sopenharmony_ci /* 455562306a36Sopenharmony_ci * Only enforce MPOL_BIND policy which overlaps with cpuset policy 455662306a36Sopenharmony_ci * (from policy_nodemask) specifically for hugetlb case 455762306a36Sopenharmony_ci */ 455862306a36Sopenharmony_ci if (mpol->mode == MPOL_BIND && 455962306a36Sopenharmony_ci (apply_policy_zone(mpol, gfp_zone(gfp)) && 456062306a36Sopenharmony_ci cpuset_nodemask_valid_mems_allowed(&mpol->nodes))) 456162306a36Sopenharmony_ci return &mpol->nodes; 456262306a36Sopenharmony_ci#endif 456362306a36Sopenharmony_ci return NULL; 456462306a36Sopenharmony_ci} 456562306a36Sopenharmony_ci 456662306a36Sopenharmony_cistatic unsigned int allowed_mems_nr(struct hstate *h) 456762306a36Sopenharmony_ci{ 456862306a36Sopenharmony_ci int node; 456962306a36Sopenharmony_ci unsigned int nr = 0; 457062306a36Sopenharmony_ci nodemask_t *mbind_nodemask; 457162306a36Sopenharmony_ci unsigned int *array = h->free_huge_pages_node; 457262306a36Sopenharmony_ci gfp_t gfp_mask = htlb_alloc_mask(h); 457362306a36Sopenharmony_ci 457462306a36Sopenharmony_ci mbind_nodemask = policy_mbind_nodemask(gfp_mask); 457562306a36Sopenharmony_ci for_each_node_mask(node, cpuset_current_mems_allowed) { 457662306a36Sopenharmony_ci if (!mbind_nodemask || node_isset(node, *mbind_nodemask)) 457762306a36Sopenharmony_ci nr += array[node]; 457862306a36Sopenharmony_ci } 457962306a36Sopenharmony_ci 458062306a36Sopenharmony_ci return nr; 458162306a36Sopenharmony_ci} 458262306a36Sopenharmony_ci 458362306a36Sopenharmony_ci#ifdef CONFIG_SYSCTL 458462306a36Sopenharmony_cistatic int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write, 458562306a36Sopenharmony_ci void *buffer, size_t *length, 458662306a36Sopenharmony_ci loff_t *ppos, unsigned long *out) 458762306a36Sopenharmony_ci{ 458862306a36Sopenharmony_ci struct ctl_table dup_table; 458962306a36Sopenharmony_ci 459062306a36Sopenharmony_ci /* 459162306a36Sopenharmony_ci * In order to avoid races with __do_proc_doulongvec_minmax(), we 459262306a36Sopenharmony_ci * can duplicate the @table and alter the duplicate of it. 459362306a36Sopenharmony_ci */ 459462306a36Sopenharmony_ci dup_table = *table; 459562306a36Sopenharmony_ci dup_table.data = out; 459662306a36Sopenharmony_ci 459762306a36Sopenharmony_ci return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos); 459862306a36Sopenharmony_ci} 459962306a36Sopenharmony_ci 460062306a36Sopenharmony_cistatic int hugetlb_sysctl_handler_common(bool obey_mempolicy, 460162306a36Sopenharmony_ci struct ctl_table *table, int write, 460262306a36Sopenharmony_ci void *buffer, size_t *length, loff_t *ppos) 460362306a36Sopenharmony_ci{ 460462306a36Sopenharmony_ci struct hstate *h = &default_hstate; 460562306a36Sopenharmony_ci unsigned long tmp = h->max_huge_pages; 460662306a36Sopenharmony_ci int ret; 460762306a36Sopenharmony_ci 460862306a36Sopenharmony_ci if (!hugepages_supported()) 460962306a36Sopenharmony_ci return -EOPNOTSUPP; 461062306a36Sopenharmony_ci 461162306a36Sopenharmony_ci ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, 461262306a36Sopenharmony_ci &tmp); 461362306a36Sopenharmony_ci if (ret) 461462306a36Sopenharmony_ci goto out; 461562306a36Sopenharmony_ci 461662306a36Sopenharmony_ci if (write) 461762306a36Sopenharmony_ci ret = __nr_hugepages_store_common(obey_mempolicy, h, 461862306a36Sopenharmony_ci NUMA_NO_NODE, tmp, *length); 461962306a36Sopenharmony_ciout: 462062306a36Sopenharmony_ci return ret; 462162306a36Sopenharmony_ci} 462262306a36Sopenharmony_ci 462362306a36Sopenharmony_cistatic int hugetlb_sysctl_handler(struct ctl_table *table, int write, 462462306a36Sopenharmony_ci void *buffer, size_t *length, loff_t *ppos) 462562306a36Sopenharmony_ci{ 462662306a36Sopenharmony_ci 462762306a36Sopenharmony_ci return hugetlb_sysctl_handler_common(false, table, write, 462862306a36Sopenharmony_ci buffer, length, ppos); 462962306a36Sopenharmony_ci} 463062306a36Sopenharmony_ci 463162306a36Sopenharmony_ci#ifdef CONFIG_NUMA 463262306a36Sopenharmony_cistatic int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, 463362306a36Sopenharmony_ci void *buffer, size_t *length, loff_t *ppos) 463462306a36Sopenharmony_ci{ 463562306a36Sopenharmony_ci return hugetlb_sysctl_handler_common(true, table, write, 463662306a36Sopenharmony_ci buffer, length, ppos); 463762306a36Sopenharmony_ci} 463862306a36Sopenharmony_ci#endif /* CONFIG_NUMA */ 463962306a36Sopenharmony_ci 464062306a36Sopenharmony_cistatic int hugetlb_overcommit_handler(struct ctl_table *table, int write, 464162306a36Sopenharmony_ci void *buffer, size_t *length, loff_t *ppos) 464262306a36Sopenharmony_ci{ 464362306a36Sopenharmony_ci struct hstate *h = &default_hstate; 464462306a36Sopenharmony_ci unsigned long tmp; 464562306a36Sopenharmony_ci int ret; 464662306a36Sopenharmony_ci 464762306a36Sopenharmony_ci if (!hugepages_supported()) 464862306a36Sopenharmony_ci return -EOPNOTSUPP; 464962306a36Sopenharmony_ci 465062306a36Sopenharmony_ci tmp = h->nr_overcommit_huge_pages; 465162306a36Sopenharmony_ci 465262306a36Sopenharmony_ci if (write && hstate_is_gigantic(h)) 465362306a36Sopenharmony_ci return -EINVAL; 465462306a36Sopenharmony_ci 465562306a36Sopenharmony_ci ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, 465662306a36Sopenharmony_ci &tmp); 465762306a36Sopenharmony_ci if (ret) 465862306a36Sopenharmony_ci goto out; 465962306a36Sopenharmony_ci 466062306a36Sopenharmony_ci if (write) { 466162306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 466262306a36Sopenharmony_ci h->nr_overcommit_huge_pages = tmp; 466362306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 466462306a36Sopenharmony_ci } 466562306a36Sopenharmony_ciout: 466662306a36Sopenharmony_ci return ret; 466762306a36Sopenharmony_ci} 466862306a36Sopenharmony_ci 466962306a36Sopenharmony_cistatic struct ctl_table hugetlb_table[] = { 467062306a36Sopenharmony_ci { 467162306a36Sopenharmony_ci .procname = "nr_hugepages", 467262306a36Sopenharmony_ci .data = NULL, 467362306a36Sopenharmony_ci .maxlen = sizeof(unsigned long), 467462306a36Sopenharmony_ci .mode = 0644, 467562306a36Sopenharmony_ci .proc_handler = hugetlb_sysctl_handler, 467662306a36Sopenharmony_ci }, 467762306a36Sopenharmony_ci#ifdef CONFIG_NUMA 467862306a36Sopenharmony_ci { 467962306a36Sopenharmony_ci .procname = "nr_hugepages_mempolicy", 468062306a36Sopenharmony_ci .data = NULL, 468162306a36Sopenharmony_ci .maxlen = sizeof(unsigned long), 468262306a36Sopenharmony_ci .mode = 0644, 468362306a36Sopenharmony_ci .proc_handler = &hugetlb_mempolicy_sysctl_handler, 468462306a36Sopenharmony_ci }, 468562306a36Sopenharmony_ci#endif 468662306a36Sopenharmony_ci { 468762306a36Sopenharmony_ci .procname = "hugetlb_shm_group", 468862306a36Sopenharmony_ci .data = &sysctl_hugetlb_shm_group, 468962306a36Sopenharmony_ci .maxlen = sizeof(gid_t), 469062306a36Sopenharmony_ci .mode = 0644, 469162306a36Sopenharmony_ci .proc_handler = proc_dointvec, 469262306a36Sopenharmony_ci }, 469362306a36Sopenharmony_ci { 469462306a36Sopenharmony_ci .procname = "nr_overcommit_hugepages", 469562306a36Sopenharmony_ci .data = NULL, 469662306a36Sopenharmony_ci .maxlen = sizeof(unsigned long), 469762306a36Sopenharmony_ci .mode = 0644, 469862306a36Sopenharmony_ci .proc_handler = hugetlb_overcommit_handler, 469962306a36Sopenharmony_ci }, 470062306a36Sopenharmony_ci { } 470162306a36Sopenharmony_ci}; 470262306a36Sopenharmony_ci 470362306a36Sopenharmony_cistatic void hugetlb_sysctl_init(void) 470462306a36Sopenharmony_ci{ 470562306a36Sopenharmony_ci register_sysctl_init("vm", hugetlb_table); 470662306a36Sopenharmony_ci} 470762306a36Sopenharmony_ci#endif /* CONFIG_SYSCTL */ 470862306a36Sopenharmony_ci 470962306a36Sopenharmony_civoid hugetlb_report_meminfo(struct seq_file *m) 471062306a36Sopenharmony_ci{ 471162306a36Sopenharmony_ci struct hstate *h; 471262306a36Sopenharmony_ci unsigned long total = 0; 471362306a36Sopenharmony_ci 471462306a36Sopenharmony_ci if (!hugepages_supported()) 471562306a36Sopenharmony_ci return; 471662306a36Sopenharmony_ci 471762306a36Sopenharmony_ci for_each_hstate(h) { 471862306a36Sopenharmony_ci unsigned long count = h->nr_huge_pages; 471962306a36Sopenharmony_ci 472062306a36Sopenharmony_ci total += huge_page_size(h) * count; 472162306a36Sopenharmony_ci 472262306a36Sopenharmony_ci if (h == &default_hstate) 472362306a36Sopenharmony_ci seq_printf(m, 472462306a36Sopenharmony_ci "HugePages_Total: %5lu\n" 472562306a36Sopenharmony_ci "HugePages_Free: %5lu\n" 472662306a36Sopenharmony_ci "HugePages_Rsvd: %5lu\n" 472762306a36Sopenharmony_ci "HugePages_Surp: %5lu\n" 472862306a36Sopenharmony_ci "Hugepagesize: %8lu kB\n", 472962306a36Sopenharmony_ci count, 473062306a36Sopenharmony_ci h->free_huge_pages, 473162306a36Sopenharmony_ci h->resv_huge_pages, 473262306a36Sopenharmony_ci h->surplus_huge_pages, 473362306a36Sopenharmony_ci huge_page_size(h) / SZ_1K); 473462306a36Sopenharmony_ci } 473562306a36Sopenharmony_ci 473662306a36Sopenharmony_ci seq_printf(m, "Hugetlb: %8lu kB\n", total / SZ_1K); 473762306a36Sopenharmony_ci} 473862306a36Sopenharmony_ci 473962306a36Sopenharmony_ciint hugetlb_report_node_meminfo(char *buf, int len, int nid) 474062306a36Sopenharmony_ci{ 474162306a36Sopenharmony_ci struct hstate *h = &default_hstate; 474262306a36Sopenharmony_ci 474362306a36Sopenharmony_ci if (!hugepages_supported()) 474462306a36Sopenharmony_ci return 0; 474562306a36Sopenharmony_ci 474662306a36Sopenharmony_ci return sysfs_emit_at(buf, len, 474762306a36Sopenharmony_ci "Node %d HugePages_Total: %5u\n" 474862306a36Sopenharmony_ci "Node %d HugePages_Free: %5u\n" 474962306a36Sopenharmony_ci "Node %d HugePages_Surp: %5u\n", 475062306a36Sopenharmony_ci nid, h->nr_huge_pages_node[nid], 475162306a36Sopenharmony_ci nid, h->free_huge_pages_node[nid], 475262306a36Sopenharmony_ci nid, h->surplus_huge_pages_node[nid]); 475362306a36Sopenharmony_ci} 475462306a36Sopenharmony_ci 475562306a36Sopenharmony_civoid hugetlb_show_meminfo_node(int nid) 475662306a36Sopenharmony_ci{ 475762306a36Sopenharmony_ci struct hstate *h; 475862306a36Sopenharmony_ci 475962306a36Sopenharmony_ci if (!hugepages_supported()) 476062306a36Sopenharmony_ci return; 476162306a36Sopenharmony_ci 476262306a36Sopenharmony_ci for_each_hstate(h) 476362306a36Sopenharmony_ci printk("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", 476462306a36Sopenharmony_ci nid, 476562306a36Sopenharmony_ci h->nr_huge_pages_node[nid], 476662306a36Sopenharmony_ci h->free_huge_pages_node[nid], 476762306a36Sopenharmony_ci h->surplus_huge_pages_node[nid], 476862306a36Sopenharmony_ci huge_page_size(h) / SZ_1K); 476962306a36Sopenharmony_ci} 477062306a36Sopenharmony_ci 477162306a36Sopenharmony_civoid hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm) 477262306a36Sopenharmony_ci{ 477362306a36Sopenharmony_ci seq_printf(m, "HugetlbPages:\t%8lu kB\n", 477462306a36Sopenharmony_ci K(atomic_long_read(&mm->hugetlb_usage))); 477562306a36Sopenharmony_ci} 477662306a36Sopenharmony_ci 477762306a36Sopenharmony_ci/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 477862306a36Sopenharmony_ciunsigned long hugetlb_total_pages(void) 477962306a36Sopenharmony_ci{ 478062306a36Sopenharmony_ci struct hstate *h; 478162306a36Sopenharmony_ci unsigned long nr_total_pages = 0; 478262306a36Sopenharmony_ci 478362306a36Sopenharmony_ci for_each_hstate(h) 478462306a36Sopenharmony_ci nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h); 478562306a36Sopenharmony_ci return nr_total_pages; 478662306a36Sopenharmony_ci} 478762306a36Sopenharmony_ci 478862306a36Sopenharmony_cistatic int hugetlb_acct_memory(struct hstate *h, long delta) 478962306a36Sopenharmony_ci{ 479062306a36Sopenharmony_ci int ret = -ENOMEM; 479162306a36Sopenharmony_ci 479262306a36Sopenharmony_ci if (!delta) 479362306a36Sopenharmony_ci return 0; 479462306a36Sopenharmony_ci 479562306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 479662306a36Sopenharmony_ci /* 479762306a36Sopenharmony_ci * When cpuset is configured, it breaks the strict hugetlb page 479862306a36Sopenharmony_ci * reservation as the accounting is done on a global variable. Such 479962306a36Sopenharmony_ci * reservation is completely rubbish in the presence of cpuset because 480062306a36Sopenharmony_ci * the reservation is not checked against page availability for the 480162306a36Sopenharmony_ci * current cpuset. Application can still potentially OOM'ed by kernel 480262306a36Sopenharmony_ci * with lack of free htlb page in cpuset that the task is in. 480362306a36Sopenharmony_ci * Attempt to enforce strict accounting with cpuset is almost 480462306a36Sopenharmony_ci * impossible (or too ugly) because cpuset is too fluid that 480562306a36Sopenharmony_ci * task or memory node can be dynamically moved between cpusets. 480662306a36Sopenharmony_ci * 480762306a36Sopenharmony_ci * The change of semantics for shared hugetlb mapping with cpuset is 480862306a36Sopenharmony_ci * undesirable. However, in order to preserve some of the semantics, 480962306a36Sopenharmony_ci * we fall back to check against current free page availability as 481062306a36Sopenharmony_ci * a best attempt and hopefully to minimize the impact of changing 481162306a36Sopenharmony_ci * semantics that cpuset has. 481262306a36Sopenharmony_ci * 481362306a36Sopenharmony_ci * Apart from cpuset, we also have memory policy mechanism that 481462306a36Sopenharmony_ci * also determines from which node the kernel will allocate memory 481562306a36Sopenharmony_ci * in a NUMA system. So similar to cpuset, we also should consider 481662306a36Sopenharmony_ci * the memory policy of the current task. Similar to the description 481762306a36Sopenharmony_ci * above. 481862306a36Sopenharmony_ci */ 481962306a36Sopenharmony_ci if (delta > 0) { 482062306a36Sopenharmony_ci if (gather_surplus_pages(h, delta) < 0) 482162306a36Sopenharmony_ci goto out; 482262306a36Sopenharmony_ci 482362306a36Sopenharmony_ci if (delta > allowed_mems_nr(h)) { 482462306a36Sopenharmony_ci return_unused_surplus_pages(h, delta); 482562306a36Sopenharmony_ci goto out; 482662306a36Sopenharmony_ci } 482762306a36Sopenharmony_ci } 482862306a36Sopenharmony_ci 482962306a36Sopenharmony_ci ret = 0; 483062306a36Sopenharmony_ci if (delta < 0) 483162306a36Sopenharmony_ci return_unused_surplus_pages(h, (unsigned long) -delta); 483262306a36Sopenharmony_ci 483362306a36Sopenharmony_ciout: 483462306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 483562306a36Sopenharmony_ci return ret; 483662306a36Sopenharmony_ci} 483762306a36Sopenharmony_ci 483862306a36Sopenharmony_cistatic void hugetlb_vm_op_open(struct vm_area_struct *vma) 483962306a36Sopenharmony_ci{ 484062306a36Sopenharmony_ci struct resv_map *resv = vma_resv_map(vma); 484162306a36Sopenharmony_ci 484262306a36Sopenharmony_ci /* 484362306a36Sopenharmony_ci * HPAGE_RESV_OWNER indicates a private mapping. 484462306a36Sopenharmony_ci * This new VMA should share its siblings reservation map if present. 484562306a36Sopenharmony_ci * The VMA will only ever have a valid reservation map pointer where 484662306a36Sopenharmony_ci * it is being copied for another still existing VMA. As that VMA 484762306a36Sopenharmony_ci * has a reference to the reservation map it cannot disappear until 484862306a36Sopenharmony_ci * after this open call completes. It is therefore safe to take a 484962306a36Sopenharmony_ci * new reference here without additional locking. 485062306a36Sopenharmony_ci */ 485162306a36Sopenharmony_ci if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 485262306a36Sopenharmony_ci resv_map_dup_hugetlb_cgroup_uncharge_info(resv); 485362306a36Sopenharmony_ci kref_get(&resv->refs); 485462306a36Sopenharmony_ci } 485562306a36Sopenharmony_ci 485662306a36Sopenharmony_ci /* 485762306a36Sopenharmony_ci * vma_lock structure for sharable mappings is vma specific. 485862306a36Sopenharmony_ci * Clear old pointer (if copied via vm_area_dup) and allocate 485962306a36Sopenharmony_ci * new structure. Before clearing, make sure vma_lock is not 486062306a36Sopenharmony_ci * for this vma. 486162306a36Sopenharmony_ci */ 486262306a36Sopenharmony_ci if (vma->vm_flags & VM_MAYSHARE) { 486362306a36Sopenharmony_ci struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; 486462306a36Sopenharmony_ci 486562306a36Sopenharmony_ci if (vma_lock) { 486662306a36Sopenharmony_ci if (vma_lock->vma != vma) { 486762306a36Sopenharmony_ci vma->vm_private_data = NULL; 486862306a36Sopenharmony_ci hugetlb_vma_lock_alloc(vma); 486962306a36Sopenharmony_ci } else 487062306a36Sopenharmony_ci pr_warn("HugeTLB: vma_lock already exists in %s.\n", __func__); 487162306a36Sopenharmony_ci } else 487262306a36Sopenharmony_ci hugetlb_vma_lock_alloc(vma); 487362306a36Sopenharmony_ci } 487462306a36Sopenharmony_ci} 487562306a36Sopenharmony_ci 487662306a36Sopenharmony_cistatic void hugetlb_vm_op_close(struct vm_area_struct *vma) 487762306a36Sopenharmony_ci{ 487862306a36Sopenharmony_ci struct hstate *h = hstate_vma(vma); 487962306a36Sopenharmony_ci struct resv_map *resv; 488062306a36Sopenharmony_ci struct hugepage_subpool *spool = subpool_vma(vma); 488162306a36Sopenharmony_ci unsigned long reserve, start, end; 488262306a36Sopenharmony_ci long gbl_reserve; 488362306a36Sopenharmony_ci 488462306a36Sopenharmony_ci hugetlb_vma_lock_free(vma); 488562306a36Sopenharmony_ci 488662306a36Sopenharmony_ci resv = vma_resv_map(vma); 488762306a36Sopenharmony_ci if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 488862306a36Sopenharmony_ci return; 488962306a36Sopenharmony_ci 489062306a36Sopenharmony_ci start = vma_hugecache_offset(h, vma, vma->vm_start); 489162306a36Sopenharmony_ci end = vma_hugecache_offset(h, vma, vma->vm_end); 489262306a36Sopenharmony_ci 489362306a36Sopenharmony_ci reserve = (end - start) - region_count(resv, start, end); 489462306a36Sopenharmony_ci hugetlb_cgroup_uncharge_counter(resv, start, end); 489562306a36Sopenharmony_ci if (reserve) { 489662306a36Sopenharmony_ci /* 489762306a36Sopenharmony_ci * Decrement reserve counts. The global reserve count may be 489862306a36Sopenharmony_ci * adjusted if the subpool has a minimum size. 489962306a36Sopenharmony_ci */ 490062306a36Sopenharmony_ci gbl_reserve = hugepage_subpool_put_pages(spool, reserve); 490162306a36Sopenharmony_ci hugetlb_acct_memory(h, -gbl_reserve); 490262306a36Sopenharmony_ci } 490362306a36Sopenharmony_ci 490462306a36Sopenharmony_ci kref_put(&resv->refs, resv_map_release); 490562306a36Sopenharmony_ci} 490662306a36Sopenharmony_ci 490762306a36Sopenharmony_cistatic int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) 490862306a36Sopenharmony_ci{ 490962306a36Sopenharmony_ci if (addr & ~(huge_page_mask(hstate_vma(vma)))) 491062306a36Sopenharmony_ci return -EINVAL; 491162306a36Sopenharmony_ci 491262306a36Sopenharmony_ci /* 491362306a36Sopenharmony_ci * PMD sharing is only possible for PUD_SIZE-aligned address ranges 491462306a36Sopenharmony_ci * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this 491562306a36Sopenharmony_ci * split, unshare PMDs in the PUD_SIZE interval surrounding addr now. 491662306a36Sopenharmony_ci */ 491762306a36Sopenharmony_ci if (addr & ~PUD_MASK) { 491862306a36Sopenharmony_ci /* 491962306a36Sopenharmony_ci * hugetlb_vm_op_split is called right before we attempt to 492062306a36Sopenharmony_ci * split the VMA. We will need to unshare PMDs in the old and 492162306a36Sopenharmony_ci * new VMAs, so let's unshare before we split. 492262306a36Sopenharmony_ci */ 492362306a36Sopenharmony_ci unsigned long floor = addr & PUD_MASK; 492462306a36Sopenharmony_ci unsigned long ceil = floor + PUD_SIZE; 492562306a36Sopenharmony_ci 492662306a36Sopenharmony_ci if (floor >= vma->vm_start && ceil <= vma->vm_end) 492762306a36Sopenharmony_ci hugetlb_unshare_pmds(vma, floor, ceil); 492862306a36Sopenharmony_ci } 492962306a36Sopenharmony_ci 493062306a36Sopenharmony_ci return 0; 493162306a36Sopenharmony_ci} 493262306a36Sopenharmony_ci 493362306a36Sopenharmony_cistatic unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) 493462306a36Sopenharmony_ci{ 493562306a36Sopenharmony_ci return huge_page_size(hstate_vma(vma)); 493662306a36Sopenharmony_ci} 493762306a36Sopenharmony_ci 493862306a36Sopenharmony_ci/* 493962306a36Sopenharmony_ci * We cannot handle pagefaults against hugetlb pages at all. They cause 494062306a36Sopenharmony_ci * handle_mm_fault() to try to instantiate regular-sized pages in the 494162306a36Sopenharmony_ci * hugepage VMA. do_page_fault() is supposed to trap this, so BUG is we get 494262306a36Sopenharmony_ci * this far. 494362306a36Sopenharmony_ci */ 494462306a36Sopenharmony_cistatic vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf) 494562306a36Sopenharmony_ci{ 494662306a36Sopenharmony_ci BUG(); 494762306a36Sopenharmony_ci return 0; 494862306a36Sopenharmony_ci} 494962306a36Sopenharmony_ci 495062306a36Sopenharmony_ci/* 495162306a36Sopenharmony_ci * When a new function is introduced to vm_operations_struct and added 495262306a36Sopenharmony_ci * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops. 495362306a36Sopenharmony_ci * This is because under System V memory model, mappings created via 495462306a36Sopenharmony_ci * shmget/shmat with "huge page" specified are backed by hugetlbfs files, 495562306a36Sopenharmony_ci * their original vm_ops are overwritten with shm_vm_ops. 495662306a36Sopenharmony_ci */ 495762306a36Sopenharmony_ciconst struct vm_operations_struct hugetlb_vm_ops = { 495862306a36Sopenharmony_ci .fault = hugetlb_vm_op_fault, 495962306a36Sopenharmony_ci .open = hugetlb_vm_op_open, 496062306a36Sopenharmony_ci .close = hugetlb_vm_op_close, 496162306a36Sopenharmony_ci .may_split = hugetlb_vm_op_split, 496262306a36Sopenharmony_ci .pagesize = hugetlb_vm_op_pagesize, 496362306a36Sopenharmony_ci}; 496462306a36Sopenharmony_ci 496562306a36Sopenharmony_cistatic pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 496662306a36Sopenharmony_ci int writable) 496762306a36Sopenharmony_ci{ 496862306a36Sopenharmony_ci pte_t entry; 496962306a36Sopenharmony_ci unsigned int shift = huge_page_shift(hstate_vma(vma)); 497062306a36Sopenharmony_ci 497162306a36Sopenharmony_ci if (writable) { 497262306a36Sopenharmony_ci entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, 497362306a36Sopenharmony_ci vma->vm_page_prot))); 497462306a36Sopenharmony_ci } else { 497562306a36Sopenharmony_ci entry = huge_pte_wrprotect(mk_huge_pte(page, 497662306a36Sopenharmony_ci vma->vm_page_prot)); 497762306a36Sopenharmony_ci } 497862306a36Sopenharmony_ci entry = pte_mkyoung(entry); 497962306a36Sopenharmony_ci entry = arch_make_huge_pte(entry, shift, vma->vm_flags); 498062306a36Sopenharmony_ci 498162306a36Sopenharmony_ci return entry; 498262306a36Sopenharmony_ci} 498362306a36Sopenharmony_ci 498462306a36Sopenharmony_cistatic void set_huge_ptep_writable(struct vm_area_struct *vma, 498562306a36Sopenharmony_ci unsigned long address, pte_t *ptep) 498662306a36Sopenharmony_ci{ 498762306a36Sopenharmony_ci pte_t entry; 498862306a36Sopenharmony_ci 498962306a36Sopenharmony_ci entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep))); 499062306a36Sopenharmony_ci if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) 499162306a36Sopenharmony_ci update_mmu_cache(vma, address, ptep); 499262306a36Sopenharmony_ci} 499362306a36Sopenharmony_ci 499462306a36Sopenharmony_cibool is_hugetlb_entry_migration(pte_t pte) 499562306a36Sopenharmony_ci{ 499662306a36Sopenharmony_ci swp_entry_t swp; 499762306a36Sopenharmony_ci 499862306a36Sopenharmony_ci if (huge_pte_none(pte) || pte_present(pte)) 499962306a36Sopenharmony_ci return false; 500062306a36Sopenharmony_ci swp = pte_to_swp_entry(pte); 500162306a36Sopenharmony_ci if (is_migration_entry(swp)) 500262306a36Sopenharmony_ci return true; 500362306a36Sopenharmony_ci else 500462306a36Sopenharmony_ci return false; 500562306a36Sopenharmony_ci} 500662306a36Sopenharmony_ci 500762306a36Sopenharmony_cistatic bool is_hugetlb_entry_hwpoisoned(pte_t pte) 500862306a36Sopenharmony_ci{ 500962306a36Sopenharmony_ci swp_entry_t swp; 501062306a36Sopenharmony_ci 501162306a36Sopenharmony_ci if (huge_pte_none(pte) || pte_present(pte)) 501262306a36Sopenharmony_ci return false; 501362306a36Sopenharmony_ci swp = pte_to_swp_entry(pte); 501462306a36Sopenharmony_ci if (is_hwpoison_entry(swp)) 501562306a36Sopenharmony_ci return true; 501662306a36Sopenharmony_ci else 501762306a36Sopenharmony_ci return false; 501862306a36Sopenharmony_ci} 501962306a36Sopenharmony_ci 502062306a36Sopenharmony_cistatic void 502162306a36Sopenharmony_cihugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, 502262306a36Sopenharmony_ci struct folio *new_folio, pte_t old, unsigned long sz) 502362306a36Sopenharmony_ci{ 502462306a36Sopenharmony_ci pte_t newpte = make_huge_pte(vma, &new_folio->page, 1); 502562306a36Sopenharmony_ci 502662306a36Sopenharmony_ci __folio_mark_uptodate(new_folio); 502762306a36Sopenharmony_ci hugepage_add_new_anon_rmap(new_folio, vma, addr); 502862306a36Sopenharmony_ci if (userfaultfd_wp(vma) && huge_pte_uffd_wp(old)) 502962306a36Sopenharmony_ci newpte = huge_pte_mkuffd_wp(newpte); 503062306a36Sopenharmony_ci set_huge_pte_at(vma->vm_mm, addr, ptep, newpte, sz); 503162306a36Sopenharmony_ci hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); 503262306a36Sopenharmony_ci folio_set_hugetlb_migratable(new_folio); 503362306a36Sopenharmony_ci} 503462306a36Sopenharmony_ci 503562306a36Sopenharmony_ciint copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 503662306a36Sopenharmony_ci struct vm_area_struct *dst_vma, 503762306a36Sopenharmony_ci struct vm_area_struct *src_vma) 503862306a36Sopenharmony_ci{ 503962306a36Sopenharmony_ci pte_t *src_pte, *dst_pte, entry; 504062306a36Sopenharmony_ci struct folio *pte_folio; 504162306a36Sopenharmony_ci unsigned long addr; 504262306a36Sopenharmony_ci bool cow = is_cow_mapping(src_vma->vm_flags); 504362306a36Sopenharmony_ci struct hstate *h = hstate_vma(src_vma); 504462306a36Sopenharmony_ci unsigned long sz = huge_page_size(h); 504562306a36Sopenharmony_ci unsigned long npages = pages_per_huge_page(h); 504662306a36Sopenharmony_ci struct mmu_notifier_range range; 504762306a36Sopenharmony_ci unsigned long last_addr_mask; 504862306a36Sopenharmony_ci int ret = 0; 504962306a36Sopenharmony_ci 505062306a36Sopenharmony_ci if (cow) { 505162306a36Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src, 505262306a36Sopenharmony_ci src_vma->vm_start, 505362306a36Sopenharmony_ci src_vma->vm_end); 505462306a36Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 505562306a36Sopenharmony_ci vma_assert_write_locked(src_vma); 505662306a36Sopenharmony_ci raw_write_seqcount_begin(&src->write_protect_seq); 505762306a36Sopenharmony_ci } else { 505862306a36Sopenharmony_ci /* 505962306a36Sopenharmony_ci * For shared mappings the vma lock must be held before 506062306a36Sopenharmony_ci * calling hugetlb_walk() in the src vma. Otherwise, the 506162306a36Sopenharmony_ci * returned ptep could go away if part of a shared pmd and 506262306a36Sopenharmony_ci * another thread calls huge_pmd_unshare. 506362306a36Sopenharmony_ci */ 506462306a36Sopenharmony_ci hugetlb_vma_lock_read(src_vma); 506562306a36Sopenharmony_ci } 506662306a36Sopenharmony_ci 506762306a36Sopenharmony_ci last_addr_mask = hugetlb_mask_last_page(h); 506862306a36Sopenharmony_ci for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) { 506962306a36Sopenharmony_ci spinlock_t *src_ptl, *dst_ptl; 507062306a36Sopenharmony_ci src_pte = hugetlb_walk(src_vma, addr, sz); 507162306a36Sopenharmony_ci if (!src_pte) { 507262306a36Sopenharmony_ci addr |= last_addr_mask; 507362306a36Sopenharmony_ci continue; 507462306a36Sopenharmony_ci } 507562306a36Sopenharmony_ci dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz); 507662306a36Sopenharmony_ci if (!dst_pte) { 507762306a36Sopenharmony_ci ret = -ENOMEM; 507862306a36Sopenharmony_ci break; 507962306a36Sopenharmony_ci } 508062306a36Sopenharmony_ci 508162306a36Sopenharmony_ci /* 508262306a36Sopenharmony_ci * If the pagetables are shared don't copy or take references. 508362306a36Sopenharmony_ci * 508462306a36Sopenharmony_ci * dst_pte == src_pte is the common case of src/dest sharing. 508562306a36Sopenharmony_ci * However, src could have 'unshared' and dst shares with 508662306a36Sopenharmony_ci * another vma. So page_count of ptep page is checked instead 508762306a36Sopenharmony_ci * to reliably determine whether pte is shared. 508862306a36Sopenharmony_ci */ 508962306a36Sopenharmony_ci if (page_count(virt_to_page(dst_pte)) > 1) { 509062306a36Sopenharmony_ci addr |= last_addr_mask; 509162306a36Sopenharmony_ci continue; 509262306a36Sopenharmony_ci } 509362306a36Sopenharmony_ci 509462306a36Sopenharmony_ci dst_ptl = huge_pte_lock(h, dst, dst_pte); 509562306a36Sopenharmony_ci src_ptl = huge_pte_lockptr(h, src, src_pte); 509662306a36Sopenharmony_ci spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 509762306a36Sopenharmony_ci entry = huge_ptep_get(src_pte); 509862306a36Sopenharmony_ciagain: 509962306a36Sopenharmony_ci if (huge_pte_none(entry)) { 510062306a36Sopenharmony_ci /* 510162306a36Sopenharmony_ci * Skip if src entry none. 510262306a36Sopenharmony_ci */ 510362306a36Sopenharmony_ci ; 510462306a36Sopenharmony_ci } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) { 510562306a36Sopenharmony_ci if (!userfaultfd_wp(dst_vma)) 510662306a36Sopenharmony_ci entry = huge_pte_clear_uffd_wp(entry); 510762306a36Sopenharmony_ci set_huge_pte_at(dst, addr, dst_pte, entry, sz); 510862306a36Sopenharmony_ci } else if (unlikely(is_hugetlb_entry_migration(entry))) { 510962306a36Sopenharmony_ci swp_entry_t swp_entry = pte_to_swp_entry(entry); 511062306a36Sopenharmony_ci bool uffd_wp = pte_swp_uffd_wp(entry); 511162306a36Sopenharmony_ci 511262306a36Sopenharmony_ci if (!is_readable_migration_entry(swp_entry) && cow) { 511362306a36Sopenharmony_ci /* 511462306a36Sopenharmony_ci * COW mappings require pages in both 511562306a36Sopenharmony_ci * parent and child to be set to read. 511662306a36Sopenharmony_ci */ 511762306a36Sopenharmony_ci swp_entry = make_readable_migration_entry( 511862306a36Sopenharmony_ci swp_offset(swp_entry)); 511962306a36Sopenharmony_ci entry = swp_entry_to_pte(swp_entry); 512062306a36Sopenharmony_ci if (userfaultfd_wp(src_vma) && uffd_wp) 512162306a36Sopenharmony_ci entry = pte_swp_mkuffd_wp(entry); 512262306a36Sopenharmony_ci set_huge_pte_at(src, addr, src_pte, entry, sz); 512362306a36Sopenharmony_ci } 512462306a36Sopenharmony_ci if (!userfaultfd_wp(dst_vma)) 512562306a36Sopenharmony_ci entry = huge_pte_clear_uffd_wp(entry); 512662306a36Sopenharmony_ci set_huge_pte_at(dst, addr, dst_pte, entry, sz); 512762306a36Sopenharmony_ci } else if (unlikely(is_pte_marker(entry))) { 512862306a36Sopenharmony_ci pte_marker marker = copy_pte_marker( 512962306a36Sopenharmony_ci pte_to_swp_entry(entry), dst_vma); 513062306a36Sopenharmony_ci 513162306a36Sopenharmony_ci if (marker) 513262306a36Sopenharmony_ci set_huge_pte_at(dst, addr, dst_pte, 513362306a36Sopenharmony_ci make_pte_marker(marker), sz); 513462306a36Sopenharmony_ci } else { 513562306a36Sopenharmony_ci entry = huge_ptep_get(src_pte); 513662306a36Sopenharmony_ci pte_folio = page_folio(pte_page(entry)); 513762306a36Sopenharmony_ci folio_get(pte_folio); 513862306a36Sopenharmony_ci 513962306a36Sopenharmony_ci /* 514062306a36Sopenharmony_ci * Failing to duplicate the anon rmap is a rare case 514162306a36Sopenharmony_ci * where we see pinned hugetlb pages while they're 514262306a36Sopenharmony_ci * prone to COW. We need to do the COW earlier during 514362306a36Sopenharmony_ci * fork. 514462306a36Sopenharmony_ci * 514562306a36Sopenharmony_ci * When pre-allocating the page or copying data, we 514662306a36Sopenharmony_ci * need to be without the pgtable locks since we could 514762306a36Sopenharmony_ci * sleep during the process. 514862306a36Sopenharmony_ci */ 514962306a36Sopenharmony_ci if (!folio_test_anon(pte_folio)) { 515062306a36Sopenharmony_ci page_dup_file_rmap(&pte_folio->page, true); 515162306a36Sopenharmony_ci } else if (page_try_dup_anon_rmap(&pte_folio->page, 515262306a36Sopenharmony_ci true, src_vma)) { 515362306a36Sopenharmony_ci pte_t src_pte_old = entry; 515462306a36Sopenharmony_ci struct folio *new_folio; 515562306a36Sopenharmony_ci 515662306a36Sopenharmony_ci spin_unlock(src_ptl); 515762306a36Sopenharmony_ci spin_unlock(dst_ptl); 515862306a36Sopenharmony_ci /* Do not use reserve as it's private owned */ 515962306a36Sopenharmony_ci new_folio = alloc_hugetlb_folio(dst_vma, addr, 1); 516062306a36Sopenharmony_ci if (IS_ERR(new_folio)) { 516162306a36Sopenharmony_ci folio_put(pte_folio); 516262306a36Sopenharmony_ci ret = PTR_ERR(new_folio); 516362306a36Sopenharmony_ci break; 516462306a36Sopenharmony_ci } 516562306a36Sopenharmony_ci ret = copy_user_large_folio(new_folio, 516662306a36Sopenharmony_ci pte_folio, 516762306a36Sopenharmony_ci addr, dst_vma); 516862306a36Sopenharmony_ci folio_put(pte_folio); 516962306a36Sopenharmony_ci if (ret) { 517062306a36Sopenharmony_ci folio_put(new_folio); 517162306a36Sopenharmony_ci break; 517262306a36Sopenharmony_ci } 517362306a36Sopenharmony_ci 517462306a36Sopenharmony_ci /* Install the new hugetlb folio if src pte stable */ 517562306a36Sopenharmony_ci dst_ptl = huge_pte_lock(h, dst, dst_pte); 517662306a36Sopenharmony_ci src_ptl = huge_pte_lockptr(h, src, src_pte); 517762306a36Sopenharmony_ci spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 517862306a36Sopenharmony_ci entry = huge_ptep_get(src_pte); 517962306a36Sopenharmony_ci if (!pte_same(src_pte_old, entry)) { 518062306a36Sopenharmony_ci restore_reserve_on_error(h, dst_vma, addr, 518162306a36Sopenharmony_ci new_folio); 518262306a36Sopenharmony_ci folio_put(new_folio); 518362306a36Sopenharmony_ci /* huge_ptep of dst_pte won't change as in child */ 518462306a36Sopenharmony_ci goto again; 518562306a36Sopenharmony_ci } 518662306a36Sopenharmony_ci hugetlb_install_folio(dst_vma, dst_pte, addr, 518762306a36Sopenharmony_ci new_folio, src_pte_old, sz); 518862306a36Sopenharmony_ci spin_unlock(src_ptl); 518962306a36Sopenharmony_ci spin_unlock(dst_ptl); 519062306a36Sopenharmony_ci continue; 519162306a36Sopenharmony_ci } 519262306a36Sopenharmony_ci 519362306a36Sopenharmony_ci if (cow) { 519462306a36Sopenharmony_ci /* 519562306a36Sopenharmony_ci * No need to notify as we are downgrading page 519662306a36Sopenharmony_ci * table protection not changing it to point 519762306a36Sopenharmony_ci * to a new page. 519862306a36Sopenharmony_ci * 519962306a36Sopenharmony_ci * See Documentation/mm/mmu_notifier.rst 520062306a36Sopenharmony_ci */ 520162306a36Sopenharmony_ci huge_ptep_set_wrprotect(src, addr, src_pte); 520262306a36Sopenharmony_ci entry = huge_pte_wrprotect(entry); 520362306a36Sopenharmony_ci } 520462306a36Sopenharmony_ci 520562306a36Sopenharmony_ci if (!userfaultfd_wp(dst_vma)) 520662306a36Sopenharmony_ci entry = huge_pte_clear_uffd_wp(entry); 520762306a36Sopenharmony_ci 520862306a36Sopenharmony_ci set_huge_pte_at(dst, addr, dst_pte, entry, sz); 520962306a36Sopenharmony_ci hugetlb_count_add(npages, dst); 521062306a36Sopenharmony_ci } 521162306a36Sopenharmony_ci spin_unlock(src_ptl); 521262306a36Sopenharmony_ci spin_unlock(dst_ptl); 521362306a36Sopenharmony_ci } 521462306a36Sopenharmony_ci 521562306a36Sopenharmony_ci if (cow) { 521662306a36Sopenharmony_ci raw_write_seqcount_end(&src->write_protect_seq); 521762306a36Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 521862306a36Sopenharmony_ci } else { 521962306a36Sopenharmony_ci hugetlb_vma_unlock_read(src_vma); 522062306a36Sopenharmony_ci } 522162306a36Sopenharmony_ci 522262306a36Sopenharmony_ci return ret; 522362306a36Sopenharmony_ci} 522462306a36Sopenharmony_ci 522562306a36Sopenharmony_cistatic void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, 522662306a36Sopenharmony_ci unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte, 522762306a36Sopenharmony_ci unsigned long sz) 522862306a36Sopenharmony_ci{ 522962306a36Sopenharmony_ci struct hstate *h = hstate_vma(vma); 523062306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 523162306a36Sopenharmony_ci spinlock_t *src_ptl, *dst_ptl; 523262306a36Sopenharmony_ci pte_t pte; 523362306a36Sopenharmony_ci 523462306a36Sopenharmony_ci dst_ptl = huge_pte_lock(h, mm, dst_pte); 523562306a36Sopenharmony_ci src_ptl = huge_pte_lockptr(h, mm, src_pte); 523662306a36Sopenharmony_ci 523762306a36Sopenharmony_ci /* 523862306a36Sopenharmony_ci * We don't have to worry about the ordering of src and dst ptlocks 523962306a36Sopenharmony_ci * because exclusive mmap_lock (or the i_mmap_lock) prevents deadlock. 524062306a36Sopenharmony_ci */ 524162306a36Sopenharmony_ci if (src_ptl != dst_ptl) 524262306a36Sopenharmony_ci spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 524362306a36Sopenharmony_ci 524462306a36Sopenharmony_ci pte = huge_ptep_get_and_clear(mm, old_addr, src_pte); 524562306a36Sopenharmony_ci set_huge_pte_at(mm, new_addr, dst_pte, pte, sz); 524662306a36Sopenharmony_ci 524762306a36Sopenharmony_ci if (src_ptl != dst_ptl) 524862306a36Sopenharmony_ci spin_unlock(src_ptl); 524962306a36Sopenharmony_ci spin_unlock(dst_ptl); 525062306a36Sopenharmony_ci} 525162306a36Sopenharmony_ci 525262306a36Sopenharmony_ciint move_hugetlb_page_tables(struct vm_area_struct *vma, 525362306a36Sopenharmony_ci struct vm_area_struct *new_vma, 525462306a36Sopenharmony_ci unsigned long old_addr, unsigned long new_addr, 525562306a36Sopenharmony_ci unsigned long len) 525662306a36Sopenharmony_ci{ 525762306a36Sopenharmony_ci struct hstate *h = hstate_vma(vma); 525862306a36Sopenharmony_ci struct address_space *mapping = vma->vm_file->f_mapping; 525962306a36Sopenharmony_ci unsigned long sz = huge_page_size(h); 526062306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 526162306a36Sopenharmony_ci unsigned long old_end = old_addr + len; 526262306a36Sopenharmony_ci unsigned long last_addr_mask; 526362306a36Sopenharmony_ci pte_t *src_pte, *dst_pte; 526462306a36Sopenharmony_ci struct mmu_notifier_range range; 526562306a36Sopenharmony_ci bool shared_pmd = false; 526662306a36Sopenharmony_ci 526762306a36Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr, 526862306a36Sopenharmony_ci old_end); 526962306a36Sopenharmony_ci adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); 527062306a36Sopenharmony_ci /* 527162306a36Sopenharmony_ci * In case of shared PMDs, we should cover the maximum possible 527262306a36Sopenharmony_ci * range. 527362306a36Sopenharmony_ci */ 527462306a36Sopenharmony_ci flush_cache_range(vma, range.start, range.end); 527562306a36Sopenharmony_ci 527662306a36Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 527762306a36Sopenharmony_ci last_addr_mask = hugetlb_mask_last_page(h); 527862306a36Sopenharmony_ci /* Prevent race with file truncation */ 527962306a36Sopenharmony_ci hugetlb_vma_lock_write(vma); 528062306a36Sopenharmony_ci i_mmap_lock_write(mapping); 528162306a36Sopenharmony_ci for (; old_addr < old_end; old_addr += sz, new_addr += sz) { 528262306a36Sopenharmony_ci src_pte = hugetlb_walk(vma, old_addr, sz); 528362306a36Sopenharmony_ci if (!src_pte) { 528462306a36Sopenharmony_ci old_addr |= last_addr_mask; 528562306a36Sopenharmony_ci new_addr |= last_addr_mask; 528662306a36Sopenharmony_ci continue; 528762306a36Sopenharmony_ci } 528862306a36Sopenharmony_ci if (huge_pte_none(huge_ptep_get(src_pte))) 528962306a36Sopenharmony_ci continue; 529062306a36Sopenharmony_ci 529162306a36Sopenharmony_ci if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) { 529262306a36Sopenharmony_ci shared_pmd = true; 529362306a36Sopenharmony_ci old_addr |= last_addr_mask; 529462306a36Sopenharmony_ci new_addr |= last_addr_mask; 529562306a36Sopenharmony_ci continue; 529662306a36Sopenharmony_ci } 529762306a36Sopenharmony_ci 529862306a36Sopenharmony_ci dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz); 529962306a36Sopenharmony_ci if (!dst_pte) 530062306a36Sopenharmony_ci break; 530162306a36Sopenharmony_ci 530262306a36Sopenharmony_ci move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte, sz); 530362306a36Sopenharmony_ci } 530462306a36Sopenharmony_ci 530562306a36Sopenharmony_ci if (shared_pmd) 530662306a36Sopenharmony_ci flush_hugetlb_tlb_range(vma, range.start, range.end); 530762306a36Sopenharmony_ci else 530862306a36Sopenharmony_ci flush_hugetlb_tlb_range(vma, old_end - len, old_end); 530962306a36Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 531062306a36Sopenharmony_ci i_mmap_unlock_write(mapping); 531162306a36Sopenharmony_ci hugetlb_vma_unlock_write(vma); 531262306a36Sopenharmony_ci 531362306a36Sopenharmony_ci return len + old_addr - old_end; 531462306a36Sopenharmony_ci} 531562306a36Sopenharmony_ci 531662306a36Sopenharmony_civoid __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, 531762306a36Sopenharmony_ci unsigned long start, unsigned long end, 531862306a36Sopenharmony_ci struct page *ref_page, zap_flags_t zap_flags) 531962306a36Sopenharmony_ci{ 532062306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 532162306a36Sopenharmony_ci unsigned long address; 532262306a36Sopenharmony_ci pte_t *ptep; 532362306a36Sopenharmony_ci pte_t pte; 532462306a36Sopenharmony_ci spinlock_t *ptl; 532562306a36Sopenharmony_ci struct page *page; 532662306a36Sopenharmony_ci struct hstate *h = hstate_vma(vma); 532762306a36Sopenharmony_ci unsigned long sz = huge_page_size(h); 532862306a36Sopenharmony_ci unsigned long last_addr_mask; 532962306a36Sopenharmony_ci bool force_flush = false; 533062306a36Sopenharmony_ci 533162306a36Sopenharmony_ci WARN_ON(!is_vm_hugetlb_page(vma)); 533262306a36Sopenharmony_ci BUG_ON(start & ~huge_page_mask(h)); 533362306a36Sopenharmony_ci BUG_ON(end & ~huge_page_mask(h)); 533462306a36Sopenharmony_ci 533562306a36Sopenharmony_ci /* 533662306a36Sopenharmony_ci * This is a hugetlb vma, all the pte entries should point 533762306a36Sopenharmony_ci * to huge page. 533862306a36Sopenharmony_ci */ 533962306a36Sopenharmony_ci tlb_change_page_size(tlb, sz); 534062306a36Sopenharmony_ci tlb_start_vma(tlb, vma); 534162306a36Sopenharmony_ci 534262306a36Sopenharmony_ci last_addr_mask = hugetlb_mask_last_page(h); 534362306a36Sopenharmony_ci address = start; 534462306a36Sopenharmony_ci for (; address < end; address += sz) { 534562306a36Sopenharmony_ci ptep = hugetlb_walk(vma, address, sz); 534662306a36Sopenharmony_ci if (!ptep) { 534762306a36Sopenharmony_ci address |= last_addr_mask; 534862306a36Sopenharmony_ci continue; 534962306a36Sopenharmony_ci } 535062306a36Sopenharmony_ci 535162306a36Sopenharmony_ci ptl = huge_pte_lock(h, mm, ptep); 535262306a36Sopenharmony_ci if (huge_pmd_unshare(mm, vma, address, ptep)) { 535362306a36Sopenharmony_ci spin_unlock(ptl); 535462306a36Sopenharmony_ci tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE); 535562306a36Sopenharmony_ci force_flush = true; 535662306a36Sopenharmony_ci address |= last_addr_mask; 535762306a36Sopenharmony_ci continue; 535862306a36Sopenharmony_ci } 535962306a36Sopenharmony_ci 536062306a36Sopenharmony_ci pte = huge_ptep_get(ptep); 536162306a36Sopenharmony_ci if (huge_pte_none(pte)) { 536262306a36Sopenharmony_ci spin_unlock(ptl); 536362306a36Sopenharmony_ci continue; 536462306a36Sopenharmony_ci } 536562306a36Sopenharmony_ci 536662306a36Sopenharmony_ci /* 536762306a36Sopenharmony_ci * Migrating hugepage or HWPoisoned hugepage is already 536862306a36Sopenharmony_ci * unmapped and its refcount is dropped, so just clear pte here. 536962306a36Sopenharmony_ci */ 537062306a36Sopenharmony_ci if (unlikely(!pte_present(pte))) { 537162306a36Sopenharmony_ci /* 537262306a36Sopenharmony_ci * If the pte was wr-protected by uffd-wp in any of the 537362306a36Sopenharmony_ci * swap forms, meanwhile the caller does not want to 537462306a36Sopenharmony_ci * drop the uffd-wp bit in this zap, then replace the 537562306a36Sopenharmony_ci * pte with a marker. 537662306a36Sopenharmony_ci */ 537762306a36Sopenharmony_ci if (pte_swp_uffd_wp_any(pte) && 537862306a36Sopenharmony_ci !(zap_flags & ZAP_FLAG_DROP_MARKER)) 537962306a36Sopenharmony_ci set_huge_pte_at(mm, address, ptep, 538062306a36Sopenharmony_ci make_pte_marker(PTE_MARKER_UFFD_WP), 538162306a36Sopenharmony_ci sz); 538262306a36Sopenharmony_ci else 538362306a36Sopenharmony_ci huge_pte_clear(mm, address, ptep, sz); 538462306a36Sopenharmony_ci spin_unlock(ptl); 538562306a36Sopenharmony_ci continue; 538662306a36Sopenharmony_ci } 538762306a36Sopenharmony_ci 538862306a36Sopenharmony_ci page = pte_page(pte); 538962306a36Sopenharmony_ci /* 539062306a36Sopenharmony_ci * If a reference page is supplied, it is because a specific 539162306a36Sopenharmony_ci * page is being unmapped, not a range. Ensure the page we 539262306a36Sopenharmony_ci * are about to unmap is the actual page of interest. 539362306a36Sopenharmony_ci */ 539462306a36Sopenharmony_ci if (ref_page) { 539562306a36Sopenharmony_ci if (page != ref_page) { 539662306a36Sopenharmony_ci spin_unlock(ptl); 539762306a36Sopenharmony_ci continue; 539862306a36Sopenharmony_ci } 539962306a36Sopenharmony_ci /* 540062306a36Sopenharmony_ci * Mark the VMA as having unmapped its page so that 540162306a36Sopenharmony_ci * future faults in this VMA will fail rather than 540262306a36Sopenharmony_ci * looking like data was lost 540362306a36Sopenharmony_ci */ 540462306a36Sopenharmony_ci set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); 540562306a36Sopenharmony_ci } 540662306a36Sopenharmony_ci 540762306a36Sopenharmony_ci pte = huge_ptep_get_and_clear(mm, address, ptep); 540862306a36Sopenharmony_ci tlb_remove_huge_tlb_entry(h, tlb, ptep, address); 540962306a36Sopenharmony_ci if (huge_pte_dirty(pte)) 541062306a36Sopenharmony_ci set_page_dirty(page); 541162306a36Sopenharmony_ci /* Leave a uffd-wp pte marker if needed */ 541262306a36Sopenharmony_ci if (huge_pte_uffd_wp(pte) && 541362306a36Sopenharmony_ci !(zap_flags & ZAP_FLAG_DROP_MARKER)) 541462306a36Sopenharmony_ci set_huge_pte_at(mm, address, ptep, 541562306a36Sopenharmony_ci make_pte_marker(PTE_MARKER_UFFD_WP), 541662306a36Sopenharmony_ci sz); 541762306a36Sopenharmony_ci hugetlb_count_sub(pages_per_huge_page(h), mm); 541862306a36Sopenharmony_ci page_remove_rmap(page, vma, true); 541962306a36Sopenharmony_ci 542062306a36Sopenharmony_ci spin_unlock(ptl); 542162306a36Sopenharmony_ci tlb_remove_page_size(tlb, page, huge_page_size(h)); 542262306a36Sopenharmony_ci /* 542362306a36Sopenharmony_ci * Bail out after unmapping reference page if supplied 542462306a36Sopenharmony_ci */ 542562306a36Sopenharmony_ci if (ref_page) 542662306a36Sopenharmony_ci break; 542762306a36Sopenharmony_ci } 542862306a36Sopenharmony_ci tlb_end_vma(tlb, vma); 542962306a36Sopenharmony_ci 543062306a36Sopenharmony_ci /* 543162306a36Sopenharmony_ci * If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We 543262306a36Sopenharmony_ci * could defer the flush until now, since by holding i_mmap_rwsem we 543362306a36Sopenharmony_ci * guaranteed that the last refernece would not be dropped. But we must 543462306a36Sopenharmony_ci * do the flushing before we return, as otherwise i_mmap_rwsem will be 543562306a36Sopenharmony_ci * dropped and the last reference to the shared PMDs page might be 543662306a36Sopenharmony_ci * dropped as well. 543762306a36Sopenharmony_ci * 543862306a36Sopenharmony_ci * In theory we could defer the freeing of the PMD pages as well, but 543962306a36Sopenharmony_ci * huge_pmd_unshare() relies on the exact page_count for the PMD page to 544062306a36Sopenharmony_ci * detect sharing, so we cannot defer the release of the page either. 544162306a36Sopenharmony_ci * Instead, do flush now. 544262306a36Sopenharmony_ci */ 544362306a36Sopenharmony_ci if (force_flush) 544462306a36Sopenharmony_ci tlb_flush_mmu_tlbonly(tlb); 544562306a36Sopenharmony_ci} 544662306a36Sopenharmony_ci 544762306a36Sopenharmony_civoid __hugetlb_zap_begin(struct vm_area_struct *vma, 544862306a36Sopenharmony_ci unsigned long *start, unsigned long *end) 544962306a36Sopenharmony_ci{ 545062306a36Sopenharmony_ci if (!vma->vm_file) /* hugetlbfs_file_mmap error */ 545162306a36Sopenharmony_ci return; 545262306a36Sopenharmony_ci 545362306a36Sopenharmony_ci adjust_range_if_pmd_sharing_possible(vma, start, end); 545462306a36Sopenharmony_ci hugetlb_vma_lock_write(vma); 545562306a36Sopenharmony_ci if (vma->vm_file) 545662306a36Sopenharmony_ci i_mmap_lock_write(vma->vm_file->f_mapping); 545762306a36Sopenharmony_ci} 545862306a36Sopenharmony_ci 545962306a36Sopenharmony_civoid __hugetlb_zap_end(struct vm_area_struct *vma, 546062306a36Sopenharmony_ci struct zap_details *details) 546162306a36Sopenharmony_ci{ 546262306a36Sopenharmony_ci zap_flags_t zap_flags = details ? details->zap_flags : 0; 546362306a36Sopenharmony_ci 546462306a36Sopenharmony_ci if (!vma->vm_file) /* hugetlbfs_file_mmap error */ 546562306a36Sopenharmony_ci return; 546662306a36Sopenharmony_ci 546762306a36Sopenharmony_ci if (zap_flags & ZAP_FLAG_UNMAP) { /* final unmap */ 546862306a36Sopenharmony_ci /* 546962306a36Sopenharmony_ci * Unlock and free the vma lock before releasing i_mmap_rwsem. 547062306a36Sopenharmony_ci * When the vma_lock is freed, this makes the vma ineligible 547162306a36Sopenharmony_ci * for pmd sharing. And, i_mmap_rwsem is required to set up 547262306a36Sopenharmony_ci * pmd sharing. This is important as page tables for this 547362306a36Sopenharmony_ci * unmapped range will be asynchrously deleted. If the page 547462306a36Sopenharmony_ci * tables are shared, there will be issues when accessed by 547562306a36Sopenharmony_ci * someone else. 547662306a36Sopenharmony_ci */ 547762306a36Sopenharmony_ci __hugetlb_vma_unlock_write_free(vma); 547862306a36Sopenharmony_ci } else { 547962306a36Sopenharmony_ci hugetlb_vma_unlock_write(vma); 548062306a36Sopenharmony_ci } 548162306a36Sopenharmony_ci 548262306a36Sopenharmony_ci if (vma->vm_file) 548362306a36Sopenharmony_ci i_mmap_unlock_write(vma->vm_file->f_mapping); 548462306a36Sopenharmony_ci} 548562306a36Sopenharmony_ci 548662306a36Sopenharmony_civoid unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 548762306a36Sopenharmony_ci unsigned long end, struct page *ref_page, 548862306a36Sopenharmony_ci zap_flags_t zap_flags) 548962306a36Sopenharmony_ci{ 549062306a36Sopenharmony_ci struct mmu_notifier_range range; 549162306a36Sopenharmony_ci struct mmu_gather tlb; 549262306a36Sopenharmony_ci 549362306a36Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, 549462306a36Sopenharmony_ci start, end); 549562306a36Sopenharmony_ci adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); 549662306a36Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 549762306a36Sopenharmony_ci tlb_gather_mmu(&tlb, vma->vm_mm); 549862306a36Sopenharmony_ci 549962306a36Sopenharmony_ci __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags); 550062306a36Sopenharmony_ci 550162306a36Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 550262306a36Sopenharmony_ci tlb_finish_mmu(&tlb); 550362306a36Sopenharmony_ci} 550462306a36Sopenharmony_ci 550562306a36Sopenharmony_ci/* 550662306a36Sopenharmony_ci * This is called when the original mapper is failing to COW a MAP_PRIVATE 550762306a36Sopenharmony_ci * mapping it owns the reserve page for. The intention is to unmap the page 550862306a36Sopenharmony_ci * from other VMAs and let the children be SIGKILLed if they are faulting the 550962306a36Sopenharmony_ci * same region. 551062306a36Sopenharmony_ci */ 551162306a36Sopenharmony_cistatic void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, 551262306a36Sopenharmony_ci struct page *page, unsigned long address) 551362306a36Sopenharmony_ci{ 551462306a36Sopenharmony_ci struct hstate *h = hstate_vma(vma); 551562306a36Sopenharmony_ci struct vm_area_struct *iter_vma; 551662306a36Sopenharmony_ci struct address_space *mapping; 551762306a36Sopenharmony_ci pgoff_t pgoff; 551862306a36Sopenharmony_ci 551962306a36Sopenharmony_ci /* 552062306a36Sopenharmony_ci * vm_pgoff is in PAGE_SIZE units, hence the different calculation 552162306a36Sopenharmony_ci * from page cache lookup which is in HPAGE_SIZE units. 552262306a36Sopenharmony_ci */ 552362306a36Sopenharmony_ci address = address & huge_page_mask(h); 552462306a36Sopenharmony_ci pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + 552562306a36Sopenharmony_ci vma->vm_pgoff; 552662306a36Sopenharmony_ci mapping = vma->vm_file->f_mapping; 552762306a36Sopenharmony_ci 552862306a36Sopenharmony_ci /* 552962306a36Sopenharmony_ci * Take the mapping lock for the duration of the table walk. As 553062306a36Sopenharmony_ci * this mapping should be shared between all the VMAs, 553162306a36Sopenharmony_ci * __unmap_hugepage_range() is called as the lock is already held 553262306a36Sopenharmony_ci */ 553362306a36Sopenharmony_ci i_mmap_lock_write(mapping); 553462306a36Sopenharmony_ci vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { 553562306a36Sopenharmony_ci /* Do not unmap the current VMA */ 553662306a36Sopenharmony_ci if (iter_vma == vma) 553762306a36Sopenharmony_ci continue; 553862306a36Sopenharmony_ci 553962306a36Sopenharmony_ci /* 554062306a36Sopenharmony_ci * Shared VMAs have their own reserves and do not affect 554162306a36Sopenharmony_ci * MAP_PRIVATE accounting but it is possible that a shared 554262306a36Sopenharmony_ci * VMA is using the same page so check and skip such VMAs. 554362306a36Sopenharmony_ci */ 554462306a36Sopenharmony_ci if (iter_vma->vm_flags & VM_MAYSHARE) 554562306a36Sopenharmony_ci continue; 554662306a36Sopenharmony_ci 554762306a36Sopenharmony_ci /* 554862306a36Sopenharmony_ci * Unmap the page from other VMAs without their own reserves. 554962306a36Sopenharmony_ci * They get marked to be SIGKILLed if they fault in these 555062306a36Sopenharmony_ci * areas. This is because a future no-page fault on this VMA 555162306a36Sopenharmony_ci * could insert a zeroed page instead of the data existing 555262306a36Sopenharmony_ci * from the time of fork. This would look like data corruption 555362306a36Sopenharmony_ci */ 555462306a36Sopenharmony_ci if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) 555562306a36Sopenharmony_ci unmap_hugepage_range(iter_vma, address, 555662306a36Sopenharmony_ci address + huge_page_size(h), page, 0); 555762306a36Sopenharmony_ci } 555862306a36Sopenharmony_ci i_mmap_unlock_write(mapping); 555962306a36Sopenharmony_ci} 556062306a36Sopenharmony_ci 556162306a36Sopenharmony_ci/* 556262306a36Sopenharmony_ci * hugetlb_wp() should be called with page lock of the original hugepage held. 556362306a36Sopenharmony_ci * Called with hugetlb_fault_mutex_table held and pte_page locked so we 556462306a36Sopenharmony_ci * cannot race with other handlers or page migration. 556562306a36Sopenharmony_ci * Keep the pte_same checks anyway to make transition from the mutex easier. 556662306a36Sopenharmony_ci */ 556762306a36Sopenharmony_cistatic vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, 556862306a36Sopenharmony_ci unsigned long address, pte_t *ptep, unsigned int flags, 556962306a36Sopenharmony_ci struct folio *pagecache_folio, spinlock_t *ptl) 557062306a36Sopenharmony_ci{ 557162306a36Sopenharmony_ci const bool unshare = flags & FAULT_FLAG_UNSHARE; 557262306a36Sopenharmony_ci pte_t pte = huge_ptep_get(ptep); 557362306a36Sopenharmony_ci struct hstate *h = hstate_vma(vma); 557462306a36Sopenharmony_ci struct folio *old_folio; 557562306a36Sopenharmony_ci struct folio *new_folio; 557662306a36Sopenharmony_ci int outside_reserve = 0; 557762306a36Sopenharmony_ci vm_fault_t ret = 0; 557862306a36Sopenharmony_ci unsigned long haddr = address & huge_page_mask(h); 557962306a36Sopenharmony_ci struct mmu_notifier_range range; 558062306a36Sopenharmony_ci 558162306a36Sopenharmony_ci /* 558262306a36Sopenharmony_ci * Never handle CoW for uffd-wp protected pages. It should be only 558362306a36Sopenharmony_ci * handled when the uffd-wp protection is removed. 558462306a36Sopenharmony_ci * 558562306a36Sopenharmony_ci * Note that only the CoW optimization path (in hugetlb_no_page()) 558662306a36Sopenharmony_ci * can trigger this, because hugetlb_fault() will always resolve 558762306a36Sopenharmony_ci * uffd-wp bit first. 558862306a36Sopenharmony_ci */ 558962306a36Sopenharmony_ci if (!unshare && huge_pte_uffd_wp(pte)) 559062306a36Sopenharmony_ci return 0; 559162306a36Sopenharmony_ci 559262306a36Sopenharmony_ci /* 559362306a36Sopenharmony_ci * hugetlb does not support FOLL_FORCE-style write faults that keep the 559462306a36Sopenharmony_ci * PTE mapped R/O such as maybe_mkwrite() would do. 559562306a36Sopenharmony_ci */ 559662306a36Sopenharmony_ci if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE))) 559762306a36Sopenharmony_ci return VM_FAULT_SIGSEGV; 559862306a36Sopenharmony_ci 559962306a36Sopenharmony_ci /* Let's take out MAP_SHARED mappings first. */ 560062306a36Sopenharmony_ci if (vma->vm_flags & VM_MAYSHARE) { 560162306a36Sopenharmony_ci set_huge_ptep_writable(vma, haddr, ptep); 560262306a36Sopenharmony_ci return 0; 560362306a36Sopenharmony_ci } 560462306a36Sopenharmony_ci 560562306a36Sopenharmony_ci old_folio = page_folio(pte_page(pte)); 560662306a36Sopenharmony_ci 560762306a36Sopenharmony_ci delayacct_wpcopy_start(); 560862306a36Sopenharmony_ci 560962306a36Sopenharmony_ciretry_avoidcopy: 561062306a36Sopenharmony_ci /* 561162306a36Sopenharmony_ci * If no-one else is actually using this page, we're the exclusive 561262306a36Sopenharmony_ci * owner and can reuse this page. 561362306a36Sopenharmony_ci */ 561462306a36Sopenharmony_ci if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) { 561562306a36Sopenharmony_ci if (!PageAnonExclusive(&old_folio->page)) 561662306a36Sopenharmony_ci page_move_anon_rmap(&old_folio->page, vma); 561762306a36Sopenharmony_ci if (likely(!unshare)) 561862306a36Sopenharmony_ci set_huge_ptep_writable(vma, haddr, ptep); 561962306a36Sopenharmony_ci 562062306a36Sopenharmony_ci delayacct_wpcopy_end(); 562162306a36Sopenharmony_ci return 0; 562262306a36Sopenharmony_ci } 562362306a36Sopenharmony_ci VM_BUG_ON_PAGE(folio_test_anon(old_folio) && 562462306a36Sopenharmony_ci PageAnonExclusive(&old_folio->page), &old_folio->page); 562562306a36Sopenharmony_ci 562662306a36Sopenharmony_ci /* 562762306a36Sopenharmony_ci * If the process that created a MAP_PRIVATE mapping is about to 562862306a36Sopenharmony_ci * perform a COW due to a shared page count, attempt to satisfy 562962306a36Sopenharmony_ci * the allocation without using the existing reserves. The pagecache 563062306a36Sopenharmony_ci * page is used to determine if the reserve at this address was 563162306a36Sopenharmony_ci * consumed or not. If reserves were used, a partial faulted mapping 563262306a36Sopenharmony_ci * at the time of fork() could consume its reserves on COW instead 563362306a36Sopenharmony_ci * of the full address range. 563462306a36Sopenharmony_ci */ 563562306a36Sopenharmony_ci if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && 563662306a36Sopenharmony_ci old_folio != pagecache_folio) 563762306a36Sopenharmony_ci outside_reserve = 1; 563862306a36Sopenharmony_ci 563962306a36Sopenharmony_ci folio_get(old_folio); 564062306a36Sopenharmony_ci 564162306a36Sopenharmony_ci /* 564262306a36Sopenharmony_ci * Drop page table lock as buddy allocator may be called. It will 564362306a36Sopenharmony_ci * be acquired again before returning to the caller, as expected. 564462306a36Sopenharmony_ci */ 564562306a36Sopenharmony_ci spin_unlock(ptl); 564662306a36Sopenharmony_ci new_folio = alloc_hugetlb_folio(vma, haddr, outside_reserve); 564762306a36Sopenharmony_ci 564862306a36Sopenharmony_ci if (IS_ERR(new_folio)) { 564962306a36Sopenharmony_ci /* 565062306a36Sopenharmony_ci * If a process owning a MAP_PRIVATE mapping fails to COW, 565162306a36Sopenharmony_ci * it is due to references held by a child and an insufficient 565262306a36Sopenharmony_ci * huge page pool. To guarantee the original mappers 565362306a36Sopenharmony_ci * reliability, unmap the page from child processes. The child 565462306a36Sopenharmony_ci * may get SIGKILLed if it later faults. 565562306a36Sopenharmony_ci */ 565662306a36Sopenharmony_ci if (outside_reserve) { 565762306a36Sopenharmony_ci struct address_space *mapping = vma->vm_file->f_mapping; 565862306a36Sopenharmony_ci pgoff_t idx; 565962306a36Sopenharmony_ci u32 hash; 566062306a36Sopenharmony_ci 566162306a36Sopenharmony_ci folio_put(old_folio); 566262306a36Sopenharmony_ci /* 566362306a36Sopenharmony_ci * Drop hugetlb_fault_mutex and vma_lock before 566462306a36Sopenharmony_ci * unmapping. unmapping needs to hold vma_lock 566562306a36Sopenharmony_ci * in write mode. Dropping vma_lock in read mode 566662306a36Sopenharmony_ci * here is OK as COW mappings do not interact with 566762306a36Sopenharmony_ci * PMD sharing. 566862306a36Sopenharmony_ci * 566962306a36Sopenharmony_ci * Reacquire both after unmap operation. 567062306a36Sopenharmony_ci */ 567162306a36Sopenharmony_ci idx = vma_hugecache_offset(h, vma, haddr); 567262306a36Sopenharmony_ci hash = hugetlb_fault_mutex_hash(mapping, idx); 567362306a36Sopenharmony_ci hugetlb_vma_unlock_read(vma); 567462306a36Sopenharmony_ci mutex_unlock(&hugetlb_fault_mutex_table[hash]); 567562306a36Sopenharmony_ci 567662306a36Sopenharmony_ci unmap_ref_private(mm, vma, &old_folio->page, haddr); 567762306a36Sopenharmony_ci 567862306a36Sopenharmony_ci mutex_lock(&hugetlb_fault_mutex_table[hash]); 567962306a36Sopenharmony_ci hugetlb_vma_lock_read(vma); 568062306a36Sopenharmony_ci spin_lock(ptl); 568162306a36Sopenharmony_ci ptep = hugetlb_walk(vma, haddr, huge_page_size(h)); 568262306a36Sopenharmony_ci if (likely(ptep && 568362306a36Sopenharmony_ci pte_same(huge_ptep_get(ptep), pte))) 568462306a36Sopenharmony_ci goto retry_avoidcopy; 568562306a36Sopenharmony_ci /* 568662306a36Sopenharmony_ci * race occurs while re-acquiring page table 568762306a36Sopenharmony_ci * lock, and our job is done. 568862306a36Sopenharmony_ci */ 568962306a36Sopenharmony_ci delayacct_wpcopy_end(); 569062306a36Sopenharmony_ci return 0; 569162306a36Sopenharmony_ci } 569262306a36Sopenharmony_ci 569362306a36Sopenharmony_ci ret = vmf_error(PTR_ERR(new_folio)); 569462306a36Sopenharmony_ci goto out_release_old; 569562306a36Sopenharmony_ci } 569662306a36Sopenharmony_ci 569762306a36Sopenharmony_ci /* 569862306a36Sopenharmony_ci * When the original hugepage is shared one, it does not have 569962306a36Sopenharmony_ci * anon_vma prepared. 570062306a36Sopenharmony_ci */ 570162306a36Sopenharmony_ci if (unlikely(anon_vma_prepare(vma))) { 570262306a36Sopenharmony_ci ret = VM_FAULT_OOM; 570362306a36Sopenharmony_ci goto out_release_all; 570462306a36Sopenharmony_ci } 570562306a36Sopenharmony_ci 570662306a36Sopenharmony_ci if (copy_user_large_folio(new_folio, old_folio, address, vma)) { 570762306a36Sopenharmony_ci ret = VM_FAULT_HWPOISON_LARGE; 570862306a36Sopenharmony_ci goto out_release_all; 570962306a36Sopenharmony_ci } 571062306a36Sopenharmony_ci __folio_mark_uptodate(new_folio); 571162306a36Sopenharmony_ci 571262306a36Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr, 571362306a36Sopenharmony_ci haddr + huge_page_size(h)); 571462306a36Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 571562306a36Sopenharmony_ci 571662306a36Sopenharmony_ci /* 571762306a36Sopenharmony_ci * Retake the page table lock to check for racing updates 571862306a36Sopenharmony_ci * before the page tables are altered 571962306a36Sopenharmony_ci */ 572062306a36Sopenharmony_ci spin_lock(ptl); 572162306a36Sopenharmony_ci ptep = hugetlb_walk(vma, haddr, huge_page_size(h)); 572262306a36Sopenharmony_ci if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { 572362306a36Sopenharmony_ci pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare); 572462306a36Sopenharmony_ci 572562306a36Sopenharmony_ci /* Break COW or unshare */ 572662306a36Sopenharmony_ci huge_ptep_clear_flush(vma, haddr, ptep); 572762306a36Sopenharmony_ci page_remove_rmap(&old_folio->page, vma, true); 572862306a36Sopenharmony_ci hugepage_add_new_anon_rmap(new_folio, vma, haddr); 572962306a36Sopenharmony_ci if (huge_pte_uffd_wp(pte)) 573062306a36Sopenharmony_ci newpte = huge_pte_mkuffd_wp(newpte); 573162306a36Sopenharmony_ci set_huge_pte_at(mm, haddr, ptep, newpte, huge_page_size(h)); 573262306a36Sopenharmony_ci folio_set_hugetlb_migratable(new_folio); 573362306a36Sopenharmony_ci /* Make the old page be freed below */ 573462306a36Sopenharmony_ci new_folio = old_folio; 573562306a36Sopenharmony_ci } 573662306a36Sopenharmony_ci spin_unlock(ptl); 573762306a36Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 573862306a36Sopenharmony_ciout_release_all: 573962306a36Sopenharmony_ci /* 574062306a36Sopenharmony_ci * No restore in case of successful pagetable update (Break COW or 574162306a36Sopenharmony_ci * unshare) 574262306a36Sopenharmony_ci */ 574362306a36Sopenharmony_ci if (new_folio != old_folio) 574462306a36Sopenharmony_ci restore_reserve_on_error(h, vma, haddr, new_folio); 574562306a36Sopenharmony_ci folio_put(new_folio); 574662306a36Sopenharmony_ciout_release_old: 574762306a36Sopenharmony_ci folio_put(old_folio); 574862306a36Sopenharmony_ci 574962306a36Sopenharmony_ci spin_lock(ptl); /* Caller expects lock to be held */ 575062306a36Sopenharmony_ci 575162306a36Sopenharmony_ci delayacct_wpcopy_end(); 575262306a36Sopenharmony_ci return ret; 575362306a36Sopenharmony_ci} 575462306a36Sopenharmony_ci 575562306a36Sopenharmony_ci/* 575662306a36Sopenharmony_ci * Return whether there is a pagecache page to back given address within VMA. 575762306a36Sopenharmony_ci */ 575862306a36Sopenharmony_cistatic bool hugetlbfs_pagecache_present(struct hstate *h, 575962306a36Sopenharmony_ci struct vm_area_struct *vma, unsigned long address) 576062306a36Sopenharmony_ci{ 576162306a36Sopenharmony_ci struct address_space *mapping = vma->vm_file->f_mapping; 576262306a36Sopenharmony_ci pgoff_t idx = vma_hugecache_offset(h, vma, address); 576362306a36Sopenharmony_ci struct folio *folio; 576462306a36Sopenharmony_ci 576562306a36Sopenharmony_ci folio = filemap_get_folio(mapping, idx); 576662306a36Sopenharmony_ci if (IS_ERR(folio)) 576762306a36Sopenharmony_ci return false; 576862306a36Sopenharmony_ci folio_put(folio); 576962306a36Sopenharmony_ci return true; 577062306a36Sopenharmony_ci} 577162306a36Sopenharmony_ci 577262306a36Sopenharmony_ciint hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, 577362306a36Sopenharmony_ci pgoff_t idx) 577462306a36Sopenharmony_ci{ 577562306a36Sopenharmony_ci struct inode *inode = mapping->host; 577662306a36Sopenharmony_ci struct hstate *h = hstate_inode(inode); 577762306a36Sopenharmony_ci int err; 577862306a36Sopenharmony_ci 577962306a36Sopenharmony_ci __folio_set_locked(folio); 578062306a36Sopenharmony_ci err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL); 578162306a36Sopenharmony_ci 578262306a36Sopenharmony_ci if (unlikely(err)) { 578362306a36Sopenharmony_ci __folio_clear_locked(folio); 578462306a36Sopenharmony_ci return err; 578562306a36Sopenharmony_ci } 578662306a36Sopenharmony_ci folio_clear_hugetlb_restore_reserve(folio); 578762306a36Sopenharmony_ci 578862306a36Sopenharmony_ci /* 578962306a36Sopenharmony_ci * mark folio dirty so that it will not be removed from cache/file 579062306a36Sopenharmony_ci * by non-hugetlbfs specific code paths. 579162306a36Sopenharmony_ci */ 579262306a36Sopenharmony_ci folio_mark_dirty(folio); 579362306a36Sopenharmony_ci 579462306a36Sopenharmony_ci spin_lock(&inode->i_lock); 579562306a36Sopenharmony_ci inode->i_blocks += blocks_per_huge_page(h); 579662306a36Sopenharmony_ci spin_unlock(&inode->i_lock); 579762306a36Sopenharmony_ci return 0; 579862306a36Sopenharmony_ci} 579962306a36Sopenharmony_ci 580062306a36Sopenharmony_cistatic inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, 580162306a36Sopenharmony_ci struct address_space *mapping, 580262306a36Sopenharmony_ci pgoff_t idx, 580362306a36Sopenharmony_ci unsigned int flags, 580462306a36Sopenharmony_ci unsigned long haddr, 580562306a36Sopenharmony_ci unsigned long addr, 580662306a36Sopenharmony_ci unsigned long reason) 580762306a36Sopenharmony_ci{ 580862306a36Sopenharmony_ci u32 hash; 580962306a36Sopenharmony_ci struct vm_fault vmf = { 581062306a36Sopenharmony_ci .vma = vma, 581162306a36Sopenharmony_ci .address = haddr, 581262306a36Sopenharmony_ci .real_address = addr, 581362306a36Sopenharmony_ci .flags = flags, 581462306a36Sopenharmony_ci 581562306a36Sopenharmony_ci /* 581662306a36Sopenharmony_ci * Hard to debug if it ends up being 581762306a36Sopenharmony_ci * used by a callee that assumes 581862306a36Sopenharmony_ci * something about the other 581962306a36Sopenharmony_ci * uninitialized fields... same as in 582062306a36Sopenharmony_ci * memory.c 582162306a36Sopenharmony_ci */ 582262306a36Sopenharmony_ci }; 582362306a36Sopenharmony_ci 582462306a36Sopenharmony_ci /* 582562306a36Sopenharmony_ci * vma_lock and hugetlb_fault_mutex must be dropped before handling 582662306a36Sopenharmony_ci * userfault. Also mmap_lock could be dropped due to handling 582762306a36Sopenharmony_ci * userfault, any vma operation should be careful from here. 582862306a36Sopenharmony_ci */ 582962306a36Sopenharmony_ci hugetlb_vma_unlock_read(vma); 583062306a36Sopenharmony_ci hash = hugetlb_fault_mutex_hash(mapping, idx); 583162306a36Sopenharmony_ci mutex_unlock(&hugetlb_fault_mutex_table[hash]); 583262306a36Sopenharmony_ci return handle_userfault(&vmf, reason); 583362306a36Sopenharmony_ci} 583462306a36Sopenharmony_ci 583562306a36Sopenharmony_ci/* 583662306a36Sopenharmony_ci * Recheck pte with pgtable lock. Returns true if pte didn't change, or 583762306a36Sopenharmony_ci * false if pte changed or is changing. 583862306a36Sopenharmony_ci */ 583962306a36Sopenharmony_cistatic bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, 584062306a36Sopenharmony_ci pte_t *ptep, pte_t old_pte) 584162306a36Sopenharmony_ci{ 584262306a36Sopenharmony_ci spinlock_t *ptl; 584362306a36Sopenharmony_ci bool same; 584462306a36Sopenharmony_ci 584562306a36Sopenharmony_ci ptl = huge_pte_lock(h, mm, ptep); 584662306a36Sopenharmony_ci same = pte_same(huge_ptep_get(ptep), old_pte); 584762306a36Sopenharmony_ci spin_unlock(ptl); 584862306a36Sopenharmony_ci 584962306a36Sopenharmony_ci return same; 585062306a36Sopenharmony_ci} 585162306a36Sopenharmony_ci 585262306a36Sopenharmony_cistatic vm_fault_t hugetlb_no_page(struct mm_struct *mm, 585362306a36Sopenharmony_ci struct vm_area_struct *vma, 585462306a36Sopenharmony_ci struct address_space *mapping, pgoff_t idx, 585562306a36Sopenharmony_ci unsigned long address, pte_t *ptep, 585662306a36Sopenharmony_ci pte_t old_pte, unsigned int flags) 585762306a36Sopenharmony_ci{ 585862306a36Sopenharmony_ci struct hstate *h = hstate_vma(vma); 585962306a36Sopenharmony_ci vm_fault_t ret = VM_FAULT_SIGBUS; 586062306a36Sopenharmony_ci int anon_rmap = 0; 586162306a36Sopenharmony_ci unsigned long size; 586262306a36Sopenharmony_ci struct folio *folio; 586362306a36Sopenharmony_ci pte_t new_pte; 586462306a36Sopenharmony_ci spinlock_t *ptl; 586562306a36Sopenharmony_ci unsigned long haddr = address & huge_page_mask(h); 586662306a36Sopenharmony_ci bool new_folio, new_pagecache_folio = false; 586762306a36Sopenharmony_ci u32 hash = hugetlb_fault_mutex_hash(mapping, idx); 586862306a36Sopenharmony_ci 586962306a36Sopenharmony_ci /* 587062306a36Sopenharmony_ci * Currently, we are forced to kill the process in the event the 587162306a36Sopenharmony_ci * original mapper has unmapped pages from the child due to a failed 587262306a36Sopenharmony_ci * COW/unsharing. Warn that such a situation has occurred as it may not 587362306a36Sopenharmony_ci * be obvious. 587462306a36Sopenharmony_ci */ 587562306a36Sopenharmony_ci if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { 587662306a36Sopenharmony_ci pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n", 587762306a36Sopenharmony_ci current->pid); 587862306a36Sopenharmony_ci goto out; 587962306a36Sopenharmony_ci } 588062306a36Sopenharmony_ci 588162306a36Sopenharmony_ci /* 588262306a36Sopenharmony_ci * Use page lock to guard against racing truncation 588362306a36Sopenharmony_ci * before we get page_table_lock. 588462306a36Sopenharmony_ci */ 588562306a36Sopenharmony_ci new_folio = false; 588662306a36Sopenharmony_ci folio = filemap_lock_folio(mapping, idx); 588762306a36Sopenharmony_ci if (IS_ERR(folio)) { 588862306a36Sopenharmony_ci size = i_size_read(mapping->host) >> huge_page_shift(h); 588962306a36Sopenharmony_ci if (idx >= size) 589062306a36Sopenharmony_ci goto out; 589162306a36Sopenharmony_ci /* Check for page in userfault range */ 589262306a36Sopenharmony_ci if (userfaultfd_missing(vma)) { 589362306a36Sopenharmony_ci /* 589462306a36Sopenharmony_ci * Since hugetlb_no_page() was examining pte 589562306a36Sopenharmony_ci * without pgtable lock, we need to re-test under 589662306a36Sopenharmony_ci * lock because the pte may not be stable and could 589762306a36Sopenharmony_ci * have changed from under us. Try to detect 589862306a36Sopenharmony_ci * either changed or during-changing ptes and retry 589962306a36Sopenharmony_ci * properly when needed. 590062306a36Sopenharmony_ci * 590162306a36Sopenharmony_ci * Note that userfaultfd is actually fine with 590262306a36Sopenharmony_ci * false positives (e.g. caused by pte changed), 590362306a36Sopenharmony_ci * but not wrong logical events (e.g. caused by 590462306a36Sopenharmony_ci * reading a pte during changing). The latter can 590562306a36Sopenharmony_ci * confuse the userspace, so the strictness is very 590662306a36Sopenharmony_ci * much preferred. E.g., MISSING event should 590762306a36Sopenharmony_ci * never happen on the page after UFFDIO_COPY has 590862306a36Sopenharmony_ci * correctly installed the page and returned. 590962306a36Sopenharmony_ci */ 591062306a36Sopenharmony_ci if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { 591162306a36Sopenharmony_ci ret = 0; 591262306a36Sopenharmony_ci goto out; 591362306a36Sopenharmony_ci } 591462306a36Sopenharmony_ci 591562306a36Sopenharmony_ci return hugetlb_handle_userfault(vma, mapping, idx, flags, 591662306a36Sopenharmony_ci haddr, address, 591762306a36Sopenharmony_ci VM_UFFD_MISSING); 591862306a36Sopenharmony_ci } 591962306a36Sopenharmony_ci 592062306a36Sopenharmony_ci folio = alloc_hugetlb_folio(vma, haddr, 0); 592162306a36Sopenharmony_ci if (IS_ERR(folio)) { 592262306a36Sopenharmony_ci /* 592362306a36Sopenharmony_ci * Returning error will result in faulting task being 592462306a36Sopenharmony_ci * sent SIGBUS. The hugetlb fault mutex prevents two 592562306a36Sopenharmony_ci * tasks from racing to fault in the same page which 592662306a36Sopenharmony_ci * could result in false unable to allocate errors. 592762306a36Sopenharmony_ci * Page migration does not take the fault mutex, but 592862306a36Sopenharmony_ci * does a clear then write of pte's under page table 592962306a36Sopenharmony_ci * lock. Page fault code could race with migration, 593062306a36Sopenharmony_ci * notice the clear pte and try to allocate a page 593162306a36Sopenharmony_ci * here. Before returning error, get ptl and make 593262306a36Sopenharmony_ci * sure there really is no pte entry. 593362306a36Sopenharmony_ci */ 593462306a36Sopenharmony_ci if (hugetlb_pte_stable(h, mm, ptep, old_pte)) 593562306a36Sopenharmony_ci ret = vmf_error(PTR_ERR(folio)); 593662306a36Sopenharmony_ci else 593762306a36Sopenharmony_ci ret = 0; 593862306a36Sopenharmony_ci goto out; 593962306a36Sopenharmony_ci } 594062306a36Sopenharmony_ci clear_huge_page(&folio->page, address, pages_per_huge_page(h)); 594162306a36Sopenharmony_ci __folio_mark_uptodate(folio); 594262306a36Sopenharmony_ci new_folio = true; 594362306a36Sopenharmony_ci 594462306a36Sopenharmony_ci if (vma->vm_flags & VM_MAYSHARE) { 594562306a36Sopenharmony_ci int err = hugetlb_add_to_page_cache(folio, mapping, idx); 594662306a36Sopenharmony_ci if (err) { 594762306a36Sopenharmony_ci /* 594862306a36Sopenharmony_ci * err can't be -EEXIST which implies someone 594962306a36Sopenharmony_ci * else consumed the reservation since hugetlb 595062306a36Sopenharmony_ci * fault mutex is held when add a hugetlb page 595162306a36Sopenharmony_ci * to the page cache. So it's safe to call 595262306a36Sopenharmony_ci * restore_reserve_on_error() here. 595362306a36Sopenharmony_ci */ 595462306a36Sopenharmony_ci restore_reserve_on_error(h, vma, haddr, folio); 595562306a36Sopenharmony_ci folio_put(folio); 595662306a36Sopenharmony_ci goto out; 595762306a36Sopenharmony_ci } 595862306a36Sopenharmony_ci new_pagecache_folio = true; 595962306a36Sopenharmony_ci } else { 596062306a36Sopenharmony_ci folio_lock(folio); 596162306a36Sopenharmony_ci if (unlikely(anon_vma_prepare(vma))) { 596262306a36Sopenharmony_ci ret = VM_FAULT_OOM; 596362306a36Sopenharmony_ci goto backout_unlocked; 596462306a36Sopenharmony_ci } 596562306a36Sopenharmony_ci anon_rmap = 1; 596662306a36Sopenharmony_ci } 596762306a36Sopenharmony_ci } else { 596862306a36Sopenharmony_ci /* 596962306a36Sopenharmony_ci * If memory error occurs between mmap() and fault, some process 597062306a36Sopenharmony_ci * don't have hwpoisoned swap entry for errored virtual address. 597162306a36Sopenharmony_ci * So we need to block hugepage fault by PG_hwpoison bit check. 597262306a36Sopenharmony_ci */ 597362306a36Sopenharmony_ci if (unlikely(folio_test_hwpoison(folio))) { 597462306a36Sopenharmony_ci ret = VM_FAULT_HWPOISON_LARGE | 597562306a36Sopenharmony_ci VM_FAULT_SET_HINDEX(hstate_index(h)); 597662306a36Sopenharmony_ci goto backout_unlocked; 597762306a36Sopenharmony_ci } 597862306a36Sopenharmony_ci 597962306a36Sopenharmony_ci /* Check for page in userfault range. */ 598062306a36Sopenharmony_ci if (userfaultfd_minor(vma)) { 598162306a36Sopenharmony_ci folio_unlock(folio); 598262306a36Sopenharmony_ci folio_put(folio); 598362306a36Sopenharmony_ci /* See comment in userfaultfd_missing() block above */ 598462306a36Sopenharmony_ci if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { 598562306a36Sopenharmony_ci ret = 0; 598662306a36Sopenharmony_ci goto out; 598762306a36Sopenharmony_ci } 598862306a36Sopenharmony_ci return hugetlb_handle_userfault(vma, mapping, idx, flags, 598962306a36Sopenharmony_ci haddr, address, 599062306a36Sopenharmony_ci VM_UFFD_MINOR); 599162306a36Sopenharmony_ci } 599262306a36Sopenharmony_ci } 599362306a36Sopenharmony_ci 599462306a36Sopenharmony_ci /* 599562306a36Sopenharmony_ci * If we are going to COW a private mapping later, we examine the 599662306a36Sopenharmony_ci * pending reservations for this page now. This will ensure that 599762306a36Sopenharmony_ci * any allocations necessary to record that reservation occur outside 599862306a36Sopenharmony_ci * the spinlock. 599962306a36Sopenharmony_ci */ 600062306a36Sopenharmony_ci if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { 600162306a36Sopenharmony_ci if (vma_needs_reservation(h, vma, haddr) < 0) { 600262306a36Sopenharmony_ci ret = VM_FAULT_OOM; 600362306a36Sopenharmony_ci goto backout_unlocked; 600462306a36Sopenharmony_ci } 600562306a36Sopenharmony_ci /* Just decrements count, does not deallocate */ 600662306a36Sopenharmony_ci vma_end_reservation(h, vma, haddr); 600762306a36Sopenharmony_ci } 600862306a36Sopenharmony_ci 600962306a36Sopenharmony_ci ptl = huge_pte_lock(h, mm, ptep); 601062306a36Sopenharmony_ci ret = 0; 601162306a36Sopenharmony_ci /* If pte changed from under us, retry */ 601262306a36Sopenharmony_ci if (!pte_same(huge_ptep_get(ptep), old_pte)) 601362306a36Sopenharmony_ci goto backout; 601462306a36Sopenharmony_ci 601562306a36Sopenharmony_ci if (anon_rmap) 601662306a36Sopenharmony_ci hugepage_add_new_anon_rmap(folio, vma, haddr); 601762306a36Sopenharmony_ci else 601862306a36Sopenharmony_ci page_dup_file_rmap(&folio->page, true); 601962306a36Sopenharmony_ci new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE) 602062306a36Sopenharmony_ci && (vma->vm_flags & VM_SHARED))); 602162306a36Sopenharmony_ci /* 602262306a36Sopenharmony_ci * If this pte was previously wr-protected, keep it wr-protected even 602362306a36Sopenharmony_ci * if populated. 602462306a36Sopenharmony_ci */ 602562306a36Sopenharmony_ci if (unlikely(pte_marker_uffd_wp(old_pte))) 602662306a36Sopenharmony_ci new_pte = huge_pte_mkuffd_wp(new_pte); 602762306a36Sopenharmony_ci set_huge_pte_at(mm, haddr, ptep, new_pte, huge_page_size(h)); 602862306a36Sopenharmony_ci 602962306a36Sopenharmony_ci hugetlb_count_add(pages_per_huge_page(h), mm); 603062306a36Sopenharmony_ci if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { 603162306a36Sopenharmony_ci /* Optimization, do the COW without a second fault */ 603262306a36Sopenharmony_ci ret = hugetlb_wp(mm, vma, address, ptep, flags, folio, ptl); 603362306a36Sopenharmony_ci } 603462306a36Sopenharmony_ci 603562306a36Sopenharmony_ci spin_unlock(ptl); 603662306a36Sopenharmony_ci 603762306a36Sopenharmony_ci /* 603862306a36Sopenharmony_ci * Only set hugetlb_migratable in newly allocated pages. Existing pages 603962306a36Sopenharmony_ci * found in the pagecache may not have hugetlb_migratable if they have 604062306a36Sopenharmony_ci * been isolated for migration. 604162306a36Sopenharmony_ci */ 604262306a36Sopenharmony_ci if (new_folio) 604362306a36Sopenharmony_ci folio_set_hugetlb_migratable(folio); 604462306a36Sopenharmony_ci 604562306a36Sopenharmony_ci folio_unlock(folio); 604662306a36Sopenharmony_ciout: 604762306a36Sopenharmony_ci hugetlb_vma_unlock_read(vma); 604862306a36Sopenharmony_ci mutex_unlock(&hugetlb_fault_mutex_table[hash]); 604962306a36Sopenharmony_ci return ret; 605062306a36Sopenharmony_ci 605162306a36Sopenharmony_cibackout: 605262306a36Sopenharmony_ci spin_unlock(ptl); 605362306a36Sopenharmony_cibackout_unlocked: 605462306a36Sopenharmony_ci if (new_folio && !new_pagecache_folio) 605562306a36Sopenharmony_ci restore_reserve_on_error(h, vma, haddr, folio); 605662306a36Sopenharmony_ci 605762306a36Sopenharmony_ci folio_unlock(folio); 605862306a36Sopenharmony_ci folio_put(folio); 605962306a36Sopenharmony_ci goto out; 606062306a36Sopenharmony_ci} 606162306a36Sopenharmony_ci 606262306a36Sopenharmony_ci#ifdef CONFIG_SMP 606362306a36Sopenharmony_ciu32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) 606462306a36Sopenharmony_ci{ 606562306a36Sopenharmony_ci unsigned long key[2]; 606662306a36Sopenharmony_ci u32 hash; 606762306a36Sopenharmony_ci 606862306a36Sopenharmony_ci key[0] = (unsigned long) mapping; 606962306a36Sopenharmony_ci key[1] = idx; 607062306a36Sopenharmony_ci 607162306a36Sopenharmony_ci hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0); 607262306a36Sopenharmony_ci 607362306a36Sopenharmony_ci return hash & (num_fault_mutexes - 1); 607462306a36Sopenharmony_ci} 607562306a36Sopenharmony_ci#else 607662306a36Sopenharmony_ci/* 607762306a36Sopenharmony_ci * For uniprocessor systems we always use a single mutex, so just 607862306a36Sopenharmony_ci * return 0 and avoid the hashing overhead. 607962306a36Sopenharmony_ci */ 608062306a36Sopenharmony_ciu32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) 608162306a36Sopenharmony_ci{ 608262306a36Sopenharmony_ci return 0; 608362306a36Sopenharmony_ci} 608462306a36Sopenharmony_ci#endif 608562306a36Sopenharmony_ci 608662306a36Sopenharmony_civm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 608762306a36Sopenharmony_ci unsigned long address, unsigned int flags) 608862306a36Sopenharmony_ci{ 608962306a36Sopenharmony_ci pte_t *ptep, entry; 609062306a36Sopenharmony_ci spinlock_t *ptl; 609162306a36Sopenharmony_ci vm_fault_t ret; 609262306a36Sopenharmony_ci u32 hash; 609362306a36Sopenharmony_ci pgoff_t idx; 609462306a36Sopenharmony_ci struct folio *folio = NULL; 609562306a36Sopenharmony_ci struct folio *pagecache_folio = NULL; 609662306a36Sopenharmony_ci struct hstate *h = hstate_vma(vma); 609762306a36Sopenharmony_ci struct address_space *mapping; 609862306a36Sopenharmony_ci int need_wait_lock = 0; 609962306a36Sopenharmony_ci unsigned long haddr = address & huge_page_mask(h); 610062306a36Sopenharmony_ci 610162306a36Sopenharmony_ci /* TODO: Handle faults under the VMA lock */ 610262306a36Sopenharmony_ci if (flags & FAULT_FLAG_VMA_LOCK) { 610362306a36Sopenharmony_ci vma_end_read(vma); 610462306a36Sopenharmony_ci return VM_FAULT_RETRY; 610562306a36Sopenharmony_ci } 610662306a36Sopenharmony_ci 610762306a36Sopenharmony_ci /* 610862306a36Sopenharmony_ci * Serialize hugepage allocation and instantiation, so that we don't 610962306a36Sopenharmony_ci * get spurious allocation failures if two CPUs race to instantiate 611062306a36Sopenharmony_ci * the same page in the page cache. 611162306a36Sopenharmony_ci */ 611262306a36Sopenharmony_ci mapping = vma->vm_file->f_mapping; 611362306a36Sopenharmony_ci idx = vma_hugecache_offset(h, vma, haddr); 611462306a36Sopenharmony_ci hash = hugetlb_fault_mutex_hash(mapping, idx); 611562306a36Sopenharmony_ci mutex_lock(&hugetlb_fault_mutex_table[hash]); 611662306a36Sopenharmony_ci 611762306a36Sopenharmony_ci /* 611862306a36Sopenharmony_ci * Acquire vma lock before calling huge_pte_alloc and hold 611962306a36Sopenharmony_ci * until finished with ptep. This prevents huge_pmd_unshare from 612062306a36Sopenharmony_ci * being called elsewhere and making the ptep no longer valid. 612162306a36Sopenharmony_ci */ 612262306a36Sopenharmony_ci hugetlb_vma_lock_read(vma); 612362306a36Sopenharmony_ci ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); 612462306a36Sopenharmony_ci if (!ptep) { 612562306a36Sopenharmony_ci hugetlb_vma_unlock_read(vma); 612662306a36Sopenharmony_ci mutex_unlock(&hugetlb_fault_mutex_table[hash]); 612762306a36Sopenharmony_ci return VM_FAULT_OOM; 612862306a36Sopenharmony_ci } 612962306a36Sopenharmony_ci 613062306a36Sopenharmony_ci entry = huge_ptep_get(ptep); 613162306a36Sopenharmony_ci if (huge_pte_none_mostly(entry)) { 613262306a36Sopenharmony_ci if (is_pte_marker(entry)) { 613362306a36Sopenharmony_ci pte_marker marker = 613462306a36Sopenharmony_ci pte_marker_get(pte_to_swp_entry(entry)); 613562306a36Sopenharmony_ci 613662306a36Sopenharmony_ci if (marker & PTE_MARKER_POISONED) { 613762306a36Sopenharmony_ci ret = VM_FAULT_HWPOISON_LARGE; 613862306a36Sopenharmony_ci goto out_mutex; 613962306a36Sopenharmony_ci } 614062306a36Sopenharmony_ci } 614162306a36Sopenharmony_ci 614262306a36Sopenharmony_ci /* 614362306a36Sopenharmony_ci * Other PTE markers should be handled the same way as none PTE. 614462306a36Sopenharmony_ci * 614562306a36Sopenharmony_ci * hugetlb_no_page will drop vma lock and hugetlb fault 614662306a36Sopenharmony_ci * mutex internally, which make us return immediately. 614762306a36Sopenharmony_ci */ 614862306a36Sopenharmony_ci return hugetlb_no_page(mm, vma, mapping, idx, address, ptep, 614962306a36Sopenharmony_ci entry, flags); 615062306a36Sopenharmony_ci } 615162306a36Sopenharmony_ci 615262306a36Sopenharmony_ci ret = 0; 615362306a36Sopenharmony_ci 615462306a36Sopenharmony_ci /* 615562306a36Sopenharmony_ci * entry could be a migration/hwpoison entry at this point, so this 615662306a36Sopenharmony_ci * check prevents the kernel from going below assuming that we have 615762306a36Sopenharmony_ci * an active hugepage in pagecache. This goto expects the 2nd page 615862306a36Sopenharmony_ci * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will 615962306a36Sopenharmony_ci * properly handle it. 616062306a36Sopenharmony_ci */ 616162306a36Sopenharmony_ci if (!pte_present(entry)) { 616262306a36Sopenharmony_ci if (unlikely(is_hugetlb_entry_migration(entry))) { 616362306a36Sopenharmony_ci /* 616462306a36Sopenharmony_ci * Release the hugetlb fault lock now, but retain 616562306a36Sopenharmony_ci * the vma lock, because it is needed to guard the 616662306a36Sopenharmony_ci * huge_pte_lockptr() later in 616762306a36Sopenharmony_ci * migration_entry_wait_huge(). The vma lock will 616862306a36Sopenharmony_ci * be released there. 616962306a36Sopenharmony_ci */ 617062306a36Sopenharmony_ci mutex_unlock(&hugetlb_fault_mutex_table[hash]); 617162306a36Sopenharmony_ci migration_entry_wait_huge(vma, ptep); 617262306a36Sopenharmony_ci return 0; 617362306a36Sopenharmony_ci } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 617462306a36Sopenharmony_ci ret = VM_FAULT_HWPOISON_LARGE | 617562306a36Sopenharmony_ci VM_FAULT_SET_HINDEX(hstate_index(h)); 617662306a36Sopenharmony_ci goto out_mutex; 617762306a36Sopenharmony_ci } 617862306a36Sopenharmony_ci 617962306a36Sopenharmony_ci /* 618062306a36Sopenharmony_ci * If we are going to COW/unshare the mapping later, we examine the 618162306a36Sopenharmony_ci * pending reservations for this page now. This will ensure that any 618262306a36Sopenharmony_ci * allocations necessary to record that reservation occur outside the 618362306a36Sopenharmony_ci * spinlock. Also lookup the pagecache page now as it is used to 618462306a36Sopenharmony_ci * determine if a reservation has been consumed. 618562306a36Sopenharmony_ci */ 618662306a36Sopenharmony_ci if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && 618762306a36Sopenharmony_ci !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) { 618862306a36Sopenharmony_ci if (vma_needs_reservation(h, vma, haddr) < 0) { 618962306a36Sopenharmony_ci ret = VM_FAULT_OOM; 619062306a36Sopenharmony_ci goto out_mutex; 619162306a36Sopenharmony_ci } 619262306a36Sopenharmony_ci /* Just decrements count, does not deallocate */ 619362306a36Sopenharmony_ci vma_end_reservation(h, vma, haddr); 619462306a36Sopenharmony_ci 619562306a36Sopenharmony_ci pagecache_folio = filemap_lock_folio(mapping, idx); 619662306a36Sopenharmony_ci if (IS_ERR(pagecache_folio)) 619762306a36Sopenharmony_ci pagecache_folio = NULL; 619862306a36Sopenharmony_ci } 619962306a36Sopenharmony_ci 620062306a36Sopenharmony_ci ptl = huge_pte_lock(h, mm, ptep); 620162306a36Sopenharmony_ci 620262306a36Sopenharmony_ci /* Check for a racing update before calling hugetlb_wp() */ 620362306a36Sopenharmony_ci if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) 620462306a36Sopenharmony_ci goto out_ptl; 620562306a36Sopenharmony_ci 620662306a36Sopenharmony_ci /* Handle userfault-wp first, before trying to lock more pages */ 620762306a36Sopenharmony_ci if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) && 620862306a36Sopenharmony_ci (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { 620962306a36Sopenharmony_ci struct vm_fault vmf = { 621062306a36Sopenharmony_ci .vma = vma, 621162306a36Sopenharmony_ci .address = haddr, 621262306a36Sopenharmony_ci .real_address = address, 621362306a36Sopenharmony_ci .flags = flags, 621462306a36Sopenharmony_ci }; 621562306a36Sopenharmony_ci 621662306a36Sopenharmony_ci spin_unlock(ptl); 621762306a36Sopenharmony_ci if (pagecache_folio) { 621862306a36Sopenharmony_ci folio_unlock(pagecache_folio); 621962306a36Sopenharmony_ci folio_put(pagecache_folio); 622062306a36Sopenharmony_ci } 622162306a36Sopenharmony_ci hugetlb_vma_unlock_read(vma); 622262306a36Sopenharmony_ci mutex_unlock(&hugetlb_fault_mutex_table[hash]); 622362306a36Sopenharmony_ci return handle_userfault(&vmf, VM_UFFD_WP); 622462306a36Sopenharmony_ci } 622562306a36Sopenharmony_ci 622662306a36Sopenharmony_ci /* 622762306a36Sopenharmony_ci * hugetlb_wp() requires page locks of pte_page(entry) and 622862306a36Sopenharmony_ci * pagecache_folio, so here we need take the former one 622962306a36Sopenharmony_ci * when folio != pagecache_folio or !pagecache_folio. 623062306a36Sopenharmony_ci */ 623162306a36Sopenharmony_ci folio = page_folio(pte_page(entry)); 623262306a36Sopenharmony_ci if (folio != pagecache_folio) 623362306a36Sopenharmony_ci if (!folio_trylock(folio)) { 623462306a36Sopenharmony_ci need_wait_lock = 1; 623562306a36Sopenharmony_ci goto out_ptl; 623662306a36Sopenharmony_ci } 623762306a36Sopenharmony_ci 623862306a36Sopenharmony_ci folio_get(folio); 623962306a36Sopenharmony_ci 624062306a36Sopenharmony_ci if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { 624162306a36Sopenharmony_ci if (!huge_pte_write(entry)) { 624262306a36Sopenharmony_ci ret = hugetlb_wp(mm, vma, address, ptep, flags, 624362306a36Sopenharmony_ci pagecache_folio, ptl); 624462306a36Sopenharmony_ci goto out_put_page; 624562306a36Sopenharmony_ci } else if (likely(flags & FAULT_FLAG_WRITE)) { 624662306a36Sopenharmony_ci entry = huge_pte_mkdirty(entry); 624762306a36Sopenharmony_ci } 624862306a36Sopenharmony_ci } 624962306a36Sopenharmony_ci entry = pte_mkyoung(entry); 625062306a36Sopenharmony_ci if (huge_ptep_set_access_flags(vma, haddr, ptep, entry, 625162306a36Sopenharmony_ci flags & FAULT_FLAG_WRITE)) 625262306a36Sopenharmony_ci update_mmu_cache(vma, haddr, ptep); 625362306a36Sopenharmony_ciout_put_page: 625462306a36Sopenharmony_ci if (folio != pagecache_folio) 625562306a36Sopenharmony_ci folio_unlock(folio); 625662306a36Sopenharmony_ci folio_put(folio); 625762306a36Sopenharmony_ciout_ptl: 625862306a36Sopenharmony_ci spin_unlock(ptl); 625962306a36Sopenharmony_ci 626062306a36Sopenharmony_ci if (pagecache_folio) { 626162306a36Sopenharmony_ci folio_unlock(pagecache_folio); 626262306a36Sopenharmony_ci folio_put(pagecache_folio); 626362306a36Sopenharmony_ci } 626462306a36Sopenharmony_ciout_mutex: 626562306a36Sopenharmony_ci hugetlb_vma_unlock_read(vma); 626662306a36Sopenharmony_ci mutex_unlock(&hugetlb_fault_mutex_table[hash]); 626762306a36Sopenharmony_ci /* 626862306a36Sopenharmony_ci * Generally it's safe to hold refcount during waiting page lock. But 626962306a36Sopenharmony_ci * here we just wait to defer the next page fault to avoid busy loop and 627062306a36Sopenharmony_ci * the page is not used after unlocked before returning from the current 627162306a36Sopenharmony_ci * page fault. So we are safe from accessing freed page, even if we wait 627262306a36Sopenharmony_ci * here without taking refcount. 627362306a36Sopenharmony_ci */ 627462306a36Sopenharmony_ci if (need_wait_lock) 627562306a36Sopenharmony_ci folio_wait_locked(folio); 627662306a36Sopenharmony_ci return ret; 627762306a36Sopenharmony_ci} 627862306a36Sopenharmony_ci 627962306a36Sopenharmony_ci#ifdef CONFIG_USERFAULTFD 628062306a36Sopenharmony_ci/* 628162306a36Sopenharmony_ci * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte 628262306a36Sopenharmony_ci * with modifications for hugetlb pages. 628362306a36Sopenharmony_ci */ 628462306a36Sopenharmony_ciint hugetlb_mfill_atomic_pte(pte_t *dst_pte, 628562306a36Sopenharmony_ci struct vm_area_struct *dst_vma, 628662306a36Sopenharmony_ci unsigned long dst_addr, 628762306a36Sopenharmony_ci unsigned long src_addr, 628862306a36Sopenharmony_ci uffd_flags_t flags, 628962306a36Sopenharmony_ci struct folio **foliop) 629062306a36Sopenharmony_ci{ 629162306a36Sopenharmony_ci struct mm_struct *dst_mm = dst_vma->vm_mm; 629262306a36Sopenharmony_ci bool is_continue = uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE); 629362306a36Sopenharmony_ci bool wp_enabled = (flags & MFILL_ATOMIC_WP); 629462306a36Sopenharmony_ci struct hstate *h = hstate_vma(dst_vma); 629562306a36Sopenharmony_ci struct address_space *mapping = dst_vma->vm_file->f_mapping; 629662306a36Sopenharmony_ci pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr); 629762306a36Sopenharmony_ci unsigned long size; 629862306a36Sopenharmony_ci int vm_shared = dst_vma->vm_flags & VM_SHARED; 629962306a36Sopenharmony_ci pte_t _dst_pte; 630062306a36Sopenharmony_ci spinlock_t *ptl; 630162306a36Sopenharmony_ci int ret = -ENOMEM; 630262306a36Sopenharmony_ci struct folio *folio; 630362306a36Sopenharmony_ci int writable; 630462306a36Sopenharmony_ci bool folio_in_pagecache = false; 630562306a36Sopenharmony_ci 630662306a36Sopenharmony_ci if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { 630762306a36Sopenharmony_ci ptl = huge_pte_lock(h, dst_mm, dst_pte); 630862306a36Sopenharmony_ci 630962306a36Sopenharmony_ci /* Don't overwrite any existing PTEs (even markers) */ 631062306a36Sopenharmony_ci if (!huge_pte_none(huge_ptep_get(dst_pte))) { 631162306a36Sopenharmony_ci spin_unlock(ptl); 631262306a36Sopenharmony_ci return -EEXIST; 631362306a36Sopenharmony_ci } 631462306a36Sopenharmony_ci 631562306a36Sopenharmony_ci _dst_pte = make_pte_marker(PTE_MARKER_POISONED); 631662306a36Sopenharmony_ci set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, 631762306a36Sopenharmony_ci huge_page_size(h)); 631862306a36Sopenharmony_ci 631962306a36Sopenharmony_ci /* No need to invalidate - it was non-present before */ 632062306a36Sopenharmony_ci update_mmu_cache(dst_vma, dst_addr, dst_pte); 632162306a36Sopenharmony_ci 632262306a36Sopenharmony_ci spin_unlock(ptl); 632362306a36Sopenharmony_ci return 0; 632462306a36Sopenharmony_ci } 632562306a36Sopenharmony_ci 632662306a36Sopenharmony_ci if (is_continue) { 632762306a36Sopenharmony_ci ret = -EFAULT; 632862306a36Sopenharmony_ci folio = filemap_lock_folio(mapping, idx); 632962306a36Sopenharmony_ci if (IS_ERR(folio)) 633062306a36Sopenharmony_ci goto out; 633162306a36Sopenharmony_ci folio_in_pagecache = true; 633262306a36Sopenharmony_ci } else if (!*foliop) { 633362306a36Sopenharmony_ci /* If a folio already exists, then it's UFFDIO_COPY for 633462306a36Sopenharmony_ci * a non-missing case. Return -EEXIST. 633562306a36Sopenharmony_ci */ 633662306a36Sopenharmony_ci if (vm_shared && 633762306a36Sopenharmony_ci hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { 633862306a36Sopenharmony_ci ret = -EEXIST; 633962306a36Sopenharmony_ci goto out; 634062306a36Sopenharmony_ci } 634162306a36Sopenharmony_ci 634262306a36Sopenharmony_ci folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); 634362306a36Sopenharmony_ci if (IS_ERR(folio)) { 634462306a36Sopenharmony_ci ret = -ENOMEM; 634562306a36Sopenharmony_ci goto out; 634662306a36Sopenharmony_ci } 634762306a36Sopenharmony_ci 634862306a36Sopenharmony_ci ret = copy_folio_from_user(folio, (const void __user *) src_addr, 634962306a36Sopenharmony_ci false); 635062306a36Sopenharmony_ci 635162306a36Sopenharmony_ci /* fallback to copy_from_user outside mmap_lock */ 635262306a36Sopenharmony_ci if (unlikely(ret)) { 635362306a36Sopenharmony_ci ret = -ENOENT; 635462306a36Sopenharmony_ci /* Free the allocated folio which may have 635562306a36Sopenharmony_ci * consumed a reservation. 635662306a36Sopenharmony_ci */ 635762306a36Sopenharmony_ci restore_reserve_on_error(h, dst_vma, dst_addr, folio); 635862306a36Sopenharmony_ci folio_put(folio); 635962306a36Sopenharmony_ci 636062306a36Sopenharmony_ci /* Allocate a temporary folio to hold the copied 636162306a36Sopenharmony_ci * contents. 636262306a36Sopenharmony_ci */ 636362306a36Sopenharmony_ci folio = alloc_hugetlb_folio_vma(h, dst_vma, dst_addr); 636462306a36Sopenharmony_ci if (!folio) { 636562306a36Sopenharmony_ci ret = -ENOMEM; 636662306a36Sopenharmony_ci goto out; 636762306a36Sopenharmony_ci } 636862306a36Sopenharmony_ci *foliop = folio; 636962306a36Sopenharmony_ci /* Set the outparam foliop and return to the caller to 637062306a36Sopenharmony_ci * copy the contents outside the lock. Don't free the 637162306a36Sopenharmony_ci * folio. 637262306a36Sopenharmony_ci */ 637362306a36Sopenharmony_ci goto out; 637462306a36Sopenharmony_ci } 637562306a36Sopenharmony_ci } else { 637662306a36Sopenharmony_ci if (vm_shared && 637762306a36Sopenharmony_ci hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { 637862306a36Sopenharmony_ci folio_put(*foliop); 637962306a36Sopenharmony_ci ret = -EEXIST; 638062306a36Sopenharmony_ci *foliop = NULL; 638162306a36Sopenharmony_ci goto out; 638262306a36Sopenharmony_ci } 638362306a36Sopenharmony_ci 638462306a36Sopenharmony_ci folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); 638562306a36Sopenharmony_ci if (IS_ERR(folio)) { 638662306a36Sopenharmony_ci folio_put(*foliop); 638762306a36Sopenharmony_ci ret = -ENOMEM; 638862306a36Sopenharmony_ci *foliop = NULL; 638962306a36Sopenharmony_ci goto out; 639062306a36Sopenharmony_ci } 639162306a36Sopenharmony_ci ret = copy_user_large_folio(folio, *foliop, dst_addr, dst_vma); 639262306a36Sopenharmony_ci folio_put(*foliop); 639362306a36Sopenharmony_ci *foliop = NULL; 639462306a36Sopenharmony_ci if (ret) { 639562306a36Sopenharmony_ci folio_put(folio); 639662306a36Sopenharmony_ci goto out; 639762306a36Sopenharmony_ci } 639862306a36Sopenharmony_ci } 639962306a36Sopenharmony_ci 640062306a36Sopenharmony_ci /* 640162306a36Sopenharmony_ci * The memory barrier inside __folio_mark_uptodate makes sure that 640262306a36Sopenharmony_ci * preceding stores to the page contents become visible before 640362306a36Sopenharmony_ci * the set_pte_at() write. 640462306a36Sopenharmony_ci */ 640562306a36Sopenharmony_ci __folio_mark_uptodate(folio); 640662306a36Sopenharmony_ci 640762306a36Sopenharmony_ci /* Add shared, newly allocated pages to the page cache. */ 640862306a36Sopenharmony_ci if (vm_shared && !is_continue) { 640962306a36Sopenharmony_ci size = i_size_read(mapping->host) >> huge_page_shift(h); 641062306a36Sopenharmony_ci ret = -EFAULT; 641162306a36Sopenharmony_ci if (idx >= size) 641262306a36Sopenharmony_ci goto out_release_nounlock; 641362306a36Sopenharmony_ci 641462306a36Sopenharmony_ci /* 641562306a36Sopenharmony_ci * Serialization between remove_inode_hugepages() and 641662306a36Sopenharmony_ci * hugetlb_add_to_page_cache() below happens through the 641762306a36Sopenharmony_ci * hugetlb_fault_mutex_table that here must be hold by 641862306a36Sopenharmony_ci * the caller. 641962306a36Sopenharmony_ci */ 642062306a36Sopenharmony_ci ret = hugetlb_add_to_page_cache(folio, mapping, idx); 642162306a36Sopenharmony_ci if (ret) 642262306a36Sopenharmony_ci goto out_release_nounlock; 642362306a36Sopenharmony_ci folio_in_pagecache = true; 642462306a36Sopenharmony_ci } 642562306a36Sopenharmony_ci 642662306a36Sopenharmony_ci ptl = huge_pte_lock(h, dst_mm, dst_pte); 642762306a36Sopenharmony_ci 642862306a36Sopenharmony_ci ret = -EIO; 642962306a36Sopenharmony_ci if (folio_test_hwpoison(folio)) 643062306a36Sopenharmony_ci goto out_release_unlock; 643162306a36Sopenharmony_ci 643262306a36Sopenharmony_ci /* 643362306a36Sopenharmony_ci * We allow to overwrite a pte marker: consider when both MISSING|WP 643462306a36Sopenharmony_ci * registered, we firstly wr-protect a none pte which has no page cache 643562306a36Sopenharmony_ci * page backing it, then access the page. 643662306a36Sopenharmony_ci */ 643762306a36Sopenharmony_ci ret = -EEXIST; 643862306a36Sopenharmony_ci if (!huge_pte_none_mostly(huge_ptep_get(dst_pte))) 643962306a36Sopenharmony_ci goto out_release_unlock; 644062306a36Sopenharmony_ci 644162306a36Sopenharmony_ci if (folio_in_pagecache) 644262306a36Sopenharmony_ci page_dup_file_rmap(&folio->page, true); 644362306a36Sopenharmony_ci else 644462306a36Sopenharmony_ci hugepage_add_new_anon_rmap(folio, dst_vma, dst_addr); 644562306a36Sopenharmony_ci 644662306a36Sopenharmony_ci /* 644762306a36Sopenharmony_ci * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY 644862306a36Sopenharmony_ci * with wp flag set, don't set pte write bit. 644962306a36Sopenharmony_ci */ 645062306a36Sopenharmony_ci if (wp_enabled || (is_continue && !vm_shared)) 645162306a36Sopenharmony_ci writable = 0; 645262306a36Sopenharmony_ci else 645362306a36Sopenharmony_ci writable = dst_vma->vm_flags & VM_WRITE; 645462306a36Sopenharmony_ci 645562306a36Sopenharmony_ci _dst_pte = make_huge_pte(dst_vma, &folio->page, writable); 645662306a36Sopenharmony_ci /* 645762306a36Sopenharmony_ci * Always mark UFFDIO_COPY page dirty; note that this may not be 645862306a36Sopenharmony_ci * extremely important for hugetlbfs for now since swapping is not 645962306a36Sopenharmony_ci * supported, but we should still be clear in that this page cannot be 646062306a36Sopenharmony_ci * thrown away at will, even if write bit not set. 646162306a36Sopenharmony_ci */ 646262306a36Sopenharmony_ci _dst_pte = huge_pte_mkdirty(_dst_pte); 646362306a36Sopenharmony_ci _dst_pte = pte_mkyoung(_dst_pte); 646462306a36Sopenharmony_ci 646562306a36Sopenharmony_ci if (wp_enabled) 646662306a36Sopenharmony_ci _dst_pte = huge_pte_mkuffd_wp(_dst_pte); 646762306a36Sopenharmony_ci 646862306a36Sopenharmony_ci set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, huge_page_size(h)); 646962306a36Sopenharmony_ci 647062306a36Sopenharmony_ci hugetlb_count_add(pages_per_huge_page(h), dst_mm); 647162306a36Sopenharmony_ci 647262306a36Sopenharmony_ci /* No need to invalidate - it was non-present before */ 647362306a36Sopenharmony_ci update_mmu_cache(dst_vma, dst_addr, dst_pte); 647462306a36Sopenharmony_ci 647562306a36Sopenharmony_ci spin_unlock(ptl); 647662306a36Sopenharmony_ci if (!is_continue) 647762306a36Sopenharmony_ci folio_set_hugetlb_migratable(folio); 647862306a36Sopenharmony_ci if (vm_shared || is_continue) 647962306a36Sopenharmony_ci folio_unlock(folio); 648062306a36Sopenharmony_ci ret = 0; 648162306a36Sopenharmony_ciout: 648262306a36Sopenharmony_ci return ret; 648362306a36Sopenharmony_ciout_release_unlock: 648462306a36Sopenharmony_ci spin_unlock(ptl); 648562306a36Sopenharmony_ci if (vm_shared || is_continue) 648662306a36Sopenharmony_ci folio_unlock(folio); 648762306a36Sopenharmony_ciout_release_nounlock: 648862306a36Sopenharmony_ci if (!folio_in_pagecache) 648962306a36Sopenharmony_ci restore_reserve_on_error(h, dst_vma, dst_addr, folio); 649062306a36Sopenharmony_ci folio_put(folio); 649162306a36Sopenharmony_ci goto out; 649262306a36Sopenharmony_ci} 649362306a36Sopenharmony_ci#endif /* CONFIG_USERFAULTFD */ 649462306a36Sopenharmony_ci 649562306a36Sopenharmony_cistruct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, 649662306a36Sopenharmony_ci unsigned long address, unsigned int flags, 649762306a36Sopenharmony_ci unsigned int *page_mask) 649862306a36Sopenharmony_ci{ 649962306a36Sopenharmony_ci struct hstate *h = hstate_vma(vma); 650062306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 650162306a36Sopenharmony_ci unsigned long haddr = address & huge_page_mask(h); 650262306a36Sopenharmony_ci struct page *page = NULL; 650362306a36Sopenharmony_ci spinlock_t *ptl; 650462306a36Sopenharmony_ci pte_t *pte, entry; 650562306a36Sopenharmony_ci int ret; 650662306a36Sopenharmony_ci 650762306a36Sopenharmony_ci hugetlb_vma_lock_read(vma); 650862306a36Sopenharmony_ci pte = hugetlb_walk(vma, haddr, huge_page_size(h)); 650962306a36Sopenharmony_ci if (!pte) 651062306a36Sopenharmony_ci goto out_unlock; 651162306a36Sopenharmony_ci 651262306a36Sopenharmony_ci ptl = huge_pte_lock(h, mm, pte); 651362306a36Sopenharmony_ci entry = huge_ptep_get(pte); 651462306a36Sopenharmony_ci if (pte_present(entry)) { 651562306a36Sopenharmony_ci page = pte_page(entry); 651662306a36Sopenharmony_ci 651762306a36Sopenharmony_ci if (!huge_pte_write(entry)) { 651862306a36Sopenharmony_ci if (flags & FOLL_WRITE) { 651962306a36Sopenharmony_ci page = NULL; 652062306a36Sopenharmony_ci goto out; 652162306a36Sopenharmony_ci } 652262306a36Sopenharmony_ci 652362306a36Sopenharmony_ci if (gup_must_unshare(vma, flags, page)) { 652462306a36Sopenharmony_ci /* Tell the caller to do unsharing */ 652562306a36Sopenharmony_ci page = ERR_PTR(-EMLINK); 652662306a36Sopenharmony_ci goto out; 652762306a36Sopenharmony_ci } 652862306a36Sopenharmony_ci } 652962306a36Sopenharmony_ci 653062306a36Sopenharmony_ci page = nth_page(page, ((address & ~huge_page_mask(h)) >> PAGE_SHIFT)); 653162306a36Sopenharmony_ci 653262306a36Sopenharmony_ci /* 653362306a36Sopenharmony_ci * Note that page may be a sub-page, and with vmemmap 653462306a36Sopenharmony_ci * optimizations the page struct may be read only. 653562306a36Sopenharmony_ci * try_grab_page() will increase the ref count on the 653662306a36Sopenharmony_ci * head page, so this will be OK. 653762306a36Sopenharmony_ci * 653862306a36Sopenharmony_ci * try_grab_page() should always be able to get the page here, 653962306a36Sopenharmony_ci * because we hold the ptl lock and have verified pte_present(). 654062306a36Sopenharmony_ci */ 654162306a36Sopenharmony_ci ret = try_grab_page(page, flags); 654262306a36Sopenharmony_ci 654362306a36Sopenharmony_ci if (WARN_ON_ONCE(ret)) { 654462306a36Sopenharmony_ci page = ERR_PTR(ret); 654562306a36Sopenharmony_ci goto out; 654662306a36Sopenharmony_ci } 654762306a36Sopenharmony_ci 654862306a36Sopenharmony_ci *page_mask = (1U << huge_page_order(h)) - 1; 654962306a36Sopenharmony_ci } 655062306a36Sopenharmony_ciout: 655162306a36Sopenharmony_ci spin_unlock(ptl); 655262306a36Sopenharmony_ciout_unlock: 655362306a36Sopenharmony_ci hugetlb_vma_unlock_read(vma); 655462306a36Sopenharmony_ci 655562306a36Sopenharmony_ci /* 655662306a36Sopenharmony_ci * Fixup retval for dump requests: if pagecache doesn't exist, 655762306a36Sopenharmony_ci * don't try to allocate a new page but just skip it. 655862306a36Sopenharmony_ci */ 655962306a36Sopenharmony_ci if (!page && (flags & FOLL_DUMP) && 656062306a36Sopenharmony_ci !hugetlbfs_pagecache_present(h, vma, address)) 656162306a36Sopenharmony_ci page = ERR_PTR(-EFAULT); 656262306a36Sopenharmony_ci 656362306a36Sopenharmony_ci return page; 656462306a36Sopenharmony_ci} 656562306a36Sopenharmony_ci 656662306a36Sopenharmony_cilong hugetlb_change_protection(struct vm_area_struct *vma, 656762306a36Sopenharmony_ci unsigned long address, unsigned long end, 656862306a36Sopenharmony_ci pgprot_t newprot, unsigned long cp_flags) 656962306a36Sopenharmony_ci{ 657062306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 657162306a36Sopenharmony_ci unsigned long start = address; 657262306a36Sopenharmony_ci pte_t *ptep; 657362306a36Sopenharmony_ci pte_t pte; 657462306a36Sopenharmony_ci struct hstate *h = hstate_vma(vma); 657562306a36Sopenharmony_ci long pages = 0, psize = huge_page_size(h); 657662306a36Sopenharmony_ci bool shared_pmd = false; 657762306a36Sopenharmony_ci struct mmu_notifier_range range; 657862306a36Sopenharmony_ci unsigned long last_addr_mask; 657962306a36Sopenharmony_ci bool uffd_wp = cp_flags & MM_CP_UFFD_WP; 658062306a36Sopenharmony_ci bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; 658162306a36Sopenharmony_ci 658262306a36Sopenharmony_ci /* 658362306a36Sopenharmony_ci * In the case of shared PMDs, the area to flush could be beyond 658462306a36Sopenharmony_ci * start/end. Set range.start/range.end to cover the maximum possible 658562306a36Sopenharmony_ci * range if PMD sharing is possible. 658662306a36Sopenharmony_ci */ 658762306a36Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 658862306a36Sopenharmony_ci 0, mm, start, end); 658962306a36Sopenharmony_ci adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); 659062306a36Sopenharmony_ci 659162306a36Sopenharmony_ci BUG_ON(address >= end); 659262306a36Sopenharmony_ci flush_cache_range(vma, range.start, range.end); 659362306a36Sopenharmony_ci 659462306a36Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 659562306a36Sopenharmony_ci hugetlb_vma_lock_write(vma); 659662306a36Sopenharmony_ci i_mmap_lock_write(vma->vm_file->f_mapping); 659762306a36Sopenharmony_ci last_addr_mask = hugetlb_mask_last_page(h); 659862306a36Sopenharmony_ci for (; address < end; address += psize) { 659962306a36Sopenharmony_ci spinlock_t *ptl; 660062306a36Sopenharmony_ci ptep = hugetlb_walk(vma, address, psize); 660162306a36Sopenharmony_ci if (!ptep) { 660262306a36Sopenharmony_ci if (!uffd_wp) { 660362306a36Sopenharmony_ci address |= last_addr_mask; 660462306a36Sopenharmony_ci continue; 660562306a36Sopenharmony_ci } 660662306a36Sopenharmony_ci /* 660762306a36Sopenharmony_ci * Userfaultfd wr-protect requires pgtable 660862306a36Sopenharmony_ci * pre-allocations to install pte markers. 660962306a36Sopenharmony_ci */ 661062306a36Sopenharmony_ci ptep = huge_pte_alloc(mm, vma, address, psize); 661162306a36Sopenharmony_ci if (!ptep) { 661262306a36Sopenharmony_ci pages = -ENOMEM; 661362306a36Sopenharmony_ci break; 661462306a36Sopenharmony_ci } 661562306a36Sopenharmony_ci } 661662306a36Sopenharmony_ci ptl = huge_pte_lock(h, mm, ptep); 661762306a36Sopenharmony_ci if (huge_pmd_unshare(mm, vma, address, ptep)) { 661862306a36Sopenharmony_ci /* 661962306a36Sopenharmony_ci * When uffd-wp is enabled on the vma, unshare 662062306a36Sopenharmony_ci * shouldn't happen at all. Warn about it if it 662162306a36Sopenharmony_ci * happened due to some reason. 662262306a36Sopenharmony_ci */ 662362306a36Sopenharmony_ci WARN_ON_ONCE(uffd_wp || uffd_wp_resolve); 662462306a36Sopenharmony_ci pages++; 662562306a36Sopenharmony_ci spin_unlock(ptl); 662662306a36Sopenharmony_ci shared_pmd = true; 662762306a36Sopenharmony_ci address |= last_addr_mask; 662862306a36Sopenharmony_ci continue; 662962306a36Sopenharmony_ci } 663062306a36Sopenharmony_ci pte = huge_ptep_get(ptep); 663162306a36Sopenharmony_ci if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { 663262306a36Sopenharmony_ci /* Nothing to do. */ 663362306a36Sopenharmony_ci } else if (unlikely(is_hugetlb_entry_migration(pte))) { 663462306a36Sopenharmony_ci swp_entry_t entry = pte_to_swp_entry(pte); 663562306a36Sopenharmony_ci struct page *page = pfn_swap_entry_to_page(entry); 663662306a36Sopenharmony_ci pte_t newpte = pte; 663762306a36Sopenharmony_ci 663862306a36Sopenharmony_ci if (is_writable_migration_entry(entry)) { 663962306a36Sopenharmony_ci if (PageAnon(page)) 664062306a36Sopenharmony_ci entry = make_readable_exclusive_migration_entry( 664162306a36Sopenharmony_ci swp_offset(entry)); 664262306a36Sopenharmony_ci else 664362306a36Sopenharmony_ci entry = make_readable_migration_entry( 664462306a36Sopenharmony_ci swp_offset(entry)); 664562306a36Sopenharmony_ci newpte = swp_entry_to_pte(entry); 664662306a36Sopenharmony_ci pages++; 664762306a36Sopenharmony_ci } 664862306a36Sopenharmony_ci 664962306a36Sopenharmony_ci if (uffd_wp) 665062306a36Sopenharmony_ci newpte = pte_swp_mkuffd_wp(newpte); 665162306a36Sopenharmony_ci else if (uffd_wp_resolve) 665262306a36Sopenharmony_ci newpte = pte_swp_clear_uffd_wp(newpte); 665362306a36Sopenharmony_ci if (!pte_same(pte, newpte)) 665462306a36Sopenharmony_ci set_huge_pte_at(mm, address, ptep, newpte, psize); 665562306a36Sopenharmony_ci } else if (unlikely(is_pte_marker(pte))) { 665662306a36Sopenharmony_ci /* No other markers apply for now. */ 665762306a36Sopenharmony_ci WARN_ON_ONCE(!pte_marker_uffd_wp(pte)); 665862306a36Sopenharmony_ci if (uffd_wp_resolve) 665962306a36Sopenharmony_ci /* Safe to modify directly (non-present->none). */ 666062306a36Sopenharmony_ci huge_pte_clear(mm, address, ptep, psize); 666162306a36Sopenharmony_ci } else if (!huge_pte_none(pte)) { 666262306a36Sopenharmony_ci pte_t old_pte; 666362306a36Sopenharmony_ci unsigned int shift = huge_page_shift(hstate_vma(vma)); 666462306a36Sopenharmony_ci 666562306a36Sopenharmony_ci old_pte = huge_ptep_modify_prot_start(vma, address, ptep); 666662306a36Sopenharmony_ci pte = huge_pte_modify(old_pte, newprot); 666762306a36Sopenharmony_ci pte = arch_make_huge_pte(pte, shift, vma->vm_flags); 666862306a36Sopenharmony_ci if (uffd_wp) 666962306a36Sopenharmony_ci pte = huge_pte_mkuffd_wp(pte); 667062306a36Sopenharmony_ci else if (uffd_wp_resolve) 667162306a36Sopenharmony_ci pte = huge_pte_clear_uffd_wp(pte); 667262306a36Sopenharmony_ci huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); 667362306a36Sopenharmony_ci pages++; 667462306a36Sopenharmony_ci } else { 667562306a36Sopenharmony_ci /* None pte */ 667662306a36Sopenharmony_ci if (unlikely(uffd_wp)) 667762306a36Sopenharmony_ci /* Safe to modify directly (none->non-present). */ 667862306a36Sopenharmony_ci set_huge_pte_at(mm, address, ptep, 667962306a36Sopenharmony_ci make_pte_marker(PTE_MARKER_UFFD_WP), 668062306a36Sopenharmony_ci psize); 668162306a36Sopenharmony_ci } 668262306a36Sopenharmony_ci spin_unlock(ptl); 668362306a36Sopenharmony_ci } 668462306a36Sopenharmony_ci /* 668562306a36Sopenharmony_ci * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare 668662306a36Sopenharmony_ci * may have cleared our pud entry and done put_page on the page table: 668762306a36Sopenharmony_ci * once we release i_mmap_rwsem, another task can do the final put_page 668862306a36Sopenharmony_ci * and that page table be reused and filled with junk. If we actually 668962306a36Sopenharmony_ci * did unshare a page of pmds, flush the range corresponding to the pud. 669062306a36Sopenharmony_ci */ 669162306a36Sopenharmony_ci if (shared_pmd) 669262306a36Sopenharmony_ci flush_hugetlb_tlb_range(vma, range.start, range.end); 669362306a36Sopenharmony_ci else 669462306a36Sopenharmony_ci flush_hugetlb_tlb_range(vma, start, end); 669562306a36Sopenharmony_ci /* 669662306a36Sopenharmony_ci * No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are 669762306a36Sopenharmony_ci * downgrading page table protection not changing it to point to a new 669862306a36Sopenharmony_ci * page. 669962306a36Sopenharmony_ci * 670062306a36Sopenharmony_ci * See Documentation/mm/mmu_notifier.rst 670162306a36Sopenharmony_ci */ 670262306a36Sopenharmony_ci i_mmap_unlock_write(vma->vm_file->f_mapping); 670362306a36Sopenharmony_ci hugetlb_vma_unlock_write(vma); 670462306a36Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 670562306a36Sopenharmony_ci 670662306a36Sopenharmony_ci return pages > 0 ? (pages << h->order) : pages; 670762306a36Sopenharmony_ci} 670862306a36Sopenharmony_ci 670962306a36Sopenharmony_ci/* Return true if reservation was successful, false otherwise. */ 671062306a36Sopenharmony_cibool hugetlb_reserve_pages(struct inode *inode, 671162306a36Sopenharmony_ci long from, long to, 671262306a36Sopenharmony_ci struct vm_area_struct *vma, 671362306a36Sopenharmony_ci vm_flags_t vm_flags) 671462306a36Sopenharmony_ci{ 671562306a36Sopenharmony_ci long chg = -1, add = -1; 671662306a36Sopenharmony_ci struct hstate *h = hstate_inode(inode); 671762306a36Sopenharmony_ci struct hugepage_subpool *spool = subpool_inode(inode); 671862306a36Sopenharmony_ci struct resv_map *resv_map; 671962306a36Sopenharmony_ci struct hugetlb_cgroup *h_cg = NULL; 672062306a36Sopenharmony_ci long gbl_reserve, regions_needed = 0; 672162306a36Sopenharmony_ci 672262306a36Sopenharmony_ci /* This should never happen */ 672362306a36Sopenharmony_ci if (from > to) { 672462306a36Sopenharmony_ci VM_WARN(1, "%s called with a negative range\n", __func__); 672562306a36Sopenharmony_ci return false; 672662306a36Sopenharmony_ci } 672762306a36Sopenharmony_ci 672862306a36Sopenharmony_ci /* 672962306a36Sopenharmony_ci * vma specific semaphore used for pmd sharing and fault/truncation 673062306a36Sopenharmony_ci * synchronization 673162306a36Sopenharmony_ci */ 673262306a36Sopenharmony_ci hugetlb_vma_lock_alloc(vma); 673362306a36Sopenharmony_ci 673462306a36Sopenharmony_ci /* 673562306a36Sopenharmony_ci * Only apply hugepage reservation if asked. At fault time, an 673662306a36Sopenharmony_ci * attempt will be made for VM_NORESERVE to allocate a page 673762306a36Sopenharmony_ci * without using reserves 673862306a36Sopenharmony_ci */ 673962306a36Sopenharmony_ci if (vm_flags & VM_NORESERVE) 674062306a36Sopenharmony_ci return true; 674162306a36Sopenharmony_ci 674262306a36Sopenharmony_ci /* 674362306a36Sopenharmony_ci * Shared mappings base their reservation on the number of pages that 674462306a36Sopenharmony_ci * are already allocated on behalf of the file. Private mappings need 674562306a36Sopenharmony_ci * to reserve the full area even if read-only as mprotect() may be 674662306a36Sopenharmony_ci * called to make the mapping read-write. Assume !vma is a shm mapping 674762306a36Sopenharmony_ci */ 674862306a36Sopenharmony_ci if (!vma || vma->vm_flags & VM_MAYSHARE) { 674962306a36Sopenharmony_ci /* 675062306a36Sopenharmony_ci * resv_map can not be NULL as hugetlb_reserve_pages is only 675162306a36Sopenharmony_ci * called for inodes for which resv_maps were created (see 675262306a36Sopenharmony_ci * hugetlbfs_get_inode). 675362306a36Sopenharmony_ci */ 675462306a36Sopenharmony_ci resv_map = inode_resv_map(inode); 675562306a36Sopenharmony_ci 675662306a36Sopenharmony_ci chg = region_chg(resv_map, from, to, ®ions_needed); 675762306a36Sopenharmony_ci } else { 675862306a36Sopenharmony_ci /* Private mapping. */ 675962306a36Sopenharmony_ci resv_map = resv_map_alloc(); 676062306a36Sopenharmony_ci if (!resv_map) 676162306a36Sopenharmony_ci goto out_err; 676262306a36Sopenharmony_ci 676362306a36Sopenharmony_ci chg = to - from; 676462306a36Sopenharmony_ci 676562306a36Sopenharmony_ci set_vma_resv_map(vma, resv_map); 676662306a36Sopenharmony_ci set_vma_resv_flags(vma, HPAGE_RESV_OWNER); 676762306a36Sopenharmony_ci } 676862306a36Sopenharmony_ci 676962306a36Sopenharmony_ci if (chg < 0) 677062306a36Sopenharmony_ci goto out_err; 677162306a36Sopenharmony_ci 677262306a36Sopenharmony_ci if (hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h), 677362306a36Sopenharmony_ci chg * pages_per_huge_page(h), &h_cg) < 0) 677462306a36Sopenharmony_ci goto out_err; 677562306a36Sopenharmony_ci 677662306a36Sopenharmony_ci if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) { 677762306a36Sopenharmony_ci /* For private mappings, the hugetlb_cgroup uncharge info hangs 677862306a36Sopenharmony_ci * of the resv_map. 677962306a36Sopenharmony_ci */ 678062306a36Sopenharmony_ci resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h); 678162306a36Sopenharmony_ci } 678262306a36Sopenharmony_ci 678362306a36Sopenharmony_ci /* 678462306a36Sopenharmony_ci * There must be enough pages in the subpool for the mapping. If 678562306a36Sopenharmony_ci * the subpool has a minimum size, there may be some global 678662306a36Sopenharmony_ci * reservations already in place (gbl_reserve). 678762306a36Sopenharmony_ci */ 678862306a36Sopenharmony_ci gbl_reserve = hugepage_subpool_get_pages(spool, chg); 678962306a36Sopenharmony_ci if (gbl_reserve < 0) 679062306a36Sopenharmony_ci goto out_uncharge_cgroup; 679162306a36Sopenharmony_ci 679262306a36Sopenharmony_ci /* 679362306a36Sopenharmony_ci * Check enough hugepages are available for the reservation. 679462306a36Sopenharmony_ci * Hand the pages back to the subpool if there are not 679562306a36Sopenharmony_ci */ 679662306a36Sopenharmony_ci if (hugetlb_acct_memory(h, gbl_reserve) < 0) 679762306a36Sopenharmony_ci goto out_put_pages; 679862306a36Sopenharmony_ci 679962306a36Sopenharmony_ci /* 680062306a36Sopenharmony_ci * Account for the reservations made. Shared mappings record regions 680162306a36Sopenharmony_ci * that have reservations as they are shared by multiple VMAs. 680262306a36Sopenharmony_ci * When the last VMA disappears, the region map says how much 680362306a36Sopenharmony_ci * the reservation was and the page cache tells how much of 680462306a36Sopenharmony_ci * the reservation was consumed. Private mappings are per-VMA and 680562306a36Sopenharmony_ci * only the consumed reservations are tracked. When the VMA 680662306a36Sopenharmony_ci * disappears, the original reservation is the VMA size and the 680762306a36Sopenharmony_ci * consumed reservations are stored in the map. Hence, nothing 680862306a36Sopenharmony_ci * else has to be done for private mappings here 680962306a36Sopenharmony_ci */ 681062306a36Sopenharmony_ci if (!vma || vma->vm_flags & VM_MAYSHARE) { 681162306a36Sopenharmony_ci add = region_add(resv_map, from, to, regions_needed, h, h_cg); 681262306a36Sopenharmony_ci 681362306a36Sopenharmony_ci if (unlikely(add < 0)) { 681462306a36Sopenharmony_ci hugetlb_acct_memory(h, -gbl_reserve); 681562306a36Sopenharmony_ci goto out_put_pages; 681662306a36Sopenharmony_ci } else if (unlikely(chg > add)) { 681762306a36Sopenharmony_ci /* 681862306a36Sopenharmony_ci * pages in this range were added to the reserve 681962306a36Sopenharmony_ci * map between region_chg and region_add. This 682062306a36Sopenharmony_ci * indicates a race with alloc_hugetlb_folio. Adjust 682162306a36Sopenharmony_ci * the subpool and reserve counts modified above 682262306a36Sopenharmony_ci * based on the difference. 682362306a36Sopenharmony_ci */ 682462306a36Sopenharmony_ci long rsv_adjust; 682562306a36Sopenharmony_ci 682662306a36Sopenharmony_ci /* 682762306a36Sopenharmony_ci * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the 682862306a36Sopenharmony_ci * reference to h_cg->css. See comment below for detail. 682962306a36Sopenharmony_ci */ 683062306a36Sopenharmony_ci hugetlb_cgroup_uncharge_cgroup_rsvd( 683162306a36Sopenharmony_ci hstate_index(h), 683262306a36Sopenharmony_ci (chg - add) * pages_per_huge_page(h), h_cg); 683362306a36Sopenharmony_ci 683462306a36Sopenharmony_ci rsv_adjust = hugepage_subpool_put_pages(spool, 683562306a36Sopenharmony_ci chg - add); 683662306a36Sopenharmony_ci hugetlb_acct_memory(h, -rsv_adjust); 683762306a36Sopenharmony_ci } else if (h_cg) { 683862306a36Sopenharmony_ci /* 683962306a36Sopenharmony_ci * The file_regions will hold their own reference to 684062306a36Sopenharmony_ci * h_cg->css. So we should release the reference held 684162306a36Sopenharmony_ci * via hugetlb_cgroup_charge_cgroup_rsvd() when we are 684262306a36Sopenharmony_ci * done. 684362306a36Sopenharmony_ci */ 684462306a36Sopenharmony_ci hugetlb_cgroup_put_rsvd_cgroup(h_cg); 684562306a36Sopenharmony_ci } 684662306a36Sopenharmony_ci } 684762306a36Sopenharmony_ci return true; 684862306a36Sopenharmony_ci 684962306a36Sopenharmony_ciout_put_pages: 685062306a36Sopenharmony_ci /* put back original number of pages, chg */ 685162306a36Sopenharmony_ci (void)hugepage_subpool_put_pages(spool, chg); 685262306a36Sopenharmony_ciout_uncharge_cgroup: 685362306a36Sopenharmony_ci hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), 685462306a36Sopenharmony_ci chg * pages_per_huge_page(h), h_cg); 685562306a36Sopenharmony_ciout_err: 685662306a36Sopenharmony_ci hugetlb_vma_lock_free(vma); 685762306a36Sopenharmony_ci if (!vma || vma->vm_flags & VM_MAYSHARE) 685862306a36Sopenharmony_ci /* Only call region_abort if the region_chg succeeded but the 685962306a36Sopenharmony_ci * region_add failed or didn't run. 686062306a36Sopenharmony_ci */ 686162306a36Sopenharmony_ci if (chg >= 0 && add < 0) 686262306a36Sopenharmony_ci region_abort(resv_map, from, to, regions_needed); 686362306a36Sopenharmony_ci if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 686462306a36Sopenharmony_ci kref_put(&resv_map->refs, resv_map_release); 686562306a36Sopenharmony_ci set_vma_resv_map(vma, NULL); 686662306a36Sopenharmony_ci } 686762306a36Sopenharmony_ci return false; 686862306a36Sopenharmony_ci} 686962306a36Sopenharmony_ci 687062306a36Sopenharmony_cilong hugetlb_unreserve_pages(struct inode *inode, long start, long end, 687162306a36Sopenharmony_ci long freed) 687262306a36Sopenharmony_ci{ 687362306a36Sopenharmony_ci struct hstate *h = hstate_inode(inode); 687462306a36Sopenharmony_ci struct resv_map *resv_map = inode_resv_map(inode); 687562306a36Sopenharmony_ci long chg = 0; 687662306a36Sopenharmony_ci struct hugepage_subpool *spool = subpool_inode(inode); 687762306a36Sopenharmony_ci long gbl_reserve; 687862306a36Sopenharmony_ci 687962306a36Sopenharmony_ci /* 688062306a36Sopenharmony_ci * Since this routine can be called in the evict inode path for all 688162306a36Sopenharmony_ci * hugetlbfs inodes, resv_map could be NULL. 688262306a36Sopenharmony_ci */ 688362306a36Sopenharmony_ci if (resv_map) { 688462306a36Sopenharmony_ci chg = region_del(resv_map, start, end); 688562306a36Sopenharmony_ci /* 688662306a36Sopenharmony_ci * region_del() can fail in the rare case where a region 688762306a36Sopenharmony_ci * must be split and another region descriptor can not be 688862306a36Sopenharmony_ci * allocated. If end == LONG_MAX, it will not fail. 688962306a36Sopenharmony_ci */ 689062306a36Sopenharmony_ci if (chg < 0) 689162306a36Sopenharmony_ci return chg; 689262306a36Sopenharmony_ci } 689362306a36Sopenharmony_ci 689462306a36Sopenharmony_ci spin_lock(&inode->i_lock); 689562306a36Sopenharmony_ci inode->i_blocks -= (blocks_per_huge_page(h) * freed); 689662306a36Sopenharmony_ci spin_unlock(&inode->i_lock); 689762306a36Sopenharmony_ci 689862306a36Sopenharmony_ci /* 689962306a36Sopenharmony_ci * If the subpool has a minimum size, the number of global 690062306a36Sopenharmony_ci * reservations to be released may be adjusted. 690162306a36Sopenharmony_ci * 690262306a36Sopenharmony_ci * Note that !resv_map implies freed == 0. So (chg - freed) 690362306a36Sopenharmony_ci * won't go negative. 690462306a36Sopenharmony_ci */ 690562306a36Sopenharmony_ci gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); 690662306a36Sopenharmony_ci hugetlb_acct_memory(h, -gbl_reserve); 690762306a36Sopenharmony_ci 690862306a36Sopenharmony_ci return 0; 690962306a36Sopenharmony_ci} 691062306a36Sopenharmony_ci 691162306a36Sopenharmony_ci#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE 691262306a36Sopenharmony_cistatic unsigned long page_table_shareable(struct vm_area_struct *svma, 691362306a36Sopenharmony_ci struct vm_area_struct *vma, 691462306a36Sopenharmony_ci unsigned long addr, pgoff_t idx) 691562306a36Sopenharmony_ci{ 691662306a36Sopenharmony_ci unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + 691762306a36Sopenharmony_ci svma->vm_start; 691862306a36Sopenharmony_ci unsigned long sbase = saddr & PUD_MASK; 691962306a36Sopenharmony_ci unsigned long s_end = sbase + PUD_SIZE; 692062306a36Sopenharmony_ci 692162306a36Sopenharmony_ci /* Allow segments to share if only one is marked locked */ 692262306a36Sopenharmony_ci unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED_MASK; 692362306a36Sopenharmony_ci unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED_MASK; 692462306a36Sopenharmony_ci 692562306a36Sopenharmony_ci /* 692662306a36Sopenharmony_ci * match the virtual addresses, permission and the alignment of the 692762306a36Sopenharmony_ci * page table page. 692862306a36Sopenharmony_ci * 692962306a36Sopenharmony_ci * Also, vma_lock (vm_private_data) is required for sharing. 693062306a36Sopenharmony_ci */ 693162306a36Sopenharmony_ci if (pmd_index(addr) != pmd_index(saddr) || 693262306a36Sopenharmony_ci vm_flags != svm_flags || 693362306a36Sopenharmony_ci !range_in_vma(svma, sbase, s_end) || 693462306a36Sopenharmony_ci !svma->vm_private_data) 693562306a36Sopenharmony_ci return 0; 693662306a36Sopenharmony_ci 693762306a36Sopenharmony_ci return saddr; 693862306a36Sopenharmony_ci} 693962306a36Sopenharmony_ci 694062306a36Sopenharmony_cibool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) 694162306a36Sopenharmony_ci{ 694262306a36Sopenharmony_ci unsigned long start = addr & PUD_MASK; 694362306a36Sopenharmony_ci unsigned long end = start + PUD_SIZE; 694462306a36Sopenharmony_ci 694562306a36Sopenharmony_ci#ifdef CONFIG_USERFAULTFD 694662306a36Sopenharmony_ci if (uffd_disable_huge_pmd_share(vma)) 694762306a36Sopenharmony_ci return false; 694862306a36Sopenharmony_ci#endif 694962306a36Sopenharmony_ci /* 695062306a36Sopenharmony_ci * check on proper vm_flags and page table alignment 695162306a36Sopenharmony_ci */ 695262306a36Sopenharmony_ci if (!(vma->vm_flags & VM_MAYSHARE)) 695362306a36Sopenharmony_ci return false; 695462306a36Sopenharmony_ci if (!vma->vm_private_data) /* vma lock required for sharing */ 695562306a36Sopenharmony_ci return false; 695662306a36Sopenharmony_ci if (!range_in_vma(vma, start, end)) 695762306a36Sopenharmony_ci return false; 695862306a36Sopenharmony_ci return true; 695962306a36Sopenharmony_ci} 696062306a36Sopenharmony_ci 696162306a36Sopenharmony_ci/* 696262306a36Sopenharmony_ci * Determine if start,end range within vma could be mapped by shared pmd. 696362306a36Sopenharmony_ci * If yes, adjust start and end to cover range associated with possible 696462306a36Sopenharmony_ci * shared pmd mappings. 696562306a36Sopenharmony_ci */ 696662306a36Sopenharmony_civoid adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, 696762306a36Sopenharmony_ci unsigned long *start, unsigned long *end) 696862306a36Sopenharmony_ci{ 696962306a36Sopenharmony_ci unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE), 697062306a36Sopenharmony_ci v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); 697162306a36Sopenharmony_ci 697262306a36Sopenharmony_ci /* 697362306a36Sopenharmony_ci * vma needs to span at least one aligned PUD size, and the range 697462306a36Sopenharmony_ci * must be at least partially within in. 697562306a36Sopenharmony_ci */ 697662306a36Sopenharmony_ci if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) || 697762306a36Sopenharmony_ci (*end <= v_start) || (*start >= v_end)) 697862306a36Sopenharmony_ci return; 697962306a36Sopenharmony_ci 698062306a36Sopenharmony_ci /* Extend the range to be PUD aligned for a worst case scenario */ 698162306a36Sopenharmony_ci if (*start > v_start) 698262306a36Sopenharmony_ci *start = ALIGN_DOWN(*start, PUD_SIZE); 698362306a36Sopenharmony_ci 698462306a36Sopenharmony_ci if (*end < v_end) 698562306a36Sopenharmony_ci *end = ALIGN(*end, PUD_SIZE); 698662306a36Sopenharmony_ci} 698762306a36Sopenharmony_ci 698862306a36Sopenharmony_ci/* 698962306a36Sopenharmony_ci * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() 699062306a36Sopenharmony_ci * and returns the corresponding pte. While this is not necessary for the 699162306a36Sopenharmony_ci * !shared pmd case because we can allocate the pmd later as well, it makes the 699262306a36Sopenharmony_ci * code much cleaner. pmd allocation is essential for the shared case because 699362306a36Sopenharmony_ci * pud has to be populated inside the same i_mmap_rwsem section - otherwise 699462306a36Sopenharmony_ci * racing tasks could either miss the sharing (see huge_pte_offset) or select a 699562306a36Sopenharmony_ci * bad pmd for sharing. 699662306a36Sopenharmony_ci */ 699762306a36Sopenharmony_cipte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, 699862306a36Sopenharmony_ci unsigned long addr, pud_t *pud) 699962306a36Sopenharmony_ci{ 700062306a36Sopenharmony_ci struct address_space *mapping = vma->vm_file->f_mapping; 700162306a36Sopenharmony_ci pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + 700262306a36Sopenharmony_ci vma->vm_pgoff; 700362306a36Sopenharmony_ci struct vm_area_struct *svma; 700462306a36Sopenharmony_ci unsigned long saddr; 700562306a36Sopenharmony_ci pte_t *spte = NULL; 700662306a36Sopenharmony_ci pte_t *pte; 700762306a36Sopenharmony_ci 700862306a36Sopenharmony_ci i_mmap_lock_read(mapping); 700962306a36Sopenharmony_ci vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { 701062306a36Sopenharmony_ci if (svma == vma) 701162306a36Sopenharmony_ci continue; 701262306a36Sopenharmony_ci 701362306a36Sopenharmony_ci saddr = page_table_shareable(svma, vma, addr, idx); 701462306a36Sopenharmony_ci if (saddr) { 701562306a36Sopenharmony_ci spte = hugetlb_walk(svma, saddr, 701662306a36Sopenharmony_ci vma_mmu_pagesize(svma)); 701762306a36Sopenharmony_ci if (spte) { 701862306a36Sopenharmony_ci get_page(virt_to_page(spte)); 701962306a36Sopenharmony_ci break; 702062306a36Sopenharmony_ci } 702162306a36Sopenharmony_ci } 702262306a36Sopenharmony_ci } 702362306a36Sopenharmony_ci 702462306a36Sopenharmony_ci if (!spte) 702562306a36Sopenharmony_ci goto out; 702662306a36Sopenharmony_ci 702762306a36Sopenharmony_ci spin_lock(&mm->page_table_lock); 702862306a36Sopenharmony_ci if (pud_none(*pud)) { 702962306a36Sopenharmony_ci pud_populate(mm, pud, 703062306a36Sopenharmony_ci (pmd_t *)((unsigned long)spte & PAGE_MASK)); 703162306a36Sopenharmony_ci mm_inc_nr_pmds(mm); 703262306a36Sopenharmony_ci } else { 703362306a36Sopenharmony_ci put_page(virt_to_page(spte)); 703462306a36Sopenharmony_ci } 703562306a36Sopenharmony_ci spin_unlock(&mm->page_table_lock); 703662306a36Sopenharmony_ciout: 703762306a36Sopenharmony_ci pte = (pte_t *)pmd_alloc(mm, pud, addr); 703862306a36Sopenharmony_ci i_mmap_unlock_read(mapping); 703962306a36Sopenharmony_ci return pte; 704062306a36Sopenharmony_ci} 704162306a36Sopenharmony_ci 704262306a36Sopenharmony_ci/* 704362306a36Sopenharmony_ci * unmap huge page backed by shared pte. 704462306a36Sopenharmony_ci * 704562306a36Sopenharmony_ci * Hugetlb pte page is ref counted at the time of mapping. If pte is shared 704662306a36Sopenharmony_ci * indicated by page_count > 1, unmap is achieved by clearing pud and 704762306a36Sopenharmony_ci * decrementing the ref count. If count == 1, the pte page is not shared. 704862306a36Sopenharmony_ci * 704962306a36Sopenharmony_ci * Called with page table lock held. 705062306a36Sopenharmony_ci * 705162306a36Sopenharmony_ci * returns: 1 successfully unmapped a shared pte page 705262306a36Sopenharmony_ci * 0 the underlying pte page is not shared, or it is the last user 705362306a36Sopenharmony_ci */ 705462306a36Sopenharmony_ciint huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, 705562306a36Sopenharmony_ci unsigned long addr, pte_t *ptep) 705662306a36Sopenharmony_ci{ 705762306a36Sopenharmony_ci pgd_t *pgd = pgd_offset(mm, addr); 705862306a36Sopenharmony_ci p4d_t *p4d = p4d_offset(pgd, addr); 705962306a36Sopenharmony_ci pud_t *pud = pud_offset(p4d, addr); 706062306a36Sopenharmony_ci 706162306a36Sopenharmony_ci i_mmap_assert_write_locked(vma->vm_file->f_mapping); 706262306a36Sopenharmony_ci hugetlb_vma_assert_locked(vma); 706362306a36Sopenharmony_ci BUG_ON(page_count(virt_to_page(ptep)) == 0); 706462306a36Sopenharmony_ci if (page_count(virt_to_page(ptep)) == 1) 706562306a36Sopenharmony_ci return 0; 706662306a36Sopenharmony_ci 706762306a36Sopenharmony_ci pud_clear(pud); 706862306a36Sopenharmony_ci put_page(virt_to_page(ptep)); 706962306a36Sopenharmony_ci mm_dec_nr_pmds(mm); 707062306a36Sopenharmony_ci return 1; 707162306a36Sopenharmony_ci} 707262306a36Sopenharmony_ci 707362306a36Sopenharmony_ci#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ 707462306a36Sopenharmony_ci 707562306a36Sopenharmony_cipte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, 707662306a36Sopenharmony_ci unsigned long addr, pud_t *pud) 707762306a36Sopenharmony_ci{ 707862306a36Sopenharmony_ci return NULL; 707962306a36Sopenharmony_ci} 708062306a36Sopenharmony_ci 708162306a36Sopenharmony_ciint huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, 708262306a36Sopenharmony_ci unsigned long addr, pte_t *ptep) 708362306a36Sopenharmony_ci{ 708462306a36Sopenharmony_ci return 0; 708562306a36Sopenharmony_ci} 708662306a36Sopenharmony_ci 708762306a36Sopenharmony_civoid adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, 708862306a36Sopenharmony_ci unsigned long *start, unsigned long *end) 708962306a36Sopenharmony_ci{ 709062306a36Sopenharmony_ci} 709162306a36Sopenharmony_ci 709262306a36Sopenharmony_cibool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) 709362306a36Sopenharmony_ci{ 709462306a36Sopenharmony_ci return false; 709562306a36Sopenharmony_ci} 709662306a36Sopenharmony_ci#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ 709762306a36Sopenharmony_ci 709862306a36Sopenharmony_ci#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB 709962306a36Sopenharmony_cipte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, 710062306a36Sopenharmony_ci unsigned long addr, unsigned long sz) 710162306a36Sopenharmony_ci{ 710262306a36Sopenharmony_ci pgd_t *pgd; 710362306a36Sopenharmony_ci p4d_t *p4d; 710462306a36Sopenharmony_ci pud_t *pud; 710562306a36Sopenharmony_ci pte_t *pte = NULL; 710662306a36Sopenharmony_ci 710762306a36Sopenharmony_ci pgd = pgd_offset(mm, addr); 710862306a36Sopenharmony_ci p4d = p4d_alloc(mm, pgd, addr); 710962306a36Sopenharmony_ci if (!p4d) 711062306a36Sopenharmony_ci return NULL; 711162306a36Sopenharmony_ci pud = pud_alloc(mm, p4d, addr); 711262306a36Sopenharmony_ci if (pud) { 711362306a36Sopenharmony_ci if (sz == PUD_SIZE) { 711462306a36Sopenharmony_ci pte = (pte_t *)pud; 711562306a36Sopenharmony_ci } else { 711662306a36Sopenharmony_ci BUG_ON(sz != PMD_SIZE); 711762306a36Sopenharmony_ci if (want_pmd_share(vma, addr) && pud_none(*pud)) 711862306a36Sopenharmony_ci pte = huge_pmd_share(mm, vma, addr, pud); 711962306a36Sopenharmony_ci else 712062306a36Sopenharmony_ci pte = (pte_t *)pmd_alloc(mm, pud, addr); 712162306a36Sopenharmony_ci } 712262306a36Sopenharmony_ci } 712362306a36Sopenharmony_ci 712462306a36Sopenharmony_ci if (pte) { 712562306a36Sopenharmony_ci pte_t pteval = ptep_get_lockless(pte); 712662306a36Sopenharmony_ci 712762306a36Sopenharmony_ci BUG_ON(pte_present(pteval) && !pte_huge(pteval)); 712862306a36Sopenharmony_ci } 712962306a36Sopenharmony_ci 713062306a36Sopenharmony_ci return pte; 713162306a36Sopenharmony_ci} 713262306a36Sopenharmony_ci 713362306a36Sopenharmony_ci/* 713462306a36Sopenharmony_ci * huge_pte_offset() - Walk the page table to resolve the hugepage 713562306a36Sopenharmony_ci * entry at address @addr 713662306a36Sopenharmony_ci * 713762306a36Sopenharmony_ci * Return: Pointer to page table entry (PUD or PMD) for 713862306a36Sopenharmony_ci * address @addr, or NULL if a !p*d_present() entry is encountered and the 713962306a36Sopenharmony_ci * size @sz doesn't match the hugepage size at this level of the page 714062306a36Sopenharmony_ci * table. 714162306a36Sopenharmony_ci */ 714262306a36Sopenharmony_cipte_t *huge_pte_offset(struct mm_struct *mm, 714362306a36Sopenharmony_ci unsigned long addr, unsigned long sz) 714462306a36Sopenharmony_ci{ 714562306a36Sopenharmony_ci pgd_t *pgd; 714662306a36Sopenharmony_ci p4d_t *p4d; 714762306a36Sopenharmony_ci pud_t *pud; 714862306a36Sopenharmony_ci pmd_t *pmd; 714962306a36Sopenharmony_ci 715062306a36Sopenharmony_ci pgd = pgd_offset(mm, addr); 715162306a36Sopenharmony_ci if (!pgd_present(*pgd)) 715262306a36Sopenharmony_ci return NULL; 715362306a36Sopenharmony_ci p4d = p4d_offset(pgd, addr); 715462306a36Sopenharmony_ci if (!p4d_present(*p4d)) 715562306a36Sopenharmony_ci return NULL; 715662306a36Sopenharmony_ci 715762306a36Sopenharmony_ci pud = pud_offset(p4d, addr); 715862306a36Sopenharmony_ci if (sz == PUD_SIZE) 715962306a36Sopenharmony_ci /* must be pud huge, non-present or none */ 716062306a36Sopenharmony_ci return (pte_t *)pud; 716162306a36Sopenharmony_ci if (!pud_present(*pud)) 716262306a36Sopenharmony_ci return NULL; 716362306a36Sopenharmony_ci /* must have a valid entry and size to go further */ 716462306a36Sopenharmony_ci 716562306a36Sopenharmony_ci pmd = pmd_offset(pud, addr); 716662306a36Sopenharmony_ci /* must be pmd huge, non-present or none */ 716762306a36Sopenharmony_ci return (pte_t *)pmd; 716862306a36Sopenharmony_ci} 716962306a36Sopenharmony_ci 717062306a36Sopenharmony_ci/* 717162306a36Sopenharmony_ci * Return a mask that can be used to update an address to the last huge 717262306a36Sopenharmony_ci * page in a page table page mapping size. Used to skip non-present 717362306a36Sopenharmony_ci * page table entries when linearly scanning address ranges. Architectures 717462306a36Sopenharmony_ci * with unique huge page to page table relationships can define their own 717562306a36Sopenharmony_ci * version of this routine. 717662306a36Sopenharmony_ci */ 717762306a36Sopenharmony_ciunsigned long hugetlb_mask_last_page(struct hstate *h) 717862306a36Sopenharmony_ci{ 717962306a36Sopenharmony_ci unsigned long hp_size = huge_page_size(h); 718062306a36Sopenharmony_ci 718162306a36Sopenharmony_ci if (hp_size == PUD_SIZE) 718262306a36Sopenharmony_ci return P4D_SIZE - PUD_SIZE; 718362306a36Sopenharmony_ci else if (hp_size == PMD_SIZE) 718462306a36Sopenharmony_ci return PUD_SIZE - PMD_SIZE; 718562306a36Sopenharmony_ci else 718662306a36Sopenharmony_ci return 0UL; 718762306a36Sopenharmony_ci} 718862306a36Sopenharmony_ci 718962306a36Sopenharmony_ci#else 719062306a36Sopenharmony_ci 719162306a36Sopenharmony_ci/* See description above. Architectures can provide their own version. */ 719262306a36Sopenharmony_ci__weak unsigned long hugetlb_mask_last_page(struct hstate *h) 719362306a36Sopenharmony_ci{ 719462306a36Sopenharmony_ci#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE 719562306a36Sopenharmony_ci if (huge_page_size(h) == PMD_SIZE) 719662306a36Sopenharmony_ci return PUD_SIZE - PMD_SIZE; 719762306a36Sopenharmony_ci#endif 719862306a36Sopenharmony_ci return 0UL; 719962306a36Sopenharmony_ci} 720062306a36Sopenharmony_ci 720162306a36Sopenharmony_ci#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ 720262306a36Sopenharmony_ci 720362306a36Sopenharmony_ci/* 720462306a36Sopenharmony_ci * These functions are overwritable if your architecture needs its own 720562306a36Sopenharmony_ci * behavior. 720662306a36Sopenharmony_ci */ 720762306a36Sopenharmony_cibool isolate_hugetlb(struct folio *folio, struct list_head *list) 720862306a36Sopenharmony_ci{ 720962306a36Sopenharmony_ci bool ret = true; 721062306a36Sopenharmony_ci 721162306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 721262306a36Sopenharmony_ci if (!folio_test_hugetlb(folio) || 721362306a36Sopenharmony_ci !folio_test_hugetlb_migratable(folio) || 721462306a36Sopenharmony_ci !folio_try_get(folio)) { 721562306a36Sopenharmony_ci ret = false; 721662306a36Sopenharmony_ci goto unlock; 721762306a36Sopenharmony_ci } 721862306a36Sopenharmony_ci folio_clear_hugetlb_migratable(folio); 721962306a36Sopenharmony_ci list_move_tail(&folio->lru, list); 722062306a36Sopenharmony_ciunlock: 722162306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 722262306a36Sopenharmony_ci return ret; 722362306a36Sopenharmony_ci} 722462306a36Sopenharmony_ci 722562306a36Sopenharmony_ciint get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison) 722662306a36Sopenharmony_ci{ 722762306a36Sopenharmony_ci int ret = 0; 722862306a36Sopenharmony_ci 722962306a36Sopenharmony_ci *hugetlb = false; 723062306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 723162306a36Sopenharmony_ci if (folio_test_hugetlb(folio)) { 723262306a36Sopenharmony_ci *hugetlb = true; 723362306a36Sopenharmony_ci if (folio_test_hugetlb_freed(folio)) 723462306a36Sopenharmony_ci ret = 0; 723562306a36Sopenharmony_ci else if (folio_test_hugetlb_migratable(folio) || unpoison) 723662306a36Sopenharmony_ci ret = folio_try_get(folio); 723762306a36Sopenharmony_ci else 723862306a36Sopenharmony_ci ret = -EBUSY; 723962306a36Sopenharmony_ci } 724062306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 724162306a36Sopenharmony_ci return ret; 724262306a36Sopenharmony_ci} 724362306a36Sopenharmony_ci 724462306a36Sopenharmony_ciint get_huge_page_for_hwpoison(unsigned long pfn, int flags, 724562306a36Sopenharmony_ci bool *migratable_cleared) 724662306a36Sopenharmony_ci{ 724762306a36Sopenharmony_ci int ret; 724862306a36Sopenharmony_ci 724962306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 725062306a36Sopenharmony_ci ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared); 725162306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 725262306a36Sopenharmony_ci return ret; 725362306a36Sopenharmony_ci} 725462306a36Sopenharmony_ci 725562306a36Sopenharmony_civoid folio_putback_active_hugetlb(struct folio *folio) 725662306a36Sopenharmony_ci{ 725762306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 725862306a36Sopenharmony_ci folio_set_hugetlb_migratable(folio); 725962306a36Sopenharmony_ci list_move_tail(&folio->lru, &(folio_hstate(folio))->hugepage_activelist); 726062306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 726162306a36Sopenharmony_ci folio_put(folio); 726262306a36Sopenharmony_ci} 726362306a36Sopenharmony_ci 726462306a36Sopenharmony_civoid move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason) 726562306a36Sopenharmony_ci{ 726662306a36Sopenharmony_ci struct hstate *h = folio_hstate(old_folio); 726762306a36Sopenharmony_ci 726862306a36Sopenharmony_ci hugetlb_cgroup_migrate(old_folio, new_folio); 726962306a36Sopenharmony_ci set_page_owner_migrate_reason(&new_folio->page, reason); 727062306a36Sopenharmony_ci 727162306a36Sopenharmony_ci /* 727262306a36Sopenharmony_ci * transfer temporary state of the new hugetlb folio. This is 727362306a36Sopenharmony_ci * reverse to other transitions because the newpage is going to 727462306a36Sopenharmony_ci * be final while the old one will be freed so it takes over 727562306a36Sopenharmony_ci * the temporary status. 727662306a36Sopenharmony_ci * 727762306a36Sopenharmony_ci * Also note that we have to transfer the per-node surplus state 727862306a36Sopenharmony_ci * here as well otherwise the global surplus count will not match 727962306a36Sopenharmony_ci * the per-node's. 728062306a36Sopenharmony_ci */ 728162306a36Sopenharmony_ci if (folio_test_hugetlb_temporary(new_folio)) { 728262306a36Sopenharmony_ci int old_nid = folio_nid(old_folio); 728362306a36Sopenharmony_ci int new_nid = folio_nid(new_folio); 728462306a36Sopenharmony_ci 728562306a36Sopenharmony_ci folio_set_hugetlb_temporary(old_folio); 728662306a36Sopenharmony_ci folio_clear_hugetlb_temporary(new_folio); 728762306a36Sopenharmony_ci 728862306a36Sopenharmony_ci 728962306a36Sopenharmony_ci /* 729062306a36Sopenharmony_ci * There is no need to transfer the per-node surplus state 729162306a36Sopenharmony_ci * when we do not cross the node. 729262306a36Sopenharmony_ci */ 729362306a36Sopenharmony_ci if (new_nid == old_nid) 729462306a36Sopenharmony_ci return; 729562306a36Sopenharmony_ci spin_lock_irq(&hugetlb_lock); 729662306a36Sopenharmony_ci if (h->surplus_huge_pages_node[old_nid]) { 729762306a36Sopenharmony_ci h->surplus_huge_pages_node[old_nid]--; 729862306a36Sopenharmony_ci h->surplus_huge_pages_node[new_nid]++; 729962306a36Sopenharmony_ci } 730062306a36Sopenharmony_ci spin_unlock_irq(&hugetlb_lock); 730162306a36Sopenharmony_ci } 730262306a36Sopenharmony_ci} 730362306a36Sopenharmony_ci 730462306a36Sopenharmony_cistatic void hugetlb_unshare_pmds(struct vm_area_struct *vma, 730562306a36Sopenharmony_ci unsigned long start, 730662306a36Sopenharmony_ci unsigned long end) 730762306a36Sopenharmony_ci{ 730862306a36Sopenharmony_ci struct hstate *h = hstate_vma(vma); 730962306a36Sopenharmony_ci unsigned long sz = huge_page_size(h); 731062306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 731162306a36Sopenharmony_ci struct mmu_notifier_range range; 731262306a36Sopenharmony_ci unsigned long address; 731362306a36Sopenharmony_ci spinlock_t *ptl; 731462306a36Sopenharmony_ci pte_t *ptep; 731562306a36Sopenharmony_ci 731662306a36Sopenharmony_ci if (!(vma->vm_flags & VM_MAYSHARE)) 731762306a36Sopenharmony_ci return; 731862306a36Sopenharmony_ci 731962306a36Sopenharmony_ci if (start >= end) 732062306a36Sopenharmony_ci return; 732162306a36Sopenharmony_ci 732262306a36Sopenharmony_ci flush_cache_range(vma, start, end); 732362306a36Sopenharmony_ci /* 732462306a36Sopenharmony_ci * No need to call adjust_range_if_pmd_sharing_possible(), because 732562306a36Sopenharmony_ci * we have already done the PUD_SIZE alignment. 732662306a36Sopenharmony_ci */ 732762306a36Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, 732862306a36Sopenharmony_ci start, end); 732962306a36Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 733062306a36Sopenharmony_ci hugetlb_vma_lock_write(vma); 733162306a36Sopenharmony_ci i_mmap_lock_write(vma->vm_file->f_mapping); 733262306a36Sopenharmony_ci for (address = start; address < end; address += PUD_SIZE) { 733362306a36Sopenharmony_ci ptep = hugetlb_walk(vma, address, sz); 733462306a36Sopenharmony_ci if (!ptep) 733562306a36Sopenharmony_ci continue; 733662306a36Sopenharmony_ci ptl = huge_pte_lock(h, mm, ptep); 733762306a36Sopenharmony_ci huge_pmd_unshare(mm, vma, address, ptep); 733862306a36Sopenharmony_ci spin_unlock(ptl); 733962306a36Sopenharmony_ci } 734062306a36Sopenharmony_ci flush_hugetlb_tlb_range(vma, start, end); 734162306a36Sopenharmony_ci i_mmap_unlock_write(vma->vm_file->f_mapping); 734262306a36Sopenharmony_ci hugetlb_vma_unlock_write(vma); 734362306a36Sopenharmony_ci /* 734462306a36Sopenharmony_ci * No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see 734562306a36Sopenharmony_ci * Documentation/mm/mmu_notifier.rst. 734662306a36Sopenharmony_ci */ 734762306a36Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 734862306a36Sopenharmony_ci} 734962306a36Sopenharmony_ci 735062306a36Sopenharmony_ci/* 735162306a36Sopenharmony_ci * This function will unconditionally remove all the shared pmd pgtable entries 735262306a36Sopenharmony_ci * within the specific vma for a hugetlbfs memory range. 735362306a36Sopenharmony_ci */ 735462306a36Sopenharmony_civoid hugetlb_unshare_all_pmds(struct vm_area_struct *vma) 735562306a36Sopenharmony_ci{ 735662306a36Sopenharmony_ci hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE), 735762306a36Sopenharmony_ci ALIGN_DOWN(vma->vm_end, PUD_SIZE)); 735862306a36Sopenharmony_ci} 735962306a36Sopenharmony_ci 736062306a36Sopenharmony_ci#ifdef CONFIG_CMA 736162306a36Sopenharmony_cistatic bool cma_reserve_called __initdata; 736262306a36Sopenharmony_ci 736362306a36Sopenharmony_cistatic int __init cmdline_parse_hugetlb_cma(char *p) 736462306a36Sopenharmony_ci{ 736562306a36Sopenharmony_ci int nid, count = 0; 736662306a36Sopenharmony_ci unsigned long tmp; 736762306a36Sopenharmony_ci char *s = p; 736862306a36Sopenharmony_ci 736962306a36Sopenharmony_ci while (*s) { 737062306a36Sopenharmony_ci if (sscanf(s, "%lu%n", &tmp, &count) != 1) 737162306a36Sopenharmony_ci break; 737262306a36Sopenharmony_ci 737362306a36Sopenharmony_ci if (s[count] == ':') { 737462306a36Sopenharmony_ci if (tmp >= MAX_NUMNODES) 737562306a36Sopenharmony_ci break; 737662306a36Sopenharmony_ci nid = array_index_nospec(tmp, MAX_NUMNODES); 737762306a36Sopenharmony_ci 737862306a36Sopenharmony_ci s += count + 1; 737962306a36Sopenharmony_ci tmp = memparse(s, &s); 738062306a36Sopenharmony_ci hugetlb_cma_size_in_node[nid] = tmp; 738162306a36Sopenharmony_ci hugetlb_cma_size += tmp; 738262306a36Sopenharmony_ci 738362306a36Sopenharmony_ci /* 738462306a36Sopenharmony_ci * Skip the separator if have one, otherwise 738562306a36Sopenharmony_ci * break the parsing. 738662306a36Sopenharmony_ci */ 738762306a36Sopenharmony_ci if (*s == ',') 738862306a36Sopenharmony_ci s++; 738962306a36Sopenharmony_ci else 739062306a36Sopenharmony_ci break; 739162306a36Sopenharmony_ci } else { 739262306a36Sopenharmony_ci hugetlb_cma_size = memparse(p, &p); 739362306a36Sopenharmony_ci break; 739462306a36Sopenharmony_ci } 739562306a36Sopenharmony_ci } 739662306a36Sopenharmony_ci 739762306a36Sopenharmony_ci return 0; 739862306a36Sopenharmony_ci} 739962306a36Sopenharmony_ci 740062306a36Sopenharmony_ciearly_param("hugetlb_cma", cmdline_parse_hugetlb_cma); 740162306a36Sopenharmony_ci 740262306a36Sopenharmony_civoid __init hugetlb_cma_reserve(int order) 740362306a36Sopenharmony_ci{ 740462306a36Sopenharmony_ci unsigned long size, reserved, per_node; 740562306a36Sopenharmony_ci bool node_specific_cma_alloc = false; 740662306a36Sopenharmony_ci int nid; 740762306a36Sopenharmony_ci 740862306a36Sopenharmony_ci cma_reserve_called = true; 740962306a36Sopenharmony_ci 741062306a36Sopenharmony_ci if (!hugetlb_cma_size) 741162306a36Sopenharmony_ci return; 741262306a36Sopenharmony_ci 741362306a36Sopenharmony_ci for (nid = 0; nid < MAX_NUMNODES; nid++) { 741462306a36Sopenharmony_ci if (hugetlb_cma_size_in_node[nid] == 0) 741562306a36Sopenharmony_ci continue; 741662306a36Sopenharmony_ci 741762306a36Sopenharmony_ci if (!node_online(nid)) { 741862306a36Sopenharmony_ci pr_warn("hugetlb_cma: invalid node %d specified\n", nid); 741962306a36Sopenharmony_ci hugetlb_cma_size -= hugetlb_cma_size_in_node[nid]; 742062306a36Sopenharmony_ci hugetlb_cma_size_in_node[nid] = 0; 742162306a36Sopenharmony_ci continue; 742262306a36Sopenharmony_ci } 742362306a36Sopenharmony_ci 742462306a36Sopenharmony_ci if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) { 742562306a36Sopenharmony_ci pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n", 742662306a36Sopenharmony_ci nid, (PAGE_SIZE << order) / SZ_1M); 742762306a36Sopenharmony_ci hugetlb_cma_size -= hugetlb_cma_size_in_node[nid]; 742862306a36Sopenharmony_ci hugetlb_cma_size_in_node[nid] = 0; 742962306a36Sopenharmony_ci } else { 743062306a36Sopenharmony_ci node_specific_cma_alloc = true; 743162306a36Sopenharmony_ci } 743262306a36Sopenharmony_ci } 743362306a36Sopenharmony_ci 743462306a36Sopenharmony_ci /* Validate the CMA size again in case some invalid nodes specified. */ 743562306a36Sopenharmony_ci if (!hugetlb_cma_size) 743662306a36Sopenharmony_ci return; 743762306a36Sopenharmony_ci 743862306a36Sopenharmony_ci if (hugetlb_cma_size < (PAGE_SIZE << order)) { 743962306a36Sopenharmony_ci pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n", 744062306a36Sopenharmony_ci (PAGE_SIZE << order) / SZ_1M); 744162306a36Sopenharmony_ci hugetlb_cma_size = 0; 744262306a36Sopenharmony_ci return; 744362306a36Sopenharmony_ci } 744462306a36Sopenharmony_ci 744562306a36Sopenharmony_ci if (!node_specific_cma_alloc) { 744662306a36Sopenharmony_ci /* 744762306a36Sopenharmony_ci * If 3 GB area is requested on a machine with 4 numa nodes, 744862306a36Sopenharmony_ci * let's allocate 1 GB on first three nodes and ignore the last one. 744962306a36Sopenharmony_ci */ 745062306a36Sopenharmony_ci per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes); 745162306a36Sopenharmony_ci pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n", 745262306a36Sopenharmony_ci hugetlb_cma_size / SZ_1M, per_node / SZ_1M); 745362306a36Sopenharmony_ci } 745462306a36Sopenharmony_ci 745562306a36Sopenharmony_ci reserved = 0; 745662306a36Sopenharmony_ci for_each_online_node(nid) { 745762306a36Sopenharmony_ci int res; 745862306a36Sopenharmony_ci char name[CMA_MAX_NAME]; 745962306a36Sopenharmony_ci 746062306a36Sopenharmony_ci if (node_specific_cma_alloc) { 746162306a36Sopenharmony_ci if (hugetlb_cma_size_in_node[nid] == 0) 746262306a36Sopenharmony_ci continue; 746362306a36Sopenharmony_ci 746462306a36Sopenharmony_ci size = hugetlb_cma_size_in_node[nid]; 746562306a36Sopenharmony_ci } else { 746662306a36Sopenharmony_ci size = min(per_node, hugetlb_cma_size - reserved); 746762306a36Sopenharmony_ci } 746862306a36Sopenharmony_ci 746962306a36Sopenharmony_ci size = round_up(size, PAGE_SIZE << order); 747062306a36Sopenharmony_ci 747162306a36Sopenharmony_ci snprintf(name, sizeof(name), "hugetlb%d", nid); 747262306a36Sopenharmony_ci /* 747362306a36Sopenharmony_ci * Note that 'order per bit' is based on smallest size that 747462306a36Sopenharmony_ci * may be returned to CMA allocator in the case of 747562306a36Sopenharmony_ci * huge page demotion. 747662306a36Sopenharmony_ci */ 747762306a36Sopenharmony_ci res = cma_declare_contiguous_nid(0, size, 0, 747862306a36Sopenharmony_ci PAGE_SIZE << HUGETLB_PAGE_ORDER, 747962306a36Sopenharmony_ci 0, false, name, 748062306a36Sopenharmony_ci &hugetlb_cma[nid], nid); 748162306a36Sopenharmony_ci if (res) { 748262306a36Sopenharmony_ci pr_warn("hugetlb_cma: reservation failed: err %d, node %d", 748362306a36Sopenharmony_ci res, nid); 748462306a36Sopenharmony_ci continue; 748562306a36Sopenharmony_ci } 748662306a36Sopenharmony_ci 748762306a36Sopenharmony_ci reserved += size; 748862306a36Sopenharmony_ci pr_info("hugetlb_cma: reserved %lu MiB on node %d\n", 748962306a36Sopenharmony_ci size / SZ_1M, nid); 749062306a36Sopenharmony_ci 749162306a36Sopenharmony_ci if (reserved >= hugetlb_cma_size) 749262306a36Sopenharmony_ci break; 749362306a36Sopenharmony_ci } 749462306a36Sopenharmony_ci 749562306a36Sopenharmony_ci if (!reserved) 749662306a36Sopenharmony_ci /* 749762306a36Sopenharmony_ci * hugetlb_cma_size is used to determine if allocations from 749862306a36Sopenharmony_ci * cma are possible. Set to zero if no cma regions are set up. 749962306a36Sopenharmony_ci */ 750062306a36Sopenharmony_ci hugetlb_cma_size = 0; 750162306a36Sopenharmony_ci} 750262306a36Sopenharmony_ci 750362306a36Sopenharmony_cistatic void __init hugetlb_cma_check(void) 750462306a36Sopenharmony_ci{ 750562306a36Sopenharmony_ci if (!hugetlb_cma_size || cma_reserve_called) 750662306a36Sopenharmony_ci return; 750762306a36Sopenharmony_ci 750862306a36Sopenharmony_ci pr_warn("hugetlb_cma: the option isn't supported by current arch\n"); 750962306a36Sopenharmony_ci} 751062306a36Sopenharmony_ci 751162306a36Sopenharmony_ci#endif /* CONFIG_CMA */ 7512