18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Generic hugetlb support. 48c2ecf20Sopenharmony_ci * (C) Nadia Yvette Chambers, April 2004 58c2ecf20Sopenharmony_ci */ 68c2ecf20Sopenharmony_ci#include <linux/list.h> 78c2ecf20Sopenharmony_ci#include <linux/init.h> 88c2ecf20Sopenharmony_ci#include <linux/mm.h> 98c2ecf20Sopenharmony_ci#include <linux/seq_file.h> 108c2ecf20Sopenharmony_ci#include <linux/sysctl.h> 118c2ecf20Sopenharmony_ci#include <linux/highmem.h> 128c2ecf20Sopenharmony_ci#include <linux/mmu_notifier.h> 138c2ecf20Sopenharmony_ci#include <linux/nodemask.h> 148c2ecf20Sopenharmony_ci#include <linux/pagemap.h> 158c2ecf20Sopenharmony_ci#include <linux/mempolicy.h> 168c2ecf20Sopenharmony_ci#include <linux/compiler.h> 178c2ecf20Sopenharmony_ci#include <linux/cpuset.h> 188c2ecf20Sopenharmony_ci#include <linux/mutex.h> 198c2ecf20Sopenharmony_ci#include <linux/memblock.h> 208c2ecf20Sopenharmony_ci#include <linux/sysfs.h> 218c2ecf20Sopenharmony_ci#include <linux/slab.h> 228c2ecf20Sopenharmony_ci#include <linux/sched/mm.h> 238c2ecf20Sopenharmony_ci#include <linux/mmdebug.h> 248c2ecf20Sopenharmony_ci#include <linux/sched/signal.h> 258c2ecf20Sopenharmony_ci#include <linux/rmap.h> 268c2ecf20Sopenharmony_ci#include <linux/string_helpers.h> 278c2ecf20Sopenharmony_ci#include <linux/swap.h> 288c2ecf20Sopenharmony_ci#include <linux/swapops.h> 298c2ecf20Sopenharmony_ci#include <linux/jhash.h> 308c2ecf20Sopenharmony_ci#include <linux/numa.h> 318c2ecf20Sopenharmony_ci#include <linux/llist.h> 328c2ecf20Sopenharmony_ci#include <linux/cma.h> 338c2ecf20Sopenharmony_ci 348c2ecf20Sopenharmony_ci#include <asm/page.h> 358c2ecf20Sopenharmony_ci#include <asm/pgalloc.h> 368c2ecf20Sopenharmony_ci#include <asm/tlb.h> 378c2ecf20Sopenharmony_ci 388c2ecf20Sopenharmony_ci#include <linux/io.h> 398c2ecf20Sopenharmony_ci#include <linux/hugetlb.h> 408c2ecf20Sopenharmony_ci#include <linux/hugetlb_cgroup.h> 418c2ecf20Sopenharmony_ci#include <linux/node.h> 428c2ecf20Sopenharmony_ci#include <linux/userfaultfd_k.h> 438c2ecf20Sopenharmony_ci#include <linux/page_owner.h> 448c2ecf20Sopenharmony_ci#include "internal.h" 458c2ecf20Sopenharmony_ci 468c2ecf20Sopenharmony_ciint hugetlb_max_hstate __read_mostly; 478c2ecf20Sopenharmony_ciunsigned int default_hstate_idx; 488c2ecf20Sopenharmony_cistruct hstate hstates[HUGE_MAX_HSTATE]; 498c2ecf20Sopenharmony_ci 508c2ecf20Sopenharmony_ci#ifdef CONFIG_CMA 518c2ecf20Sopenharmony_cistatic struct cma *hugetlb_cma[MAX_NUMNODES]; 528c2ecf20Sopenharmony_ci#endif 538c2ecf20Sopenharmony_cistatic unsigned long hugetlb_cma_size __initdata; 548c2ecf20Sopenharmony_ci 558c2ecf20Sopenharmony_ci/* 568c2ecf20Sopenharmony_ci * Minimum page order among possible hugepage sizes, set to a proper value 578c2ecf20Sopenharmony_ci * at boot time. 588c2ecf20Sopenharmony_ci */ 598c2ecf20Sopenharmony_cistatic unsigned int minimum_order __read_mostly = UINT_MAX; 608c2ecf20Sopenharmony_ci 618c2ecf20Sopenharmony_ci__initdata LIST_HEAD(huge_boot_pages); 628c2ecf20Sopenharmony_ci 638c2ecf20Sopenharmony_ci/* for command line parsing */ 648c2ecf20Sopenharmony_cistatic struct hstate * __initdata parsed_hstate; 658c2ecf20Sopenharmony_cistatic unsigned long __initdata default_hstate_max_huge_pages; 668c2ecf20Sopenharmony_cistatic bool __initdata parsed_valid_hugepagesz = true; 678c2ecf20Sopenharmony_cistatic bool __initdata parsed_default_hugepagesz; 688c2ecf20Sopenharmony_ci 698c2ecf20Sopenharmony_ci/* 708c2ecf20Sopenharmony_ci * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, 718c2ecf20Sopenharmony_ci * free_huge_pages, and surplus_huge_pages. 728c2ecf20Sopenharmony_ci */ 738c2ecf20Sopenharmony_ciDEFINE_SPINLOCK(hugetlb_lock); 748c2ecf20Sopenharmony_ci 758c2ecf20Sopenharmony_ci/* 768c2ecf20Sopenharmony_ci * Serializes faults on the same logical page. This is used to 778c2ecf20Sopenharmony_ci * prevent spurious OOMs when the hugepage pool is fully utilized. 788c2ecf20Sopenharmony_ci */ 798c2ecf20Sopenharmony_cistatic int num_fault_mutexes; 808c2ecf20Sopenharmony_cistruct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; 818c2ecf20Sopenharmony_ci 828c2ecf20Sopenharmony_cistatic inline bool PageHugeFreed(struct page *head) 838c2ecf20Sopenharmony_ci{ 848c2ecf20Sopenharmony_ci return page_private(head + 4) == -1UL; 858c2ecf20Sopenharmony_ci} 868c2ecf20Sopenharmony_ci 878c2ecf20Sopenharmony_cistatic inline void SetPageHugeFreed(struct page *head) 888c2ecf20Sopenharmony_ci{ 898c2ecf20Sopenharmony_ci set_page_private(head + 4, -1UL); 908c2ecf20Sopenharmony_ci} 918c2ecf20Sopenharmony_ci 928c2ecf20Sopenharmony_cistatic inline void ClearPageHugeFreed(struct page *head) 938c2ecf20Sopenharmony_ci{ 948c2ecf20Sopenharmony_ci set_page_private(head + 4, 0); 958c2ecf20Sopenharmony_ci} 968c2ecf20Sopenharmony_ci 978c2ecf20Sopenharmony_ci/* Forward declaration */ 988c2ecf20Sopenharmony_cistatic int hugetlb_acct_memory(struct hstate *h, long delta); 998c2ecf20Sopenharmony_ci 1008c2ecf20Sopenharmony_cistatic inline void unlock_or_release_subpool(struct hugepage_subpool *spool) 1018c2ecf20Sopenharmony_ci{ 1028c2ecf20Sopenharmony_ci bool free = (spool->count == 0) && (spool->used_hpages == 0); 1038c2ecf20Sopenharmony_ci 1048c2ecf20Sopenharmony_ci spin_unlock(&spool->lock); 1058c2ecf20Sopenharmony_ci 1068c2ecf20Sopenharmony_ci /* If no pages are used, and no other handles to the subpool 1078c2ecf20Sopenharmony_ci * remain, give up any reservations based on minimum size and 1088c2ecf20Sopenharmony_ci * free the subpool */ 1098c2ecf20Sopenharmony_ci if (free) { 1108c2ecf20Sopenharmony_ci if (spool->min_hpages != -1) 1118c2ecf20Sopenharmony_ci hugetlb_acct_memory(spool->hstate, 1128c2ecf20Sopenharmony_ci -spool->min_hpages); 1138c2ecf20Sopenharmony_ci kfree(spool); 1148c2ecf20Sopenharmony_ci } 1158c2ecf20Sopenharmony_ci} 1168c2ecf20Sopenharmony_ci 1178c2ecf20Sopenharmony_cistruct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, 1188c2ecf20Sopenharmony_ci long min_hpages) 1198c2ecf20Sopenharmony_ci{ 1208c2ecf20Sopenharmony_ci struct hugepage_subpool *spool; 1218c2ecf20Sopenharmony_ci 1228c2ecf20Sopenharmony_ci spool = kzalloc(sizeof(*spool), GFP_KERNEL); 1238c2ecf20Sopenharmony_ci if (!spool) 1248c2ecf20Sopenharmony_ci return NULL; 1258c2ecf20Sopenharmony_ci 1268c2ecf20Sopenharmony_ci spin_lock_init(&spool->lock); 1278c2ecf20Sopenharmony_ci spool->count = 1; 1288c2ecf20Sopenharmony_ci spool->max_hpages = max_hpages; 1298c2ecf20Sopenharmony_ci spool->hstate = h; 1308c2ecf20Sopenharmony_ci spool->min_hpages = min_hpages; 1318c2ecf20Sopenharmony_ci 1328c2ecf20Sopenharmony_ci if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) { 1338c2ecf20Sopenharmony_ci kfree(spool); 1348c2ecf20Sopenharmony_ci return NULL; 1358c2ecf20Sopenharmony_ci } 1368c2ecf20Sopenharmony_ci spool->rsv_hpages = min_hpages; 1378c2ecf20Sopenharmony_ci 1388c2ecf20Sopenharmony_ci return spool; 1398c2ecf20Sopenharmony_ci} 1408c2ecf20Sopenharmony_ci 1418c2ecf20Sopenharmony_civoid hugepage_put_subpool(struct hugepage_subpool *spool) 1428c2ecf20Sopenharmony_ci{ 1438c2ecf20Sopenharmony_ci spin_lock(&spool->lock); 1448c2ecf20Sopenharmony_ci BUG_ON(!spool->count); 1458c2ecf20Sopenharmony_ci spool->count--; 1468c2ecf20Sopenharmony_ci unlock_or_release_subpool(spool); 1478c2ecf20Sopenharmony_ci} 1488c2ecf20Sopenharmony_ci 1498c2ecf20Sopenharmony_ci/* 1508c2ecf20Sopenharmony_ci * Subpool accounting for allocating and reserving pages. 1518c2ecf20Sopenharmony_ci * Return -ENOMEM if there are not enough resources to satisfy the 1528c2ecf20Sopenharmony_ci * request. Otherwise, return the number of pages by which the 1538c2ecf20Sopenharmony_ci * global pools must be adjusted (upward). The returned value may 1548c2ecf20Sopenharmony_ci * only be different than the passed value (delta) in the case where 1558c2ecf20Sopenharmony_ci * a subpool minimum size must be maintained. 1568c2ecf20Sopenharmony_ci */ 1578c2ecf20Sopenharmony_cistatic long hugepage_subpool_get_pages(struct hugepage_subpool *spool, 1588c2ecf20Sopenharmony_ci long delta) 1598c2ecf20Sopenharmony_ci{ 1608c2ecf20Sopenharmony_ci long ret = delta; 1618c2ecf20Sopenharmony_ci 1628c2ecf20Sopenharmony_ci if (!spool) 1638c2ecf20Sopenharmony_ci return ret; 1648c2ecf20Sopenharmony_ci 1658c2ecf20Sopenharmony_ci spin_lock(&spool->lock); 1668c2ecf20Sopenharmony_ci 1678c2ecf20Sopenharmony_ci if (spool->max_hpages != -1) { /* maximum size accounting */ 1688c2ecf20Sopenharmony_ci if ((spool->used_hpages + delta) <= spool->max_hpages) 1698c2ecf20Sopenharmony_ci spool->used_hpages += delta; 1708c2ecf20Sopenharmony_ci else { 1718c2ecf20Sopenharmony_ci ret = -ENOMEM; 1728c2ecf20Sopenharmony_ci goto unlock_ret; 1738c2ecf20Sopenharmony_ci } 1748c2ecf20Sopenharmony_ci } 1758c2ecf20Sopenharmony_ci 1768c2ecf20Sopenharmony_ci /* minimum size accounting */ 1778c2ecf20Sopenharmony_ci if (spool->min_hpages != -1 && spool->rsv_hpages) { 1788c2ecf20Sopenharmony_ci if (delta > spool->rsv_hpages) { 1798c2ecf20Sopenharmony_ci /* 1808c2ecf20Sopenharmony_ci * Asking for more reserves than those already taken on 1818c2ecf20Sopenharmony_ci * behalf of subpool. Return difference. 1828c2ecf20Sopenharmony_ci */ 1838c2ecf20Sopenharmony_ci ret = delta - spool->rsv_hpages; 1848c2ecf20Sopenharmony_ci spool->rsv_hpages = 0; 1858c2ecf20Sopenharmony_ci } else { 1868c2ecf20Sopenharmony_ci ret = 0; /* reserves already accounted for */ 1878c2ecf20Sopenharmony_ci spool->rsv_hpages -= delta; 1888c2ecf20Sopenharmony_ci } 1898c2ecf20Sopenharmony_ci } 1908c2ecf20Sopenharmony_ci 1918c2ecf20Sopenharmony_ciunlock_ret: 1928c2ecf20Sopenharmony_ci spin_unlock(&spool->lock); 1938c2ecf20Sopenharmony_ci return ret; 1948c2ecf20Sopenharmony_ci} 1958c2ecf20Sopenharmony_ci 1968c2ecf20Sopenharmony_ci/* 1978c2ecf20Sopenharmony_ci * Subpool accounting for freeing and unreserving pages. 1988c2ecf20Sopenharmony_ci * Return the number of global page reservations that must be dropped. 1998c2ecf20Sopenharmony_ci * The return value may only be different than the passed value (delta) 2008c2ecf20Sopenharmony_ci * in the case where a subpool minimum size must be maintained. 2018c2ecf20Sopenharmony_ci */ 2028c2ecf20Sopenharmony_cistatic long hugepage_subpool_put_pages(struct hugepage_subpool *spool, 2038c2ecf20Sopenharmony_ci long delta) 2048c2ecf20Sopenharmony_ci{ 2058c2ecf20Sopenharmony_ci long ret = delta; 2068c2ecf20Sopenharmony_ci 2078c2ecf20Sopenharmony_ci if (!spool) 2088c2ecf20Sopenharmony_ci return delta; 2098c2ecf20Sopenharmony_ci 2108c2ecf20Sopenharmony_ci spin_lock(&spool->lock); 2118c2ecf20Sopenharmony_ci 2128c2ecf20Sopenharmony_ci if (spool->max_hpages != -1) /* maximum size accounting */ 2138c2ecf20Sopenharmony_ci spool->used_hpages -= delta; 2148c2ecf20Sopenharmony_ci 2158c2ecf20Sopenharmony_ci /* minimum size accounting */ 2168c2ecf20Sopenharmony_ci if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) { 2178c2ecf20Sopenharmony_ci if (spool->rsv_hpages + delta <= spool->min_hpages) 2188c2ecf20Sopenharmony_ci ret = 0; 2198c2ecf20Sopenharmony_ci else 2208c2ecf20Sopenharmony_ci ret = spool->rsv_hpages + delta - spool->min_hpages; 2218c2ecf20Sopenharmony_ci 2228c2ecf20Sopenharmony_ci spool->rsv_hpages += delta; 2238c2ecf20Sopenharmony_ci if (spool->rsv_hpages > spool->min_hpages) 2248c2ecf20Sopenharmony_ci spool->rsv_hpages = spool->min_hpages; 2258c2ecf20Sopenharmony_ci } 2268c2ecf20Sopenharmony_ci 2278c2ecf20Sopenharmony_ci /* 2288c2ecf20Sopenharmony_ci * If hugetlbfs_put_super couldn't free spool due to an outstanding 2298c2ecf20Sopenharmony_ci * quota reference, free it now. 2308c2ecf20Sopenharmony_ci */ 2318c2ecf20Sopenharmony_ci unlock_or_release_subpool(spool); 2328c2ecf20Sopenharmony_ci 2338c2ecf20Sopenharmony_ci return ret; 2348c2ecf20Sopenharmony_ci} 2358c2ecf20Sopenharmony_ci 2368c2ecf20Sopenharmony_cistatic inline struct hugepage_subpool *subpool_inode(struct inode *inode) 2378c2ecf20Sopenharmony_ci{ 2388c2ecf20Sopenharmony_ci return HUGETLBFS_SB(inode->i_sb)->spool; 2398c2ecf20Sopenharmony_ci} 2408c2ecf20Sopenharmony_ci 2418c2ecf20Sopenharmony_cistatic inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) 2428c2ecf20Sopenharmony_ci{ 2438c2ecf20Sopenharmony_ci return subpool_inode(file_inode(vma->vm_file)); 2448c2ecf20Sopenharmony_ci} 2458c2ecf20Sopenharmony_ci 2468c2ecf20Sopenharmony_ci/* Helper that removes a struct file_region from the resv_map cache and returns 2478c2ecf20Sopenharmony_ci * it for use. 2488c2ecf20Sopenharmony_ci */ 2498c2ecf20Sopenharmony_cistatic struct file_region * 2508c2ecf20Sopenharmony_ciget_file_region_entry_from_cache(struct resv_map *resv, long from, long to) 2518c2ecf20Sopenharmony_ci{ 2528c2ecf20Sopenharmony_ci struct file_region *nrg = NULL; 2538c2ecf20Sopenharmony_ci 2548c2ecf20Sopenharmony_ci VM_BUG_ON(resv->region_cache_count <= 0); 2558c2ecf20Sopenharmony_ci 2568c2ecf20Sopenharmony_ci resv->region_cache_count--; 2578c2ecf20Sopenharmony_ci nrg = list_first_entry(&resv->region_cache, struct file_region, link); 2588c2ecf20Sopenharmony_ci list_del(&nrg->link); 2598c2ecf20Sopenharmony_ci 2608c2ecf20Sopenharmony_ci nrg->from = from; 2618c2ecf20Sopenharmony_ci nrg->to = to; 2628c2ecf20Sopenharmony_ci 2638c2ecf20Sopenharmony_ci return nrg; 2648c2ecf20Sopenharmony_ci} 2658c2ecf20Sopenharmony_ci 2668c2ecf20Sopenharmony_cistatic void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg, 2678c2ecf20Sopenharmony_ci struct file_region *rg) 2688c2ecf20Sopenharmony_ci{ 2698c2ecf20Sopenharmony_ci#ifdef CONFIG_CGROUP_HUGETLB 2708c2ecf20Sopenharmony_ci nrg->reservation_counter = rg->reservation_counter; 2718c2ecf20Sopenharmony_ci nrg->css = rg->css; 2728c2ecf20Sopenharmony_ci if (rg->css) 2738c2ecf20Sopenharmony_ci css_get(rg->css); 2748c2ecf20Sopenharmony_ci#endif 2758c2ecf20Sopenharmony_ci} 2768c2ecf20Sopenharmony_ci 2778c2ecf20Sopenharmony_ci/* Helper that records hugetlb_cgroup uncharge info. */ 2788c2ecf20Sopenharmony_cistatic void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg, 2798c2ecf20Sopenharmony_ci struct hstate *h, 2808c2ecf20Sopenharmony_ci struct resv_map *resv, 2818c2ecf20Sopenharmony_ci struct file_region *nrg) 2828c2ecf20Sopenharmony_ci{ 2838c2ecf20Sopenharmony_ci#ifdef CONFIG_CGROUP_HUGETLB 2848c2ecf20Sopenharmony_ci if (h_cg) { 2858c2ecf20Sopenharmony_ci nrg->reservation_counter = 2868c2ecf20Sopenharmony_ci &h_cg->rsvd_hugepage[hstate_index(h)]; 2878c2ecf20Sopenharmony_ci nrg->css = &h_cg->css; 2888c2ecf20Sopenharmony_ci /* 2898c2ecf20Sopenharmony_ci * The caller will hold exactly one h_cg->css reference for the 2908c2ecf20Sopenharmony_ci * whole contiguous reservation region. But this area might be 2918c2ecf20Sopenharmony_ci * scattered when there are already some file_regions reside in 2928c2ecf20Sopenharmony_ci * it. As a result, many file_regions may share only one css 2938c2ecf20Sopenharmony_ci * reference. In order to ensure that one file_region must hold 2948c2ecf20Sopenharmony_ci * exactly one h_cg->css reference, we should do css_get for 2958c2ecf20Sopenharmony_ci * each file_region and leave the reference held by caller 2968c2ecf20Sopenharmony_ci * untouched. 2978c2ecf20Sopenharmony_ci */ 2988c2ecf20Sopenharmony_ci css_get(&h_cg->css); 2998c2ecf20Sopenharmony_ci if (!resv->pages_per_hpage) 3008c2ecf20Sopenharmony_ci resv->pages_per_hpage = pages_per_huge_page(h); 3018c2ecf20Sopenharmony_ci /* pages_per_hpage should be the same for all entries in 3028c2ecf20Sopenharmony_ci * a resv_map. 3038c2ecf20Sopenharmony_ci */ 3048c2ecf20Sopenharmony_ci VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h)); 3058c2ecf20Sopenharmony_ci } else { 3068c2ecf20Sopenharmony_ci nrg->reservation_counter = NULL; 3078c2ecf20Sopenharmony_ci nrg->css = NULL; 3088c2ecf20Sopenharmony_ci } 3098c2ecf20Sopenharmony_ci#endif 3108c2ecf20Sopenharmony_ci} 3118c2ecf20Sopenharmony_ci 3128c2ecf20Sopenharmony_cistatic void put_uncharge_info(struct file_region *rg) 3138c2ecf20Sopenharmony_ci{ 3148c2ecf20Sopenharmony_ci#ifdef CONFIG_CGROUP_HUGETLB 3158c2ecf20Sopenharmony_ci if (rg->css) 3168c2ecf20Sopenharmony_ci css_put(rg->css); 3178c2ecf20Sopenharmony_ci#endif 3188c2ecf20Sopenharmony_ci} 3198c2ecf20Sopenharmony_ci 3208c2ecf20Sopenharmony_cistatic bool has_same_uncharge_info(struct file_region *rg, 3218c2ecf20Sopenharmony_ci struct file_region *org) 3228c2ecf20Sopenharmony_ci{ 3238c2ecf20Sopenharmony_ci#ifdef CONFIG_CGROUP_HUGETLB 3248c2ecf20Sopenharmony_ci return rg && org && 3258c2ecf20Sopenharmony_ci rg->reservation_counter == org->reservation_counter && 3268c2ecf20Sopenharmony_ci rg->css == org->css; 3278c2ecf20Sopenharmony_ci 3288c2ecf20Sopenharmony_ci#else 3298c2ecf20Sopenharmony_ci return true; 3308c2ecf20Sopenharmony_ci#endif 3318c2ecf20Sopenharmony_ci} 3328c2ecf20Sopenharmony_ci 3338c2ecf20Sopenharmony_cistatic void coalesce_file_region(struct resv_map *resv, struct file_region *rg) 3348c2ecf20Sopenharmony_ci{ 3358c2ecf20Sopenharmony_ci struct file_region *nrg = NULL, *prg = NULL; 3368c2ecf20Sopenharmony_ci 3378c2ecf20Sopenharmony_ci prg = list_prev_entry(rg, link); 3388c2ecf20Sopenharmony_ci if (&prg->link != &resv->regions && prg->to == rg->from && 3398c2ecf20Sopenharmony_ci has_same_uncharge_info(prg, rg)) { 3408c2ecf20Sopenharmony_ci prg->to = rg->to; 3418c2ecf20Sopenharmony_ci 3428c2ecf20Sopenharmony_ci list_del(&rg->link); 3438c2ecf20Sopenharmony_ci put_uncharge_info(rg); 3448c2ecf20Sopenharmony_ci kfree(rg); 3458c2ecf20Sopenharmony_ci 3468c2ecf20Sopenharmony_ci rg = prg; 3478c2ecf20Sopenharmony_ci } 3488c2ecf20Sopenharmony_ci 3498c2ecf20Sopenharmony_ci nrg = list_next_entry(rg, link); 3508c2ecf20Sopenharmony_ci if (&nrg->link != &resv->regions && nrg->from == rg->to && 3518c2ecf20Sopenharmony_ci has_same_uncharge_info(nrg, rg)) { 3528c2ecf20Sopenharmony_ci nrg->from = rg->from; 3538c2ecf20Sopenharmony_ci 3548c2ecf20Sopenharmony_ci list_del(&rg->link); 3558c2ecf20Sopenharmony_ci put_uncharge_info(rg); 3568c2ecf20Sopenharmony_ci kfree(rg); 3578c2ecf20Sopenharmony_ci } 3588c2ecf20Sopenharmony_ci} 3598c2ecf20Sopenharmony_ci 3608c2ecf20Sopenharmony_ci/* 3618c2ecf20Sopenharmony_ci * Must be called with resv->lock held. 3628c2ecf20Sopenharmony_ci * 3638c2ecf20Sopenharmony_ci * Calling this with regions_needed != NULL will count the number of pages 3648c2ecf20Sopenharmony_ci * to be added but will not modify the linked list. And regions_needed will 3658c2ecf20Sopenharmony_ci * indicate the number of file_regions needed in the cache to carry out to add 3668c2ecf20Sopenharmony_ci * the regions for this range. 3678c2ecf20Sopenharmony_ci */ 3688c2ecf20Sopenharmony_cistatic long add_reservation_in_range(struct resv_map *resv, long f, long t, 3698c2ecf20Sopenharmony_ci struct hugetlb_cgroup *h_cg, 3708c2ecf20Sopenharmony_ci struct hstate *h, long *regions_needed) 3718c2ecf20Sopenharmony_ci{ 3728c2ecf20Sopenharmony_ci long add = 0; 3738c2ecf20Sopenharmony_ci struct list_head *head = &resv->regions; 3748c2ecf20Sopenharmony_ci long last_accounted_offset = f; 3758c2ecf20Sopenharmony_ci struct file_region *rg = NULL, *trg = NULL, *nrg = NULL; 3768c2ecf20Sopenharmony_ci 3778c2ecf20Sopenharmony_ci if (regions_needed) 3788c2ecf20Sopenharmony_ci *regions_needed = 0; 3798c2ecf20Sopenharmony_ci 3808c2ecf20Sopenharmony_ci /* In this loop, we essentially handle an entry for the range 3818c2ecf20Sopenharmony_ci * [last_accounted_offset, rg->from), at every iteration, with some 3828c2ecf20Sopenharmony_ci * bounds checking. 3838c2ecf20Sopenharmony_ci */ 3848c2ecf20Sopenharmony_ci list_for_each_entry_safe(rg, trg, head, link) { 3858c2ecf20Sopenharmony_ci /* Skip irrelevant regions that start before our range. */ 3868c2ecf20Sopenharmony_ci if (rg->from < f) { 3878c2ecf20Sopenharmony_ci /* If this region ends after the last accounted offset, 3888c2ecf20Sopenharmony_ci * then we need to update last_accounted_offset. 3898c2ecf20Sopenharmony_ci */ 3908c2ecf20Sopenharmony_ci if (rg->to > last_accounted_offset) 3918c2ecf20Sopenharmony_ci last_accounted_offset = rg->to; 3928c2ecf20Sopenharmony_ci continue; 3938c2ecf20Sopenharmony_ci } 3948c2ecf20Sopenharmony_ci 3958c2ecf20Sopenharmony_ci /* When we find a region that starts beyond our range, we've 3968c2ecf20Sopenharmony_ci * finished. 3978c2ecf20Sopenharmony_ci */ 3988c2ecf20Sopenharmony_ci if (rg->from > t) 3998c2ecf20Sopenharmony_ci break; 4008c2ecf20Sopenharmony_ci 4018c2ecf20Sopenharmony_ci /* Add an entry for last_accounted_offset -> rg->from, and 4028c2ecf20Sopenharmony_ci * update last_accounted_offset. 4038c2ecf20Sopenharmony_ci */ 4048c2ecf20Sopenharmony_ci if (rg->from > last_accounted_offset) { 4058c2ecf20Sopenharmony_ci add += rg->from - last_accounted_offset; 4068c2ecf20Sopenharmony_ci if (!regions_needed) { 4078c2ecf20Sopenharmony_ci nrg = get_file_region_entry_from_cache( 4088c2ecf20Sopenharmony_ci resv, last_accounted_offset, rg->from); 4098c2ecf20Sopenharmony_ci record_hugetlb_cgroup_uncharge_info(h_cg, h, 4108c2ecf20Sopenharmony_ci resv, nrg); 4118c2ecf20Sopenharmony_ci list_add(&nrg->link, rg->link.prev); 4128c2ecf20Sopenharmony_ci coalesce_file_region(resv, nrg); 4138c2ecf20Sopenharmony_ci } else 4148c2ecf20Sopenharmony_ci *regions_needed += 1; 4158c2ecf20Sopenharmony_ci } 4168c2ecf20Sopenharmony_ci 4178c2ecf20Sopenharmony_ci last_accounted_offset = rg->to; 4188c2ecf20Sopenharmony_ci } 4198c2ecf20Sopenharmony_ci 4208c2ecf20Sopenharmony_ci /* Handle the case where our range extends beyond 4218c2ecf20Sopenharmony_ci * last_accounted_offset. 4228c2ecf20Sopenharmony_ci */ 4238c2ecf20Sopenharmony_ci if (last_accounted_offset < t) { 4248c2ecf20Sopenharmony_ci add += t - last_accounted_offset; 4258c2ecf20Sopenharmony_ci if (!regions_needed) { 4268c2ecf20Sopenharmony_ci nrg = get_file_region_entry_from_cache( 4278c2ecf20Sopenharmony_ci resv, last_accounted_offset, t); 4288c2ecf20Sopenharmony_ci record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg); 4298c2ecf20Sopenharmony_ci list_add(&nrg->link, rg->link.prev); 4308c2ecf20Sopenharmony_ci coalesce_file_region(resv, nrg); 4318c2ecf20Sopenharmony_ci } else 4328c2ecf20Sopenharmony_ci *regions_needed += 1; 4338c2ecf20Sopenharmony_ci } 4348c2ecf20Sopenharmony_ci 4358c2ecf20Sopenharmony_ci VM_BUG_ON(add < 0); 4368c2ecf20Sopenharmony_ci return add; 4378c2ecf20Sopenharmony_ci} 4388c2ecf20Sopenharmony_ci 4398c2ecf20Sopenharmony_ci/* Must be called with resv->lock acquired. Will drop lock to allocate entries. 4408c2ecf20Sopenharmony_ci */ 4418c2ecf20Sopenharmony_cistatic int allocate_file_region_entries(struct resv_map *resv, 4428c2ecf20Sopenharmony_ci int regions_needed) 4438c2ecf20Sopenharmony_ci __must_hold(&resv->lock) 4448c2ecf20Sopenharmony_ci{ 4458c2ecf20Sopenharmony_ci struct list_head allocated_regions; 4468c2ecf20Sopenharmony_ci int to_allocate = 0, i = 0; 4478c2ecf20Sopenharmony_ci struct file_region *trg = NULL, *rg = NULL; 4488c2ecf20Sopenharmony_ci 4498c2ecf20Sopenharmony_ci VM_BUG_ON(regions_needed < 0); 4508c2ecf20Sopenharmony_ci 4518c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&allocated_regions); 4528c2ecf20Sopenharmony_ci 4538c2ecf20Sopenharmony_ci /* 4548c2ecf20Sopenharmony_ci * Check for sufficient descriptors in the cache to accommodate 4558c2ecf20Sopenharmony_ci * the number of in progress add operations plus regions_needed. 4568c2ecf20Sopenharmony_ci * 4578c2ecf20Sopenharmony_ci * This is a while loop because when we drop the lock, some other call 4588c2ecf20Sopenharmony_ci * to region_add or region_del may have consumed some region_entries, 4598c2ecf20Sopenharmony_ci * so we keep looping here until we finally have enough entries for 4608c2ecf20Sopenharmony_ci * (adds_in_progress + regions_needed). 4618c2ecf20Sopenharmony_ci */ 4628c2ecf20Sopenharmony_ci while (resv->region_cache_count < 4638c2ecf20Sopenharmony_ci (resv->adds_in_progress + regions_needed)) { 4648c2ecf20Sopenharmony_ci to_allocate = resv->adds_in_progress + regions_needed - 4658c2ecf20Sopenharmony_ci resv->region_cache_count; 4668c2ecf20Sopenharmony_ci 4678c2ecf20Sopenharmony_ci /* At this point, we should have enough entries in the cache 4688c2ecf20Sopenharmony_ci * for all the existings adds_in_progress. We should only be 4698c2ecf20Sopenharmony_ci * needing to allocate for regions_needed. 4708c2ecf20Sopenharmony_ci */ 4718c2ecf20Sopenharmony_ci VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress); 4728c2ecf20Sopenharmony_ci 4738c2ecf20Sopenharmony_ci spin_unlock(&resv->lock); 4748c2ecf20Sopenharmony_ci for (i = 0; i < to_allocate; i++) { 4758c2ecf20Sopenharmony_ci trg = kmalloc(sizeof(*trg), GFP_KERNEL); 4768c2ecf20Sopenharmony_ci if (!trg) 4778c2ecf20Sopenharmony_ci goto out_of_memory; 4788c2ecf20Sopenharmony_ci list_add(&trg->link, &allocated_regions); 4798c2ecf20Sopenharmony_ci } 4808c2ecf20Sopenharmony_ci 4818c2ecf20Sopenharmony_ci spin_lock(&resv->lock); 4828c2ecf20Sopenharmony_ci 4838c2ecf20Sopenharmony_ci list_splice(&allocated_regions, &resv->region_cache); 4848c2ecf20Sopenharmony_ci resv->region_cache_count += to_allocate; 4858c2ecf20Sopenharmony_ci } 4868c2ecf20Sopenharmony_ci 4878c2ecf20Sopenharmony_ci return 0; 4888c2ecf20Sopenharmony_ci 4898c2ecf20Sopenharmony_ciout_of_memory: 4908c2ecf20Sopenharmony_ci list_for_each_entry_safe(rg, trg, &allocated_regions, link) { 4918c2ecf20Sopenharmony_ci list_del(&rg->link); 4928c2ecf20Sopenharmony_ci kfree(rg); 4938c2ecf20Sopenharmony_ci } 4948c2ecf20Sopenharmony_ci return -ENOMEM; 4958c2ecf20Sopenharmony_ci} 4968c2ecf20Sopenharmony_ci 4978c2ecf20Sopenharmony_ci/* 4988c2ecf20Sopenharmony_ci * Add the huge page range represented by [f, t) to the reserve 4998c2ecf20Sopenharmony_ci * map. Regions will be taken from the cache to fill in this range. 5008c2ecf20Sopenharmony_ci * Sufficient regions should exist in the cache due to the previous 5018c2ecf20Sopenharmony_ci * call to region_chg with the same range, but in some cases the cache will not 5028c2ecf20Sopenharmony_ci * have sufficient entries due to races with other code doing region_add or 5038c2ecf20Sopenharmony_ci * region_del. The extra needed entries will be allocated. 5048c2ecf20Sopenharmony_ci * 5058c2ecf20Sopenharmony_ci * regions_needed is the out value provided by a previous call to region_chg. 5068c2ecf20Sopenharmony_ci * 5078c2ecf20Sopenharmony_ci * Return the number of new huge pages added to the map. This number is greater 5088c2ecf20Sopenharmony_ci * than or equal to zero. If file_region entries needed to be allocated for 5098c2ecf20Sopenharmony_ci * this operation and we were not able to allocate, it returns -ENOMEM. 5108c2ecf20Sopenharmony_ci * region_add of regions of length 1 never allocate file_regions and cannot 5118c2ecf20Sopenharmony_ci * fail; region_chg will always allocate at least 1 entry and a region_add for 5128c2ecf20Sopenharmony_ci * 1 page will only require at most 1 entry. 5138c2ecf20Sopenharmony_ci */ 5148c2ecf20Sopenharmony_cistatic long region_add(struct resv_map *resv, long f, long t, 5158c2ecf20Sopenharmony_ci long in_regions_needed, struct hstate *h, 5168c2ecf20Sopenharmony_ci struct hugetlb_cgroup *h_cg) 5178c2ecf20Sopenharmony_ci{ 5188c2ecf20Sopenharmony_ci long add = 0, actual_regions_needed = 0; 5198c2ecf20Sopenharmony_ci 5208c2ecf20Sopenharmony_ci spin_lock(&resv->lock); 5218c2ecf20Sopenharmony_ciretry: 5228c2ecf20Sopenharmony_ci 5238c2ecf20Sopenharmony_ci /* Count how many regions are actually needed to execute this add. */ 5248c2ecf20Sopenharmony_ci add_reservation_in_range(resv, f, t, NULL, NULL, 5258c2ecf20Sopenharmony_ci &actual_regions_needed); 5268c2ecf20Sopenharmony_ci 5278c2ecf20Sopenharmony_ci /* 5288c2ecf20Sopenharmony_ci * Check for sufficient descriptors in the cache to accommodate 5298c2ecf20Sopenharmony_ci * this add operation. Note that actual_regions_needed may be greater 5308c2ecf20Sopenharmony_ci * than in_regions_needed, as the resv_map may have been modified since 5318c2ecf20Sopenharmony_ci * the region_chg call. In this case, we need to make sure that we 5328c2ecf20Sopenharmony_ci * allocate extra entries, such that we have enough for all the 5338c2ecf20Sopenharmony_ci * existing adds_in_progress, plus the excess needed for this 5348c2ecf20Sopenharmony_ci * operation. 5358c2ecf20Sopenharmony_ci */ 5368c2ecf20Sopenharmony_ci if (actual_regions_needed > in_regions_needed && 5378c2ecf20Sopenharmony_ci resv->region_cache_count < 5388c2ecf20Sopenharmony_ci resv->adds_in_progress + 5398c2ecf20Sopenharmony_ci (actual_regions_needed - in_regions_needed)) { 5408c2ecf20Sopenharmony_ci /* region_add operation of range 1 should never need to 5418c2ecf20Sopenharmony_ci * allocate file_region entries. 5428c2ecf20Sopenharmony_ci */ 5438c2ecf20Sopenharmony_ci VM_BUG_ON(t - f <= 1); 5448c2ecf20Sopenharmony_ci 5458c2ecf20Sopenharmony_ci if (allocate_file_region_entries( 5468c2ecf20Sopenharmony_ci resv, actual_regions_needed - in_regions_needed)) { 5478c2ecf20Sopenharmony_ci return -ENOMEM; 5488c2ecf20Sopenharmony_ci } 5498c2ecf20Sopenharmony_ci 5508c2ecf20Sopenharmony_ci goto retry; 5518c2ecf20Sopenharmony_ci } 5528c2ecf20Sopenharmony_ci 5538c2ecf20Sopenharmony_ci add = add_reservation_in_range(resv, f, t, h_cg, h, NULL); 5548c2ecf20Sopenharmony_ci 5558c2ecf20Sopenharmony_ci resv->adds_in_progress -= in_regions_needed; 5568c2ecf20Sopenharmony_ci 5578c2ecf20Sopenharmony_ci spin_unlock(&resv->lock); 5588c2ecf20Sopenharmony_ci VM_BUG_ON(add < 0); 5598c2ecf20Sopenharmony_ci return add; 5608c2ecf20Sopenharmony_ci} 5618c2ecf20Sopenharmony_ci 5628c2ecf20Sopenharmony_ci/* 5638c2ecf20Sopenharmony_ci * Examine the existing reserve map and determine how many 5648c2ecf20Sopenharmony_ci * huge pages in the specified range [f, t) are NOT currently 5658c2ecf20Sopenharmony_ci * represented. This routine is called before a subsequent 5668c2ecf20Sopenharmony_ci * call to region_add that will actually modify the reserve 5678c2ecf20Sopenharmony_ci * map to add the specified range [f, t). region_chg does 5688c2ecf20Sopenharmony_ci * not change the number of huge pages represented by the 5698c2ecf20Sopenharmony_ci * map. A number of new file_region structures is added to the cache as a 5708c2ecf20Sopenharmony_ci * placeholder, for the subsequent region_add call to use. At least 1 5718c2ecf20Sopenharmony_ci * file_region structure is added. 5728c2ecf20Sopenharmony_ci * 5738c2ecf20Sopenharmony_ci * out_regions_needed is the number of regions added to the 5748c2ecf20Sopenharmony_ci * resv->adds_in_progress. This value needs to be provided to a follow up call 5758c2ecf20Sopenharmony_ci * to region_add or region_abort for proper accounting. 5768c2ecf20Sopenharmony_ci * 5778c2ecf20Sopenharmony_ci * Returns the number of huge pages that need to be added to the existing 5788c2ecf20Sopenharmony_ci * reservation map for the range [f, t). This number is greater or equal to 5798c2ecf20Sopenharmony_ci * zero. -ENOMEM is returned if a new file_region structure or cache entry 5808c2ecf20Sopenharmony_ci * is needed and can not be allocated. 5818c2ecf20Sopenharmony_ci */ 5828c2ecf20Sopenharmony_cistatic long region_chg(struct resv_map *resv, long f, long t, 5838c2ecf20Sopenharmony_ci long *out_regions_needed) 5848c2ecf20Sopenharmony_ci{ 5858c2ecf20Sopenharmony_ci long chg = 0; 5868c2ecf20Sopenharmony_ci 5878c2ecf20Sopenharmony_ci spin_lock(&resv->lock); 5888c2ecf20Sopenharmony_ci 5898c2ecf20Sopenharmony_ci /* Count how many hugepages in this range are NOT represented. */ 5908c2ecf20Sopenharmony_ci chg = add_reservation_in_range(resv, f, t, NULL, NULL, 5918c2ecf20Sopenharmony_ci out_regions_needed); 5928c2ecf20Sopenharmony_ci 5938c2ecf20Sopenharmony_ci if (*out_regions_needed == 0) 5948c2ecf20Sopenharmony_ci *out_regions_needed = 1; 5958c2ecf20Sopenharmony_ci 5968c2ecf20Sopenharmony_ci if (allocate_file_region_entries(resv, *out_regions_needed)) 5978c2ecf20Sopenharmony_ci return -ENOMEM; 5988c2ecf20Sopenharmony_ci 5998c2ecf20Sopenharmony_ci resv->adds_in_progress += *out_regions_needed; 6008c2ecf20Sopenharmony_ci 6018c2ecf20Sopenharmony_ci spin_unlock(&resv->lock); 6028c2ecf20Sopenharmony_ci return chg; 6038c2ecf20Sopenharmony_ci} 6048c2ecf20Sopenharmony_ci 6058c2ecf20Sopenharmony_ci/* 6068c2ecf20Sopenharmony_ci * Abort the in progress add operation. The adds_in_progress field 6078c2ecf20Sopenharmony_ci * of the resv_map keeps track of the operations in progress between 6088c2ecf20Sopenharmony_ci * calls to region_chg and region_add. Operations are sometimes 6098c2ecf20Sopenharmony_ci * aborted after the call to region_chg. In such cases, region_abort 6108c2ecf20Sopenharmony_ci * is called to decrement the adds_in_progress counter. regions_needed 6118c2ecf20Sopenharmony_ci * is the value returned by the region_chg call, it is used to decrement 6128c2ecf20Sopenharmony_ci * the adds_in_progress counter. 6138c2ecf20Sopenharmony_ci * 6148c2ecf20Sopenharmony_ci * NOTE: The range arguments [f, t) are not needed or used in this 6158c2ecf20Sopenharmony_ci * routine. They are kept to make reading the calling code easier as 6168c2ecf20Sopenharmony_ci * arguments will match the associated region_chg call. 6178c2ecf20Sopenharmony_ci */ 6188c2ecf20Sopenharmony_cistatic void region_abort(struct resv_map *resv, long f, long t, 6198c2ecf20Sopenharmony_ci long regions_needed) 6208c2ecf20Sopenharmony_ci{ 6218c2ecf20Sopenharmony_ci spin_lock(&resv->lock); 6228c2ecf20Sopenharmony_ci VM_BUG_ON(!resv->region_cache_count); 6238c2ecf20Sopenharmony_ci resv->adds_in_progress -= regions_needed; 6248c2ecf20Sopenharmony_ci spin_unlock(&resv->lock); 6258c2ecf20Sopenharmony_ci} 6268c2ecf20Sopenharmony_ci 6278c2ecf20Sopenharmony_ci/* 6288c2ecf20Sopenharmony_ci * Delete the specified range [f, t) from the reserve map. If the 6298c2ecf20Sopenharmony_ci * t parameter is LONG_MAX, this indicates that ALL regions after f 6308c2ecf20Sopenharmony_ci * should be deleted. Locate the regions which intersect [f, t) 6318c2ecf20Sopenharmony_ci * and either trim, delete or split the existing regions. 6328c2ecf20Sopenharmony_ci * 6338c2ecf20Sopenharmony_ci * Returns the number of huge pages deleted from the reserve map. 6348c2ecf20Sopenharmony_ci * In the normal case, the return value is zero or more. In the 6358c2ecf20Sopenharmony_ci * case where a region must be split, a new region descriptor must 6368c2ecf20Sopenharmony_ci * be allocated. If the allocation fails, -ENOMEM will be returned. 6378c2ecf20Sopenharmony_ci * NOTE: If the parameter t == LONG_MAX, then we will never split 6388c2ecf20Sopenharmony_ci * a region and possibly return -ENOMEM. Callers specifying 6398c2ecf20Sopenharmony_ci * t == LONG_MAX do not need to check for -ENOMEM error. 6408c2ecf20Sopenharmony_ci */ 6418c2ecf20Sopenharmony_cistatic long region_del(struct resv_map *resv, long f, long t) 6428c2ecf20Sopenharmony_ci{ 6438c2ecf20Sopenharmony_ci struct list_head *head = &resv->regions; 6448c2ecf20Sopenharmony_ci struct file_region *rg, *trg; 6458c2ecf20Sopenharmony_ci struct file_region *nrg = NULL; 6468c2ecf20Sopenharmony_ci long del = 0; 6478c2ecf20Sopenharmony_ci 6488c2ecf20Sopenharmony_ciretry: 6498c2ecf20Sopenharmony_ci spin_lock(&resv->lock); 6508c2ecf20Sopenharmony_ci list_for_each_entry_safe(rg, trg, head, link) { 6518c2ecf20Sopenharmony_ci /* 6528c2ecf20Sopenharmony_ci * Skip regions before the range to be deleted. file_region 6538c2ecf20Sopenharmony_ci * ranges are normally of the form [from, to). However, there 6548c2ecf20Sopenharmony_ci * may be a "placeholder" entry in the map which is of the form 6558c2ecf20Sopenharmony_ci * (from, to) with from == to. Check for placeholder entries 6568c2ecf20Sopenharmony_ci * at the beginning of the range to be deleted. 6578c2ecf20Sopenharmony_ci */ 6588c2ecf20Sopenharmony_ci if (rg->to <= f && (rg->to != rg->from || rg->to != f)) 6598c2ecf20Sopenharmony_ci continue; 6608c2ecf20Sopenharmony_ci 6618c2ecf20Sopenharmony_ci if (rg->from >= t) 6628c2ecf20Sopenharmony_ci break; 6638c2ecf20Sopenharmony_ci 6648c2ecf20Sopenharmony_ci if (f > rg->from && t < rg->to) { /* Must split region */ 6658c2ecf20Sopenharmony_ci /* 6668c2ecf20Sopenharmony_ci * Check for an entry in the cache before dropping 6678c2ecf20Sopenharmony_ci * lock and attempting allocation. 6688c2ecf20Sopenharmony_ci */ 6698c2ecf20Sopenharmony_ci if (!nrg && 6708c2ecf20Sopenharmony_ci resv->region_cache_count > resv->adds_in_progress) { 6718c2ecf20Sopenharmony_ci nrg = list_first_entry(&resv->region_cache, 6728c2ecf20Sopenharmony_ci struct file_region, 6738c2ecf20Sopenharmony_ci link); 6748c2ecf20Sopenharmony_ci list_del(&nrg->link); 6758c2ecf20Sopenharmony_ci resv->region_cache_count--; 6768c2ecf20Sopenharmony_ci } 6778c2ecf20Sopenharmony_ci 6788c2ecf20Sopenharmony_ci if (!nrg) { 6798c2ecf20Sopenharmony_ci spin_unlock(&resv->lock); 6808c2ecf20Sopenharmony_ci nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); 6818c2ecf20Sopenharmony_ci if (!nrg) 6828c2ecf20Sopenharmony_ci return -ENOMEM; 6838c2ecf20Sopenharmony_ci goto retry; 6848c2ecf20Sopenharmony_ci } 6858c2ecf20Sopenharmony_ci 6868c2ecf20Sopenharmony_ci del += t - f; 6878c2ecf20Sopenharmony_ci hugetlb_cgroup_uncharge_file_region( 6888c2ecf20Sopenharmony_ci resv, rg, t - f, false); 6898c2ecf20Sopenharmony_ci 6908c2ecf20Sopenharmony_ci /* New entry for end of split region */ 6918c2ecf20Sopenharmony_ci nrg->from = t; 6928c2ecf20Sopenharmony_ci nrg->to = rg->to; 6938c2ecf20Sopenharmony_ci 6948c2ecf20Sopenharmony_ci copy_hugetlb_cgroup_uncharge_info(nrg, rg); 6958c2ecf20Sopenharmony_ci 6968c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&nrg->link); 6978c2ecf20Sopenharmony_ci 6988c2ecf20Sopenharmony_ci /* Original entry is trimmed */ 6998c2ecf20Sopenharmony_ci rg->to = f; 7008c2ecf20Sopenharmony_ci 7018c2ecf20Sopenharmony_ci list_add(&nrg->link, &rg->link); 7028c2ecf20Sopenharmony_ci nrg = NULL; 7038c2ecf20Sopenharmony_ci break; 7048c2ecf20Sopenharmony_ci } 7058c2ecf20Sopenharmony_ci 7068c2ecf20Sopenharmony_ci if (f <= rg->from && t >= rg->to) { /* Remove entire region */ 7078c2ecf20Sopenharmony_ci del += rg->to - rg->from; 7088c2ecf20Sopenharmony_ci hugetlb_cgroup_uncharge_file_region(resv, rg, 7098c2ecf20Sopenharmony_ci rg->to - rg->from, true); 7108c2ecf20Sopenharmony_ci list_del(&rg->link); 7118c2ecf20Sopenharmony_ci kfree(rg); 7128c2ecf20Sopenharmony_ci continue; 7138c2ecf20Sopenharmony_ci } 7148c2ecf20Sopenharmony_ci 7158c2ecf20Sopenharmony_ci if (f <= rg->from) { /* Trim beginning of region */ 7168c2ecf20Sopenharmony_ci hugetlb_cgroup_uncharge_file_region(resv, rg, 7178c2ecf20Sopenharmony_ci t - rg->from, false); 7188c2ecf20Sopenharmony_ci 7198c2ecf20Sopenharmony_ci del += t - rg->from; 7208c2ecf20Sopenharmony_ci rg->from = t; 7218c2ecf20Sopenharmony_ci } else { /* Trim end of region */ 7228c2ecf20Sopenharmony_ci hugetlb_cgroup_uncharge_file_region(resv, rg, 7238c2ecf20Sopenharmony_ci rg->to - f, false); 7248c2ecf20Sopenharmony_ci 7258c2ecf20Sopenharmony_ci del += rg->to - f; 7268c2ecf20Sopenharmony_ci rg->to = f; 7278c2ecf20Sopenharmony_ci } 7288c2ecf20Sopenharmony_ci } 7298c2ecf20Sopenharmony_ci 7308c2ecf20Sopenharmony_ci spin_unlock(&resv->lock); 7318c2ecf20Sopenharmony_ci kfree(nrg); 7328c2ecf20Sopenharmony_ci return del; 7338c2ecf20Sopenharmony_ci} 7348c2ecf20Sopenharmony_ci 7358c2ecf20Sopenharmony_ci/* 7368c2ecf20Sopenharmony_ci * A rare out of memory error was encountered which prevented removal of 7378c2ecf20Sopenharmony_ci * the reserve map region for a page. The huge page itself was free'ed 7388c2ecf20Sopenharmony_ci * and removed from the page cache. This routine will adjust the subpool 7398c2ecf20Sopenharmony_ci * usage count, and the global reserve count if needed. By incrementing 7408c2ecf20Sopenharmony_ci * these counts, the reserve map entry which could not be deleted will 7418c2ecf20Sopenharmony_ci * appear as a "reserved" entry instead of simply dangling with incorrect 7428c2ecf20Sopenharmony_ci * counts. 7438c2ecf20Sopenharmony_ci */ 7448c2ecf20Sopenharmony_civoid hugetlb_fix_reserve_counts(struct inode *inode) 7458c2ecf20Sopenharmony_ci{ 7468c2ecf20Sopenharmony_ci struct hugepage_subpool *spool = subpool_inode(inode); 7478c2ecf20Sopenharmony_ci long rsv_adjust; 7488c2ecf20Sopenharmony_ci bool reserved = false; 7498c2ecf20Sopenharmony_ci 7508c2ecf20Sopenharmony_ci rsv_adjust = hugepage_subpool_get_pages(spool, 1); 7518c2ecf20Sopenharmony_ci if (rsv_adjust > 0) { 7528c2ecf20Sopenharmony_ci struct hstate *h = hstate_inode(inode); 7538c2ecf20Sopenharmony_ci 7548c2ecf20Sopenharmony_ci if (!hugetlb_acct_memory(h, 1)) 7558c2ecf20Sopenharmony_ci reserved = true; 7568c2ecf20Sopenharmony_ci } else if (!rsv_adjust) { 7578c2ecf20Sopenharmony_ci reserved = true; 7588c2ecf20Sopenharmony_ci } 7598c2ecf20Sopenharmony_ci 7608c2ecf20Sopenharmony_ci if (!reserved) 7618c2ecf20Sopenharmony_ci pr_warn("hugetlb: Huge Page Reserved count may go negative.\n"); 7628c2ecf20Sopenharmony_ci} 7638c2ecf20Sopenharmony_ci 7648c2ecf20Sopenharmony_ci/* 7658c2ecf20Sopenharmony_ci * Count and return the number of huge pages in the reserve map 7668c2ecf20Sopenharmony_ci * that intersect with the range [f, t). 7678c2ecf20Sopenharmony_ci */ 7688c2ecf20Sopenharmony_cistatic long region_count(struct resv_map *resv, long f, long t) 7698c2ecf20Sopenharmony_ci{ 7708c2ecf20Sopenharmony_ci struct list_head *head = &resv->regions; 7718c2ecf20Sopenharmony_ci struct file_region *rg; 7728c2ecf20Sopenharmony_ci long chg = 0; 7738c2ecf20Sopenharmony_ci 7748c2ecf20Sopenharmony_ci spin_lock(&resv->lock); 7758c2ecf20Sopenharmony_ci /* Locate each segment we overlap with, and count that overlap. */ 7768c2ecf20Sopenharmony_ci list_for_each_entry(rg, head, link) { 7778c2ecf20Sopenharmony_ci long seg_from; 7788c2ecf20Sopenharmony_ci long seg_to; 7798c2ecf20Sopenharmony_ci 7808c2ecf20Sopenharmony_ci if (rg->to <= f) 7818c2ecf20Sopenharmony_ci continue; 7828c2ecf20Sopenharmony_ci if (rg->from >= t) 7838c2ecf20Sopenharmony_ci break; 7848c2ecf20Sopenharmony_ci 7858c2ecf20Sopenharmony_ci seg_from = max(rg->from, f); 7868c2ecf20Sopenharmony_ci seg_to = min(rg->to, t); 7878c2ecf20Sopenharmony_ci 7888c2ecf20Sopenharmony_ci chg += seg_to - seg_from; 7898c2ecf20Sopenharmony_ci } 7908c2ecf20Sopenharmony_ci spin_unlock(&resv->lock); 7918c2ecf20Sopenharmony_ci 7928c2ecf20Sopenharmony_ci return chg; 7938c2ecf20Sopenharmony_ci} 7948c2ecf20Sopenharmony_ci 7958c2ecf20Sopenharmony_ci/* 7968c2ecf20Sopenharmony_ci * Convert the address within this vma to the page offset within 7978c2ecf20Sopenharmony_ci * the mapping, in pagecache page units; huge pages here. 7988c2ecf20Sopenharmony_ci */ 7998c2ecf20Sopenharmony_cistatic pgoff_t vma_hugecache_offset(struct hstate *h, 8008c2ecf20Sopenharmony_ci struct vm_area_struct *vma, unsigned long address) 8018c2ecf20Sopenharmony_ci{ 8028c2ecf20Sopenharmony_ci return ((address - vma->vm_start) >> huge_page_shift(h)) + 8038c2ecf20Sopenharmony_ci (vma->vm_pgoff >> huge_page_order(h)); 8048c2ecf20Sopenharmony_ci} 8058c2ecf20Sopenharmony_ci 8068c2ecf20Sopenharmony_cipgoff_t linear_hugepage_index(struct vm_area_struct *vma, 8078c2ecf20Sopenharmony_ci unsigned long address) 8088c2ecf20Sopenharmony_ci{ 8098c2ecf20Sopenharmony_ci return vma_hugecache_offset(hstate_vma(vma), vma, address); 8108c2ecf20Sopenharmony_ci} 8118c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(linear_hugepage_index); 8128c2ecf20Sopenharmony_ci 8138c2ecf20Sopenharmony_ci/* 8148c2ecf20Sopenharmony_ci * Return the size of the pages allocated when backing a VMA. In the majority 8158c2ecf20Sopenharmony_ci * cases this will be same size as used by the page table entries. 8168c2ecf20Sopenharmony_ci */ 8178c2ecf20Sopenharmony_ciunsigned long vma_kernel_pagesize(struct vm_area_struct *vma) 8188c2ecf20Sopenharmony_ci{ 8198c2ecf20Sopenharmony_ci if (vma->vm_ops && vma->vm_ops->pagesize) 8208c2ecf20Sopenharmony_ci return vma->vm_ops->pagesize(vma); 8218c2ecf20Sopenharmony_ci return PAGE_SIZE; 8228c2ecf20Sopenharmony_ci} 8238c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(vma_kernel_pagesize); 8248c2ecf20Sopenharmony_ci 8258c2ecf20Sopenharmony_ci/* 8268c2ecf20Sopenharmony_ci * Return the page size being used by the MMU to back a VMA. In the majority 8278c2ecf20Sopenharmony_ci * of cases, the page size used by the kernel matches the MMU size. On 8288c2ecf20Sopenharmony_ci * architectures where it differs, an architecture-specific 'strong' 8298c2ecf20Sopenharmony_ci * version of this symbol is required. 8308c2ecf20Sopenharmony_ci */ 8318c2ecf20Sopenharmony_ci__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) 8328c2ecf20Sopenharmony_ci{ 8338c2ecf20Sopenharmony_ci return vma_kernel_pagesize(vma); 8348c2ecf20Sopenharmony_ci} 8358c2ecf20Sopenharmony_ci 8368c2ecf20Sopenharmony_ci/* 8378c2ecf20Sopenharmony_ci * Flags for MAP_PRIVATE reservations. These are stored in the bottom 8388c2ecf20Sopenharmony_ci * bits of the reservation map pointer, which are always clear due to 8398c2ecf20Sopenharmony_ci * alignment. 8408c2ecf20Sopenharmony_ci */ 8418c2ecf20Sopenharmony_ci#define HPAGE_RESV_OWNER (1UL << 0) 8428c2ecf20Sopenharmony_ci#define HPAGE_RESV_UNMAPPED (1UL << 1) 8438c2ecf20Sopenharmony_ci#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) 8448c2ecf20Sopenharmony_ci 8458c2ecf20Sopenharmony_ci/* 8468c2ecf20Sopenharmony_ci * These helpers are used to track how many pages are reserved for 8478c2ecf20Sopenharmony_ci * faults in a MAP_PRIVATE mapping. Only the process that called mmap() 8488c2ecf20Sopenharmony_ci * is guaranteed to have their future faults succeed. 8498c2ecf20Sopenharmony_ci * 8508c2ecf20Sopenharmony_ci * With the exception of reset_vma_resv_huge_pages() which is called at fork(), 8518c2ecf20Sopenharmony_ci * the reserve counters are updated with the hugetlb_lock held. It is safe 8528c2ecf20Sopenharmony_ci * to reset the VMA at fork() time as it is not in use yet and there is no 8538c2ecf20Sopenharmony_ci * chance of the global counters getting corrupted as a result of the values. 8548c2ecf20Sopenharmony_ci * 8558c2ecf20Sopenharmony_ci * The private mapping reservation is represented in a subtly different 8568c2ecf20Sopenharmony_ci * manner to a shared mapping. A shared mapping has a region map associated 8578c2ecf20Sopenharmony_ci * with the underlying file, this region map represents the backing file 8588c2ecf20Sopenharmony_ci * pages which have ever had a reservation assigned which this persists even 8598c2ecf20Sopenharmony_ci * after the page is instantiated. A private mapping has a region map 8608c2ecf20Sopenharmony_ci * associated with the original mmap which is attached to all VMAs which 8618c2ecf20Sopenharmony_ci * reference it, this region map represents those offsets which have consumed 8628c2ecf20Sopenharmony_ci * reservation ie. where pages have been instantiated. 8638c2ecf20Sopenharmony_ci */ 8648c2ecf20Sopenharmony_cistatic unsigned long get_vma_private_data(struct vm_area_struct *vma) 8658c2ecf20Sopenharmony_ci{ 8668c2ecf20Sopenharmony_ci return (unsigned long)vma->vm_private_data; 8678c2ecf20Sopenharmony_ci} 8688c2ecf20Sopenharmony_ci 8698c2ecf20Sopenharmony_cistatic void set_vma_private_data(struct vm_area_struct *vma, 8708c2ecf20Sopenharmony_ci unsigned long value) 8718c2ecf20Sopenharmony_ci{ 8728c2ecf20Sopenharmony_ci vma->vm_private_data = (void *)value; 8738c2ecf20Sopenharmony_ci} 8748c2ecf20Sopenharmony_ci 8758c2ecf20Sopenharmony_cistatic void 8768c2ecf20Sopenharmony_ciresv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map, 8778c2ecf20Sopenharmony_ci struct hugetlb_cgroup *h_cg, 8788c2ecf20Sopenharmony_ci struct hstate *h) 8798c2ecf20Sopenharmony_ci{ 8808c2ecf20Sopenharmony_ci#ifdef CONFIG_CGROUP_HUGETLB 8818c2ecf20Sopenharmony_ci if (!h_cg || !h) { 8828c2ecf20Sopenharmony_ci resv_map->reservation_counter = NULL; 8838c2ecf20Sopenharmony_ci resv_map->pages_per_hpage = 0; 8848c2ecf20Sopenharmony_ci resv_map->css = NULL; 8858c2ecf20Sopenharmony_ci } else { 8868c2ecf20Sopenharmony_ci resv_map->reservation_counter = 8878c2ecf20Sopenharmony_ci &h_cg->rsvd_hugepage[hstate_index(h)]; 8888c2ecf20Sopenharmony_ci resv_map->pages_per_hpage = pages_per_huge_page(h); 8898c2ecf20Sopenharmony_ci resv_map->css = &h_cg->css; 8908c2ecf20Sopenharmony_ci } 8918c2ecf20Sopenharmony_ci#endif 8928c2ecf20Sopenharmony_ci} 8938c2ecf20Sopenharmony_ci 8948c2ecf20Sopenharmony_cistruct resv_map *resv_map_alloc(void) 8958c2ecf20Sopenharmony_ci{ 8968c2ecf20Sopenharmony_ci struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); 8978c2ecf20Sopenharmony_ci struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL); 8988c2ecf20Sopenharmony_ci 8998c2ecf20Sopenharmony_ci if (!resv_map || !rg) { 9008c2ecf20Sopenharmony_ci kfree(resv_map); 9018c2ecf20Sopenharmony_ci kfree(rg); 9028c2ecf20Sopenharmony_ci return NULL; 9038c2ecf20Sopenharmony_ci } 9048c2ecf20Sopenharmony_ci 9058c2ecf20Sopenharmony_ci kref_init(&resv_map->refs); 9068c2ecf20Sopenharmony_ci spin_lock_init(&resv_map->lock); 9078c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&resv_map->regions); 9088c2ecf20Sopenharmony_ci 9098c2ecf20Sopenharmony_ci resv_map->adds_in_progress = 0; 9108c2ecf20Sopenharmony_ci /* 9118c2ecf20Sopenharmony_ci * Initialize these to 0. On shared mappings, 0's here indicate these 9128c2ecf20Sopenharmony_ci * fields don't do cgroup accounting. On private mappings, these will be 9138c2ecf20Sopenharmony_ci * re-initialized to the proper values, to indicate that hugetlb cgroup 9148c2ecf20Sopenharmony_ci * reservations are to be un-charged from here. 9158c2ecf20Sopenharmony_ci */ 9168c2ecf20Sopenharmony_ci resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL); 9178c2ecf20Sopenharmony_ci 9188c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&resv_map->region_cache); 9198c2ecf20Sopenharmony_ci list_add(&rg->link, &resv_map->region_cache); 9208c2ecf20Sopenharmony_ci resv_map->region_cache_count = 1; 9218c2ecf20Sopenharmony_ci 9228c2ecf20Sopenharmony_ci return resv_map; 9238c2ecf20Sopenharmony_ci} 9248c2ecf20Sopenharmony_ci 9258c2ecf20Sopenharmony_civoid resv_map_release(struct kref *ref) 9268c2ecf20Sopenharmony_ci{ 9278c2ecf20Sopenharmony_ci struct resv_map *resv_map = container_of(ref, struct resv_map, refs); 9288c2ecf20Sopenharmony_ci struct list_head *head = &resv_map->region_cache; 9298c2ecf20Sopenharmony_ci struct file_region *rg, *trg; 9308c2ecf20Sopenharmony_ci 9318c2ecf20Sopenharmony_ci /* Clear out any active regions before we release the map. */ 9328c2ecf20Sopenharmony_ci region_del(resv_map, 0, LONG_MAX); 9338c2ecf20Sopenharmony_ci 9348c2ecf20Sopenharmony_ci /* ... and any entries left in the cache */ 9358c2ecf20Sopenharmony_ci list_for_each_entry_safe(rg, trg, head, link) { 9368c2ecf20Sopenharmony_ci list_del(&rg->link); 9378c2ecf20Sopenharmony_ci kfree(rg); 9388c2ecf20Sopenharmony_ci } 9398c2ecf20Sopenharmony_ci 9408c2ecf20Sopenharmony_ci VM_BUG_ON(resv_map->adds_in_progress); 9418c2ecf20Sopenharmony_ci 9428c2ecf20Sopenharmony_ci kfree(resv_map); 9438c2ecf20Sopenharmony_ci} 9448c2ecf20Sopenharmony_ci 9458c2ecf20Sopenharmony_cistatic inline struct resv_map *inode_resv_map(struct inode *inode) 9468c2ecf20Sopenharmony_ci{ 9478c2ecf20Sopenharmony_ci /* 9488c2ecf20Sopenharmony_ci * At inode evict time, i_mapping may not point to the original 9498c2ecf20Sopenharmony_ci * address space within the inode. This original address space 9508c2ecf20Sopenharmony_ci * contains the pointer to the resv_map. So, always use the 9518c2ecf20Sopenharmony_ci * address space embedded within the inode. 9528c2ecf20Sopenharmony_ci * The VERY common case is inode->mapping == &inode->i_data but, 9538c2ecf20Sopenharmony_ci * this may not be true for device special inodes. 9548c2ecf20Sopenharmony_ci */ 9558c2ecf20Sopenharmony_ci return (struct resv_map *)(&inode->i_data)->private_data; 9568c2ecf20Sopenharmony_ci} 9578c2ecf20Sopenharmony_ci 9588c2ecf20Sopenharmony_cistatic struct resv_map *vma_resv_map(struct vm_area_struct *vma) 9598c2ecf20Sopenharmony_ci{ 9608c2ecf20Sopenharmony_ci VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 9618c2ecf20Sopenharmony_ci if (vma->vm_flags & VM_MAYSHARE) { 9628c2ecf20Sopenharmony_ci struct address_space *mapping = vma->vm_file->f_mapping; 9638c2ecf20Sopenharmony_ci struct inode *inode = mapping->host; 9648c2ecf20Sopenharmony_ci 9658c2ecf20Sopenharmony_ci return inode_resv_map(inode); 9668c2ecf20Sopenharmony_ci 9678c2ecf20Sopenharmony_ci } else { 9688c2ecf20Sopenharmony_ci return (struct resv_map *)(get_vma_private_data(vma) & 9698c2ecf20Sopenharmony_ci ~HPAGE_RESV_MASK); 9708c2ecf20Sopenharmony_ci } 9718c2ecf20Sopenharmony_ci} 9728c2ecf20Sopenharmony_ci 9738c2ecf20Sopenharmony_cistatic void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) 9748c2ecf20Sopenharmony_ci{ 9758c2ecf20Sopenharmony_ci VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 9768c2ecf20Sopenharmony_ci VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); 9778c2ecf20Sopenharmony_ci 9788c2ecf20Sopenharmony_ci set_vma_private_data(vma, (get_vma_private_data(vma) & 9798c2ecf20Sopenharmony_ci HPAGE_RESV_MASK) | (unsigned long)map); 9808c2ecf20Sopenharmony_ci} 9818c2ecf20Sopenharmony_ci 9828c2ecf20Sopenharmony_cistatic void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) 9838c2ecf20Sopenharmony_ci{ 9848c2ecf20Sopenharmony_ci VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 9858c2ecf20Sopenharmony_ci VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); 9868c2ecf20Sopenharmony_ci 9878c2ecf20Sopenharmony_ci set_vma_private_data(vma, get_vma_private_data(vma) | flags); 9888c2ecf20Sopenharmony_ci} 9898c2ecf20Sopenharmony_ci 9908c2ecf20Sopenharmony_cistatic int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) 9918c2ecf20Sopenharmony_ci{ 9928c2ecf20Sopenharmony_ci VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 9938c2ecf20Sopenharmony_ci 9948c2ecf20Sopenharmony_ci return (get_vma_private_data(vma) & flag) != 0; 9958c2ecf20Sopenharmony_ci} 9968c2ecf20Sopenharmony_ci 9978c2ecf20Sopenharmony_ci/* Reset counters to 0 and clear all HPAGE_RESV_* flags */ 9988c2ecf20Sopenharmony_civoid reset_vma_resv_huge_pages(struct vm_area_struct *vma) 9998c2ecf20Sopenharmony_ci{ 10008c2ecf20Sopenharmony_ci VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 10018c2ecf20Sopenharmony_ci if (!(vma->vm_flags & VM_MAYSHARE)) 10028c2ecf20Sopenharmony_ci vma->vm_private_data = (void *)0; 10038c2ecf20Sopenharmony_ci} 10048c2ecf20Sopenharmony_ci 10058c2ecf20Sopenharmony_ci/* Returns true if the VMA has associated reserve pages */ 10068c2ecf20Sopenharmony_cistatic bool vma_has_reserves(struct vm_area_struct *vma, long chg) 10078c2ecf20Sopenharmony_ci{ 10088c2ecf20Sopenharmony_ci if (vma->vm_flags & VM_NORESERVE) { 10098c2ecf20Sopenharmony_ci /* 10108c2ecf20Sopenharmony_ci * This address is already reserved by other process(chg == 0), 10118c2ecf20Sopenharmony_ci * so, we should decrement reserved count. Without decrementing, 10128c2ecf20Sopenharmony_ci * reserve count remains after releasing inode, because this 10138c2ecf20Sopenharmony_ci * allocated page will go into page cache and is regarded as 10148c2ecf20Sopenharmony_ci * coming from reserved pool in releasing step. Currently, we 10158c2ecf20Sopenharmony_ci * don't have any other solution to deal with this situation 10168c2ecf20Sopenharmony_ci * properly, so add work-around here. 10178c2ecf20Sopenharmony_ci */ 10188c2ecf20Sopenharmony_ci if (vma->vm_flags & VM_MAYSHARE && chg == 0) 10198c2ecf20Sopenharmony_ci return true; 10208c2ecf20Sopenharmony_ci else 10218c2ecf20Sopenharmony_ci return false; 10228c2ecf20Sopenharmony_ci } 10238c2ecf20Sopenharmony_ci 10248c2ecf20Sopenharmony_ci /* Shared mappings always use reserves */ 10258c2ecf20Sopenharmony_ci if (vma->vm_flags & VM_MAYSHARE) { 10268c2ecf20Sopenharmony_ci /* 10278c2ecf20Sopenharmony_ci * We know VM_NORESERVE is not set. Therefore, there SHOULD 10288c2ecf20Sopenharmony_ci * be a region map for all pages. The only situation where 10298c2ecf20Sopenharmony_ci * there is no region map is if a hole was punched via 10308c2ecf20Sopenharmony_ci * fallocate. In this case, there really are no reserves to 10318c2ecf20Sopenharmony_ci * use. This situation is indicated if chg != 0. 10328c2ecf20Sopenharmony_ci */ 10338c2ecf20Sopenharmony_ci if (chg) 10348c2ecf20Sopenharmony_ci return false; 10358c2ecf20Sopenharmony_ci else 10368c2ecf20Sopenharmony_ci return true; 10378c2ecf20Sopenharmony_ci } 10388c2ecf20Sopenharmony_ci 10398c2ecf20Sopenharmony_ci /* 10408c2ecf20Sopenharmony_ci * Only the process that called mmap() has reserves for 10418c2ecf20Sopenharmony_ci * private mappings. 10428c2ecf20Sopenharmony_ci */ 10438c2ecf20Sopenharmony_ci if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 10448c2ecf20Sopenharmony_ci /* 10458c2ecf20Sopenharmony_ci * Like the shared case above, a hole punch or truncate 10468c2ecf20Sopenharmony_ci * could have been performed on the private mapping. 10478c2ecf20Sopenharmony_ci * Examine the value of chg to determine if reserves 10488c2ecf20Sopenharmony_ci * actually exist or were previously consumed. 10498c2ecf20Sopenharmony_ci * Very Subtle - The value of chg comes from a previous 10508c2ecf20Sopenharmony_ci * call to vma_needs_reserves(). The reserve map for 10518c2ecf20Sopenharmony_ci * private mappings has different (opposite) semantics 10528c2ecf20Sopenharmony_ci * than that of shared mappings. vma_needs_reserves() 10538c2ecf20Sopenharmony_ci * has already taken this difference in semantics into 10548c2ecf20Sopenharmony_ci * account. Therefore, the meaning of chg is the same 10558c2ecf20Sopenharmony_ci * as in the shared case above. Code could easily be 10568c2ecf20Sopenharmony_ci * combined, but keeping it separate draws attention to 10578c2ecf20Sopenharmony_ci * subtle differences. 10588c2ecf20Sopenharmony_ci */ 10598c2ecf20Sopenharmony_ci if (chg) 10608c2ecf20Sopenharmony_ci return false; 10618c2ecf20Sopenharmony_ci else 10628c2ecf20Sopenharmony_ci return true; 10638c2ecf20Sopenharmony_ci } 10648c2ecf20Sopenharmony_ci 10658c2ecf20Sopenharmony_ci return false; 10668c2ecf20Sopenharmony_ci} 10678c2ecf20Sopenharmony_ci 10688c2ecf20Sopenharmony_cistatic void enqueue_huge_page(struct hstate *h, struct page *page) 10698c2ecf20Sopenharmony_ci{ 10708c2ecf20Sopenharmony_ci int nid = page_to_nid(page); 10718c2ecf20Sopenharmony_ci list_move(&page->lru, &h->hugepage_freelists[nid]); 10728c2ecf20Sopenharmony_ci h->free_huge_pages++; 10738c2ecf20Sopenharmony_ci h->free_huge_pages_node[nid]++; 10748c2ecf20Sopenharmony_ci SetPageHugeFreed(page); 10758c2ecf20Sopenharmony_ci} 10768c2ecf20Sopenharmony_ci 10778c2ecf20Sopenharmony_cistatic struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) 10788c2ecf20Sopenharmony_ci{ 10798c2ecf20Sopenharmony_ci struct page *page; 10808c2ecf20Sopenharmony_ci bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA); 10818c2ecf20Sopenharmony_ci 10828c2ecf20Sopenharmony_ci list_for_each_entry(page, &h->hugepage_freelists[nid], lru) { 10838c2ecf20Sopenharmony_ci if (nocma && is_migrate_cma_page(page)) 10848c2ecf20Sopenharmony_ci continue; 10858c2ecf20Sopenharmony_ci 10868c2ecf20Sopenharmony_ci if (PageHWPoison(page)) 10878c2ecf20Sopenharmony_ci continue; 10888c2ecf20Sopenharmony_ci 10898c2ecf20Sopenharmony_ci list_move(&page->lru, &h->hugepage_activelist); 10908c2ecf20Sopenharmony_ci set_page_refcounted(page); 10918c2ecf20Sopenharmony_ci ClearPageHugeFreed(page); 10928c2ecf20Sopenharmony_ci h->free_huge_pages--; 10938c2ecf20Sopenharmony_ci h->free_huge_pages_node[nid]--; 10948c2ecf20Sopenharmony_ci return page; 10958c2ecf20Sopenharmony_ci } 10968c2ecf20Sopenharmony_ci 10978c2ecf20Sopenharmony_ci return NULL; 10988c2ecf20Sopenharmony_ci} 10998c2ecf20Sopenharmony_ci 11008c2ecf20Sopenharmony_cistatic struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid, 11018c2ecf20Sopenharmony_ci nodemask_t *nmask) 11028c2ecf20Sopenharmony_ci{ 11038c2ecf20Sopenharmony_ci unsigned int cpuset_mems_cookie; 11048c2ecf20Sopenharmony_ci struct zonelist *zonelist; 11058c2ecf20Sopenharmony_ci struct zone *zone; 11068c2ecf20Sopenharmony_ci struct zoneref *z; 11078c2ecf20Sopenharmony_ci int node = NUMA_NO_NODE; 11088c2ecf20Sopenharmony_ci 11098c2ecf20Sopenharmony_ci zonelist = node_zonelist(nid, gfp_mask); 11108c2ecf20Sopenharmony_ci 11118c2ecf20Sopenharmony_ciretry_cpuset: 11128c2ecf20Sopenharmony_ci cpuset_mems_cookie = read_mems_allowed_begin(); 11138c2ecf20Sopenharmony_ci for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) { 11148c2ecf20Sopenharmony_ci struct page *page; 11158c2ecf20Sopenharmony_ci 11168c2ecf20Sopenharmony_ci if (!cpuset_zone_allowed(zone, gfp_mask)) 11178c2ecf20Sopenharmony_ci continue; 11188c2ecf20Sopenharmony_ci /* 11198c2ecf20Sopenharmony_ci * no need to ask again on the same node. Pool is node rather than 11208c2ecf20Sopenharmony_ci * zone aware 11218c2ecf20Sopenharmony_ci */ 11228c2ecf20Sopenharmony_ci if (zone_to_nid(zone) == node) 11238c2ecf20Sopenharmony_ci continue; 11248c2ecf20Sopenharmony_ci node = zone_to_nid(zone); 11258c2ecf20Sopenharmony_ci 11268c2ecf20Sopenharmony_ci page = dequeue_huge_page_node_exact(h, node); 11278c2ecf20Sopenharmony_ci if (page) 11288c2ecf20Sopenharmony_ci return page; 11298c2ecf20Sopenharmony_ci } 11308c2ecf20Sopenharmony_ci if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie))) 11318c2ecf20Sopenharmony_ci goto retry_cpuset; 11328c2ecf20Sopenharmony_ci 11338c2ecf20Sopenharmony_ci return NULL; 11348c2ecf20Sopenharmony_ci} 11358c2ecf20Sopenharmony_ci 11368c2ecf20Sopenharmony_cistatic struct page *dequeue_huge_page_vma(struct hstate *h, 11378c2ecf20Sopenharmony_ci struct vm_area_struct *vma, 11388c2ecf20Sopenharmony_ci unsigned long address, int avoid_reserve, 11398c2ecf20Sopenharmony_ci long chg) 11408c2ecf20Sopenharmony_ci{ 11418c2ecf20Sopenharmony_ci struct page *page; 11428c2ecf20Sopenharmony_ci struct mempolicy *mpol; 11438c2ecf20Sopenharmony_ci gfp_t gfp_mask; 11448c2ecf20Sopenharmony_ci nodemask_t *nodemask; 11458c2ecf20Sopenharmony_ci int nid; 11468c2ecf20Sopenharmony_ci 11478c2ecf20Sopenharmony_ci /* 11488c2ecf20Sopenharmony_ci * A child process with MAP_PRIVATE mappings created by their parent 11498c2ecf20Sopenharmony_ci * have no page reserves. This check ensures that reservations are 11508c2ecf20Sopenharmony_ci * not "stolen". The child may still get SIGKILLed 11518c2ecf20Sopenharmony_ci */ 11528c2ecf20Sopenharmony_ci if (!vma_has_reserves(vma, chg) && 11538c2ecf20Sopenharmony_ci h->free_huge_pages - h->resv_huge_pages == 0) 11548c2ecf20Sopenharmony_ci goto err; 11558c2ecf20Sopenharmony_ci 11568c2ecf20Sopenharmony_ci /* If reserves cannot be used, ensure enough pages are in the pool */ 11578c2ecf20Sopenharmony_ci if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) 11588c2ecf20Sopenharmony_ci goto err; 11598c2ecf20Sopenharmony_ci 11608c2ecf20Sopenharmony_ci gfp_mask = htlb_alloc_mask(h); 11618c2ecf20Sopenharmony_ci nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); 11628c2ecf20Sopenharmony_ci page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); 11638c2ecf20Sopenharmony_ci if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { 11648c2ecf20Sopenharmony_ci SetPagePrivate(page); 11658c2ecf20Sopenharmony_ci h->resv_huge_pages--; 11668c2ecf20Sopenharmony_ci } 11678c2ecf20Sopenharmony_ci 11688c2ecf20Sopenharmony_ci mpol_cond_put(mpol); 11698c2ecf20Sopenharmony_ci return page; 11708c2ecf20Sopenharmony_ci 11718c2ecf20Sopenharmony_cierr: 11728c2ecf20Sopenharmony_ci return NULL; 11738c2ecf20Sopenharmony_ci} 11748c2ecf20Sopenharmony_ci 11758c2ecf20Sopenharmony_ci/* 11768c2ecf20Sopenharmony_ci * common helper functions for hstate_next_node_to_{alloc|free}. 11778c2ecf20Sopenharmony_ci * We may have allocated or freed a huge page based on a different 11788c2ecf20Sopenharmony_ci * nodes_allowed previously, so h->next_node_to_{alloc|free} might 11798c2ecf20Sopenharmony_ci * be outside of *nodes_allowed. Ensure that we use an allowed 11808c2ecf20Sopenharmony_ci * node for alloc or free. 11818c2ecf20Sopenharmony_ci */ 11828c2ecf20Sopenharmony_cistatic int next_node_allowed(int nid, nodemask_t *nodes_allowed) 11838c2ecf20Sopenharmony_ci{ 11848c2ecf20Sopenharmony_ci nid = next_node_in(nid, *nodes_allowed); 11858c2ecf20Sopenharmony_ci VM_BUG_ON(nid >= MAX_NUMNODES); 11868c2ecf20Sopenharmony_ci 11878c2ecf20Sopenharmony_ci return nid; 11888c2ecf20Sopenharmony_ci} 11898c2ecf20Sopenharmony_ci 11908c2ecf20Sopenharmony_cistatic int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) 11918c2ecf20Sopenharmony_ci{ 11928c2ecf20Sopenharmony_ci if (!node_isset(nid, *nodes_allowed)) 11938c2ecf20Sopenharmony_ci nid = next_node_allowed(nid, nodes_allowed); 11948c2ecf20Sopenharmony_ci return nid; 11958c2ecf20Sopenharmony_ci} 11968c2ecf20Sopenharmony_ci 11978c2ecf20Sopenharmony_ci/* 11988c2ecf20Sopenharmony_ci * returns the previously saved node ["this node"] from which to 11998c2ecf20Sopenharmony_ci * allocate a persistent huge page for the pool and advance the 12008c2ecf20Sopenharmony_ci * next node from which to allocate, handling wrap at end of node 12018c2ecf20Sopenharmony_ci * mask. 12028c2ecf20Sopenharmony_ci */ 12038c2ecf20Sopenharmony_cistatic int hstate_next_node_to_alloc(struct hstate *h, 12048c2ecf20Sopenharmony_ci nodemask_t *nodes_allowed) 12058c2ecf20Sopenharmony_ci{ 12068c2ecf20Sopenharmony_ci int nid; 12078c2ecf20Sopenharmony_ci 12088c2ecf20Sopenharmony_ci VM_BUG_ON(!nodes_allowed); 12098c2ecf20Sopenharmony_ci 12108c2ecf20Sopenharmony_ci nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); 12118c2ecf20Sopenharmony_ci h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); 12128c2ecf20Sopenharmony_ci 12138c2ecf20Sopenharmony_ci return nid; 12148c2ecf20Sopenharmony_ci} 12158c2ecf20Sopenharmony_ci 12168c2ecf20Sopenharmony_ci/* 12178c2ecf20Sopenharmony_ci * helper for free_pool_huge_page() - return the previously saved 12188c2ecf20Sopenharmony_ci * node ["this node"] from which to free a huge page. Advance the 12198c2ecf20Sopenharmony_ci * next node id whether or not we find a free huge page to free so 12208c2ecf20Sopenharmony_ci * that the next attempt to free addresses the next node. 12218c2ecf20Sopenharmony_ci */ 12228c2ecf20Sopenharmony_cistatic int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) 12238c2ecf20Sopenharmony_ci{ 12248c2ecf20Sopenharmony_ci int nid; 12258c2ecf20Sopenharmony_ci 12268c2ecf20Sopenharmony_ci VM_BUG_ON(!nodes_allowed); 12278c2ecf20Sopenharmony_ci 12288c2ecf20Sopenharmony_ci nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); 12298c2ecf20Sopenharmony_ci h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); 12308c2ecf20Sopenharmony_ci 12318c2ecf20Sopenharmony_ci return nid; 12328c2ecf20Sopenharmony_ci} 12338c2ecf20Sopenharmony_ci 12348c2ecf20Sopenharmony_ci#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ 12358c2ecf20Sopenharmony_ci for (nr_nodes = nodes_weight(*mask); \ 12368c2ecf20Sopenharmony_ci nr_nodes > 0 && \ 12378c2ecf20Sopenharmony_ci ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ 12388c2ecf20Sopenharmony_ci nr_nodes--) 12398c2ecf20Sopenharmony_ci 12408c2ecf20Sopenharmony_ci#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ 12418c2ecf20Sopenharmony_ci for (nr_nodes = nodes_weight(*mask); \ 12428c2ecf20Sopenharmony_ci nr_nodes > 0 && \ 12438c2ecf20Sopenharmony_ci ((node = hstate_next_node_to_free(hs, mask)) || 1); \ 12448c2ecf20Sopenharmony_ci nr_nodes--) 12458c2ecf20Sopenharmony_ci 12468c2ecf20Sopenharmony_ci#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE 12478c2ecf20Sopenharmony_cistatic void destroy_compound_gigantic_page(struct page *page, 12488c2ecf20Sopenharmony_ci unsigned int order) 12498c2ecf20Sopenharmony_ci{ 12508c2ecf20Sopenharmony_ci int i; 12518c2ecf20Sopenharmony_ci int nr_pages = 1 << order; 12528c2ecf20Sopenharmony_ci struct page *p = page + 1; 12538c2ecf20Sopenharmony_ci 12548c2ecf20Sopenharmony_ci atomic_set(compound_mapcount_ptr(page), 0); 12558c2ecf20Sopenharmony_ci atomic_set(compound_pincount_ptr(page), 0); 12568c2ecf20Sopenharmony_ci 12578c2ecf20Sopenharmony_ci for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { 12588c2ecf20Sopenharmony_ci clear_compound_head(p); 12598c2ecf20Sopenharmony_ci set_page_refcounted(p); 12608c2ecf20Sopenharmony_ci } 12618c2ecf20Sopenharmony_ci 12628c2ecf20Sopenharmony_ci set_compound_order(page, 0); 12638c2ecf20Sopenharmony_ci page[1].compound_nr = 0; 12648c2ecf20Sopenharmony_ci __ClearPageHead(page); 12658c2ecf20Sopenharmony_ci} 12668c2ecf20Sopenharmony_ci 12678c2ecf20Sopenharmony_cistatic void free_gigantic_page(struct page *page, unsigned int order) 12688c2ecf20Sopenharmony_ci{ 12698c2ecf20Sopenharmony_ci /* 12708c2ecf20Sopenharmony_ci * If the page isn't allocated using the cma allocator, 12718c2ecf20Sopenharmony_ci * cma_release() returns false. 12728c2ecf20Sopenharmony_ci */ 12738c2ecf20Sopenharmony_ci#ifdef CONFIG_CMA 12748c2ecf20Sopenharmony_ci if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order)) 12758c2ecf20Sopenharmony_ci return; 12768c2ecf20Sopenharmony_ci#endif 12778c2ecf20Sopenharmony_ci 12788c2ecf20Sopenharmony_ci free_contig_range(page_to_pfn(page), 1 << order); 12798c2ecf20Sopenharmony_ci} 12808c2ecf20Sopenharmony_ci 12818c2ecf20Sopenharmony_ci#ifdef CONFIG_CONTIG_ALLOC 12828c2ecf20Sopenharmony_cistatic struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, 12838c2ecf20Sopenharmony_ci int nid, nodemask_t *nodemask) 12848c2ecf20Sopenharmony_ci{ 12858c2ecf20Sopenharmony_ci unsigned long nr_pages = 1UL << huge_page_order(h); 12868c2ecf20Sopenharmony_ci if (nid == NUMA_NO_NODE) 12878c2ecf20Sopenharmony_ci nid = numa_mem_id(); 12888c2ecf20Sopenharmony_ci 12898c2ecf20Sopenharmony_ci#ifdef CONFIG_CMA 12908c2ecf20Sopenharmony_ci { 12918c2ecf20Sopenharmony_ci struct page *page; 12928c2ecf20Sopenharmony_ci int node; 12938c2ecf20Sopenharmony_ci 12948c2ecf20Sopenharmony_ci if (hugetlb_cma[nid]) { 12958c2ecf20Sopenharmony_ci page = cma_alloc(hugetlb_cma[nid], nr_pages, 12968c2ecf20Sopenharmony_ci huge_page_order(h), true); 12978c2ecf20Sopenharmony_ci if (page) 12988c2ecf20Sopenharmony_ci return page; 12998c2ecf20Sopenharmony_ci } 13008c2ecf20Sopenharmony_ci 13018c2ecf20Sopenharmony_ci if (!(gfp_mask & __GFP_THISNODE)) { 13028c2ecf20Sopenharmony_ci for_each_node_mask(node, *nodemask) { 13038c2ecf20Sopenharmony_ci if (node == nid || !hugetlb_cma[node]) 13048c2ecf20Sopenharmony_ci continue; 13058c2ecf20Sopenharmony_ci 13068c2ecf20Sopenharmony_ci page = cma_alloc(hugetlb_cma[node], nr_pages, 13078c2ecf20Sopenharmony_ci huge_page_order(h), true); 13088c2ecf20Sopenharmony_ci if (page) 13098c2ecf20Sopenharmony_ci return page; 13108c2ecf20Sopenharmony_ci } 13118c2ecf20Sopenharmony_ci } 13128c2ecf20Sopenharmony_ci } 13138c2ecf20Sopenharmony_ci#endif 13148c2ecf20Sopenharmony_ci 13158c2ecf20Sopenharmony_ci return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); 13168c2ecf20Sopenharmony_ci} 13178c2ecf20Sopenharmony_ci 13188c2ecf20Sopenharmony_ci#else /* !CONFIG_CONTIG_ALLOC */ 13198c2ecf20Sopenharmony_cistatic struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, 13208c2ecf20Sopenharmony_ci int nid, nodemask_t *nodemask) 13218c2ecf20Sopenharmony_ci{ 13228c2ecf20Sopenharmony_ci return NULL; 13238c2ecf20Sopenharmony_ci} 13248c2ecf20Sopenharmony_ci#endif /* CONFIG_CONTIG_ALLOC */ 13258c2ecf20Sopenharmony_ci 13268c2ecf20Sopenharmony_ci#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ 13278c2ecf20Sopenharmony_cistatic struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, 13288c2ecf20Sopenharmony_ci int nid, nodemask_t *nodemask) 13298c2ecf20Sopenharmony_ci{ 13308c2ecf20Sopenharmony_ci return NULL; 13318c2ecf20Sopenharmony_ci} 13328c2ecf20Sopenharmony_cistatic inline void free_gigantic_page(struct page *page, unsigned int order) { } 13338c2ecf20Sopenharmony_cistatic inline void destroy_compound_gigantic_page(struct page *page, 13348c2ecf20Sopenharmony_ci unsigned int order) { } 13358c2ecf20Sopenharmony_ci#endif 13368c2ecf20Sopenharmony_ci 13378c2ecf20Sopenharmony_cistatic void update_and_free_page(struct hstate *h, struct page *page) 13388c2ecf20Sopenharmony_ci{ 13398c2ecf20Sopenharmony_ci int i; 13408c2ecf20Sopenharmony_ci struct page *subpage = page; 13418c2ecf20Sopenharmony_ci 13428c2ecf20Sopenharmony_ci if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) 13438c2ecf20Sopenharmony_ci return; 13448c2ecf20Sopenharmony_ci 13458c2ecf20Sopenharmony_ci h->nr_huge_pages--; 13468c2ecf20Sopenharmony_ci h->nr_huge_pages_node[page_to_nid(page)]--; 13478c2ecf20Sopenharmony_ci for (i = 0; i < pages_per_huge_page(h); 13488c2ecf20Sopenharmony_ci i++, subpage = mem_map_next(subpage, page, i)) { 13498c2ecf20Sopenharmony_ci subpage->flags &= ~(1 << PG_locked | 1 << PG_error | 13508c2ecf20Sopenharmony_ci 1 << PG_referenced | 1 << PG_dirty | 13518c2ecf20Sopenharmony_ci 1 << PG_active | 1 << PG_private | 13528c2ecf20Sopenharmony_ci 1 << PG_writeback); 13538c2ecf20Sopenharmony_ci } 13548c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); 13558c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page); 13568c2ecf20Sopenharmony_ci /* 13578c2ecf20Sopenharmony_ci * Very subtle 13588c2ecf20Sopenharmony_ci * 13598c2ecf20Sopenharmony_ci * For non-gigantic pages set the destructor to the normal compound 13608c2ecf20Sopenharmony_ci * page dtor. This is needed in case someone takes an additional 13618c2ecf20Sopenharmony_ci * temporary ref to the page, and freeing is delayed until they drop 13628c2ecf20Sopenharmony_ci * their reference. 13638c2ecf20Sopenharmony_ci * 13648c2ecf20Sopenharmony_ci * For gigantic pages set the destructor to the null dtor. This 13658c2ecf20Sopenharmony_ci * destructor will never be called. Before freeing the gigantic 13668c2ecf20Sopenharmony_ci * page destroy_compound_gigantic_page will turn the compound page 13678c2ecf20Sopenharmony_ci * into a simple group of pages. After this the destructor does not 13688c2ecf20Sopenharmony_ci * apply. 13698c2ecf20Sopenharmony_ci * 13708c2ecf20Sopenharmony_ci * This handles the case where more than one ref is held when and 13718c2ecf20Sopenharmony_ci * after update_and_free_page is called. 13728c2ecf20Sopenharmony_ci */ 13738c2ecf20Sopenharmony_ci set_page_refcounted(page); 13748c2ecf20Sopenharmony_ci if (hstate_is_gigantic(h)) { 13758c2ecf20Sopenharmony_ci set_compound_page_dtor(page, NULL_COMPOUND_DTOR); 13768c2ecf20Sopenharmony_ci /* 13778c2ecf20Sopenharmony_ci * Temporarily drop the hugetlb_lock, because 13788c2ecf20Sopenharmony_ci * we might block in free_gigantic_page(). 13798c2ecf20Sopenharmony_ci */ 13808c2ecf20Sopenharmony_ci spin_unlock(&hugetlb_lock); 13818c2ecf20Sopenharmony_ci destroy_compound_gigantic_page(page, huge_page_order(h)); 13828c2ecf20Sopenharmony_ci free_gigantic_page(page, huge_page_order(h)); 13838c2ecf20Sopenharmony_ci spin_lock(&hugetlb_lock); 13848c2ecf20Sopenharmony_ci } else { 13858c2ecf20Sopenharmony_ci set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); 13868c2ecf20Sopenharmony_ci __free_pages(page, huge_page_order(h)); 13878c2ecf20Sopenharmony_ci } 13888c2ecf20Sopenharmony_ci} 13898c2ecf20Sopenharmony_ci 13908c2ecf20Sopenharmony_cistruct hstate *size_to_hstate(unsigned long size) 13918c2ecf20Sopenharmony_ci{ 13928c2ecf20Sopenharmony_ci struct hstate *h; 13938c2ecf20Sopenharmony_ci 13948c2ecf20Sopenharmony_ci for_each_hstate(h) { 13958c2ecf20Sopenharmony_ci if (huge_page_size(h) == size) 13968c2ecf20Sopenharmony_ci return h; 13978c2ecf20Sopenharmony_ci } 13988c2ecf20Sopenharmony_ci return NULL; 13998c2ecf20Sopenharmony_ci} 14008c2ecf20Sopenharmony_ci 14018c2ecf20Sopenharmony_ci/* 14028c2ecf20Sopenharmony_ci * Test to determine whether the hugepage is "active/in-use" (i.e. being linked 14038c2ecf20Sopenharmony_ci * to hstate->hugepage_activelist.) 14048c2ecf20Sopenharmony_ci * 14058c2ecf20Sopenharmony_ci * This function can be called for tail pages, but never returns true for them. 14068c2ecf20Sopenharmony_ci */ 14078c2ecf20Sopenharmony_cibool page_huge_active(struct page *page) 14088c2ecf20Sopenharmony_ci{ 14098c2ecf20Sopenharmony_ci return PageHeadHuge(page) && PagePrivate(&page[1]); 14108c2ecf20Sopenharmony_ci} 14118c2ecf20Sopenharmony_ci 14128c2ecf20Sopenharmony_ci/* never called for tail page */ 14138c2ecf20Sopenharmony_civoid set_page_huge_active(struct page *page) 14148c2ecf20Sopenharmony_ci{ 14158c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(!PageHeadHuge(page), page); 14168c2ecf20Sopenharmony_ci SetPagePrivate(&page[1]); 14178c2ecf20Sopenharmony_ci} 14188c2ecf20Sopenharmony_ci 14198c2ecf20Sopenharmony_cistatic void clear_page_huge_active(struct page *page) 14208c2ecf20Sopenharmony_ci{ 14218c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(!PageHeadHuge(page), page); 14228c2ecf20Sopenharmony_ci ClearPagePrivate(&page[1]); 14238c2ecf20Sopenharmony_ci} 14248c2ecf20Sopenharmony_ci 14258c2ecf20Sopenharmony_ci/* 14268c2ecf20Sopenharmony_ci * Internal hugetlb specific page flag. Do not use outside of the hugetlb 14278c2ecf20Sopenharmony_ci * code 14288c2ecf20Sopenharmony_ci */ 14298c2ecf20Sopenharmony_cistatic inline bool PageHugeTemporary(struct page *page) 14308c2ecf20Sopenharmony_ci{ 14318c2ecf20Sopenharmony_ci if (!PageHuge(page)) 14328c2ecf20Sopenharmony_ci return false; 14338c2ecf20Sopenharmony_ci 14348c2ecf20Sopenharmony_ci return (unsigned long)page[2].mapping == -1U; 14358c2ecf20Sopenharmony_ci} 14368c2ecf20Sopenharmony_ci 14378c2ecf20Sopenharmony_cistatic inline void SetPageHugeTemporary(struct page *page) 14388c2ecf20Sopenharmony_ci{ 14398c2ecf20Sopenharmony_ci page[2].mapping = (void *)-1U; 14408c2ecf20Sopenharmony_ci} 14418c2ecf20Sopenharmony_ci 14428c2ecf20Sopenharmony_cistatic inline void ClearPageHugeTemporary(struct page *page) 14438c2ecf20Sopenharmony_ci{ 14448c2ecf20Sopenharmony_ci page[2].mapping = NULL; 14458c2ecf20Sopenharmony_ci} 14468c2ecf20Sopenharmony_ci 14478c2ecf20Sopenharmony_cistatic void __free_huge_page(struct page *page) 14488c2ecf20Sopenharmony_ci{ 14498c2ecf20Sopenharmony_ci /* 14508c2ecf20Sopenharmony_ci * Can't pass hstate in here because it is called from the 14518c2ecf20Sopenharmony_ci * compound page destructor. 14528c2ecf20Sopenharmony_ci */ 14538c2ecf20Sopenharmony_ci struct hstate *h = page_hstate(page); 14548c2ecf20Sopenharmony_ci int nid = page_to_nid(page); 14558c2ecf20Sopenharmony_ci struct hugepage_subpool *spool = 14568c2ecf20Sopenharmony_ci (struct hugepage_subpool *)page_private(page); 14578c2ecf20Sopenharmony_ci bool restore_reserve; 14588c2ecf20Sopenharmony_ci 14598c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(page_count(page), page); 14608c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(page_mapcount(page), page); 14618c2ecf20Sopenharmony_ci 14628c2ecf20Sopenharmony_ci set_page_private(page, 0); 14638c2ecf20Sopenharmony_ci page->mapping = NULL; 14648c2ecf20Sopenharmony_ci restore_reserve = PagePrivate(page); 14658c2ecf20Sopenharmony_ci ClearPagePrivate(page); 14668c2ecf20Sopenharmony_ci 14678c2ecf20Sopenharmony_ci /* 14688c2ecf20Sopenharmony_ci * If PagePrivate() was set on page, page allocation consumed a 14698c2ecf20Sopenharmony_ci * reservation. If the page was associated with a subpool, there 14708c2ecf20Sopenharmony_ci * would have been a page reserved in the subpool before allocation 14718c2ecf20Sopenharmony_ci * via hugepage_subpool_get_pages(). Since we are 'restoring' the 14728c2ecf20Sopenharmony_ci * reservtion, do not call hugepage_subpool_put_pages() as this will 14738c2ecf20Sopenharmony_ci * remove the reserved page from the subpool. 14748c2ecf20Sopenharmony_ci */ 14758c2ecf20Sopenharmony_ci if (!restore_reserve) { 14768c2ecf20Sopenharmony_ci /* 14778c2ecf20Sopenharmony_ci * A return code of zero implies that the subpool will be 14788c2ecf20Sopenharmony_ci * under its minimum size if the reservation is not restored 14798c2ecf20Sopenharmony_ci * after page is free. Therefore, force restore_reserve 14808c2ecf20Sopenharmony_ci * operation. 14818c2ecf20Sopenharmony_ci */ 14828c2ecf20Sopenharmony_ci if (hugepage_subpool_put_pages(spool, 1) == 0) 14838c2ecf20Sopenharmony_ci restore_reserve = true; 14848c2ecf20Sopenharmony_ci } 14858c2ecf20Sopenharmony_ci 14868c2ecf20Sopenharmony_ci spin_lock(&hugetlb_lock); 14878c2ecf20Sopenharmony_ci clear_page_huge_active(page); 14888c2ecf20Sopenharmony_ci hugetlb_cgroup_uncharge_page(hstate_index(h), 14898c2ecf20Sopenharmony_ci pages_per_huge_page(h), page); 14908c2ecf20Sopenharmony_ci hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), 14918c2ecf20Sopenharmony_ci pages_per_huge_page(h), page); 14928c2ecf20Sopenharmony_ci if (restore_reserve) 14938c2ecf20Sopenharmony_ci h->resv_huge_pages++; 14948c2ecf20Sopenharmony_ci 14958c2ecf20Sopenharmony_ci if (PageHugeTemporary(page)) { 14968c2ecf20Sopenharmony_ci list_del(&page->lru); 14978c2ecf20Sopenharmony_ci ClearPageHugeTemporary(page); 14988c2ecf20Sopenharmony_ci update_and_free_page(h, page); 14998c2ecf20Sopenharmony_ci } else if (h->surplus_huge_pages_node[nid]) { 15008c2ecf20Sopenharmony_ci /* remove the page from active list */ 15018c2ecf20Sopenharmony_ci list_del(&page->lru); 15028c2ecf20Sopenharmony_ci update_and_free_page(h, page); 15038c2ecf20Sopenharmony_ci h->surplus_huge_pages--; 15048c2ecf20Sopenharmony_ci h->surplus_huge_pages_node[nid]--; 15058c2ecf20Sopenharmony_ci } else { 15068c2ecf20Sopenharmony_ci arch_clear_hugepage_flags(page); 15078c2ecf20Sopenharmony_ci enqueue_huge_page(h, page); 15088c2ecf20Sopenharmony_ci } 15098c2ecf20Sopenharmony_ci spin_unlock(&hugetlb_lock); 15108c2ecf20Sopenharmony_ci} 15118c2ecf20Sopenharmony_ci 15128c2ecf20Sopenharmony_ci/* 15138c2ecf20Sopenharmony_ci * As free_huge_page() can be called from a non-task context, we have 15148c2ecf20Sopenharmony_ci * to defer the actual freeing in a workqueue to prevent potential 15158c2ecf20Sopenharmony_ci * hugetlb_lock deadlock. 15168c2ecf20Sopenharmony_ci * 15178c2ecf20Sopenharmony_ci * free_hpage_workfn() locklessly retrieves the linked list of pages to 15188c2ecf20Sopenharmony_ci * be freed and frees them one-by-one. As the page->mapping pointer is 15198c2ecf20Sopenharmony_ci * going to be cleared in __free_huge_page() anyway, it is reused as the 15208c2ecf20Sopenharmony_ci * llist_node structure of a lockless linked list of huge pages to be freed. 15218c2ecf20Sopenharmony_ci */ 15228c2ecf20Sopenharmony_cistatic LLIST_HEAD(hpage_freelist); 15238c2ecf20Sopenharmony_ci 15248c2ecf20Sopenharmony_cistatic void free_hpage_workfn(struct work_struct *work) 15258c2ecf20Sopenharmony_ci{ 15268c2ecf20Sopenharmony_ci struct llist_node *node; 15278c2ecf20Sopenharmony_ci struct page *page; 15288c2ecf20Sopenharmony_ci 15298c2ecf20Sopenharmony_ci node = llist_del_all(&hpage_freelist); 15308c2ecf20Sopenharmony_ci 15318c2ecf20Sopenharmony_ci while (node) { 15328c2ecf20Sopenharmony_ci page = container_of((struct address_space **)node, 15338c2ecf20Sopenharmony_ci struct page, mapping); 15348c2ecf20Sopenharmony_ci node = node->next; 15358c2ecf20Sopenharmony_ci __free_huge_page(page); 15368c2ecf20Sopenharmony_ci } 15378c2ecf20Sopenharmony_ci} 15388c2ecf20Sopenharmony_cistatic DECLARE_WORK(free_hpage_work, free_hpage_workfn); 15398c2ecf20Sopenharmony_ci 15408c2ecf20Sopenharmony_civoid free_huge_page(struct page *page) 15418c2ecf20Sopenharmony_ci{ 15428c2ecf20Sopenharmony_ci /* 15438c2ecf20Sopenharmony_ci * Defer freeing if in non-task context to avoid hugetlb_lock deadlock. 15448c2ecf20Sopenharmony_ci */ 15458c2ecf20Sopenharmony_ci if (!in_task()) { 15468c2ecf20Sopenharmony_ci /* 15478c2ecf20Sopenharmony_ci * Only call schedule_work() if hpage_freelist is previously 15488c2ecf20Sopenharmony_ci * empty. Otherwise, schedule_work() had been called but the 15498c2ecf20Sopenharmony_ci * workfn hasn't retrieved the list yet. 15508c2ecf20Sopenharmony_ci */ 15518c2ecf20Sopenharmony_ci if (llist_add((struct llist_node *)&page->mapping, 15528c2ecf20Sopenharmony_ci &hpage_freelist)) 15538c2ecf20Sopenharmony_ci schedule_work(&free_hpage_work); 15548c2ecf20Sopenharmony_ci return; 15558c2ecf20Sopenharmony_ci } 15568c2ecf20Sopenharmony_ci 15578c2ecf20Sopenharmony_ci __free_huge_page(page); 15588c2ecf20Sopenharmony_ci} 15598c2ecf20Sopenharmony_ci 15608c2ecf20Sopenharmony_cistatic void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 15618c2ecf20Sopenharmony_ci{ 15628c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&page->lru); 15638c2ecf20Sopenharmony_ci set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); 15648c2ecf20Sopenharmony_ci set_hugetlb_cgroup(page, NULL); 15658c2ecf20Sopenharmony_ci set_hugetlb_cgroup_rsvd(page, NULL); 15668c2ecf20Sopenharmony_ci spin_lock(&hugetlb_lock); 15678c2ecf20Sopenharmony_ci h->nr_huge_pages++; 15688c2ecf20Sopenharmony_ci h->nr_huge_pages_node[nid]++; 15698c2ecf20Sopenharmony_ci ClearPageHugeFreed(page); 15708c2ecf20Sopenharmony_ci spin_unlock(&hugetlb_lock); 15718c2ecf20Sopenharmony_ci} 15728c2ecf20Sopenharmony_ci 15738c2ecf20Sopenharmony_cistatic void prep_compound_gigantic_page(struct page *page, unsigned int order) 15748c2ecf20Sopenharmony_ci{ 15758c2ecf20Sopenharmony_ci int i; 15768c2ecf20Sopenharmony_ci int nr_pages = 1 << order; 15778c2ecf20Sopenharmony_ci struct page *p = page + 1; 15788c2ecf20Sopenharmony_ci 15798c2ecf20Sopenharmony_ci /* we rely on prep_new_huge_page to set the destructor */ 15808c2ecf20Sopenharmony_ci set_compound_order(page, order); 15818c2ecf20Sopenharmony_ci __ClearPageReserved(page); 15828c2ecf20Sopenharmony_ci __SetPageHead(page); 15838c2ecf20Sopenharmony_ci for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { 15848c2ecf20Sopenharmony_ci /* 15858c2ecf20Sopenharmony_ci * For gigantic hugepages allocated through bootmem at 15868c2ecf20Sopenharmony_ci * boot, it's safer to be consistent with the not-gigantic 15878c2ecf20Sopenharmony_ci * hugepages and clear the PG_reserved bit from all tail pages 15888c2ecf20Sopenharmony_ci * too. Otherwise drivers using get_user_pages() to access tail 15898c2ecf20Sopenharmony_ci * pages may get the reference counting wrong if they see 15908c2ecf20Sopenharmony_ci * PG_reserved set on a tail page (despite the head page not 15918c2ecf20Sopenharmony_ci * having PG_reserved set). Enforcing this consistency between 15928c2ecf20Sopenharmony_ci * head and tail pages allows drivers to optimize away a check 15938c2ecf20Sopenharmony_ci * on the head page when they need know if put_page() is needed 15948c2ecf20Sopenharmony_ci * after get_user_pages(). 15958c2ecf20Sopenharmony_ci */ 15968c2ecf20Sopenharmony_ci __ClearPageReserved(p); 15978c2ecf20Sopenharmony_ci set_page_count(p, 0); 15988c2ecf20Sopenharmony_ci set_compound_head(p, page); 15998c2ecf20Sopenharmony_ci } 16008c2ecf20Sopenharmony_ci atomic_set(compound_mapcount_ptr(page), -1); 16018c2ecf20Sopenharmony_ci atomic_set(compound_pincount_ptr(page), 0); 16028c2ecf20Sopenharmony_ci} 16038c2ecf20Sopenharmony_ci 16048c2ecf20Sopenharmony_ci/* 16058c2ecf20Sopenharmony_ci * PageHuge() only returns true for hugetlbfs pages, but not for normal or 16068c2ecf20Sopenharmony_ci * transparent huge pages. See the PageTransHuge() documentation for more 16078c2ecf20Sopenharmony_ci * details. 16088c2ecf20Sopenharmony_ci */ 16098c2ecf20Sopenharmony_ciint PageHuge(struct page *page) 16108c2ecf20Sopenharmony_ci{ 16118c2ecf20Sopenharmony_ci if (!PageCompound(page)) 16128c2ecf20Sopenharmony_ci return 0; 16138c2ecf20Sopenharmony_ci 16148c2ecf20Sopenharmony_ci page = compound_head(page); 16158c2ecf20Sopenharmony_ci return page[1].compound_dtor == HUGETLB_PAGE_DTOR; 16168c2ecf20Sopenharmony_ci} 16178c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(PageHuge); 16188c2ecf20Sopenharmony_ci 16198c2ecf20Sopenharmony_ci/* 16208c2ecf20Sopenharmony_ci * PageHeadHuge() only returns true for hugetlbfs head page, but not for 16218c2ecf20Sopenharmony_ci * normal or transparent huge pages. 16228c2ecf20Sopenharmony_ci */ 16238c2ecf20Sopenharmony_ciint PageHeadHuge(struct page *page_head) 16248c2ecf20Sopenharmony_ci{ 16258c2ecf20Sopenharmony_ci if (!PageHead(page_head)) 16268c2ecf20Sopenharmony_ci return 0; 16278c2ecf20Sopenharmony_ci 16288c2ecf20Sopenharmony_ci return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR; 16298c2ecf20Sopenharmony_ci} 16308c2ecf20Sopenharmony_ci 16318c2ecf20Sopenharmony_ci/* 16328c2ecf20Sopenharmony_ci * Find and lock address space (mapping) in write mode. 16338c2ecf20Sopenharmony_ci * 16348c2ecf20Sopenharmony_ci * Upon entry, the page is locked which means that page_mapping() is 16358c2ecf20Sopenharmony_ci * stable. Due to locking order, we can only trylock_write. If we can 16368c2ecf20Sopenharmony_ci * not get the lock, simply return NULL to caller. 16378c2ecf20Sopenharmony_ci */ 16388c2ecf20Sopenharmony_cistruct address_space *hugetlb_page_mapping_lock_write(struct page *hpage) 16398c2ecf20Sopenharmony_ci{ 16408c2ecf20Sopenharmony_ci struct address_space *mapping = page_mapping(hpage); 16418c2ecf20Sopenharmony_ci 16428c2ecf20Sopenharmony_ci if (!mapping) 16438c2ecf20Sopenharmony_ci return mapping; 16448c2ecf20Sopenharmony_ci 16458c2ecf20Sopenharmony_ci if (i_mmap_trylock_write(mapping)) 16468c2ecf20Sopenharmony_ci return mapping; 16478c2ecf20Sopenharmony_ci 16488c2ecf20Sopenharmony_ci return NULL; 16498c2ecf20Sopenharmony_ci} 16508c2ecf20Sopenharmony_ci 16518c2ecf20Sopenharmony_cipgoff_t hugetlb_basepage_index(struct page *page) 16528c2ecf20Sopenharmony_ci{ 16538c2ecf20Sopenharmony_ci struct page *page_head = compound_head(page); 16548c2ecf20Sopenharmony_ci pgoff_t index = page_index(page_head); 16558c2ecf20Sopenharmony_ci unsigned long compound_idx; 16568c2ecf20Sopenharmony_ci 16578c2ecf20Sopenharmony_ci if (compound_order(page_head) >= MAX_ORDER) 16588c2ecf20Sopenharmony_ci compound_idx = page_to_pfn(page) - page_to_pfn(page_head); 16598c2ecf20Sopenharmony_ci else 16608c2ecf20Sopenharmony_ci compound_idx = page - page_head; 16618c2ecf20Sopenharmony_ci 16628c2ecf20Sopenharmony_ci return (index << compound_order(page_head)) + compound_idx; 16638c2ecf20Sopenharmony_ci} 16648c2ecf20Sopenharmony_ci 16658c2ecf20Sopenharmony_cistatic struct page *alloc_buddy_huge_page(struct hstate *h, 16668c2ecf20Sopenharmony_ci gfp_t gfp_mask, int nid, nodemask_t *nmask, 16678c2ecf20Sopenharmony_ci nodemask_t *node_alloc_noretry) 16688c2ecf20Sopenharmony_ci{ 16698c2ecf20Sopenharmony_ci int order = huge_page_order(h); 16708c2ecf20Sopenharmony_ci struct page *page; 16718c2ecf20Sopenharmony_ci bool alloc_try_hard = true; 16728c2ecf20Sopenharmony_ci 16738c2ecf20Sopenharmony_ci /* 16748c2ecf20Sopenharmony_ci * By default we always try hard to allocate the page with 16758c2ecf20Sopenharmony_ci * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in 16768c2ecf20Sopenharmony_ci * a loop (to adjust global huge page counts) and previous allocation 16778c2ecf20Sopenharmony_ci * failed, do not continue to try hard on the same node. Use the 16788c2ecf20Sopenharmony_ci * node_alloc_noretry bitmap to manage this state information. 16798c2ecf20Sopenharmony_ci */ 16808c2ecf20Sopenharmony_ci if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry)) 16818c2ecf20Sopenharmony_ci alloc_try_hard = false; 16828c2ecf20Sopenharmony_ci gfp_mask |= __GFP_COMP|__GFP_NOWARN; 16838c2ecf20Sopenharmony_ci if (alloc_try_hard) 16848c2ecf20Sopenharmony_ci gfp_mask |= __GFP_RETRY_MAYFAIL; 16858c2ecf20Sopenharmony_ci if (nid == NUMA_NO_NODE) 16868c2ecf20Sopenharmony_ci nid = numa_mem_id(); 16878c2ecf20Sopenharmony_ci page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask); 16888c2ecf20Sopenharmony_ci if (page) 16898c2ecf20Sopenharmony_ci __count_vm_event(HTLB_BUDDY_PGALLOC); 16908c2ecf20Sopenharmony_ci else 16918c2ecf20Sopenharmony_ci __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 16928c2ecf20Sopenharmony_ci 16938c2ecf20Sopenharmony_ci /* 16948c2ecf20Sopenharmony_ci * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this 16958c2ecf20Sopenharmony_ci * indicates an overall state change. Clear bit so that we resume 16968c2ecf20Sopenharmony_ci * normal 'try hard' allocations. 16978c2ecf20Sopenharmony_ci */ 16988c2ecf20Sopenharmony_ci if (node_alloc_noretry && page && !alloc_try_hard) 16998c2ecf20Sopenharmony_ci node_clear(nid, *node_alloc_noretry); 17008c2ecf20Sopenharmony_ci 17018c2ecf20Sopenharmony_ci /* 17028c2ecf20Sopenharmony_ci * If we tried hard to get a page but failed, set bit so that 17038c2ecf20Sopenharmony_ci * subsequent attempts will not try as hard until there is an 17048c2ecf20Sopenharmony_ci * overall state change. 17058c2ecf20Sopenharmony_ci */ 17068c2ecf20Sopenharmony_ci if (node_alloc_noretry && !page && alloc_try_hard) 17078c2ecf20Sopenharmony_ci node_set(nid, *node_alloc_noretry); 17088c2ecf20Sopenharmony_ci 17098c2ecf20Sopenharmony_ci return page; 17108c2ecf20Sopenharmony_ci} 17118c2ecf20Sopenharmony_ci 17128c2ecf20Sopenharmony_ci/* 17138c2ecf20Sopenharmony_ci * Common helper to allocate a fresh hugetlb page. All specific allocators 17148c2ecf20Sopenharmony_ci * should use this function to get new hugetlb pages 17158c2ecf20Sopenharmony_ci */ 17168c2ecf20Sopenharmony_cistatic struct page *alloc_fresh_huge_page(struct hstate *h, 17178c2ecf20Sopenharmony_ci gfp_t gfp_mask, int nid, nodemask_t *nmask, 17188c2ecf20Sopenharmony_ci nodemask_t *node_alloc_noretry) 17198c2ecf20Sopenharmony_ci{ 17208c2ecf20Sopenharmony_ci struct page *page; 17218c2ecf20Sopenharmony_ci 17228c2ecf20Sopenharmony_ci if (hstate_is_gigantic(h)) 17238c2ecf20Sopenharmony_ci page = alloc_gigantic_page(h, gfp_mask, nid, nmask); 17248c2ecf20Sopenharmony_ci else 17258c2ecf20Sopenharmony_ci page = alloc_buddy_huge_page(h, gfp_mask, 17268c2ecf20Sopenharmony_ci nid, nmask, node_alloc_noretry); 17278c2ecf20Sopenharmony_ci if (!page) 17288c2ecf20Sopenharmony_ci return NULL; 17298c2ecf20Sopenharmony_ci 17308c2ecf20Sopenharmony_ci if (hstate_is_gigantic(h)) 17318c2ecf20Sopenharmony_ci prep_compound_gigantic_page(page, huge_page_order(h)); 17328c2ecf20Sopenharmony_ci prep_new_huge_page(h, page, page_to_nid(page)); 17338c2ecf20Sopenharmony_ci 17348c2ecf20Sopenharmony_ci return page; 17358c2ecf20Sopenharmony_ci} 17368c2ecf20Sopenharmony_ci 17378c2ecf20Sopenharmony_ci/* 17388c2ecf20Sopenharmony_ci * Allocates a fresh page to the hugetlb allocator pool in the node interleaved 17398c2ecf20Sopenharmony_ci * manner. 17408c2ecf20Sopenharmony_ci */ 17418c2ecf20Sopenharmony_cistatic int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, 17428c2ecf20Sopenharmony_ci nodemask_t *node_alloc_noretry) 17438c2ecf20Sopenharmony_ci{ 17448c2ecf20Sopenharmony_ci struct page *page; 17458c2ecf20Sopenharmony_ci int nr_nodes, node; 17468c2ecf20Sopenharmony_ci gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; 17478c2ecf20Sopenharmony_ci 17488c2ecf20Sopenharmony_ci for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { 17498c2ecf20Sopenharmony_ci page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed, 17508c2ecf20Sopenharmony_ci node_alloc_noretry); 17518c2ecf20Sopenharmony_ci if (page) 17528c2ecf20Sopenharmony_ci break; 17538c2ecf20Sopenharmony_ci } 17548c2ecf20Sopenharmony_ci 17558c2ecf20Sopenharmony_ci if (!page) 17568c2ecf20Sopenharmony_ci return 0; 17578c2ecf20Sopenharmony_ci 17588c2ecf20Sopenharmony_ci put_page(page); /* free it into the hugepage allocator */ 17598c2ecf20Sopenharmony_ci 17608c2ecf20Sopenharmony_ci return 1; 17618c2ecf20Sopenharmony_ci} 17628c2ecf20Sopenharmony_ci 17638c2ecf20Sopenharmony_ci/* 17648c2ecf20Sopenharmony_ci * Free huge page from pool from next node to free. 17658c2ecf20Sopenharmony_ci * Attempt to keep persistent huge pages more or less 17668c2ecf20Sopenharmony_ci * balanced over allowed nodes. 17678c2ecf20Sopenharmony_ci * Called with hugetlb_lock locked. 17688c2ecf20Sopenharmony_ci */ 17698c2ecf20Sopenharmony_cistatic int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, 17708c2ecf20Sopenharmony_ci bool acct_surplus) 17718c2ecf20Sopenharmony_ci{ 17728c2ecf20Sopenharmony_ci int nr_nodes, node; 17738c2ecf20Sopenharmony_ci int ret = 0; 17748c2ecf20Sopenharmony_ci 17758c2ecf20Sopenharmony_ci for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { 17768c2ecf20Sopenharmony_ci /* 17778c2ecf20Sopenharmony_ci * If we're returning unused surplus pages, only examine 17788c2ecf20Sopenharmony_ci * nodes with surplus pages. 17798c2ecf20Sopenharmony_ci */ 17808c2ecf20Sopenharmony_ci if ((!acct_surplus || h->surplus_huge_pages_node[node]) && 17818c2ecf20Sopenharmony_ci !list_empty(&h->hugepage_freelists[node])) { 17828c2ecf20Sopenharmony_ci struct page *page = 17838c2ecf20Sopenharmony_ci list_entry(h->hugepage_freelists[node].next, 17848c2ecf20Sopenharmony_ci struct page, lru); 17858c2ecf20Sopenharmony_ci list_del(&page->lru); 17868c2ecf20Sopenharmony_ci h->free_huge_pages--; 17878c2ecf20Sopenharmony_ci h->free_huge_pages_node[node]--; 17888c2ecf20Sopenharmony_ci if (acct_surplus) { 17898c2ecf20Sopenharmony_ci h->surplus_huge_pages--; 17908c2ecf20Sopenharmony_ci h->surplus_huge_pages_node[node]--; 17918c2ecf20Sopenharmony_ci } 17928c2ecf20Sopenharmony_ci update_and_free_page(h, page); 17938c2ecf20Sopenharmony_ci ret = 1; 17948c2ecf20Sopenharmony_ci break; 17958c2ecf20Sopenharmony_ci } 17968c2ecf20Sopenharmony_ci } 17978c2ecf20Sopenharmony_ci 17988c2ecf20Sopenharmony_ci return ret; 17998c2ecf20Sopenharmony_ci} 18008c2ecf20Sopenharmony_ci 18018c2ecf20Sopenharmony_ci/* 18028c2ecf20Sopenharmony_ci * Dissolve a given free hugepage into free buddy pages. This function does 18038c2ecf20Sopenharmony_ci * nothing for in-use hugepages and non-hugepages. 18048c2ecf20Sopenharmony_ci * This function returns values like below: 18058c2ecf20Sopenharmony_ci * 18068c2ecf20Sopenharmony_ci * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use 18078c2ecf20Sopenharmony_ci * (allocated or reserved.) 18088c2ecf20Sopenharmony_ci * 0: successfully dissolved free hugepages or the page is not a 18098c2ecf20Sopenharmony_ci * hugepage (considered as already dissolved) 18108c2ecf20Sopenharmony_ci */ 18118c2ecf20Sopenharmony_ciint dissolve_free_huge_page(struct page *page) 18128c2ecf20Sopenharmony_ci{ 18138c2ecf20Sopenharmony_ci int rc = -EBUSY; 18148c2ecf20Sopenharmony_ci 18158c2ecf20Sopenharmony_ciretry: 18168c2ecf20Sopenharmony_ci /* Not to disrupt normal path by vainly holding hugetlb_lock */ 18178c2ecf20Sopenharmony_ci if (!PageHuge(page)) 18188c2ecf20Sopenharmony_ci return 0; 18198c2ecf20Sopenharmony_ci 18208c2ecf20Sopenharmony_ci spin_lock(&hugetlb_lock); 18218c2ecf20Sopenharmony_ci if (!PageHuge(page)) { 18228c2ecf20Sopenharmony_ci rc = 0; 18238c2ecf20Sopenharmony_ci goto out; 18248c2ecf20Sopenharmony_ci } 18258c2ecf20Sopenharmony_ci 18268c2ecf20Sopenharmony_ci if (!page_count(page)) { 18278c2ecf20Sopenharmony_ci struct page *head = compound_head(page); 18288c2ecf20Sopenharmony_ci struct hstate *h = page_hstate(head); 18298c2ecf20Sopenharmony_ci int nid = page_to_nid(head); 18308c2ecf20Sopenharmony_ci if (h->free_huge_pages - h->resv_huge_pages == 0) 18318c2ecf20Sopenharmony_ci goto out; 18328c2ecf20Sopenharmony_ci 18338c2ecf20Sopenharmony_ci /* 18348c2ecf20Sopenharmony_ci * We should make sure that the page is already on the free list 18358c2ecf20Sopenharmony_ci * when it is dissolved. 18368c2ecf20Sopenharmony_ci */ 18378c2ecf20Sopenharmony_ci if (unlikely(!PageHugeFreed(head))) { 18388c2ecf20Sopenharmony_ci spin_unlock(&hugetlb_lock); 18398c2ecf20Sopenharmony_ci cond_resched(); 18408c2ecf20Sopenharmony_ci 18418c2ecf20Sopenharmony_ci /* 18428c2ecf20Sopenharmony_ci * Theoretically, we should return -EBUSY when we 18438c2ecf20Sopenharmony_ci * encounter this race. In fact, we have a chance 18448c2ecf20Sopenharmony_ci * to successfully dissolve the page if we do a 18458c2ecf20Sopenharmony_ci * retry. Because the race window is quite small. 18468c2ecf20Sopenharmony_ci * If we seize this opportunity, it is an optimization 18478c2ecf20Sopenharmony_ci * for increasing the success rate of dissolving page. 18488c2ecf20Sopenharmony_ci */ 18498c2ecf20Sopenharmony_ci goto retry; 18508c2ecf20Sopenharmony_ci } 18518c2ecf20Sopenharmony_ci 18528c2ecf20Sopenharmony_ci /* 18538c2ecf20Sopenharmony_ci * Move PageHWPoison flag from head page to the raw error page, 18548c2ecf20Sopenharmony_ci * which makes any subpages rather than the error page reusable. 18558c2ecf20Sopenharmony_ci */ 18568c2ecf20Sopenharmony_ci if (PageHWPoison(head) && page != head) { 18578c2ecf20Sopenharmony_ci SetPageHWPoison(page); 18588c2ecf20Sopenharmony_ci ClearPageHWPoison(head); 18598c2ecf20Sopenharmony_ci } 18608c2ecf20Sopenharmony_ci list_del(&head->lru); 18618c2ecf20Sopenharmony_ci h->free_huge_pages--; 18628c2ecf20Sopenharmony_ci h->free_huge_pages_node[nid]--; 18638c2ecf20Sopenharmony_ci h->max_huge_pages--; 18648c2ecf20Sopenharmony_ci update_and_free_page(h, head); 18658c2ecf20Sopenharmony_ci rc = 0; 18668c2ecf20Sopenharmony_ci } 18678c2ecf20Sopenharmony_ciout: 18688c2ecf20Sopenharmony_ci spin_unlock(&hugetlb_lock); 18698c2ecf20Sopenharmony_ci return rc; 18708c2ecf20Sopenharmony_ci} 18718c2ecf20Sopenharmony_ci 18728c2ecf20Sopenharmony_ci/* 18738c2ecf20Sopenharmony_ci * Dissolve free hugepages in a given pfn range. Used by memory hotplug to 18748c2ecf20Sopenharmony_ci * make specified memory blocks removable from the system. 18758c2ecf20Sopenharmony_ci * Note that this will dissolve a free gigantic hugepage completely, if any 18768c2ecf20Sopenharmony_ci * part of it lies within the given range. 18778c2ecf20Sopenharmony_ci * Also note that if dissolve_free_huge_page() returns with an error, all 18788c2ecf20Sopenharmony_ci * free hugepages that were dissolved before that error are lost. 18798c2ecf20Sopenharmony_ci */ 18808c2ecf20Sopenharmony_ciint dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) 18818c2ecf20Sopenharmony_ci{ 18828c2ecf20Sopenharmony_ci unsigned long pfn; 18838c2ecf20Sopenharmony_ci struct page *page; 18848c2ecf20Sopenharmony_ci int rc = 0; 18858c2ecf20Sopenharmony_ci 18868c2ecf20Sopenharmony_ci if (!hugepages_supported()) 18878c2ecf20Sopenharmony_ci return rc; 18888c2ecf20Sopenharmony_ci 18898c2ecf20Sopenharmony_ci for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) { 18908c2ecf20Sopenharmony_ci page = pfn_to_page(pfn); 18918c2ecf20Sopenharmony_ci rc = dissolve_free_huge_page(page); 18928c2ecf20Sopenharmony_ci if (rc) 18938c2ecf20Sopenharmony_ci break; 18948c2ecf20Sopenharmony_ci } 18958c2ecf20Sopenharmony_ci 18968c2ecf20Sopenharmony_ci return rc; 18978c2ecf20Sopenharmony_ci} 18988c2ecf20Sopenharmony_ci 18998c2ecf20Sopenharmony_ci/* 19008c2ecf20Sopenharmony_ci * Allocates a fresh surplus page from the page allocator. 19018c2ecf20Sopenharmony_ci */ 19028c2ecf20Sopenharmony_cistatic struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, 19038c2ecf20Sopenharmony_ci int nid, nodemask_t *nmask) 19048c2ecf20Sopenharmony_ci{ 19058c2ecf20Sopenharmony_ci struct page *page = NULL; 19068c2ecf20Sopenharmony_ci 19078c2ecf20Sopenharmony_ci if (hstate_is_gigantic(h)) 19088c2ecf20Sopenharmony_ci return NULL; 19098c2ecf20Sopenharmony_ci 19108c2ecf20Sopenharmony_ci spin_lock(&hugetlb_lock); 19118c2ecf20Sopenharmony_ci if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) 19128c2ecf20Sopenharmony_ci goto out_unlock; 19138c2ecf20Sopenharmony_ci spin_unlock(&hugetlb_lock); 19148c2ecf20Sopenharmony_ci 19158c2ecf20Sopenharmony_ci page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); 19168c2ecf20Sopenharmony_ci if (!page) 19178c2ecf20Sopenharmony_ci return NULL; 19188c2ecf20Sopenharmony_ci 19198c2ecf20Sopenharmony_ci spin_lock(&hugetlb_lock); 19208c2ecf20Sopenharmony_ci /* 19218c2ecf20Sopenharmony_ci * We could have raced with the pool size change. 19228c2ecf20Sopenharmony_ci * Double check that and simply deallocate the new page 19238c2ecf20Sopenharmony_ci * if we would end up overcommiting the surpluses. Abuse 19248c2ecf20Sopenharmony_ci * temporary page to workaround the nasty free_huge_page 19258c2ecf20Sopenharmony_ci * codeflow 19268c2ecf20Sopenharmony_ci */ 19278c2ecf20Sopenharmony_ci if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { 19288c2ecf20Sopenharmony_ci SetPageHugeTemporary(page); 19298c2ecf20Sopenharmony_ci spin_unlock(&hugetlb_lock); 19308c2ecf20Sopenharmony_ci put_page(page); 19318c2ecf20Sopenharmony_ci return NULL; 19328c2ecf20Sopenharmony_ci } else { 19338c2ecf20Sopenharmony_ci h->surplus_huge_pages++; 19348c2ecf20Sopenharmony_ci h->surplus_huge_pages_node[page_to_nid(page)]++; 19358c2ecf20Sopenharmony_ci } 19368c2ecf20Sopenharmony_ci 19378c2ecf20Sopenharmony_ciout_unlock: 19388c2ecf20Sopenharmony_ci spin_unlock(&hugetlb_lock); 19398c2ecf20Sopenharmony_ci 19408c2ecf20Sopenharmony_ci return page; 19418c2ecf20Sopenharmony_ci} 19428c2ecf20Sopenharmony_ci 19438c2ecf20Sopenharmony_cistatic struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, 19448c2ecf20Sopenharmony_ci int nid, nodemask_t *nmask) 19458c2ecf20Sopenharmony_ci{ 19468c2ecf20Sopenharmony_ci struct page *page; 19478c2ecf20Sopenharmony_ci 19488c2ecf20Sopenharmony_ci if (hstate_is_gigantic(h)) 19498c2ecf20Sopenharmony_ci return NULL; 19508c2ecf20Sopenharmony_ci 19518c2ecf20Sopenharmony_ci page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); 19528c2ecf20Sopenharmony_ci if (!page) 19538c2ecf20Sopenharmony_ci return NULL; 19548c2ecf20Sopenharmony_ci 19558c2ecf20Sopenharmony_ci /* 19568c2ecf20Sopenharmony_ci * We do not account these pages as surplus because they are only 19578c2ecf20Sopenharmony_ci * temporary and will be released properly on the last reference 19588c2ecf20Sopenharmony_ci */ 19598c2ecf20Sopenharmony_ci SetPageHugeTemporary(page); 19608c2ecf20Sopenharmony_ci 19618c2ecf20Sopenharmony_ci return page; 19628c2ecf20Sopenharmony_ci} 19638c2ecf20Sopenharmony_ci 19648c2ecf20Sopenharmony_ci/* 19658c2ecf20Sopenharmony_ci * Use the VMA's mpolicy to allocate a huge page from the buddy. 19668c2ecf20Sopenharmony_ci */ 19678c2ecf20Sopenharmony_cistatic 19688c2ecf20Sopenharmony_cistruct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, 19698c2ecf20Sopenharmony_ci struct vm_area_struct *vma, unsigned long addr) 19708c2ecf20Sopenharmony_ci{ 19718c2ecf20Sopenharmony_ci struct page *page; 19728c2ecf20Sopenharmony_ci struct mempolicy *mpol; 19738c2ecf20Sopenharmony_ci gfp_t gfp_mask = htlb_alloc_mask(h); 19748c2ecf20Sopenharmony_ci int nid; 19758c2ecf20Sopenharmony_ci nodemask_t *nodemask; 19768c2ecf20Sopenharmony_ci 19778c2ecf20Sopenharmony_ci nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); 19788c2ecf20Sopenharmony_ci page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); 19798c2ecf20Sopenharmony_ci mpol_cond_put(mpol); 19808c2ecf20Sopenharmony_ci 19818c2ecf20Sopenharmony_ci return page; 19828c2ecf20Sopenharmony_ci} 19838c2ecf20Sopenharmony_ci 19848c2ecf20Sopenharmony_ci/* page migration callback function */ 19858c2ecf20Sopenharmony_cistruct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, 19868c2ecf20Sopenharmony_ci nodemask_t *nmask, gfp_t gfp_mask) 19878c2ecf20Sopenharmony_ci{ 19888c2ecf20Sopenharmony_ci spin_lock(&hugetlb_lock); 19898c2ecf20Sopenharmony_ci if (h->free_huge_pages - h->resv_huge_pages > 0) { 19908c2ecf20Sopenharmony_ci struct page *page; 19918c2ecf20Sopenharmony_ci 19928c2ecf20Sopenharmony_ci page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask); 19938c2ecf20Sopenharmony_ci if (page) { 19948c2ecf20Sopenharmony_ci spin_unlock(&hugetlb_lock); 19958c2ecf20Sopenharmony_ci return page; 19968c2ecf20Sopenharmony_ci } 19978c2ecf20Sopenharmony_ci } 19988c2ecf20Sopenharmony_ci spin_unlock(&hugetlb_lock); 19998c2ecf20Sopenharmony_ci 20008c2ecf20Sopenharmony_ci return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask); 20018c2ecf20Sopenharmony_ci} 20028c2ecf20Sopenharmony_ci 20038c2ecf20Sopenharmony_ci/* mempolicy aware migration callback */ 20048c2ecf20Sopenharmony_cistruct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, 20058c2ecf20Sopenharmony_ci unsigned long address) 20068c2ecf20Sopenharmony_ci{ 20078c2ecf20Sopenharmony_ci struct mempolicy *mpol; 20088c2ecf20Sopenharmony_ci nodemask_t *nodemask; 20098c2ecf20Sopenharmony_ci struct page *page; 20108c2ecf20Sopenharmony_ci gfp_t gfp_mask; 20118c2ecf20Sopenharmony_ci int node; 20128c2ecf20Sopenharmony_ci 20138c2ecf20Sopenharmony_ci gfp_mask = htlb_alloc_mask(h); 20148c2ecf20Sopenharmony_ci node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); 20158c2ecf20Sopenharmony_ci page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask); 20168c2ecf20Sopenharmony_ci mpol_cond_put(mpol); 20178c2ecf20Sopenharmony_ci 20188c2ecf20Sopenharmony_ci return page; 20198c2ecf20Sopenharmony_ci} 20208c2ecf20Sopenharmony_ci 20218c2ecf20Sopenharmony_ci/* 20228c2ecf20Sopenharmony_ci * Increase the hugetlb pool such that it can accommodate a reservation 20238c2ecf20Sopenharmony_ci * of size 'delta'. 20248c2ecf20Sopenharmony_ci */ 20258c2ecf20Sopenharmony_cistatic int gather_surplus_pages(struct hstate *h, long delta) 20268c2ecf20Sopenharmony_ci __must_hold(&hugetlb_lock) 20278c2ecf20Sopenharmony_ci{ 20288c2ecf20Sopenharmony_ci struct list_head surplus_list; 20298c2ecf20Sopenharmony_ci struct page *page, *tmp; 20308c2ecf20Sopenharmony_ci int ret; 20318c2ecf20Sopenharmony_ci long i; 20328c2ecf20Sopenharmony_ci long needed, allocated; 20338c2ecf20Sopenharmony_ci bool alloc_ok = true; 20348c2ecf20Sopenharmony_ci 20358c2ecf20Sopenharmony_ci needed = (h->resv_huge_pages + delta) - h->free_huge_pages; 20368c2ecf20Sopenharmony_ci if (needed <= 0) { 20378c2ecf20Sopenharmony_ci h->resv_huge_pages += delta; 20388c2ecf20Sopenharmony_ci return 0; 20398c2ecf20Sopenharmony_ci } 20408c2ecf20Sopenharmony_ci 20418c2ecf20Sopenharmony_ci allocated = 0; 20428c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&surplus_list); 20438c2ecf20Sopenharmony_ci 20448c2ecf20Sopenharmony_ci ret = -ENOMEM; 20458c2ecf20Sopenharmony_ciretry: 20468c2ecf20Sopenharmony_ci spin_unlock(&hugetlb_lock); 20478c2ecf20Sopenharmony_ci for (i = 0; i < needed; i++) { 20488c2ecf20Sopenharmony_ci page = alloc_surplus_huge_page(h, htlb_alloc_mask(h), 20498c2ecf20Sopenharmony_ci NUMA_NO_NODE, NULL); 20508c2ecf20Sopenharmony_ci if (!page) { 20518c2ecf20Sopenharmony_ci alloc_ok = false; 20528c2ecf20Sopenharmony_ci break; 20538c2ecf20Sopenharmony_ci } 20548c2ecf20Sopenharmony_ci list_add(&page->lru, &surplus_list); 20558c2ecf20Sopenharmony_ci cond_resched(); 20568c2ecf20Sopenharmony_ci } 20578c2ecf20Sopenharmony_ci allocated += i; 20588c2ecf20Sopenharmony_ci 20598c2ecf20Sopenharmony_ci /* 20608c2ecf20Sopenharmony_ci * After retaking hugetlb_lock, we need to recalculate 'needed' 20618c2ecf20Sopenharmony_ci * because either resv_huge_pages or free_huge_pages may have changed. 20628c2ecf20Sopenharmony_ci */ 20638c2ecf20Sopenharmony_ci spin_lock(&hugetlb_lock); 20648c2ecf20Sopenharmony_ci needed = (h->resv_huge_pages + delta) - 20658c2ecf20Sopenharmony_ci (h->free_huge_pages + allocated); 20668c2ecf20Sopenharmony_ci if (needed > 0) { 20678c2ecf20Sopenharmony_ci if (alloc_ok) 20688c2ecf20Sopenharmony_ci goto retry; 20698c2ecf20Sopenharmony_ci /* 20708c2ecf20Sopenharmony_ci * We were not able to allocate enough pages to 20718c2ecf20Sopenharmony_ci * satisfy the entire reservation so we free what 20728c2ecf20Sopenharmony_ci * we've allocated so far. 20738c2ecf20Sopenharmony_ci */ 20748c2ecf20Sopenharmony_ci goto free; 20758c2ecf20Sopenharmony_ci } 20768c2ecf20Sopenharmony_ci /* 20778c2ecf20Sopenharmony_ci * The surplus_list now contains _at_least_ the number of extra pages 20788c2ecf20Sopenharmony_ci * needed to accommodate the reservation. Add the appropriate number 20798c2ecf20Sopenharmony_ci * of pages to the hugetlb pool and free the extras back to the buddy 20808c2ecf20Sopenharmony_ci * allocator. Commit the entire reservation here to prevent another 20818c2ecf20Sopenharmony_ci * process from stealing the pages as they are added to the pool but 20828c2ecf20Sopenharmony_ci * before they are reserved. 20838c2ecf20Sopenharmony_ci */ 20848c2ecf20Sopenharmony_ci needed += allocated; 20858c2ecf20Sopenharmony_ci h->resv_huge_pages += delta; 20868c2ecf20Sopenharmony_ci ret = 0; 20878c2ecf20Sopenharmony_ci 20888c2ecf20Sopenharmony_ci /* Free the needed pages to the hugetlb pool */ 20898c2ecf20Sopenharmony_ci list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 20908c2ecf20Sopenharmony_ci if ((--needed) < 0) 20918c2ecf20Sopenharmony_ci break; 20928c2ecf20Sopenharmony_ci /* 20938c2ecf20Sopenharmony_ci * This page is now managed by the hugetlb allocator and has 20948c2ecf20Sopenharmony_ci * no users -- drop the buddy allocator's reference. 20958c2ecf20Sopenharmony_ci */ 20968c2ecf20Sopenharmony_ci put_page_testzero(page); 20978c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(page_count(page), page); 20988c2ecf20Sopenharmony_ci enqueue_huge_page(h, page); 20998c2ecf20Sopenharmony_ci } 21008c2ecf20Sopenharmony_cifree: 21018c2ecf20Sopenharmony_ci spin_unlock(&hugetlb_lock); 21028c2ecf20Sopenharmony_ci 21038c2ecf20Sopenharmony_ci /* Free unnecessary surplus pages to the buddy allocator */ 21048c2ecf20Sopenharmony_ci list_for_each_entry_safe(page, tmp, &surplus_list, lru) 21058c2ecf20Sopenharmony_ci put_page(page); 21068c2ecf20Sopenharmony_ci spin_lock(&hugetlb_lock); 21078c2ecf20Sopenharmony_ci 21088c2ecf20Sopenharmony_ci return ret; 21098c2ecf20Sopenharmony_ci} 21108c2ecf20Sopenharmony_ci 21118c2ecf20Sopenharmony_ci/* 21128c2ecf20Sopenharmony_ci * This routine has two main purposes: 21138c2ecf20Sopenharmony_ci * 1) Decrement the reservation count (resv_huge_pages) by the value passed 21148c2ecf20Sopenharmony_ci * in unused_resv_pages. This corresponds to the prior adjustments made 21158c2ecf20Sopenharmony_ci * to the associated reservation map. 21168c2ecf20Sopenharmony_ci * 2) Free any unused surplus pages that may have been allocated to satisfy 21178c2ecf20Sopenharmony_ci * the reservation. As many as unused_resv_pages may be freed. 21188c2ecf20Sopenharmony_ci * 21198c2ecf20Sopenharmony_ci * Called with hugetlb_lock held. However, the lock could be dropped (and 21208c2ecf20Sopenharmony_ci * reacquired) during calls to cond_resched_lock. Whenever dropping the lock, 21218c2ecf20Sopenharmony_ci * we must make sure nobody else can claim pages we are in the process of 21228c2ecf20Sopenharmony_ci * freeing. Do this by ensuring resv_huge_page always is greater than the 21238c2ecf20Sopenharmony_ci * number of huge pages we plan to free when dropping the lock. 21248c2ecf20Sopenharmony_ci */ 21258c2ecf20Sopenharmony_cistatic void return_unused_surplus_pages(struct hstate *h, 21268c2ecf20Sopenharmony_ci unsigned long unused_resv_pages) 21278c2ecf20Sopenharmony_ci{ 21288c2ecf20Sopenharmony_ci unsigned long nr_pages; 21298c2ecf20Sopenharmony_ci 21308c2ecf20Sopenharmony_ci /* Cannot return gigantic pages currently */ 21318c2ecf20Sopenharmony_ci if (hstate_is_gigantic(h)) 21328c2ecf20Sopenharmony_ci goto out; 21338c2ecf20Sopenharmony_ci 21348c2ecf20Sopenharmony_ci /* 21358c2ecf20Sopenharmony_ci * Part (or even all) of the reservation could have been backed 21368c2ecf20Sopenharmony_ci * by pre-allocated pages. Only free surplus pages. 21378c2ecf20Sopenharmony_ci */ 21388c2ecf20Sopenharmony_ci nr_pages = min(unused_resv_pages, h->surplus_huge_pages); 21398c2ecf20Sopenharmony_ci 21408c2ecf20Sopenharmony_ci /* 21418c2ecf20Sopenharmony_ci * We want to release as many surplus pages as possible, spread 21428c2ecf20Sopenharmony_ci * evenly across all nodes with memory. Iterate across these nodes 21438c2ecf20Sopenharmony_ci * until we can no longer free unreserved surplus pages. This occurs 21448c2ecf20Sopenharmony_ci * when the nodes with surplus pages have no free pages. 21458c2ecf20Sopenharmony_ci * free_pool_huge_page() will balance the freed pages across the 21468c2ecf20Sopenharmony_ci * on-line nodes with memory and will handle the hstate accounting. 21478c2ecf20Sopenharmony_ci * 21488c2ecf20Sopenharmony_ci * Note that we decrement resv_huge_pages as we free the pages. If 21498c2ecf20Sopenharmony_ci * we drop the lock, resv_huge_pages will still be sufficiently large 21508c2ecf20Sopenharmony_ci * to cover subsequent pages we may free. 21518c2ecf20Sopenharmony_ci */ 21528c2ecf20Sopenharmony_ci while (nr_pages--) { 21538c2ecf20Sopenharmony_ci h->resv_huge_pages--; 21548c2ecf20Sopenharmony_ci unused_resv_pages--; 21558c2ecf20Sopenharmony_ci if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) 21568c2ecf20Sopenharmony_ci goto out; 21578c2ecf20Sopenharmony_ci cond_resched_lock(&hugetlb_lock); 21588c2ecf20Sopenharmony_ci } 21598c2ecf20Sopenharmony_ci 21608c2ecf20Sopenharmony_ciout: 21618c2ecf20Sopenharmony_ci /* Fully uncommit the reservation */ 21628c2ecf20Sopenharmony_ci h->resv_huge_pages -= unused_resv_pages; 21638c2ecf20Sopenharmony_ci} 21648c2ecf20Sopenharmony_ci 21658c2ecf20Sopenharmony_ci 21668c2ecf20Sopenharmony_ci/* 21678c2ecf20Sopenharmony_ci * vma_needs_reservation, vma_commit_reservation and vma_end_reservation 21688c2ecf20Sopenharmony_ci * are used by the huge page allocation routines to manage reservations. 21698c2ecf20Sopenharmony_ci * 21708c2ecf20Sopenharmony_ci * vma_needs_reservation is called to determine if the huge page at addr 21718c2ecf20Sopenharmony_ci * within the vma has an associated reservation. If a reservation is 21728c2ecf20Sopenharmony_ci * needed, the value 1 is returned. The caller is then responsible for 21738c2ecf20Sopenharmony_ci * managing the global reservation and subpool usage counts. After 21748c2ecf20Sopenharmony_ci * the huge page has been allocated, vma_commit_reservation is called 21758c2ecf20Sopenharmony_ci * to add the page to the reservation map. If the page allocation fails, 21768c2ecf20Sopenharmony_ci * the reservation must be ended instead of committed. vma_end_reservation 21778c2ecf20Sopenharmony_ci * is called in such cases. 21788c2ecf20Sopenharmony_ci * 21798c2ecf20Sopenharmony_ci * In the normal case, vma_commit_reservation returns the same value 21808c2ecf20Sopenharmony_ci * as the preceding vma_needs_reservation call. The only time this 21818c2ecf20Sopenharmony_ci * is not the case is if a reserve map was changed between calls. It 21828c2ecf20Sopenharmony_ci * is the responsibility of the caller to notice the difference and 21838c2ecf20Sopenharmony_ci * take appropriate action. 21848c2ecf20Sopenharmony_ci * 21858c2ecf20Sopenharmony_ci * vma_add_reservation is used in error paths where a reservation must 21868c2ecf20Sopenharmony_ci * be restored when a newly allocated huge page must be freed. It is 21878c2ecf20Sopenharmony_ci * to be called after calling vma_needs_reservation to determine if a 21888c2ecf20Sopenharmony_ci * reservation exists. 21898c2ecf20Sopenharmony_ci */ 21908c2ecf20Sopenharmony_cienum vma_resv_mode { 21918c2ecf20Sopenharmony_ci VMA_NEEDS_RESV, 21928c2ecf20Sopenharmony_ci VMA_COMMIT_RESV, 21938c2ecf20Sopenharmony_ci VMA_END_RESV, 21948c2ecf20Sopenharmony_ci VMA_ADD_RESV, 21958c2ecf20Sopenharmony_ci}; 21968c2ecf20Sopenharmony_cistatic long __vma_reservation_common(struct hstate *h, 21978c2ecf20Sopenharmony_ci struct vm_area_struct *vma, unsigned long addr, 21988c2ecf20Sopenharmony_ci enum vma_resv_mode mode) 21998c2ecf20Sopenharmony_ci{ 22008c2ecf20Sopenharmony_ci struct resv_map *resv; 22018c2ecf20Sopenharmony_ci pgoff_t idx; 22028c2ecf20Sopenharmony_ci long ret; 22038c2ecf20Sopenharmony_ci long dummy_out_regions_needed; 22048c2ecf20Sopenharmony_ci 22058c2ecf20Sopenharmony_ci resv = vma_resv_map(vma); 22068c2ecf20Sopenharmony_ci if (!resv) 22078c2ecf20Sopenharmony_ci return 1; 22088c2ecf20Sopenharmony_ci 22098c2ecf20Sopenharmony_ci idx = vma_hugecache_offset(h, vma, addr); 22108c2ecf20Sopenharmony_ci switch (mode) { 22118c2ecf20Sopenharmony_ci case VMA_NEEDS_RESV: 22128c2ecf20Sopenharmony_ci ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed); 22138c2ecf20Sopenharmony_ci /* We assume that vma_reservation_* routines always operate on 22148c2ecf20Sopenharmony_ci * 1 page, and that adding to resv map a 1 page entry can only 22158c2ecf20Sopenharmony_ci * ever require 1 region. 22168c2ecf20Sopenharmony_ci */ 22178c2ecf20Sopenharmony_ci VM_BUG_ON(dummy_out_regions_needed != 1); 22188c2ecf20Sopenharmony_ci break; 22198c2ecf20Sopenharmony_ci case VMA_COMMIT_RESV: 22208c2ecf20Sopenharmony_ci ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); 22218c2ecf20Sopenharmony_ci /* region_add calls of range 1 should never fail. */ 22228c2ecf20Sopenharmony_ci VM_BUG_ON(ret < 0); 22238c2ecf20Sopenharmony_ci break; 22248c2ecf20Sopenharmony_ci case VMA_END_RESV: 22258c2ecf20Sopenharmony_ci region_abort(resv, idx, idx + 1, 1); 22268c2ecf20Sopenharmony_ci ret = 0; 22278c2ecf20Sopenharmony_ci break; 22288c2ecf20Sopenharmony_ci case VMA_ADD_RESV: 22298c2ecf20Sopenharmony_ci if (vma->vm_flags & VM_MAYSHARE) { 22308c2ecf20Sopenharmony_ci ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); 22318c2ecf20Sopenharmony_ci /* region_add calls of range 1 should never fail. */ 22328c2ecf20Sopenharmony_ci VM_BUG_ON(ret < 0); 22338c2ecf20Sopenharmony_ci } else { 22348c2ecf20Sopenharmony_ci region_abort(resv, idx, idx + 1, 1); 22358c2ecf20Sopenharmony_ci ret = region_del(resv, idx, idx + 1); 22368c2ecf20Sopenharmony_ci } 22378c2ecf20Sopenharmony_ci break; 22388c2ecf20Sopenharmony_ci default: 22398c2ecf20Sopenharmony_ci BUG(); 22408c2ecf20Sopenharmony_ci } 22418c2ecf20Sopenharmony_ci 22428c2ecf20Sopenharmony_ci if (vma->vm_flags & VM_MAYSHARE) 22438c2ecf20Sopenharmony_ci return ret; 22448c2ecf20Sopenharmony_ci else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) { 22458c2ecf20Sopenharmony_ci /* 22468c2ecf20Sopenharmony_ci * In most cases, reserves always exist for private mappings. 22478c2ecf20Sopenharmony_ci * However, a file associated with mapping could have been 22488c2ecf20Sopenharmony_ci * hole punched or truncated after reserves were consumed. 22498c2ecf20Sopenharmony_ci * As subsequent fault on such a range will not use reserves. 22508c2ecf20Sopenharmony_ci * Subtle - The reserve map for private mappings has the 22518c2ecf20Sopenharmony_ci * opposite meaning than that of shared mappings. If NO 22528c2ecf20Sopenharmony_ci * entry is in the reserve map, it means a reservation exists. 22538c2ecf20Sopenharmony_ci * If an entry exists in the reserve map, it means the 22548c2ecf20Sopenharmony_ci * reservation has already been consumed. As a result, the 22558c2ecf20Sopenharmony_ci * return value of this routine is the opposite of the 22568c2ecf20Sopenharmony_ci * value returned from reserve map manipulation routines above. 22578c2ecf20Sopenharmony_ci */ 22588c2ecf20Sopenharmony_ci if (ret) 22598c2ecf20Sopenharmony_ci return 0; 22608c2ecf20Sopenharmony_ci else 22618c2ecf20Sopenharmony_ci return 1; 22628c2ecf20Sopenharmony_ci } 22638c2ecf20Sopenharmony_ci else 22648c2ecf20Sopenharmony_ci return ret < 0 ? ret : 0; 22658c2ecf20Sopenharmony_ci} 22668c2ecf20Sopenharmony_ci 22678c2ecf20Sopenharmony_cistatic long vma_needs_reservation(struct hstate *h, 22688c2ecf20Sopenharmony_ci struct vm_area_struct *vma, unsigned long addr) 22698c2ecf20Sopenharmony_ci{ 22708c2ecf20Sopenharmony_ci return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV); 22718c2ecf20Sopenharmony_ci} 22728c2ecf20Sopenharmony_ci 22738c2ecf20Sopenharmony_cistatic long vma_commit_reservation(struct hstate *h, 22748c2ecf20Sopenharmony_ci struct vm_area_struct *vma, unsigned long addr) 22758c2ecf20Sopenharmony_ci{ 22768c2ecf20Sopenharmony_ci return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV); 22778c2ecf20Sopenharmony_ci} 22788c2ecf20Sopenharmony_ci 22798c2ecf20Sopenharmony_cistatic void vma_end_reservation(struct hstate *h, 22808c2ecf20Sopenharmony_ci struct vm_area_struct *vma, unsigned long addr) 22818c2ecf20Sopenharmony_ci{ 22828c2ecf20Sopenharmony_ci (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV); 22838c2ecf20Sopenharmony_ci} 22848c2ecf20Sopenharmony_ci 22858c2ecf20Sopenharmony_cistatic long vma_add_reservation(struct hstate *h, 22868c2ecf20Sopenharmony_ci struct vm_area_struct *vma, unsigned long addr) 22878c2ecf20Sopenharmony_ci{ 22888c2ecf20Sopenharmony_ci return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV); 22898c2ecf20Sopenharmony_ci} 22908c2ecf20Sopenharmony_ci 22918c2ecf20Sopenharmony_ci/* 22928c2ecf20Sopenharmony_ci * This routine is called to restore a reservation on error paths. In the 22938c2ecf20Sopenharmony_ci * specific error paths, a huge page was allocated (via alloc_huge_page) 22948c2ecf20Sopenharmony_ci * and is about to be freed. If a reservation for the page existed, 22958c2ecf20Sopenharmony_ci * alloc_huge_page would have consumed the reservation and set PagePrivate 22968c2ecf20Sopenharmony_ci * in the newly allocated page. When the page is freed via free_huge_page, 22978c2ecf20Sopenharmony_ci * the global reservation count will be incremented if PagePrivate is set. 22988c2ecf20Sopenharmony_ci * However, free_huge_page can not adjust the reserve map. Adjust the 22998c2ecf20Sopenharmony_ci * reserve map here to be consistent with global reserve count adjustments 23008c2ecf20Sopenharmony_ci * to be made by free_huge_page. 23018c2ecf20Sopenharmony_ci */ 23028c2ecf20Sopenharmony_cistatic void restore_reserve_on_error(struct hstate *h, 23038c2ecf20Sopenharmony_ci struct vm_area_struct *vma, unsigned long address, 23048c2ecf20Sopenharmony_ci struct page *page) 23058c2ecf20Sopenharmony_ci{ 23068c2ecf20Sopenharmony_ci if (unlikely(PagePrivate(page))) { 23078c2ecf20Sopenharmony_ci long rc = vma_needs_reservation(h, vma, address); 23088c2ecf20Sopenharmony_ci 23098c2ecf20Sopenharmony_ci if (unlikely(rc < 0)) { 23108c2ecf20Sopenharmony_ci /* 23118c2ecf20Sopenharmony_ci * Rare out of memory condition in reserve map 23128c2ecf20Sopenharmony_ci * manipulation. Clear PagePrivate so that 23138c2ecf20Sopenharmony_ci * global reserve count will not be incremented 23148c2ecf20Sopenharmony_ci * by free_huge_page. This will make it appear 23158c2ecf20Sopenharmony_ci * as though the reservation for this page was 23168c2ecf20Sopenharmony_ci * consumed. This may prevent the task from 23178c2ecf20Sopenharmony_ci * faulting in the page at a later time. This 23188c2ecf20Sopenharmony_ci * is better than inconsistent global huge page 23198c2ecf20Sopenharmony_ci * accounting of reserve counts. 23208c2ecf20Sopenharmony_ci */ 23218c2ecf20Sopenharmony_ci ClearPagePrivate(page); 23228c2ecf20Sopenharmony_ci } else if (rc) { 23238c2ecf20Sopenharmony_ci rc = vma_add_reservation(h, vma, address); 23248c2ecf20Sopenharmony_ci if (unlikely(rc < 0)) 23258c2ecf20Sopenharmony_ci /* 23268c2ecf20Sopenharmony_ci * See above comment about rare out of 23278c2ecf20Sopenharmony_ci * memory condition. 23288c2ecf20Sopenharmony_ci */ 23298c2ecf20Sopenharmony_ci ClearPagePrivate(page); 23308c2ecf20Sopenharmony_ci } else 23318c2ecf20Sopenharmony_ci vma_end_reservation(h, vma, address); 23328c2ecf20Sopenharmony_ci } 23338c2ecf20Sopenharmony_ci} 23348c2ecf20Sopenharmony_ci 23358c2ecf20Sopenharmony_cistruct page *alloc_huge_page(struct vm_area_struct *vma, 23368c2ecf20Sopenharmony_ci unsigned long addr, int avoid_reserve) 23378c2ecf20Sopenharmony_ci{ 23388c2ecf20Sopenharmony_ci struct hugepage_subpool *spool = subpool_vma(vma); 23398c2ecf20Sopenharmony_ci struct hstate *h = hstate_vma(vma); 23408c2ecf20Sopenharmony_ci struct page *page; 23418c2ecf20Sopenharmony_ci long map_chg, map_commit; 23428c2ecf20Sopenharmony_ci long gbl_chg; 23438c2ecf20Sopenharmony_ci int ret, idx; 23448c2ecf20Sopenharmony_ci struct hugetlb_cgroup *h_cg; 23458c2ecf20Sopenharmony_ci bool deferred_reserve; 23468c2ecf20Sopenharmony_ci 23478c2ecf20Sopenharmony_ci idx = hstate_index(h); 23488c2ecf20Sopenharmony_ci /* 23498c2ecf20Sopenharmony_ci * Examine the region/reserve map to determine if the process 23508c2ecf20Sopenharmony_ci * has a reservation for the page to be allocated. A return 23518c2ecf20Sopenharmony_ci * code of zero indicates a reservation exists (no change). 23528c2ecf20Sopenharmony_ci */ 23538c2ecf20Sopenharmony_ci map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); 23548c2ecf20Sopenharmony_ci if (map_chg < 0) 23558c2ecf20Sopenharmony_ci return ERR_PTR(-ENOMEM); 23568c2ecf20Sopenharmony_ci 23578c2ecf20Sopenharmony_ci /* 23588c2ecf20Sopenharmony_ci * Processes that did not create the mapping will have no 23598c2ecf20Sopenharmony_ci * reserves as indicated by the region/reserve map. Check 23608c2ecf20Sopenharmony_ci * that the allocation will not exceed the subpool limit. 23618c2ecf20Sopenharmony_ci * Allocations for MAP_NORESERVE mappings also need to be 23628c2ecf20Sopenharmony_ci * checked against any subpool limit. 23638c2ecf20Sopenharmony_ci */ 23648c2ecf20Sopenharmony_ci if (map_chg || avoid_reserve) { 23658c2ecf20Sopenharmony_ci gbl_chg = hugepage_subpool_get_pages(spool, 1); 23668c2ecf20Sopenharmony_ci if (gbl_chg < 0) { 23678c2ecf20Sopenharmony_ci vma_end_reservation(h, vma, addr); 23688c2ecf20Sopenharmony_ci return ERR_PTR(-ENOSPC); 23698c2ecf20Sopenharmony_ci } 23708c2ecf20Sopenharmony_ci 23718c2ecf20Sopenharmony_ci /* 23728c2ecf20Sopenharmony_ci * Even though there was no reservation in the region/reserve 23738c2ecf20Sopenharmony_ci * map, there could be reservations associated with the 23748c2ecf20Sopenharmony_ci * subpool that can be used. This would be indicated if the 23758c2ecf20Sopenharmony_ci * return value of hugepage_subpool_get_pages() is zero. 23768c2ecf20Sopenharmony_ci * However, if avoid_reserve is specified we still avoid even 23778c2ecf20Sopenharmony_ci * the subpool reservations. 23788c2ecf20Sopenharmony_ci */ 23798c2ecf20Sopenharmony_ci if (avoid_reserve) 23808c2ecf20Sopenharmony_ci gbl_chg = 1; 23818c2ecf20Sopenharmony_ci } 23828c2ecf20Sopenharmony_ci 23838c2ecf20Sopenharmony_ci /* If this allocation is not consuming a reservation, charge it now. 23848c2ecf20Sopenharmony_ci */ 23858c2ecf20Sopenharmony_ci deferred_reserve = map_chg || avoid_reserve || !vma_resv_map(vma); 23868c2ecf20Sopenharmony_ci if (deferred_reserve) { 23878c2ecf20Sopenharmony_ci ret = hugetlb_cgroup_charge_cgroup_rsvd( 23888c2ecf20Sopenharmony_ci idx, pages_per_huge_page(h), &h_cg); 23898c2ecf20Sopenharmony_ci if (ret) 23908c2ecf20Sopenharmony_ci goto out_subpool_put; 23918c2ecf20Sopenharmony_ci } 23928c2ecf20Sopenharmony_ci 23938c2ecf20Sopenharmony_ci ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); 23948c2ecf20Sopenharmony_ci if (ret) 23958c2ecf20Sopenharmony_ci goto out_uncharge_cgroup_reservation; 23968c2ecf20Sopenharmony_ci 23978c2ecf20Sopenharmony_ci spin_lock(&hugetlb_lock); 23988c2ecf20Sopenharmony_ci /* 23998c2ecf20Sopenharmony_ci * glb_chg is passed to indicate whether or not a page must be taken 24008c2ecf20Sopenharmony_ci * from the global free pool (global change). gbl_chg == 0 indicates 24018c2ecf20Sopenharmony_ci * a reservation exists for the allocation. 24028c2ecf20Sopenharmony_ci */ 24038c2ecf20Sopenharmony_ci page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); 24048c2ecf20Sopenharmony_ci if (!page) { 24058c2ecf20Sopenharmony_ci spin_unlock(&hugetlb_lock); 24068c2ecf20Sopenharmony_ci page = alloc_buddy_huge_page_with_mpol(h, vma, addr); 24078c2ecf20Sopenharmony_ci if (!page) 24088c2ecf20Sopenharmony_ci goto out_uncharge_cgroup; 24098c2ecf20Sopenharmony_ci spin_lock(&hugetlb_lock); 24108c2ecf20Sopenharmony_ci if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { 24118c2ecf20Sopenharmony_ci SetPagePrivate(page); 24128c2ecf20Sopenharmony_ci h->resv_huge_pages--; 24138c2ecf20Sopenharmony_ci } 24148c2ecf20Sopenharmony_ci list_add(&page->lru, &h->hugepage_activelist); 24158c2ecf20Sopenharmony_ci /* Fall through */ 24168c2ecf20Sopenharmony_ci } 24178c2ecf20Sopenharmony_ci hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); 24188c2ecf20Sopenharmony_ci /* If allocation is not consuming a reservation, also store the 24198c2ecf20Sopenharmony_ci * hugetlb_cgroup pointer on the page. 24208c2ecf20Sopenharmony_ci */ 24218c2ecf20Sopenharmony_ci if (deferred_reserve) { 24228c2ecf20Sopenharmony_ci hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), 24238c2ecf20Sopenharmony_ci h_cg, page); 24248c2ecf20Sopenharmony_ci } 24258c2ecf20Sopenharmony_ci 24268c2ecf20Sopenharmony_ci spin_unlock(&hugetlb_lock); 24278c2ecf20Sopenharmony_ci 24288c2ecf20Sopenharmony_ci set_page_private(page, (unsigned long)spool); 24298c2ecf20Sopenharmony_ci 24308c2ecf20Sopenharmony_ci map_commit = vma_commit_reservation(h, vma, addr); 24318c2ecf20Sopenharmony_ci if (unlikely(map_chg > map_commit)) { 24328c2ecf20Sopenharmony_ci /* 24338c2ecf20Sopenharmony_ci * The page was added to the reservation map between 24348c2ecf20Sopenharmony_ci * vma_needs_reservation and vma_commit_reservation. 24358c2ecf20Sopenharmony_ci * This indicates a race with hugetlb_reserve_pages. 24368c2ecf20Sopenharmony_ci * Adjust for the subpool count incremented above AND 24378c2ecf20Sopenharmony_ci * in hugetlb_reserve_pages for the same page. Also, 24388c2ecf20Sopenharmony_ci * the reservation count added in hugetlb_reserve_pages 24398c2ecf20Sopenharmony_ci * no longer applies. 24408c2ecf20Sopenharmony_ci */ 24418c2ecf20Sopenharmony_ci long rsv_adjust; 24428c2ecf20Sopenharmony_ci 24438c2ecf20Sopenharmony_ci rsv_adjust = hugepage_subpool_put_pages(spool, 1); 24448c2ecf20Sopenharmony_ci hugetlb_acct_memory(h, -rsv_adjust); 24458c2ecf20Sopenharmony_ci if (deferred_reserve) 24468c2ecf20Sopenharmony_ci hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), 24478c2ecf20Sopenharmony_ci pages_per_huge_page(h), page); 24488c2ecf20Sopenharmony_ci } 24498c2ecf20Sopenharmony_ci return page; 24508c2ecf20Sopenharmony_ci 24518c2ecf20Sopenharmony_ciout_uncharge_cgroup: 24528c2ecf20Sopenharmony_ci hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); 24538c2ecf20Sopenharmony_ciout_uncharge_cgroup_reservation: 24548c2ecf20Sopenharmony_ci if (deferred_reserve) 24558c2ecf20Sopenharmony_ci hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), 24568c2ecf20Sopenharmony_ci h_cg); 24578c2ecf20Sopenharmony_ciout_subpool_put: 24588c2ecf20Sopenharmony_ci if (map_chg || avoid_reserve) 24598c2ecf20Sopenharmony_ci hugepage_subpool_put_pages(spool, 1); 24608c2ecf20Sopenharmony_ci vma_end_reservation(h, vma, addr); 24618c2ecf20Sopenharmony_ci return ERR_PTR(-ENOSPC); 24628c2ecf20Sopenharmony_ci} 24638c2ecf20Sopenharmony_ci 24648c2ecf20Sopenharmony_ciint alloc_bootmem_huge_page(struct hstate *h) 24658c2ecf20Sopenharmony_ci __attribute__ ((weak, alias("__alloc_bootmem_huge_page"))); 24668c2ecf20Sopenharmony_ciint __alloc_bootmem_huge_page(struct hstate *h) 24678c2ecf20Sopenharmony_ci{ 24688c2ecf20Sopenharmony_ci struct huge_bootmem_page *m; 24698c2ecf20Sopenharmony_ci int nr_nodes, node; 24708c2ecf20Sopenharmony_ci 24718c2ecf20Sopenharmony_ci for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { 24728c2ecf20Sopenharmony_ci void *addr; 24738c2ecf20Sopenharmony_ci 24748c2ecf20Sopenharmony_ci addr = memblock_alloc_try_nid_raw( 24758c2ecf20Sopenharmony_ci huge_page_size(h), huge_page_size(h), 24768c2ecf20Sopenharmony_ci 0, MEMBLOCK_ALLOC_ACCESSIBLE, node); 24778c2ecf20Sopenharmony_ci if (addr) { 24788c2ecf20Sopenharmony_ci /* 24798c2ecf20Sopenharmony_ci * Use the beginning of the huge page to store the 24808c2ecf20Sopenharmony_ci * huge_bootmem_page struct (until gather_bootmem 24818c2ecf20Sopenharmony_ci * puts them into the mem_map). 24828c2ecf20Sopenharmony_ci */ 24838c2ecf20Sopenharmony_ci m = addr; 24848c2ecf20Sopenharmony_ci goto found; 24858c2ecf20Sopenharmony_ci } 24868c2ecf20Sopenharmony_ci } 24878c2ecf20Sopenharmony_ci return 0; 24888c2ecf20Sopenharmony_ci 24898c2ecf20Sopenharmony_cifound: 24908c2ecf20Sopenharmony_ci BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h))); 24918c2ecf20Sopenharmony_ci /* Put them into a private list first because mem_map is not up yet */ 24928c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&m->list); 24938c2ecf20Sopenharmony_ci list_add(&m->list, &huge_boot_pages); 24948c2ecf20Sopenharmony_ci m->hstate = h; 24958c2ecf20Sopenharmony_ci return 1; 24968c2ecf20Sopenharmony_ci} 24978c2ecf20Sopenharmony_ci 24988c2ecf20Sopenharmony_ci/* 24998c2ecf20Sopenharmony_ci * Put bootmem huge pages into the standard lists after mem_map is up. 25008c2ecf20Sopenharmony_ci * Note: This only applies to gigantic (order > MAX_ORDER) pages. 25018c2ecf20Sopenharmony_ci */ 25028c2ecf20Sopenharmony_cistatic void __init gather_bootmem_prealloc(void) 25038c2ecf20Sopenharmony_ci{ 25048c2ecf20Sopenharmony_ci struct huge_bootmem_page *m; 25058c2ecf20Sopenharmony_ci 25068c2ecf20Sopenharmony_ci list_for_each_entry(m, &huge_boot_pages, list) { 25078c2ecf20Sopenharmony_ci struct page *page = virt_to_page(m); 25088c2ecf20Sopenharmony_ci struct hstate *h = m->hstate; 25098c2ecf20Sopenharmony_ci 25108c2ecf20Sopenharmony_ci VM_BUG_ON(!hstate_is_gigantic(h)); 25118c2ecf20Sopenharmony_ci WARN_ON(page_count(page) != 1); 25128c2ecf20Sopenharmony_ci prep_compound_gigantic_page(page, huge_page_order(h)); 25138c2ecf20Sopenharmony_ci WARN_ON(PageReserved(page)); 25148c2ecf20Sopenharmony_ci prep_new_huge_page(h, page, page_to_nid(page)); 25158c2ecf20Sopenharmony_ci put_page(page); /* free it into the hugepage allocator */ 25168c2ecf20Sopenharmony_ci 25178c2ecf20Sopenharmony_ci /* 25188c2ecf20Sopenharmony_ci * We need to restore the 'stolen' pages to totalram_pages 25198c2ecf20Sopenharmony_ci * in order to fix confusing memory reports from free(1) and 25208c2ecf20Sopenharmony_ci * other side-effects, like CommitLimit going negative. 25218c2ecf20Sopenharmony_ci */ 25228c2ecf20Sopenharmony_ci adjust_managed_page_count(page, pages_per_huge_page(h)); 25238c2ecf20Sopenharmony_ci cond_resched(); 25248c2ecf20Sopenharmony_ci } 25258c2ecf20Sopenharmony_ci} 25268c2ecf20Sopenharmony_ci 25278c2ecf20Sopenharmony_cistatic void __init hugetlb_hstate_alloc_pages(struct hstate *h) 25288c2ecf20Sopenharmony_ci{ 25298c2ecf20Sopenharmony_ci unsigned long i; 25308c2ecf20Sopenharmony_ci nodemask_t *node_alloc_noretry; 25318c2ecf20Sopenharmony_ci 25328c2ecf20Sopenharmony_ci if (!hstate_is_gigantic(h)) { 25338c2ecf20Sopenharmony_ci /* 25348c2ecf20Sopenharmony_ci * Bit mask controlling how hard we retry per-node allocations. 25358c2ecf20Sopenharmony_ci * Ignore errors as lower level routines can deal with 25368c2ecf20Sopenharmony_ci * node_alloc_noretry == NULL. If this kmalloc fails at boot 25378c2ecf20Sopenharmony_ci * time, we are likely in bigger trouble. 25388c2ecf20Sopenharmony_ci */ 25398c2ecf20Sopenharmony_ci node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry), 25408c2ecf20Sopenharmony_ci GFP_KERNEL); 25418c2ecf20Sopenharmony_ci } else { 25428c2ecf20Sopenharmony_ci /* allocations done at boot time */ 25438c2ecf20Sopenharmony_ci node_alloc_noretry = NULL; 25448c2ecf20Sopenharmony_ci } 25458c2ecf20Sopenharmony_ci 25468c2ecf20Sopenharmony_ci /* bit mask controlling how hard we retry per-node allocations */ 25478c2ecf20Sopenharmony_ci if (node_alloc_noretry) 25488c2ecf20Sopenharmony_ci nodes_clear(*node_alloc_noretry); 25498c2ecf20Sopenharmony_ci 25508c2ecf20Sopenharmony_ci for (i = 0; i < h->max_huge_pages; ++i) { 25518c2ecf20Sopenharmony_ci if (hstate_is_gigantic(h)) { 25528c2ecf20Sopenharmony_ci if (hugetlb_cma_size) { 25538c2ecf20Sopenharmony_ci pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n"); 25548c2ecf20Sopenharmony_ci goto free; 25558c2ecf20Sopenharmony_ci } 25568c2ecf20Sopenharmony_ci if (!alloc_bootmem_huge_page(h)) 25578c2ecf20Sopenharmony_ci break; 25588c2ecf20Sopenharmony_ci } else if (!alloc_pool_huge_page(h, 25598c2ecf20Sopenharmony_ci &node_states[N_MEMORY], 25608c2ecf20Sopenharmony_ci node_alloc_noretry)) 25618c2ecf20Sopenharmony_ci break; 25628c2ecf20Sopenharmony_ci cond_resched(); 25638c2ecf20Sopenharmony_ci } 25648c2ecf20Sopenharmony_ci if (i < h->max_huge_pages) { 25658c2ecf20Sopenharmony_ci char buf[32]; 25668c2ecf20Sopenharmony_ci 25678c2ecf20Sopenharmony_ci string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); 25688c2ecf20Sopenharmony_ci pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n", 25698c2ecf20Sopenharmony_ci h->max_huge_pages, buf, i); 25708c2ecf20Sopenharmony_ci h->max_huge_pages = i; 25718c2ecf20Sopenharmony_ci } 25728c2ecf20Sopenharmony_cifree: 25738c2ecf20Sopenharmony_ci kfree(node_alloc_noretry); 25748c2ecf20Sopenharmony_ci} 25758c2ecf20Sopenharmony_ci 25768c2ecf20Sopenharmony_cistatic void __init hugetlb_init_hstates(void) 25778c2ecf20Sopenharmony_ci{ 25788c2ecf20Sopenharmony_ci struct hstate *h; 25798c2ecf20Sopenharmony_ci 25808c2ecf20Sopenharmony_ci for_each_hstate(h) { 25818c2ecf20Sopenharmony_ci if (minimum_order > huge_page_order(h)) 25828c2ecf20Sopenharmony_ci minimum_order = huge_page_order(h); 25838c2ecf20Sopenharmony_ci 25848c2ecf20Sopenharmony_ci /* oversize hugepages were init'ed in early boot */ 25858c2ecf20Sopenharmony_ci if (!hstate_is_gigantic(h)) 25868c2ecf20Sopenharmony_ci hugetlb_hstate_alloc_pages(h); 25878c2ecf20Sopenharmony_ci } 25888c2ecf20Sopenharmony_ci VM_BUG_ON(minimum_order == UINT_MAX); 25898c2ecf20Sopenharmony_ci} 25908c2ecf20Sopenharmony_ci 25918c2ecf20Sopenharmony_cistatic void __init report_hugepages(void) 25928c2ecf20Sopenharmony_ci{ 25938c2ecf20Sopenharmony_ci struct hstate *h; 25948c2ecf20Sopenharmony_ci 25958c2ecf20Sopenharmony_ci for_each_hstate(h) { 25968c2ecf20Sopenharmony_ci char buf[32]; 25978c2ecf20Sopenharmony_ci 25988c2ecf20Sopenharmony_ci string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); 25998c2ecf20Sopenharmony_ci pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n", 26008c2ecf20Sopenharmony_ci buf, h->free_huge_pages); 26018c2ecf20Sopenharmony_ci } 26028c2ecf20Sopenharmony_ci} 26038c2ecf20Sopenharmony_ci 26048c2ecf20Sopenharmony_ci#ifdef CONFIG_HIGHMEM 26058c2ecf20Sopenharmony_cistatic void try_to_free_low(struct hstate *h, unsigned long count, 26068c2ecf20Sopenharmony_ci nodemask_t *nodes_allowed) 26078c2ecf20Sopenharmony_ci{ 26088c2ecf20Sopenharmony_ci int i; 26098c2ecf20Sopenharmony_ci 26108c2ecf20Sopenharmony_ci if (hstate_is_gigantic(h)) 26118c2ecf20Sopenharmony_ci return; 26128c2ecf20Sopenharmony_ci 26138c2ecf20Sopenharmony_ci for_each_node_mask(i, *nodes_allowed) { 26148c2ecf20Sopenharmony_ci struct page *page, *next; 26158c2ecf20Sopenharmony_ci struct list_head *freel = &h->hugepage_freelists[i]; 26168c2ecf20Sopenharmony_ci list_for_each_entry_safe(page, next, freel, lru) { 26178c2ecf20Sopenharmony_ci if (count >= h->nr_huge_pages) 26188c2ecf20Sopenharmony_ci return; 26198c2ecf20Sopenharmony_ci if (PageHighMem(page)) 26208c2ecf20Sopenharmony_ci continue; 26218c2ecf20Sopenharmony_ci list_del(&page->lru); 26228c2ecf20Sopenharmony_ci update_and_free_page(h, page); 26238c2ecf20Sopenharmony_ci h->free_huge_pages--; 26248c2ecf20Sopenharmony_ci h->free_huge_pages_node[page_to_nid(page)]--; 26258c2ecf20Sopenharmony_ci } 26268c2ecf20Sopenharmony_ci } 26278c2ecf20Sopenharmony_ci} 26288c2ecf20Sopenharmony_ci#else 26298c2ecf20Sopenharmony_cistatic inline void try_to_free_low(struct hstate *h, unsigned long count, 26308c2ecf20Sopenharmony_ci nodemask_t *nodes_allowed) 26318c2ecf20Sopenharmony_ci{ 26328c2ecf20Sopenharmony_ci} 26338c2ecf20Sopenharmony_ci#endif 26348c2ecf20Sopenharmony_ci 26358c2ecf20Sopenharmony_ci/* 26368c2ecf20Sopenharmony_ci * Increment or decrement surplus_huge_pages. Keep node-specific counters 26378c2ecf20Sopenharmony_ci * balanced by operating on them in a round-robin fashion. 26388c2ecf20Sopenharmony_ci * Returns 1 if an adjustment was made. 26398c2ecf20Sopenharmony_ci */ 26408c2ecf20Sopenharmony_cistatic int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, 26418c2ecf20Sopenharmony_ci int delta) 26428c2ecf20Sopenharmony_ci{ 26438c2ecf20Sopenharmony_ci int nr_nodes, node; 26448c2ecf20Sopenharmony_ci 26458c2ecf20Sopenharmony_ci VM_BUG_ON(delta != -1 && delta != 1); 26468c2ecf20Sopenharmony_ci 26478c2ecf20Sopenharmony_ci if (delta < 0) { 26488c2ecf20Sopenharmony_ci for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { 26498c2ecf20Sopenharmony_ci if (h->surplus_huge_pages_node[node]) 26508c2ecf20Sopenharmony_ci goto found; 26518c2ecf20Sopenharmony_ci } 26528c2ecf20Sopenharmony_ci } else { 26538c2ecf20Sopenharmony_ci for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { 26548c2ecf20Sopenharmony_ci if (h->surplus_huge_pages_node[node] < 26558c2ecf20Sopenharmony_ci h->nr_huge_pages_node[node]) 26568c2ecf20Sopenharmony_ci goto found; 26578c2ecf20Sopenharmony_ci } 26588c2ecf20Sopenharmony_ci } 26598c2ecf20Sopenharmony_ci return 0; 26608c2ecf20Sopenharmony_ci 26618c2ecf20Sopenharmony_cifound: 26628c2ecf20Sopenharmony_ci h->surplus_huge_pages += delta; 26638c2ecf20Sopenharmony_ci h->surplus_huge_pages_node[node] += delta; 26648c2ecf20Sopenharmony_ci return 1; 26658c2ecf20Sopenharmony_ci} 26668c2ecf20Sopenharmony_ci 26678c2ecf20Sopenharmony_ci#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) 26688c2ecf20Sopenharmony_cistatic int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, 26698c2ecf20Sopenharmony_ci nodemask_t *nodes_allowed) 26708c2ecf20Sopenharmony_ci{ 26718c2ecf20Sopenharmony_ci unsigned long min_count, ret; 26728c2ecf20Sopenharmony_ci NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); 26738c2ecf20Sopenharmony_ci 26748c2ecf20Sopenharmony_ci /* 26758c2ecf20Sopenharmony_ci * Bit mask controlling how hard we retry per-node allocations. 26768c2ecf20Sopenharmony_ci * If we can not allocate the bit mask, do not attempt to allocate 26778c2ecf20Sopenharmony_ci * the requested huge pages. 26788c2ecf20Sopenharmony_ci */ 26798c2ecf20Sopenharmony_ci if (node_alloc_noretry) 26808c2ecf20Sopenharmony_ci nodes_clear(*node_alloc_noretry); 26818c2ecf20Sopenharmony_ci else 26828c2ecf20Sopenharmony_ci return -ENOMEM; 26838c2ecf20Sopenharmony_ci 26848c2ecf20Sopenharmony_ci spin_lock(&hugetlb_lock); 26858c2ecf20Sopenharmony_ci 26868c2ecf20Sopenharmony_ci /* 26878c2ecf20Sopenharmony_ci * Check for a node specific request. 26888c2ecf20Sopenharmony_ci * Changing node specific huge page count may require a corresponding 26898c2ecf20Sopenharmony_ci * change to the global count. In any case, the passed node mask 26908c2ecf20Sopenharmony_ci * (nodes_allowed) will restrict alloc/free to the specified node. 26918c2ecf20Sopenharmony_ci */ 26928c2ecf20Sopenharmony_ci if (nid != NUMA_NO_NODE) { 26938c2ecf20Sopenharmony_ci unsigned long old_count = count; 26948c2ecf20Sopenharmony_ci 26958c2ecf20Sopenharmony_ci count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; 26968c2ecf20Sopenharmony_ci /* 26978c2ecf20Sopenharmony_ci * User may have specified a large count value which caused the 26988c2ecf20Sopenharmony_ci * above calculation to overflow. In this case, they wanted 26998c2ecf20Sopenharmony_ci * to allocate as many huge pages as possible. Set count to 27008c2ecf20Sopenharmony_ci * largest possible value to align with their intention. 27018c2ecf20Sopenharmony_ci */ 27028c2ecf20Sopenharmony_ci if (count < old_count) 27038c2ecf20Sopenharmony_ci count = ULONG_MAX; 27048c2ecf20Sopenharmony_ci } 27058c2ecf20Sopenharmony_ci 27068c2ecf20Sopenharmony_ci /* 27078c2ecf20Sopenharmony_ci * Gigantic pages runtime allocation depend on the capability for large 27088c2ecf20Sopenharmony_ci * page range allocation. 27098c2ecf20Sopenharmony_ci * If the system does not provide this feature, return an error when 27108c2ecf20Sopenharmony_ci * the user tries to allocate gigantic pages but let the user free the 27118c2ecf20Sopenharmony_ci * boottime allocated gigantic pages. 27128c2ecf20Sopenharmony_ci */ 27138c2ecf20Sopenharmony_ci if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { 27148c2ecf20Sopenharmony_ci if (count > persistent_huge_pages(h)) { 27158c2ecf20Sopenharmony_ci spin_unlock(&hugetlb_lock); 27168c2ecf20Sopenharmony_ci NODEMASK_FREE(node_alloc_noretry); 27178c2ecf20Sopenharmony_ci return -EINVAL; 27188c2ecf20Sopenharmony_ci } 27198c2ecf20Sopenharmony_ci /* Fall through to decrease pool */ 27208c2ecf20Sopenharmony_ci } 27218c2ecf20Sopenharmony_ci 27228c2ecf20Sopenharmony_ci /* 27238c2ecf20Sopenharmony_ci * Increase the pool size 27248c2ecf20Sopenharmony_ci * First take pages out of surplus state. Then make up the 27258c2ecf20Sopenharmony_ci * remaining difference by allocating fresh huge pages. 27268c2ecf20Sopenharmony_ci * 27278c2ecf20Sopenharmony_ci * We might race with alloc_surplus_huge_page() here and be unable 27288c2ecf20Sopenharmony_ci * to convert a surplus huge page to a normal huge page. That is 27298c2ecf20Sopenharmony_ci * not critical, though, it just means the overall size of the 27308c2ecf20Sopenharmony_ci * pool might be one hugepage larger than it needs to be, but 27318c2ecf20Sopenharmony_ci * within all the constraints specified by the sysctls. 27328c2ecf20Sopenharmony_ci */ 27338c2ecf20Sopenharmony_ci while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { 27348c2ecf20Sopenharmony_ci if (!adjust_pool_surplus(h, nodes_allowed, -1)) 27358c2ecf20Sopenharmony_ci break; 27368c2ecf20Sopenharmony_ci } 27378c2ecf20Sopenharmony_ci 27388c2ecf20Sopenharmony_ci while (count > persistent_huge_pages(h)) { 27398c2ecf20Sopenharmony_ci /* 27408c2ecf20Sopenharmony_ci * If this allocation races such that we no longer need the 27418c2ecf20Sopenharmony_ci * page, free_huge_page will handle it by freeing the page 27428c2ecf20Sopenharmony_ci * and reducing the surplus. 27438c2ecf20Sopenharmony_ci */ 27448c2ecf20Sopenharmony_ci spin_unlock(&hugetlb_lock); 27458c2ecf20Sopenharmony_ci 27468c2ecf20Sopenharmony_ci /* yield cpu to avoid soft lockup */ 27478c2ecf20Sopenharmony_ci cond_resched(); 27488c2ecf20Sopenharmony_ci 27498c2ecf20Sopenharmony_ci ret = alloc_pool_huge_page(h, nodes_allowed, 27508c2ecf20Sopenharmony_ci node_alloc_noretry); 27518c2ecf20Sopenharmony_ci spin_lock(&hugetlb_lock); 27528c2ecf20Sopenharmony_ci if (!ret) 27538c2ecf20Sopenharmony_ci goto out; 27548c2ecf20Sopenharmony_ci 27558c2ecf20Sopenharmony_ci /* Bail for signals. Probably ctrl-c from user */ 27568c2ecf20Sopenharmony_ci if (signal_pending(current)) 27578c2ecf20Sopenharmony_ci goto out; 27588c2ecf20Sopenharmony_ci } 27598c2ecf20Sopenharmony_ci 27608c2ecf20Sopenharmony_ci /* 27618c2ecf20Sopenharmony_ci * Decrease the pool size 27628c2ecf20Sopenharmony_ci * First return free pages to the buddy allocator (being careful 27638c2ecf20Sopenharmony_ci * to keep enough around to satisfy reservations). Then place 27648c2ecf20Sopenharmony_ci * pages into surplus state as needed so the pool will shrink 27658c2ecf20Sopenharmony_ci * to the desired size as pages become free. 27668c2ecf20Sopenharmony_ci * 27678c2ecf20Sopenharmony_ci * By placing pages into the surplus state independent of the 27688c2ecf20Sopenharmony_ci * overcommit value, we are allowing the surplus pool size to 27698c2ecf20Sopenharmony_ci * exceed overcommit. There are few sane options here. Since 27708c2ecf20Sopenharmony_ci * alloc_surplus_huge_page() is checking the global counter, 27718c2ecf20Sopenharmony_ci * though, we'll note that we're not allowed to exceed surplus 27728c2ecf20Sopenharmony_ci * and won't grow the pool anywhere else. Not until one of the 27738c2ecf20Sopenharmony_ci * sysctls are changed, or the surplus pages go out of use. 27748c2ecf20Sopenharmony_ci */ 27758c2ecf20Sopenharmony_ci min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; 27768c2ecf20Sopenharmony_ci min_count = max(count, min_count); 27778c2ecf20Sopenharmony_ci try_to_free_low(h, min_count, nodes_allowed); 27788c2ecf20Sopenharmony_ci while (min_count < persistent_huge_pages(h)) { 27798c2ecf20Sopenharmony_ci if (!free_pool_huge_page(h, nodes_allowed, 0)) 27808c2ecf20Sopenharmony_ci break; 27818c2ecf20Sopenharmony_ci cond_resched_lock(&hugetlb_lock); 27828c2ecf20Sopenharmony_ci } 27838c2ecf20Sopenharmony_ci while (count < persistent_huge_pages(h)) { 27848c2ecf20Sopenharmony_ci if (!adjust_pool_surplus(h, nodes_allowed, 1)) 27858c2ecf20Sopenharmony_ci break; 27868c2ecf20Sopenharmony_ci } 27878c2ecf20Sopenharmony_ciout: 27888c2ecf20Sopenharmony_ci h->max_huge_pages = persistent_huge_pages(h); 27898c2ecf20Sopenharmony_ci spin_unlock(&hugetlb_lock); 27908c2ecf20Sopenharmony_ci 27918c2ecf20Sopenharmony_ci NODEMASK_FREE(node_alloc_noretry); 27928c2ecf20Sopenharmony_ci 27938c2ecf20Sopenharmony_ci return 0; 27948c2ecf20Sopenharmony_ci} 27958c2ecf20Sopenharmony_ci 27968c2ecf20Sopenharmony_ci#define HSTATE_ATTR_RO(_name) \ 27978c2ecf20Sopenharmony_ci static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 27988c2ecf20Sopenharmony_ci 27998c2ecf20Sopenharmony_ci#define HSTATE_ATTR(_name) \ 28008c2ecf20Sopenharmony_ci static struct kobj_attribute _name##_attr = \ 28018c2ecf20Sopenharmony_ci __ATTR(_name, 0644, _name##_show, _name##_store) 28028c2ecf20Sopenharmony_ci 28038c2ecf20Sopenharmony_cistatic struct kobject *hugepages_kobj; 28048c2ecf20Sopenharmony_cistatic struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 28058c2ecf20Sopenharmony_ci 28068c2ecf20Sopenharmony_cistatic struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); 28078c2ecf20Sopenharmony_ci 28088c2ecf20Sopenharmony_cistatic struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) 28098c2ecf20Sopenharmony_ci{ 28108c2ecf20Sopenharmony_ci int i; 28118c2ecf20Sopenharmony_ci 28128c2ecf20Sopenharmony_ci for (i = 0; i < HUGE_MAX_HSTATE; i++) 28138c2ecf20Sopenharmony_ci if (hstate_kobjs[i] == kobj) { 28148c2ecf20Sopenharmony_ci if (nidp) 28158c2ecf20Sopenharmony_ci *nidp = NUMA_NO_NODE; 28168c2ecf20Sopenharmony_ci return &hstates[i]; 28178c2ecf20Sopenharmony_ci } 28188c2ecf20Sopenharmony_ci 28198c2ecf20Sopenharmony_ci return kobj_to_node_hstate(kobj, nidp); 28208c2ecf20Sopenharmony_ci} 28218c2ecf20Sopenharmony_ci 28228c2ecf20Sopenharmony_cistatic ssize_t nr_hugepages_show_common(struct kobject *kobj, 28238c2ecf20Sopenharmony_ci struct kobj_attribute *attr, char *buf) 28248c2ecf20Sopenharmony_ci{ 28258c2ecf20Sopenharmony_ci struct hstate *h; 28268c2ecf20Sopenharmony_ci unsigned long nr_huge_pages; 28278c2ecf20Sopenharmony_ci int nid; 28288c2ecf20Sopenharmony_ci 28298c2ecf20Sopenharmony_ci h = kobj_to_hstate(kobj, &nid); 28308c2ecf20Sopenharmony_ci if (nid == NUMA_NO_NODE) 28318c2ecf20Sopenharmony_ci nr_huge_pages = h->nr_huge_pages; 28328c2ecf20Sopenharmony_ci else 28338c2ecf20Sopenharmony_ci nr_huge_pages = h->nr_huge_pages_node[nid]; 28348c2ecf20Sopenharmony_ci 28358c2ecf20Sopenharmony_ci return sprintf(buf, "%lu\n", nr_huge_pages); 28368c2ecf20Sopenharmony_ci} 28378c2ecf20Sopenharmony_ci 28388c2ecf20Sopenharmony_cistatic ssize_t __nr_hugepages_store_common(bool obey_mempolicy, 28398c2ecf20Sopenharmony_ci struct hstate *h, int nid, 28408c2ecf20Sopenharmony_ci unsigned long count, size_t len) 28418c2ecf20Sopenharmony_ci{ 28428c2ecf20Sopenharmony_ci int err; 28438c2ecf20Sopenharmony_ci nodemask_t nodes_allowed, *n_mask; 28448c2ecf20Sopenharmony_ci 28458c2ecf20Sopenharmony_ci if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) 28468c2ecf20Sopenharmony_ci return -EINVAL; 28478c2ecf20Sopenharmony_ci 28488c2ecf20Sopenharmony_ci if (nid == NUMA_NO_NODE) { 28498c2ecf20Sopenharmony_ci /* 28508c2ecf20Sopenharmony_ci * global hstate attribute 28518c2ecf20Sopenharmony_ci */ 28528c2ecf20Sopenharmony_ci if (!(obey_mempolicy && 28538c2ecf20Sopenharmony_ci init_nodemask_of_mempolicy(&nodes_allowed))) 28548c2ecf20Sopenharmony_ci n_mask = &node_states[N_MEMORY]; 28558c2ecf20Sopenharmony_ci else 28568c2ecf20Sopenharmony_ci n_mask = &nodes_allowed; 28578c2ecf20Sopenharmony_ci } else { 28588c2ecf20Sopenharmony_ci /* 28598c2ecf20Sopenharmony_ci * Node specific request. count adjustment happens in 28608c2ecf20Sopenharmony_ci * set_max_huge_pages() after acquiring hugetlb_lock. 28618c2ecf20Sopenharmony_ci */ 28628c2ecf20Sopenharmony_ci init_nodemask_of_node(&nodes_allowed, nid); 28638c2ecf20Sopenharmony_ci n_mask = &nodes_allowed; 28648c2ecf20Sopenharmony_ci } 28658c2ecf20Sopenharmony_ci 28668c2ecf20Sopenharmony_ci err = set_max_huge_pages(h, count, nid, n_mask); 28678c2ecf20Sopenharmony_ci 28688c2ecf20Sopenharmony_ci return err ? err : len; 28698c2ecf20Sopenharmony_ci} 28708c2ecf20Sopenharmony_ci 28718c2ecf20Sopenharmony_cistatic ssize_t nr_hugepages_store_common(bool obey_mempolicy, 28728c2ecf20Sopenharmony_ci struct kobject *kobj, const char *buf, 28738c2ecf20Sopenharmony_ci size_t len) 28748c2ecf20Sopenharmony_ci{ 28758c2ecf20Sopenharmony_ci struct hstate *h; 28768c2ecf20Sopenharmony_ci unsigned long count; 28778c2ecf20Sopenharmony_ci int nid; 28788c2ecf20Sopenharmony_ci int err; 28798c2ecf20Sopenharmony_ci 28808c2ecf20Sopenharmony_ci err = kstrtoul(buf, 10, &count); 28818c2ecf20Sopenharmony_ci if (err) 28828c2ecf20Sopenharmony_ci return err; 28838c2ecf20Sopenharmony_ci 28848c2ecf20Sopenharmony_ci h = kobj_to_hstate(kobj, &nid); 28858c2ecf20Sopenharmony_ci return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len); 28868c2ecf20Sopenharmony_ci} 28878c2ecf20Sopenharmony_ci 28888c2ecf20Sopenharmony_cistatic ssize_t nr_hugepages_show(struct kobject *kobj, 28898c2ecf20Sopenharmony_ci struct kobj_attribute *attr, char *buf) 28908c2ecf20Sopenharmony_ci{ 28918c2ecf20Sopenharmony_ci return nr_hugepages_show_common(kobj, attr, buf); 28928c2ecf20Sopenharmony_ci} 28938c2ecf20Sopenharmony_ci 28948c2ecf20Sopenharmony_cistatic ssize_t nr_hugepages_store(struct kobject *kobj, 28958c2ecf20Sopenharmony_ci struct kobj_attribute *attr, const char *buf, size_t len) 28968c2ecf20Sopenharmony_ci{ 28978c2ecf20Sopenharmony_ci return nr_hugepages_store_common(false, kobj, buf, len); 28988c2ecf20Sopenharmony_ci} 28998c2ecf20Sopenharmony_ciHSTATE_ATTR(nr_hugepages); 29008c2ecf20Sopenharmony_ci 29018c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA 29028c2ecf20Sopenharmony_ci 29038c2ecf20Sopenharmony_ci/* 29048c2ecf20Sopenharmony_ci * hstate attribute for optionally mempolicy-based constraint on persistent 29058c2ecf20Sopenharmony_ci * huge page alloc/free. 29068c2ecf20Sopenharmony_ci */ 29078c2ecf20Sopenharmony_cistatic ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, 29088c2ecf20Sopenharmony_ci struct kobj_attribute *attr, char *buf) 29098c2ecf20Sopenharmony_ci{ 29108c2ecf20Sopenharmony_ci return nr_hugepages_show_common(kobj, attr, buf); 29118c2ecf20Sopenharmony_ci} 29128c2ecf20Sopenharmony_ci 29138c2ecf20Sopenharmony_cistatic ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, 29148c2ecf20Sopenharmony_ci struct kobj_attribute *attr, const char *buf, size_t len) 29158c2ecf20Sopenharmony_ci{ 29168c2ecf20Sopenharmony_ci return nr_hugepages_store_common(true, kobj, buf, len); 29178c2ecf20Sopenharmony_ci} 29188c2ecf20Sopenharmony_ciHSTATE_ATTR(nr_hugepages_mempolicy); 29198c2ecf20Sopenharmony_ci#endif 29208c2ecf20Sopenharmony_ci 29218c2ecf20Sopenharmony_ci 29228c2ecf20Sopenharmony_cistatic ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, 29238c2ecf20Sopenharmony_ci struct kobj_attribute *attr, char *buf) 29248c2ecf20Sopenharmony_ci{ 29258c2ecf20Sopenharmony_ci struct hstate *h = kobj_to_hstate(kobj, NULL); 29268c2ecf20Sopenharmony_ci return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); 29278c2ecf20Sopenharmony_ci} 29288c2ecf20Sopenharmony_ci 29298c2ecf20Sopenharmony_cistatic ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, 29308c2ecf20Sopenharmony_ci struct kobj_attribute *attr, const char *buf, size_t count) 29318c2ecf20Sopenharmony_ci{ 29328c2ecf20Sopenharmony_ci int err; 29338c2ecf20Sopenharmony_ci unsigned long input; 29348c2ecf20Sopenharmony_ci struct hstate *h = kobj_to_hstate(kobj, NULL); 29358c2ecf20Sopenharmony_ci 29368c2ecf20Sopenharmony_ci if (hstate_is_gigantic(h)) 29378c2ecf20Sopenharmony_ci return -EINVAL; 29388c2ecf20Sopenharmony_ci 29398c2ecf20Sopenharmony_ci err = kstrtoul(buf, 10, &input); 29408c2ecf20Sopenharmony_ci if (err) 29418c2ecf20Sopenharmony_ci return err; 29428c2ecf20Sopenharmony_ci 29438c2ecf20Sopenharmony_ci spin_lock(&hugetlb_lock); 29448c2ecf20Sopenharmony_ci h->nr_overcommit_huge_pages = input; 29458c2ecf20Sopenharmony_ci spin_unlock(&hugetlb_lock); 29468c2ecf20Sopenharmony_ci 29478c2ecf20Sopenharmony_ci return count; 29488c2ecf20Sopenharmony_ci} 29498c2ecf20Sopenharmony_ciHSTATE_ATTR(nr_overcommit_hugepages); 29508c2ecf20Sopenharmony_ci 29518c2ecf20Sopenharmony_cistatic ssize_t free_hugepages_show(struct kobject *kobj, 29528c2ecf20Sopenharmony_ci struct kobj_attribute *attr, char *buf) 29538c2ecf20Sopenharmony_ci{ 29548c2ecf20Sopenharmony_ci struct hstate *h; 29558c2ecf20Sopenharmony_ci unsigned long free_huge_pages; 29568c2ecf20Sopenharmony_ci int nid; 29578c2ecf20Sopenharmony_ci 29588c2ecf20Sopenharmony_ci h = kobj_to_hstate(kobj, &nid); 29598c2ecf20Sopenharmony_ci if (nid == NUMA_NO_NODE) 29608c2ecf20Sopenharmony_ci free_huge_pages = h->free_huge_pages; 29618c2ecf20Sopenharmony_ci else 29628c2ecf20Sopenharmony_ci free_huge_pages = h->free_huge_pages_node[nid]; 29638c2ecf20Sopenharmony_ci 29648c2ecf20Sopenharmony_ci return sprintf(buf, "%lu\n", free_huge_pages); 29658c2ecf20Sopenharmony_ci} 29668c2ecf20Sopenharmony_ciHSTATE_ATTR_RO(free_hugepages); 29678c2ecf20Sopenharmony_ci 29688c2ecf20Sopenharmony_cistatic ssize_t resv_hugepages_show(struct kobject *kobj, 29698c2ecf20Sopenharmony_ci struct kobj_attribute *attr, char *buf) 29708c2ecf20Sopenharmony_ci{ 29718c2ecf20Sopenharmony_ci struct hstate *h = kobj_to_hstate(kobj, NULL); 29728c2ecf20Sopenharmony_ci return sprintf(buf, "%lu\n", h->resv_huge_pages); 29738c2ecf20Sopenharmony_ci} 29748c2ecf20Sopenharmony_ciHSTATE_ATTR_RO(resv_hugepages); 29758c2ecf20Sopenharmony_ci 29768c2ecf20Sopenharmony_cistatic ssize_t surplus_hugepages_show(struct kobject *kobj, 29778c2ecf20Sopenharmony_ci struct kobj_attribute *attr, char *buf) 29788c2ecf20Sopenharmony_ci{ 29798c2ecf20Sopenharmony_ci struct hstate *h; 29808c2ecf20Sopenharmony_ci unsigned long surplus_huge_pages; 29818c2ecf20Sopenharmony_ci int nid; 29828c2ecf20Sopenharmony_ci 29838c2ecf20Sopenharmony_ci h = kobj_to_hstate(kobj, &nid); 29848c2ecf20Sopenharmony_ci if (nid == NUMA_NO_NODE) 29858c2ecf20Sopenharmony_ci surplus_huge_pages = h->surplus_huge_pages; 29868c2ecf20Sopenharmony_ci else 29878c2ecf20Sopenharmony_ci surplus_huge_pages = h->surplus_huge_pages_node[nid]; 29888c2ecf20Sopenharmony_ci 29898c2ecf20Sopenharmony_ci return sprintf(buf, "%lu\n", surplus_huge_pages); 29908c2ecf20Sopenharmony_ci} 29918c2ecf20Sopenharmony_ciHSTATE_ATTR_RO(surplus_hugepages); 29928c2ecf20Sopenharmony_ci 29938c2ecf20Sopenharmony_cistatic struct attribute *hstate_attrs[] = { 29948c2ecf20Sopenharmony_ci &nr_hugepages_attr.attr, 29958c2ecf20Sopenharmony_ci &nr_overcommit_hugepages_attr.attr, 29968c2ecf20Sopenharmony_ci &free_hugepages_attr.attr, 29978c2ecf20Sopenharmony_ci &resv_hugepages_attr.attr, 29988c2ecf20Sopenharmony_ci &surplus_hugepages_attr.attr, 29998c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA 30008c2ecf20Sopenharmony_ci &nr_hugepages_mempolicy_attr.attr, 30018c2ecf20Sopenharmony_ci#endif 30028c2ecf20Sopenharmony_ci NULL, 30038c2ecf20Sopenharmony_ci}; 30048c2ecf20Sopenharmony_ci 30058c2ecf20Sopenharmony_cistatic const struct attribute_group hstate_attr_group = { 30068c2ecf20Sopenharmony_ci .attrs = hstate_attrs, 30078c2ecf20Sopenharmony_ci}; 30088c2ecf20Sopenharmony_ci 30098c2ecf20Sopenharmony_cistatic int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, 30108c2ecf20Sopenharmony_ci struct kobject **hstate_kobjs, 30118c2ecf20Sopenharmony_ci const struct attribute_group *hstate_attr_group) 30128c2ecf20Sopenharmony_ci{ 30138c2ecf20Sopenharmony_ci int retval; 30148c2ecf20Sopenharmony_ci int hi = hstate_index(h); 30158c2ecf20Sopenharmony_ci 30168c2ecf20Sopenharmony_ci hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); 30178c2ecf20Sopenharmony_ci if (!hstate_kobjs[hi]) 30188c2ecf20Sopenharmony_ci return -ENOMEM; 30198c2ecf20Sopenharmony_ci 30208c2ecf20Sopenharmony_ci retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); 30218c2ecf20Sopenharmony_ci if (retval) { 30228c2ecf20Sopenharmony_ci kobject_put(hstate_kobjs[hi]); 30238c2ecf20Sopenharmony_ci hstate_kobjs[hi] = NULL; 30248c2ecf20Sopenharmony_ci } 30258c2ecf20Sopenharmony_ci 30268c2ecf20Sopenharmony_ci return retval; 30278c2ecf20Sopenharmony_ci} 30288c2ecf20Sopenharmony_ci 30298c2ecf20Sopenharmony_cistatic void __init hugetlb_sysfs_init(void) 30308c2ecf20Sopenharmony_ci{ 30318c2ecf20Sopenharmony_ci struct hstate *h; 30328c2ecf20Sopenharmony_ci int err; 30338c2ecf20Sopenharmony_ci 30348c2ecf20Sopenharmony_ci hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); 30358c2ecf20Sopenharmony_ci if (!hugepages_kobj) 30368c2ecf20Sopenharmony_ci return; 30378c2ecf20Sopenharmony_ci 30388c2ecf20Sopenharmony_ci for_each_hstate(h) { 30398c2ecf20Sopenharmony_ci err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, 30408c2ecf20Sopenharmony_ci hstate_kobjs, &hstate_attr_group); 30418c2ecf20Sopenharmony_ci if (err) 30428c2ecf20Sopenharmony_ci pr_err("HugeTLB: Unable to add hstate %s", h->name); 30438c2ecf20Sopenharmony_ci } 30448c2ecf20Sopenharmony_ci} 30458c2ecf20Sopenharmony_ci 30468c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA 30478c2ecf20Sopenharmony_ci 30488c2ecf20Sopenharmony_ci/* 30498c2ecf20Sopenharmony_ci * node_hstate/s - associate per node hstate attributes, via their kobjects, 30508c2ecf20Sopenharmony_ci * with node devices in node_devices[] using a parallel array. The array 30518c2ecf20Sopenharmony_ci * index of a node device or _hstate == node id. 30528c2ecf20Sopenharmony_ci * This is here to avoid any static dependency of the node device driver, in 30538c2ecf20Sopenharmony_ci * the base kernel, on the hugetlb module. 30548c2ecf20Sopenharmony_ci */ 30558c2ecf20Sopenharmony_cistruct node_hstate { 30568c2ecf20Sopenharmony_ci struct kobject *hugepages_kobj; 30578c2ecf20Sopenharmony_ci struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 30588c2ecf20Sopenharmony_ci}; 30598c2ecf20Sopenharmony_cistatic struct node_hstate node_hstates[MAX_NUMNODES]; 30608c2ecf20Sopenharmony_ci 30618c2ecf20Sopenharmony_ci/* 30628c2ecf20Sopenharmony_ci * A subset of global hstate attributes for node devices 30638c2ecf20Sopenharmony_ci */ 30648c2ecf20Sopenharmony_cistatic struct attribute *per_node_hstate_attrs[] = { 30658c2ecf20Sopenharmony_ci &nr_hugepages_attr.attr, 30668c2ecf20Sopenharmony_ci &free_hugepages_attr.attr, 30678c2ecf20Sopenharmony_ci &surplus_hugepages_attr.attr, 30688c2ecf20Sopenharmony_ci NULL, 30698c2ecf20Sopenharmony_ci}; 30708c2ecf20Sopenharmony_ci 30718c2ecf20Sopenharmony_cistatic const struct attribute_group per_node_hstate_attr_group = { 30728c2ecf20Sopenharmony_ci .attrs = per_node_hstate_attrs, 30738c2ecf20Sopenharmony_ci}; 30748c2ecf20Sopenharmony_ci 30758c2ecf20Sopenharmony_ci/* 30768c2ecf20Sopenharmony_ci * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj. 30778c2ecf20Sopenharmony_ci * Returns node id via non-NULL nidp. 30788c2ecf20Sopenharmony_ci */ 30798c2ecf20Sopenharmony_cistatic struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) 30808c2ecf20Sopenharmony_ci{ 30818c2ecf20Sopenharmony_ci int nid; 30828c2ecf20Sopenharmony_ci 30838c2ecf20Sopenharmony_ci for (nid = 0; nid < nr_node_ids; nid++) { 30848c2ecf20Sopenharmony_ci struct node_hstate *nhs = &node_hstates[nid]; 30858c2ecf20Sopenharmony_ci int i; 30868c2ecf20Sopenharmony_ci for (i = 0; i < HUGE_MAX_HSTATE; i++) 30878c2ecf20Sopenharmony_ci if (nhs->hstate_kobjs[i] == kobj) { 30888c2ecf20Sopenharmony_ci if (nidp) 30898c2ecf20Sopenharmony_ci *nidp = nid; 30908c2ecf20Sopenharmony_ci return &hstates[i]; 30918c2ecf20Sopenharmony_ci } 30928c2ecf20Sopenharmony_ci } 30938c2ecf20Sopenharmony_ci 30948c2ecf20Sopenharmony_ci BUG(); 30958c2ecf20Sopenharmony_ci return NULL; 30968c2ecf20Sopenharmony_ci} 30978c2ecf20Sopenharmony_ci 30988c2ecf20Sopenharmony_ci/* 30998c2ecf20Sopenharmony_ci * Unregister hstate attributes from a single node device. 31008c2ecf20Sopenharmony_ci * No-op if no hstate attributes attached. 31018c2ecf20Sopenharmony_ci */ 31028c2ecf20Sopenharmony_cistatic void hugetlb_unregister_node(struct node *node) 31038c2ecf20Sopenharmony_ci{ 31048c2ecf20Sopenharmony_ci struct hstate *h; 31058c2ecf20Sopenharmony_ci struct node_hstate *nhs = &node_hstates[node->dev.id]; 31068c2ecf20Sopenharmony_ci 31078c2ecf20Sopenharmony_ci if (!nhs->hugepages_kobj) 31088c2ecf20Sopenharmony_ci return; /* no hstate attributes */ 31098c2ecf20Sopenharmony_ci 31108c2ecf20Sopenharmony_ci for_each_hstate(h) { 31118c2ecf20Sopenharmony_ci int idx = hstate_index(h); 31128c2ecf20Sopenharmony_ci if (nhs->hstate_kobjs[idx]) { 31138c2ecf20Sopenharmony_ci kobject_put(nhs->hstate_kobjs[idx]); 31148c2ecf20Sopenharmony_ci nhs->hstate_kobjs[idx] = NULL; 31158c2ecf20Sopenharmony_ci } 31168c2ecf20Sopenharmony_ci } 31178c2ecf20Sopenharmony_ci 31188c2ecf20Sopenharmony_ci kobject_put(nhs->hugepages_kobj); 31198c2ecf20Sopenharmony_ci nhs->hugepages_kobj = NULL; 31208c2ecf20Sopenharmony_ci} 31218c2ecf20Sopenharmony_ci 31228c2ecf20Sopenharmony_ci 31238c2ecf20Sopenharmony_ci/* 31248c2ecf20Sopenharmony_ci * Register hstate attributes for a single node device. 31258c2ecf20Sopenharmony_ci * No-op if attributes already registered. 31268c2ecf20Sopenharmony_ci */ 31278c2ecf20Sopenharmony_cistatic void hugetlb_register_node(struct node *node) 31288c2ecf20Sopenharmony_ci{ 31298c2ecf20Sopenharmony_ci struct hstate *h; 31308c2ecf20Sopenharmony_ci struct node_hstate *nhs = &node_hstates[node->dev.id]; 31318c2ecf20Sopenharmony_ci int err; 31328c2ecf20Sopenharmony_ci 31338c2ecf20Sopenharmony_ci if (nhs->hugepages_kobj) 31348c2ecf20Sopenharmony_ci return; /* already allocated */ 31358c2ecf20Sopenharmony_ci 31368c2ecf20Sopenharmony_ci nhs->hugepages_kobj = kobject_create_and_add("hugepages", 31378c2ecf20Sopenharmony_ci &node->dev.kobj); 31388c2ecf20Sopenharmony_ci if (!nhs->hugepages_kobj) 31398c2ecf20Sopenharmony_ci return; 31408c2ecf20Sopenharmony_ci 31418c2ecf20Sopenharmony_ci for_each_hstate(h) { 31428c2ecf20Sopenharmony_ci err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, 31438c2ecf20Sopenharmony_ci nhs->hstate_kobjs, 31448c2ecf20Sopenharmony_ci &per_node_hstate_attr_group); 31458c2ecf20Sopenharmony_ci if (err) { 31468c2ecf20Sopenharmony_ci pr_err("HugeTLB: Unable to add hstate %s for node %d\n", 31478c2ecf20Sopenharmony_ci h->name, node->dev.id); 31488c2ecf20Sopenharmony_ci hugetlb_unregister_node(node); 31498c2ecf20Sopenharmony_ci break; 31508c2ecf20Sopenharmony_ci } 31518c2ecf20Sopenharmony_ci } 31528c2ecf20Sopenharmony_ci} 31538c2ecf20Sopenharmony_ci 31548c2ecf20Sopenharmony_ci/* 31558c2ecf20Sopenharmony_ci * hugetlb init time: register hstate attributes for all registered node 31568c2ecf20Sopenharmony_ci * devices of nodes that have memory. All on-line nodes should have 31578c2ecf20Sopenharmony_ci * registered their associated device by this time. 31588c2ecf20Sopenharmony_ci */ 31598c2ecf20Sopenharmony_cistatic void __init hugetlb_register_all_nodes(void) 31608c2ecf20Sopenharmony_ci{ 31618c2ecf20Sopenharmony_ci int nid; 31628c2ecf20Sopenharmony_ci 31638c2ecf20Sopenharmony_ci for_each_node_state(nid, N_MEMORY) { 31648c2ecf20Sopenharmony_ci struct node *node = node_devices[nid]; 31658c2ecf20Sopenharmony_ci if (node->dev.id == nid) 31668c2ecf20Sopenharmony_ci hugetlb_register_node(node); 31678c2ecf20Sopenharmony_ci } 31688c2ecf20Sopenharmony_ci 31698c2ecf20Sopenharmony_ci /* 31708c2ecf20Sopenharmony_ci * Let the node device driver know we're here so it can 31718c2ecf20Sopenharmony_ci * [un]register hstate attributes on node hotplug. 31728c2ecf20Sopenharmony_ci */ 31738c2ecf20Sopenharmony_ci register_hugetlbfs_with_node(hugetlb_register_node, 31748c2ecf20Sopenharmony_ci hugetlb_unregister_node); 31758c2ecf20Sopenharmony_ci} 31768c2ecf20Sopenharmony_ci#else /* !CONFIG_NUMA */ 31778c2ecf20Sopenharmony_ci 31788c2ecf20Sopenharmony_cistatic struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) 31798c2ecf20Sopenharmony_ci{ 31808c2ecf20Sopenharmony_ci BUG(); 31818c2ecf20Sopenharmony_ci if (nidp) 31828c2ecf20Sopenharmony_ci *nidp = -1; 31838c2ecf20Sopenharmony_ci return NULL; 31848c2ecf20Sopenharmony_ci} 31858c2ecf20Sopenharmony_ci 31868c2ecf20Sopenharmony_cistatic void hugetlb_register_all_nodes(void) { } 31878c2ecf20Sopenharmony_ci 31888c2ecf20Sopenharmony_ci#endif 31898c2ecf20Sopenharmony_ci 31908c2ecf20Sopenharmony_cistatic int __init hugetlb_init(void) 31918c2ecf20Sopenharmony_ci{ 31928c2ecf20Sopenharmony_ci int i; 31938c2ecf20Sopenharmony_ci 31948c2ecf20Sopenharmony_ci if (!hugepages_supported()) { 31958c2ecf20Sopenharmony_ci if (hugetlb_max_hstate || default_hstate_max_huge_pages) 31968c2ecf20Sopenharmony_ci pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n"); 31978c2ecf20Sopenharmony_ci return 0; 31988c2ecf20Sopenharmony_ci } 31998c2ecf20Sopenharmony_ci 32008c2ecf20Sopenharmony_ci /* 32018c2ecf20Sopenharmony_ci * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some 32028c2ecf20Sopenharmony_ci * architectures depend on setup being done here. 32038c2ecf20Sopenharmony_ci */ 32048c2ecf20Sopenharmony_ci hugetlb_add_hstate(HUGETLB_PAGE_ORDER); 32058c2ecf20Sopenharmony_ci if (!parsed_default_hugepagesz) { 32068c2ecf20Sopenharmony_ci /* 32078c2ecf20Sopenharmony_ci * If we did not parse a default huge page size, set 32088c2ecf20Sopenharmony_ci * default_hstate_idx to HPAGE_SIZE hstate. And, if the 32098c2ecf20Sopenharmony_ci * number of huge pages for this default size was implicitly 32108c2ecf20Sopenharmony_ci * specified, set that here as well. 32118c2ecf20Sopenharmony_ci * Note that the implicit setting will overwrite an explicit 32128c2ecf20Sopenharmony_ci * setting. A warning will be printed in this case. 32138c2ecf20Sopenharmony_ci */ 32148c2ecf20Sopenharmony_ci default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE)); 32158c2ecf20Sopenharmony_ci if (default_hstate_max_huge_pages) { 32168c2ecf20Sopenharmony_ci if (default_hstate.max_huge_pages) { 32178c2ecf20Sopenharmony_ci char buf[32]; 32188c2ecf20Sopenharmony_ci 32198c2ecf20Sopenharmony_ci string_get_size(huge_page_size(&default_hstate), 32208c2ecf20Sopenharmony_ci 1, STRING_UNITS_2, buf, 32); 32218c2ecf20Sopenharmony_ci pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n", 32228c2ecf20Sopenharmony_ci default_hstate.max_huge_pages, buf); 32238c2ecf20Sopenharmony_ci pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n", 32248c2ecf20Sopenharmony_ci default_hstate_max_huge_pages); 32258c2ecf20Sopenharmony_ci } 32268c2ecf20Sopenharmony_ci default_hstate.max_huge_pages = 32278c2ecf20Sopenharmony_ci default_hstate_max_huge_pages; 32288c2ecf20Sopenharmony_ci } 32298c2ecf20Sopenharmony_ci } 32308c2ecf20Sopenharmony_ci 32318c2ecf20Sopenharmony_ci hugetlb_cma_check(); 32328c2ecf20Sopenharmony_ci hugetlb_init_hstates(); 32338c2ecf20Sopenharmony_ci gather_bootmem_prealloc(); 32348c2ecf20Sopenharmony_ci report_hugepages(); 32358c2ecf20Sopenharmony_ci 32368c2ecf20Sopenharmony_ci hugetlb_sysfs_init(); 32378c2ecf20Sopenharmony_ci hugetlb_register_all_nodes(); 32388c2ecf20Sopenharmony_ci hugetlb_cgroup_file_init(); 32398c2ecf20Sopenharmony_ci 32408c2ecf20Sopenharmony_ci#ifdef CONFIG_SMP 32418c2ecf20Sopenharmony_ci num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus()); 32428c2ecf20Sopenharmony_ci#else 32438c2ecf20Sopenharmony_ci num_fault_mutexes = 1; 32448c2ecf20Sopenharmony_ci#endif 32458c2ecf20Sopenharmony_ci hugetlb_fault_mutex_table = 32468c2ecf20Sopenharmony_ci kmalloc_array(num_fault_mutexes, sizeof(struct mutex), 32478c2ecf20Sopenharmony_ci GFP_KERNEL); 32488c2ecf20Sopenharmony_ci BUG_ON(!hugetlb_fault_mutex_table); 32498c2ecf20Sopenharmony_ci 32508c2ecf20Sopenharmony_ci for (i = 0; i < num_fault_mutexes; i++) 32518c2ecf20Sopenharmony_ci mutex_init(&hugetlb_fault_mutex_table[i]); 32528c2ecf20Sopenharmony_ci return 0; 32538c2ecf20Sopenharmony_ci} 32548c2ecf20Sopenharmony_cisubsys_initcall(hugetlb_init); 32558c2ecf20Sopenharmony_ci 32568c2ecf20Sopenharmony_ci/* Overwritten by architectures with more huge page sizes */ 32578c2ecf20Sopenharmony_cibool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size) 32588c2ecf20Sopenharmony_ci{ 32598c2ecf20Sopenharmony_ci return size == HPAGE_SIZE; 32608c2ecf20Sopenharmony_ci} 32618c2ecf20Sopenharmony_ci 32628c2ecf20Sopenharmony_civoid __init hugetlb_add_hstate(unsigned int order) 32638c2ecf20Sopenharmony_ci{ 32648c2ecf20Sopenharmony_ci struct hstate *h; 32658c2ecf20Sopenharmony_ci unsigned long i; 32668c2ecf20Sopenharmony_ci 32678c2ecf20Sopenharmony_ci if (size_to_hstate(PAGE_SIZE << order)) { 32688c2ecf20Sopenharmony_ci return; 32698c2ecf20Sopenharmony_ci } 32708c2ecf20Sopenharmony_ci BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); 32718c2ecf20Sopenharmony_ci BUG_ON(order == 0); 32728c2ecf20Sopenharmony_ci h = &hstates[hugetlb_max_hstate++]; 32738c2ecf20Sopenharmony_ci h->order = order; 32748c2ecf20Sopenharmony_ci h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); 32758c2ecf20Sopenharmony_ci h->nr_huge_pages = 0; 32768c2ecf20Sopenharmony_ci h->free_huge_pages = 0; 32778c2ecf20Sopenharmony_ci for (i = 0; i < MAX_NUMNODES; ++i) 32788c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&h->hugepage_freelists[i]); 32798c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&h->hugepage_activelist); 32808c2ecf20Sopenharmony_ci h->next_nid_to_alloc = first_memory_node; 32818c2ecf20Sopenharmony_ci h->next_nid_to_free = first_memory_node; 32828c2ecf20Sopenharmony_ci snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 32838c2ecf20Sopenharmony_ci huge_page_size(h)/1024); 32848c2ecf20Sopenharmony_ci 32858c2ecf20Sopenharmony_ci parsed_hstate = h; 32868c2ecf20Sopenharmony_ci} 32878c2ecf20Sopenharmony_ci 32888c2ecf20Sopenharmony_ci/* 32898c2ecf20Sopenharmony_ci * hugepages command line processing 32908c2ecf20Sopenharmony_ci * hugepages normally follows a valid hugepagsz or default_hugepagsz 32918c2ecf20Sopenharmony_ci * specification. If not, ignore the hugepages value. hugepages can also 32928c2ecf20Sopenharmony_ci * be the first huge page command line option in which case it implicitly 32938c2ecf20Sopenharmony_ci * specifies the number of huge pages for the default size. 32948c2ecf20Sopenharmony_ci */ 32958c2ecf20Sopenharmony_cistatic int __init hugepages_setup(char *s) 32968c2ecf20Sopenharmony_ci{ 32978c2ecf20Sopenharmony_ci unsigned long *mhp; 32988c2ecf20Sopenharmony_ci static unsigned long *last_mhp; 32998c2ecf20Sopenharmony_ci 33008c2ecf20Sopenharmony_ci if (!parsed_valid_hugepagesz) { 33018c2ecf20Sopenharmony_ci pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s); 33028c2ecf20Sopenharmony_ci parsed_valid_hugepagesz = true; 33038c2ecf20Sopenharmony_ci return 0; 33048c2ecf20Sopenharmony_ci } 33058c2ecf20Sopenharmony_ci 33068c2ecf20Sopenharmony_ci /* 33078c2ecf20Sopenharmony_ci * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter 33088c2ecf20Sopenharmony_ci * yet, so this hugepages= parameter goes to the "default hstate". 33098c2ecf20Sopenharmony_ci * Otherwise, it goes with the previously parsed hugepagesz or 33108c2ecf20Sopenharmony_ci * default_hugepagesz. 33118c2ecf20Sopenharmony_ci */ 33128c2ecf20Sopenharmony_ci else if (!hugetlb_max_hstate) 33138c2ecf20Sopenharmony_ci mhp = &default_hstate_max_huge_pages; 33148c2ecf20Sopenharmony_ci else 33158c2ecf20Sopenharmony_ci mhp = &parsed_hstate->max_huge_pages; 33168c2ecf20Sopenharmony_ci 33178c2ecf20Sopenharmony_ci if (mhp == last_mhp) { 33188c2ecf20Sopenharmony_ci pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s); 33198c2ecf20Sopenharmony_ci return 0; 33208c2ecf20Sopenharmony_ci } 33218c2ecf20Sopenharmony_ci 33228c2ecf20Sopenharmony_ci if (sscanf(s, "%lu", mhp) <= 0) 33238c2ecf20Sopenharmony_ci *mhp = 0; 33248c2ecf20Sopenharmony_ci 33258c2ecf20Sopenharmony_ci /* 33268c2ecf20Sopenharmony_ci * Global state is always initialized later in hugetlb_init. 33278c2ecf20Sopenharmony_ci * But we need to allocate >= MAX_ORDER hstates here early to still 33288c2ecf20Sopenharmony_ci * use the bootmem allocator. 33298c2ecf20Sopenharmony_ci */ 33308c2ecf20Sopenharmony_ci if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER) 33318c2ecf20Sopenharmony_ci hugetlb_hstate_alloc_pages(parsed_hstate); 33328c2ecf20Sopenharmony_ci 33338c2ecf20Sopenharmony_ci last_mhp = mhp; 33348c2ecf20Sopenharmony_ci 33358c2ecf20Sopenharmony_ci return 1; 33368c2ecf20Sopenharmony_ci} 33378c2ecf20Sopenharmony_ci__setup("hugepages=", hugepages_setup); 33388c2ecf20Sopenharmony_ci 33398c2ecf20Sopenharmony_ci/* 33408c2ecf20Sopenharmony_ci * hugepagesz command line processing 33418c2ecf20Sopenharmony_ci * A specific huge page size can only be specified once with hugepagesz. 33428c2ecf20Sopenharmony_ci * hugepagesz is followed by hugepages on the command line. The global 33438c2ecf20Sopenharmony_ci * variable 'parsed_valid_hugepagesz' is used to determine if prior 33448c2ecf20Sopenharmony_ci * hugepagesz argument was valid. 33458c2ecf20Sopenharmony_ci */ 33468c2ecf20Sopenharmony_cistatic int __init hugepagesz_setup(char *s) 33478c2ecf20Sopenharmony_ci{ 33488c2ecf20Sopenharmony_ci unsigned long size; 33498c2ecf20Sopenharmony_ci struct hstate *h; 33508c2ecf20Sopenharmony_ci 33518c2ecf20Sopenharmony_ci parsed_valid_hugepagesz = false; 33528c2ecf20Sopenharmony_ci size = (unsigned long)memparse(s, NULL); 33538c2ecf20Sopenharmony_ci 33548c2ecf20Sopenharmony_ci if (!arch_hugetlb_valid_size(size)) { 33558c2ecf20Sopenharmony_ci pr_err("HugeTLB: unsupported hugepagesz=%s\n", s); 33568c2ecf20Sopenharmony_ci return 0; 33578c2ecf20Sopenharmony_ci } 33588c2ecf20Sopenharmony_ci 33598c2ecf20Sopenharmony_ci h = size_to_hstate(size); 33608c2ecf20Sopenharmony_ci if (h) { 33618c2ecf20Sopenharmony_ci /* 33628c2ecf20Sopenharmony_ci * hstate for this size already exists. This is normally 33638c2ecf20Sopenharmony_ci * an error, but is allowed if the existing hstate is the 33648c2ecf20Sopenharmony_ci * default hstate. More specifically, it is only allowed if 33658c2ecf20Sopenharmony_ci * the number of huge pages for the default hstate was not 33668c2ecf20Sopenharmony_ci * previously specified. 33678c2ecf20Sopenharmony_ci */ 33688c2ecf20Sopenharmony_ci if (!parsed_default_hugepagesz || h != &default_hstate || 33698c2ecf20Sopenharmony_ci default_hstate.max_huge_pages) { 33708c2ecf20Sopenharmony_ci pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s); 33718c2ecf20Sopenharmony_ci return 0; 33728c2ecf20Sopenharmony_ci } 33738c2ecf20Sopenharmony_ci 33748c2ecf20Sopenharmony_ci /* 33758c2ecf20Sopenharmony_ci * No need to call hugetlb_add_hstate() as hstate already 33768c2ecf20Sopenharmony_ci * exists. But, do set parsed_hstate so that a following 33778c2ecf20Sopenharmony_ci * hugepages= parameter will be applied to this hstate. 33788c2ecf20Sopenharmony_ci */ 33798c2ecf20Sopenharmony_ci parsed_hstate = h; 33808c2ecf20Sopenharmony_ci parsed_valid_hugepagesz = true; 33818c2ecf20Sopenharmony_ci return 1; 33828c2ecf20Sopenharmony_ci } 33838c2ecf20Sopenharmony_ci 33848c2ecf20Sopenharmony_ci hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); 33858c2ecf20Sopenharmony_ci parsed_valid_hugepagesz = true; 33868c2ecf20Sopenharmony_ci return 1; 33878c2ecf20Sopenharmony_ci} 33888c2ecf20Sopenharmony_ci__setup("hugepagesz=", hugepagesz_setup); 33898c2ecf20Sopenharmony_ci 33908c2ecf20Sopenharmony_ci/* 33918c2ecf20Sopenharmony_ci * default_hugepagesz command line input 33928c2ecf20Sopenharmony_ci * Only one instance of default_hugepagesz allowed on command line. 33938c2ecf20Sopenharmony_ci */ 33948c2ecf20Sopenharmony_cistatic int __init default_hugepagesz_setup(char *s) 33958c2ecf20Sopenharmony_ci{ 33968c2ecf20Sopenharmony_ci unsigned long size; 33978c2ecf20Sopenharmony_ci 33988c2ecf20Sopenharmony_ci parsed_valid_hugepagesz = false; 33998c2ecf20Sopenharmony_ci if (parsed_default_hugepagesz) { 34008c2ecf20Sopenharmony_ci pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s); 34018c2ecf20Sopenharmony_ci return 0; 34028c2ecf20Sopenharmony_ci } 34038c2ecf20Sopenharmony_ci 34048c2ecf20Sopenharmony_ci size = (unsigned long)memparse(s, NULL); 34058c2ecf20Sopenharmony_ci 34068c2ecf20Sopenharmony_ci if (!arch_hugetlb_valid_size(size)) { 34078c2ecf20Sopenharmony_ci pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s); 34088c2ecf20Sopenharmony_ci return 0; 34098c2ecf20Sopenharmony_ci } 34108c2ecf20Sopenharmony_ci 34118c2ecf20Sopenharmony_ci hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); 34128c2ecf20Sopenharmony_ci parsed_valid_hugepagesz = true; 34138c2ecf20Sopenharmony_ci parsed_default_hugepagesz = true; 34148c2ecf20Sopenharmony_ci default_hstate_idx = hstate_index(size_to_hstate(size)); 34158c2ecf20Sopenharmony_ci 34168c2ecf20Sopenharmony_ci /* 34178c2ecf20Sopenharmony_ci * The number of default huge pages (for this size) could have been 34188c2ecf20Sopenharmony_ci * specified as the first hugetlb parameter: hugepages=X. If so, 34198c2ecf20Sopenharmony_ci * then default_hstate_max_huge_pages is set. If the default huge 34208c2ecf20Sopenharmony_ci * page size is gigantic (>= MAX_ORDER), then the pages must be 34218c2ecf20Sopenharmony_ci * allocated here from bootmem allocator. 34228c2ecf20Sopenharmony_ci */ 34238c2ecf20Sopenharmony_ci if (default_hstate_max_huge_pages) { 34248c2ecf20Sopenharmony_ci default_hstate.max_huge_pages = default_hstate_max_huge_pages; 34258c2ecf20Sopenharmony_ci if (hstate_is_gigantic(&default_hstate)) 34268c2ecf20Sopenharmony_ci hugetlb_hstate_alloc_pages(&default_hstate); 34278c2ecf20Sopenharmony_ci default_hstate_max_huge_pages = 0; 34288c2ecf20Sopenharmony_ci } 34298c2ecf20Sopenharmony_ci 34308c2ecf20Sopenharmony_ci return 1; 34318c2ecf20Sopenharmony_ci} 34328c2ecf20Sopenharmony_ci__setup("default_hugepagesz=", default_hugepagesz_setup); 34338c2ecf20Sopenharmony_ci 34348c2ecf20Sopenharmony_cistatic unsigned int allowed_mems_nr(struct hstate *h) 34358c2ecf20Sopenharmony_ci{ 34368c2ecf20Sopenharmony_ci int node; 34378c2ecf20Sopenharmony_ci unsigned int nr = 0; 34388c2ecf20Sopenharmony_ci nodemask_t *mpol_allowed; 34398c2ecf20Sopenharmony_ci unsigned int *array = h->free_huge_pages_node; 34408c2ecf20Sopenharmony_ci gfp_t gfp_mask = htlb_alloc_mask(h); 34418c2ecf20Sopenharmony_ci 34428c2ecf20Sopenharmony_ci mpol_allowed = policy_nodemask_current(gfp_mask); 34438c2ecf20Sopenharmony_ci 34448c2ecf20Sopenharmony_ci for_each_node_mask(node, cpuset_current_mems_allowed) { 34458c2ecf20Sopenharmony_ci if (!mpol_allowed || 34468c2ecf20Sopenharmony_ci (mpol_allowed && node_isset(node, *mpol_allowed))) 34478c2ecf20Sopenharmony_ci nr += array[node]; 34488c2ecf20Sopenharmony_ci } 34498c2ecf20Sopenharmony_ci 34508c2ecf20Sopenharmony_ci return nr; 34518c2ecf20Sopenharmony_ci} 34528c2ecf20Sopenharmony_ci 34538c2ecf20Sopenharmony_ci#ifdef CONFIG_SYSCTL 34548c2ecf20Sopenharmony_cistatic int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write, 34558c2ecf20Sopenharmony_ci void *buffer, size_t *length, 34568c2ecf20Sopenharmony_ci loff_t *ppos, unsigned long *out) 34578c2ecf20Sopenharmony_ci{ 34588c2ecf20Sopenharmony_ci struct ctl_table dup_table; 34598c2ecf20Sopenharmony_ci 34608c2ecf20Sopenharmony_ci /* 34618c2ecf20Sopenharmony_ci * In order to avoid races with __do_proc_doulongvec_minmax(), we 34628c2ecf20Sopenharmony_ci * can duplicate the @table and alter the duplicate of it. 34638c2ecf20Sopenharmony_ci */ 34648c2ecf20Sopenharmony_ci dup_table = *table; 34658c2ecf20Sopenharmony_ci dup_table.data = out; 34668c2ecf20Sopenharmony_ci 34678c2ecf20Sopenharmony_ci return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos); 34688c2ecf20Sopenharmony_ci} 34698c2ecf20Sopenharmony_ci 34708c2ecf20Sopenharmony_cistatic int hugetlb_sysctl_handler_common(bool obey_mempolicy, 34718c2ecf20Sopenharmony_ci struct ctl_table *table, int write, 34728c2ecf20Sopenharmony_ci void *buffer, size_t *length, loff_t *ppos) 34738c2ecf20Sopenharmony_ci{ 34748c2ecf20Sopenharmony_ci struct hstate *h = &default_hstate; 34758c2ecf20Sopenharmony_ci unsigned long tmp = h->max_huge_pages; 34768c2ecf20Sopenharmony_ci int ret; 34778c2ecf20Sopenharmony_ci 34788c2ecf20Sopenharmony_ci if (!hugepages_supported()) 34798c2ecf20Sopenharmony_ci return -EOPNOTSUPP; 34808c2ecf20Sopenharmony_ci 34818c2ecf20Sopenharmony_ci ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, 34828c2ecf20Sopenharmony_ci &tmp); 34838c2ecf20Sopenharmony_ci if (ret) 34848c2ecf20Sopenharmony_ci goto out; 34858c2ecf20Sopenharmony_ci 34868c2ecf20Sopenharmony_ci if (write) 34878c2ecf20Sopenharmony_ci ret = __nr_hugepages_store_common(obey_mempolicy, h, 34888c2ecf20Sopenharmony_ci NUMA_NO_NODE, tmp, *length); 34898c2ecf20Sopenharmony_ciout: 34908c2ecf20Sopenharmony_ci return ret; 34918c2ecf20Sopenharmony_ci} 34928c2ecf20Sopenharmony_ci 34938c2ecf20Sopenharmony_ciint hugetlb_sysctl_handler(struct ctl_table *table, int write, 34948c2ecf20Sopenharmony_ci void *buffer, size_t *length, loff_t *ppos) 34958c2ecf20Sopenharmony_ci{ 34968c2ecf20Sopenharmony_ci 34978c2ecf20Sopenharmony_ci return hugetlb_sysctl_handler_common(false, table, write, 34988c2ecf20Sopenharmony_ci buffer, length, ppos); 34998c2ecf20Sopenharmony_ci} 35008c2ecf20Sopenharmony_ci 35018c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA 35028c2ecf20Sopenharmony_ciint hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, 35038c2ecf20Sopenharmony_ci void *buffer, size_t *length, loff_t *ppos) 35048c2ecf20Sopenharmony_ci{ 35058c2ecf20Sopenharmony_ci return hugetlb_sysctl_handler_common(true, table, write, 35068c2ecf20Sopenharmony_ci buffer, length, ppos); 35078c2ecf20Sopenharmony_ci} 35088c2ecf20Sopenharmony_ci#endif /* CONFIG_NUMA */ 35098c2ecf20Sopenharmony_ci 35108c2ecf20Sopenharmony_ciint hugetlb_overcommit_handler(struct ctl_table *table, int write, 35118c2ecf20Sopenharmony_ci void *buffer, size_t *length, loff_t *ppos) 35128c2ecf20Sopenharmony_ci{ 35138c2ecf20Sopenharmony_ci struct hstate *h = &default_hstate; 35148c2ecf20Sopenharmony_ci unsigned long tmp; 35158c2ecf20Sopenharmony_ci int ret; 35168c2ecf20Sopenharmony_ci 35178c2ecf20Sopenharmony_ci if (!hugepages_supported()) 35188c2ecf20Sopenharmony_ci return -EOPNOTSUPP; 35198c2ecf20Sopenharmony_ci 35208c2ecf20Sopenharmony_ci tmp = h->nr_overcommit_huge_pages; 35218c2ecf20Sopenharmony_ci 35228c2ecf20Sopenharmony_ci if (write && hstate_is_gigantic(h)) 35238c2ecf20Sopenharmony_ci return -EINVAL; 35248c2ecf20Sopenharmony_ci 35258c2ecf20Sopenharmony_ci ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, 35268c2ecf20Sopenharmony_ci &tmp); 35278c2ecf20Sopenharmony_ci if (ret) 35288c2ecf20Sopenharmony_ci goto out; 35298c2ecf20Sopenharmony_ci 35308c2ecf20Sopenharmony_ci if (write) { 35318c2ecf20Sopenharmony_ci spin_lock(&hugetlb_lock); 35328c2ecf20Sopenharmony_ci h->nr_overcommit_huge_pages = tmp; 35338c2ecf20Sopenharmony_ci spin_unlock(&hugetlb_lock); 35348c2ecf20Sopenharmony_ci } 35358c2ecf20Sopenharmony_ciout: 35368c2ecf20Sopenharmony_ci return ret; 35378c2ecf20Sopenharmony_ci} 35388c2ecf20Sopenharmony_ci 35398c2ecf20Sopenharmony_ci#endif /* CONFIG_SYSCTL */ 35408c2ecf20Sopenharmony_ci 35418c2ecf20Sopenharmony_civoid hugetlb_report_meminfo(struct seq_file *m) 35428c2ecf20Sopenharmony_ci{ 35438c2ecf20Sopenharmony_ci struct hstate *h; 35448c2ecf20Sopenharmony_ci unsigned long total = 0; 35458c2ecf20Sopenharmony_ci 35468c2ecf20Sopenharmony_ci if (!hugepages_supported()) 35478c2ecf20Sopenharmony_ci return; 35488c2ecf20Sopenharmony_ci 35498c2ecf20Sopenharmony_ci for_each_hstate(h) { 35508c2ecf20Sopenharmony_ci unsigned long count = h->nr_huge_pages; 35518c2ecf20Sopenharmony_ci 35528c2ecf20Sopenharmony_ci total += (PAGE_SIZE << huge_page_order(h)) * count; 35538c2ecf20Sopenharmony_ci 35548c2ecf20Sopenharmony_ci if (h == &default_hstate) 35558c2ecf20Sopenharmony_ci seq_printf(m, 35568c2ecf20Sopenharmony_ci "HugePages_Total: %5lu\n" 35578c2ecf20Sopenharmony_ci "HugePages_Free: %5lu\n" 35588c2ecf20Sopenharmony_ci "HugePages_Rsvd: %5lu\n" 35598c2ecf20Sopenharmony_ci "HugePages_Surp: %5lu\n" 35608c2ecf20Sopenharmony_ci "Hugepagesize: %8lu kB\n", 35618c2ecf20Sopenharmony_ci count, 35628c2ecf20Sopenharmony_ci h->free_huge_pages, 35638c2ecf20Sopenharmony_ci h->resv_huge_pages, 35648c2ecf20Sopenharmony_ci h->surplus_huge_pages, 35658c2ecf20Sopenharmony_ci (PAGE_SIZE << huge_page_order(h)) / 1024); 35668c2ecf20Sopenharmony_ci } 35678c2ecf20Sopenharmony_ci 35688c2ecf20Sopenharmony_ci seq_printf(m, "Hugetlb: %8lu kB\n", total / 1024); 35698c2ecf20Sopenharmony_ci} 35708c2ecf20Sopenharmony_ci 35718c2ecf20Sopenharmony_ciint hugetlb_report_node_meminfo(char *buf, int len, int nid) 35728c2ecf20Sopenharmony_ci{ 35738c2ecf20Sopenharmony_ci struct hstate *h = &default_hstate; 35748c2ecf20Sopenharmony_ci 35758c2ecf20Sopenharmony_ci if (!hugepages_supported()) 35768c2ecf20Sopenharmony_ci return 0; 35778c2ecf20Sopenharmony_ci 35788c2ecf20Sopenharmony_ci return sysfs_emit_at(buf, len, 35798c2ecf20Sopenharmony_ci "Node %d HugePages_Total: %5u\n" 35808c2ecf20Sopenharmony_ci "Node %d HugePages_Free: %5u\n" 35818c2ecf20Sopenharmony_ci "Node %d HugePages_Surp: %5u\n", 35828c2ecf20Sopenharmony_ci nid, h->nr_huge_pages_node[nid], 35838c2ecf20Sopenharmony_ci nid, h->free_huge_pages_node[nid], 35848c2ecf20Sopenharmony_ci nid, h->surplus_huge_pages_node[nid]); 35858c2ecf20Sopenharmony_ci} 35868c2ecf20Sopenharmony_ci 35878c2ecf20Sopenharmony_civoid hugetlb_show_meminfo(void) 35888c2ecf20Sopenharmony_ci{ 35898c2ecf20Sopenharmony_ci struct hstate *h; 35908c2ecf20Sopenharmony_ci int nid; 35918c2ecf20Sopenharmony_ci 35928c2ecf20Sopenharmony_ci if (!hugepages_supported()) 35938c2ecf20Sopenharmony_ci return; 35948c2ecf20Sopenharmony_ci 35958c2ecf20Sopenharmony_ci for_each_node_state(nid, N_MEMORY) 35968c2ecf20Sopenharmony_ci for_each_hstate(h) 35978c2ecf20Sopenharmony_ci pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", 35988c2ecf20Sopenharmony_ci nid, 35998c2ecf20Sopenharmony_ci h->nr_huge_pages_node[nid], 36008c2ecf20Sopenharmony_ci h->free_huge_pages_node[nid], 36018c2ecf20Sopenharmony_ci h->surplus_huge_pages_node[nid], 36028c2ecf20Sopenharmony_ci 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); 36038c2ecf20Sopenharmony_ci} 36048c2ecf20Sopenharmony_ci 36058c2ecf20Sopenharmony_civoid hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm) 36068c2ecf20Sopenharmony_ci{ 36078c2ecf20Sopenharmony_ci seq_printf(m, "HugetlbPages:\t%8lu kB\n", 36088c2ecf20Sopenharmony_ci atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10)); 36098c2ecf20Sopenharmony_ci} 36108c2ecf20Sopenharmony_ci 36118c2ecf20Sopenharmony_ci/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 36128c2ecf20Sopenharmony_ciunsigned long hugetlb_total_pages(void) 36138c2ecf20Sopenharmony_ci{ 36148c2ecf20Sopenharmony_ci struct hstate *h; 36158c2ecf20Sopenharmony_ci unsigned long nr_total_pages = 0; 36168c2ecf20Sopenharmony_ci 36178c2ecf20Sopenharmony_ci for_each_hstate(h) 36188c2ecf20Sopenharmony_ci nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h); 36198c2ecf20Sopenharmony_ci return nr_total_pages; 36208c2ecf20Sopenharmony_ci} 36218c2ecf20Sopenharmony_ci 36228c2ecf20Sopenharmony_cistatic int hugetlb_acct_memory(struct hstate *h, long delta) 36238c2ecf20Sopenharmony_ci{ 36248c2ecf20Sopenharmony_ci int ret = -ENOMEM; 36258c2ecf20Sopenharmony_ci 36268c2ecf20Sopenharmony_ci spin_lock(&hugetlb_lock); 36278c2ecf20Sopenharmony_ci /* 36288c2ecf20Sopenharmony_ci * When cpuset is configured, it breaks the strict hugetlb page 36298c2ecf20Sopenharmony_ci * reservation as the accounting is done on a global variable. Such 36308c2ecf20Sopenharmony_ci * reservation is completely rubbish in the presence of cpuset because 36318c2ecf20Sopenharmony_ci * the reservation is not checked against page availability for the 36328c2ecf20Sopenharmony_ci * current cpuset. Application can still potentially OOM'ed by kernel 36338c2ecf20Sopenharmony_ci * with lack of free htlb page in cpuset that the task is in. 36348c2ecf20Sopenharmony_ci * Attempt to enforce strict accounting with cpuset is almost 36358c2ecf20Sopenharmony_ci * impossible (or too ugly) because cpuset is too fluid that 36368c2ecf20Sopenharmony_ci * task or memory node can be dynamically moved between cpusets. 36378c2ecf20Sopenharmony_ci * 36388c2ecf20Sopenharmony_ci * The change of semantics for shared hugetlb mapping with cpuset is 36398c2ecf20Sopenharmony_ci * undesirable. However, in order to preserve some of the semantics, 36408c2ecf20Sopenharmony_ci * we fall back to check against current free page availability as 36418c2ecf20Sopenharmony_ci * a best attempt and hopefully to minimize the impact of changing 36428c2ecf20Sopenharmony_ci * semantics that cpuset has. 36438c2ecf20Sopenharmony_ci * 36448c2ecf20Sopenharmony_ci * Apart from cpuset, we also have memory policy mechanism that 36458c2ecf20Sopenharmony_ci * also determines from which node the kernel will allocate memory 36468c2ecf20Sopenharmony_ci * in a NUMA system. So similar to cpuset, we also should consider 36478c2ecf20Sopenharmony_ci * the memory policy of the current task. Similar to the description 36488c2ecf20Sopenharmony_ci * above. 36498c2ecf20Sopenharmony_ci */ 36508c2ecf20Sopenharmony_ci if (delta > 0) { 36518c2ecf20Sopenharmony_ci if (gather_surplus_pages(h, delta) < 0) 36528c2ecf20Sopenharmony_ci goto out; 36538c2ecf20Sopenharmony_ci 36548c2ecf20Sopenharmony_ci if (delta > allowed_mems_nr(h)) { 36558c2ecf20Sopenharmony_ci return_unused_surplus_pages(h, delta); 36568c2ecf20Sopenharmony_ci goto out; 36578c2ecf20Sopenharmony_ci } 36588c2ecf20Sopenharmony_ci } 36598c2ecf20Sopenharmony_ci 36608c2ecf20Sopenharmony_ci ret = 0; 36618c2ecf20Sopenharmony_ci if (delta < 0) 36628c2ecf20Sopenharmony_ci return_unused_surplus_pages(h, (unsigned long) -delta); 36638c2ecf20Sopenharmony_ci 36648c2ecf20Sopenharmony_ciout: 36658c2ecf20Sopenharmony_ci spin_unlock(&hugetlb_lock); 36668c2ecf20Sopenharmony_ci return ret; 36678c2ecf20Sopenharmony_ci} 36688c2ecf20Sopenharmony_ci 36698c2ecf20Sopenharmony_cistatic void hugetlb_vm_op_open(struct vm_area_struct *vma) 36708c2ecf20Sopenharmony_ci{ 36718c2ecf20Sopenharmony_ci struct resv_map *resv = vma_resv_map(vma); 36728c2ecf20Sopenharmony_ci 36738c2ecf20Sopenharmony_ci /* 36748c2ecf20Sopenharmony_ci * This new VMA should share its siblings reservation map if present. 36758c2ecf20Sopenharmony_ci * The VMA will only ever have a valid reservation map pointer where 36768c2ecf20Sopenharmony_ci * it is being copied for another still existing VMA. As that VMA 36778c2ecf20Sopenharmony_ci * has a reference to the reservation map it cannot disappear until 36788c2ecf20Sopenharmony_ci * after this open call completes. It is therefore safe to take a 36798c2ecf20Sopenharmony_ci * new reference here without additional locking. 36808c2ecf20Sopenharmony_ci */ 36818c2ecf20Sopenharmony_ci if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 36828c2ecf20Sopenharmony_ci resv_map_dup_hugetlb_cgroup_uncharge_info(resv); 36838c2ecf20Sopenharmony_ci kref_get(&resv->refs); 36848c2ecf20Sopenharmony_ci } 36858c2ecf20Sopenharmony_ci} 36868c2ecf20Sopenharmony_ci 36878c2ecf20Sopenharmony_cistatic void hugetlb_vm_op_close(struct vm_area_struct *vma) 36888c2ecf20Sopenharmony_ci{ 36898c2ecf20Sopenharmony_ci struct hstate *h = hstate_vma(vma); 36908c2ecf20Sopenharmony_ci struct resv_map *resv = vma_resv_map(vma); 36918c2ecf20Sopenharmony_ci struct hugepage_subpool *spool = subpool_vma(vma); 36928c2ecf20Sopenharmony_ci unsigned long reserve, start, end; 36938c2ecf20Sopenharmony_ci long gbl_reserve; 36948c2ecf20Sopenharmony_ci 36958c2ecf20Sopenharmony_ci if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 36968c2ecf20Sopenharmony_ci return; 36978c2ecf20Sopenharmony_ci 36988c2ecf20Sopenharmony_ci start = vma_hugecache_offset(h, vma, vma->vm_start); 36998c2ecf20Sopenharmony_ci end = vma_hugecache_offset(h, vma, vma->vm_end); 37008c2ecf20Sopenharmony_ci 37018c2ecf20Sopenharmony_ci reserve = (end - start) - region_count(resv, start, end); 37028c2ecf20Sopenharmony_ci hugetlb_cgroup_uncharge_counter(resv, start, end); 37038c2ecf20Sopenharmony_ci if (reserve) { 37048c2ecf20Sopenharmony_ci /* 37058c2ecf20Sopenharmony_ci * Decrement reserve counts. The global reserve count may be 37068c2ecf20Sopenharmony_ci * adjusted if the subpool has a minimum size. 37078c2ecf20Sopenharmony_ci */ 37088c2ecf20Sopenharmony_ci gbl_reserve = hugepage_subpool_put_pages(spool, reserve); 37098c2ecf20Sopenharmony_ci hugetlb_acct_memory(h, -gbl_reserve); 37108c2ecf20Sopenharmony_ci } 37118c2ecf20Sopenharmony_ci 37128c2ecf20Sopenharmony_ci kref_put(&resv->refs, resv_map_release); 37138c2ecf20Sopenharmony_ci} 37148c2ecf20Sopenharmony_ci 37158c2ecf20Sopenharmony_cistatic int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) 37168c2ecf20Sopenharmony_ci{ 37178c2ecf20Sopenharmony_ci if (addr & ~(huge_page_mask(hstate_vma(vma)))) 37188c2ecf20Sopenharmony_ci return -EINVAL; 37198c2ecf20Sopenharmony_ci return 0; 37208c2ecf20Sopenharmony_ci} 37218c2ecf20Sopenharmony_ci 37228c2ecf20Sopenharmony_cistatic unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) 37238c2ecf20Sopenharmony_ci{ 37248c2ecf20Sopenharmony_ci struct hstate *hstate = hstate_vma(vma); 37258c2ecf20Sopenharmony_ci 37268c2ecf20Sopenharmony_ci return 1UL << huge_page_shift(hstate); 37278c2ecf20Sopenharmony_ci} 37288c2ecf20Sopenharmony_ci 37298c2ecf20Sopenharmony_ci/* 37308c2ecf20Sopenharmony_ci * We cannot handle pagefaults against hugetlb pages at all. They cause 37318c2ecf20Sopenharmony_ci * handle_mm_fault() to try to instantiate regular-sized pages in the 37328c2ecf20Sopenharmony_ci * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get 37338c2ecf20Sopenharmony_ci * this far. 37348c2ecf20Sopenharmony_ci */ 37358c2ecf20Sopenharmony_cistatic vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf) 37368c2ecf20Sopenharmony_ci{ 37378c2ecf20Sopenharmony_ci BUG(); 37388c2ecf20Sopenharmony_ci return 0; 37398c2ecf20Sopenharmony_ci} 37408c2ecf20Sopenharmony_ci 37418c2ecf20Sopenharmony_ci/* 37428c2ecf20Sopenharmony_ci * When a new function is introduced to vm_operations_struct and added 37438c2ecf20Sopenharmony_ci * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops. 37448c2ecf20Sopenharmony_ci * This is because under System V memory model, mappings created via 37458c2ecf20Sopenharmony_ci * shmget/shmat with "huge page" specified are backed by hugetlbfs files, 37468c2ecf20Sopenharmony_ci * their original vm_ops are overwritten with shm_vm_ops. 37478c2ecf20Sopenharmony_ci */ 37488c2ecf20Sopenharmony_ciconst struct vm_operations_struct hugetlb_vm_ops = { 37498c2ecf20Sopenharmony_ci .fault = hugetlb_vm_op_fault, 37508c2ecf20Sopenharmony_ci .open = hugetlb_vm_op_open, 37518c2ecf20Sopenharmony_ci .close = hugetlb_vm_op_close, 37528c2ecf20Sopenharmony_ci .split = hugetlb_vm_op_split, 37538c2ecf20Sopenharmony_ci .pagesize = hugetlb_vm_op_pagesize, 37548c2ecf20Sopenharmony_ci}; 37558c2ecf20Sopenharmony_ci 37568c2ecf20Sopenharmony_cistatic pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 37578c2ecf20Sopenharmony_ci int writable) 37588c2ecf20Sopenharmony_ci{ 37598c2ecf20Sopenharmony_ci pte_t entry; 37608c2ecf20Sopenharmony_ci 37618c2ecf20Sopenharmony_ci if (writable) { 37628c2ecf20Sopenharmony_ci entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, 37638c2ecf20Sopenharmony_ci vma->vm_page_prot))); 37648c2ecf20Sopenharmony_ci } else { 37658c2ecf20Sopenharmony_ci entry = huge_pte_wrprotect(mk_huge_pte(page, 37668c2ecf20Sopenharmony_ci vma->vm_page_prot)); 37678c2ecf20Sopenharmony_ci } 37688c2ecf20Sopenharmony_ci entry = pte_mkyoung(entry); 37698c2ecf20Sopenharmony_ci entry = pte_mkhuge(entry); 37708c2ecf20Sopenharmony_ci entry = arch_make_huge_pte(entry, vma, page, writable); 37718c2ecf20Sopenharmony_ci 37728c2ecf20Sopenharmony_ci return entry; 37738c2ecf20Sopenharmony_ci} 37748c2ecf20Sopenharmony_ci 37758c2ecf20Sopenharmony_cistatic void set_huge_ptep_writable(struct vm_area_struct *vma, 37768c2ecf20Sopenharmony_ci unsigned long address, pte_t *ptep) 37778c2ecf20Sopenharmony_ci{ 37788c2ecf20Sopenharmony_ci pte_t entry; 37798c2ecf20Sopenharmony_ci 37808c2ecf20Sopenharmony_ci entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep))); 37818c2ecf20Sopenharmony_ci if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) 37828c2ecf20Sopenharmony_ci update_mmu_cache(vma, address, ptep); 37838c2ecf20Sopenharmony_ci} 37848c2ecf20Sopenharmony_ci 37858c2ecf20Sopenharmony_cibool is_hugetlb_entry_migration(pte_t pte) 37868c2ecf20Sopenharmony_ci{ 37878c2ecf20Sopenharmony_ci swp_entry_t swp; 37888c2ecf20Sopenharmony_ci 37898c2ecf20Sopenharmony_ci if (huge_pte_none(pte) || pte_present(pte)) 37908c2ecf20Sopenharmony_ci return false; 37918c2ecf20Sopenharmony_ci swp = pte_to_swp_entry(pte); 37928c2ecf20Sopenharmony_ci if (is_migration_entry(swp)) 37938c2ecf20Sopenharmony_ci return true; 37948c2ecf20Sopenharmony_ci else 37958c2ecf20Sopenharmony_ci return false; 37968c2ecf20Sopenharmony_ci} 37978c2ecf20Sopenharmony_ci 37988c2ecf20Sopenharmony_cistatic bool is_hugetlb_entry_hwpoisoned(pte_t pte) 37998c2ecf20Sopenharmony_ci{ 38008c2ecf20Sopenharmony_ci swp_entry_t swp; 38018c2ecf20Sopenharmony_ci 38028c2ecf20Sopenharmony_ci if (huge_pte_none(pte) || pte_present(pte)) 38038c2ecf20Sopenharmony_ci return false; 38048c2ecf20Sopenharmony_ci swp = pte_to_swp_entry(pte); 38058c2ecf20Sopenharmony_ci if (is_hwpoison_entry(swp)) 38068c2ecf20Sopenharmony_ci return true; 38078c2ecf20Sopenharmony_ci else 38088c2ecf20Sopenharmony_ci return false; 38098c2ecf20Sopenharmony_ci} 38108c2ecf20Sopenharmony_ci 38118c2ecf20Sopenharmony_ciint copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 38128c2ecf20Sopenharmony_ci struct vm_area_struct *vma) 38138c2ecf20Sopenharmony_ci{ 38148c2ecf20Sopenharmony_ci pte_t *src_pte, *dst_pte, entry, dst_entry; 38158c2ecf20Sopenharmony_ci struct page *ptepage; 38168c2ecf20Sopenharmony_ci unsigned long addr; 38178c2ecf20Sopenharmony_ci int cow; 38188c2ecf20Sopenharmony_ci struct hstate *h = hstate_vma(vma); 38198c2ecf20Sopenharmony_ci unsigned long sz = huge_page_size(h); 38208c2ecf20Sopenharmony_ci struct address_space *mapping = vma->vm_file->f_mapping; 38218c2ecf20Sopenharmony_ci struct mmu_notifier_range range; 38228c2ecf20Sopenharmony_ci int ret = 0; 38238c2ecf20Sopenharmony_ci 38248c2ecf20Sopenharmony_ci cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 38258c2ecf20Sopenharmony_ci 38268c2ecf20Sopenharmony_ci if (cow) { 38278c2ecf20Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src, 38288c2ecf20Sopenharmony_ci vma->vm_start, 38298c2ecf20Sopenharmony_ci vma->vm_end); 38308c2ecf20Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 38318c2ecf20Sopenharmony_ci } else { 38328c2ecf20Sopenharmony_ci /* 38338c2ecf20Sopenharmony_ci * For shared mappings i_mmap_rwsem must be held to call 38348c2ecf20Sopenharmony_ci * huge_pte_alloc, otherwise the returned ptep could go 38358c2ecf20Sopenharmony_ci * away if part of a shared pmd and another thread calls 38368c2ecf20Sopenharmony_ci * huge_pmd_unshare. 38378c2ecf20Sopenharmony_ci */ 38388c2ecf20Sopenharmony_ci i_mmap_lock_read(mapping); 38398c2ecf20Sopenharmony_ci } 38408c2ecf20Sopenharmony_ci 38418c2ecf20Sopenharmony_ci for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { 38428c2ecf20Sopenharmony_ci spinlock_t *src_ptl, *dst_ptl; 38438c2ecf20Sopenharmony_ci src_pte = huge_pte_offset(src, addr, sz); 38448c2ecf20Sopenharmony_ci if (!src_pte) 38458c2ecf20Sopenharmony_ci continue; 38468c2ecf20Sopenharmony_ci dst_pte = huge_pte_alloc(dst, addr, sz); 38478c2ecf20Sopenharmony_ci if (!dst_pte) { 38488c2ecf20Sopenharmony_ci ret = -ENOMEM; 38498c2ecf20Sopenharmony_ci break; 38508c2ecf20Sopenharmony_ci } 38518c2ecf20Sopenharmony_ci 38528c2ecf20Sopenharmony_ci /* 38538c2ecf20Sopenharmony_ci * If the pagetables are shared don't copy or take references. 38548c2ecf20Sopenharmony_ci * dst_pte == src_pte is the common case of src/dest sharing. 38558c2ecf20Sopenharmony_ci * 38568c2ecf20Sopenharmony_ci * However, src could have 'unshared' and dst shares with 38578c2ecf20Sopenharmony_ci * another vma. If dst_pte !none, this implies sharing. 38588c2ecf20Sopenharmony_ci * Check here before taking page table lock, and once again 38598c2ecf20Sopenharmony_ci * after taking the lock below. 38608c2ecf20Sopenharmony_ci */ 38618c2ecf20Sopenharmony_ci dst_entry = huge_ptep_get(dst_pte); 38628c2ecf20Sopenharmony_ci if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) 38638c2ecf20Sopenharmony_ci continue; 38648c2ecf20Sopenharmony_ci 38658c2ecf20Sopenharmony_ci dst_ptl = huge_pte_lock(h, dst, dst_pte); 38668c2ecf20Sopenharmony_ci src_ptl = huge_pte_lockptr(h, src, src_pte); 38678c2ecf20Sopenharmony_ci spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 38688c2ecf20Sopenharmony_ci entry = huge_ptep_get(src_pte); 38698c2ecf20Sopenharmony_ci dst_entry = huge_ptep_get(dst_pte); 38708c2ecf20Sopenharmony_ci if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) { 38718c2ecf20Sopenharmony_ci /* 38728c2ecf20Sopenharmony_ci * Skip if src entry none. Also, skip in the 38738c2ecf20Sopenharmony_ci * unlikely case dst entry !none as this implies 38748c2ecf20Sopenharmony_ci * sharing with another vma. 38758c2ecf20Sopenharmony_ci */ 38768c2ecf20Sopenharmony_ci ; 38778c2ecf20Sopenharmony_ci } else if (unlikely(is_hugetlb_entry_migration(entry) || 38788c2ecf20Sopenharmony_ci is_hugetlb_entry_hwpoisoned(entry))) { 38798c2ecf20Sopenharmony_ci swp_entry_t swp_entry = pte_to_swp_entry(entry); 38808c2ecf20Sopenharmony_ci 38818c2ecf20Sopenharmony_ci if (is_write_migration_entry(swp_entry) && cow) { 38828c2ecf20Sopenharmony_ci /* 38838c2ecf20Sopenharmony_ci * COW mappings require pages in both 38848c2ecf20Sopenharmony_ci * parent and child to be set to read. 38858c2ecf20Sopenharmony_ci */ 38868c2ecf20Sopenharmony_ci make_migration_entry_read(&swp_entry); 38878c2ecf20Sopenharmony_ci entry = swp_entry_to_pte(swp_entry); 38888c2ecf20Sopenharmony_ci set_huge_swap_pte_at(src, addr, src_pte, 38898c2ecf20Sopenharmony_ci entry, sz); 38908c2ecf20Sopenharmony_ci } 38918c2ecf20Sopenharmony_ci set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz); 38928c2ecf20Sopenharmony_ci } else { 38938c2ecf20Sopenharmony_ci if (cow) { 38948c2ecf20Sopenharmony_ci /* 38958c2ecf20Sopenharmony_ci * No need to notify as we are downgrading page 38968c2ecf20Sopenharmony_ci * table protection not changing it to point 38978c2ecf20Sopenharmony_ci * to a new page. 38988c2ecf20Sopenharmony_ci * 38998c2ecf20Sopenharmony_ci * See Documentation/vm/mmu_notifier.rst 39008c2ecf20Sopenharmony_ci */ 39018c2ecf20Sopenharmony_ci huge_ptep_set_wrprotect(src, addr, src_pte); 39028c2ecf20Sopenharmony_ci } 39038c2ecf20Sopenharmony_ci entry = huge_ptep_get(src_pte); 39048c2ecf20Sopenharmony_ci ptepage = pte_page(entry); 39058c2ecf20Sopenharmony_ci get_page(ptepage); 39068c2ecf20Sopenharmony_ci page_dup_rmap(ptepage, true); 39078c2ecf20Sopenharmony_ci set_huge_pte_at(dst, addr, dst_pte, entry); 39088c2ecf20Sopenharmony_ci hugetlb_count_add(pages_per_huge_page(h), dst); 39098c2ecf20Sopenharmony_ci } 39108c2ecf20Sopenharmony_ci spin_unlock(src_ptl); 39118c2ecf20Sopenharmony_ci spin_unlock(dst_ptl); 39128c2ecf20Sopenharmony_ci } 39138c2ecf20Sopenharmony_ci 39148c2ecf20Sopenharmony_ci if (cow) 39158c2ecf20Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 39168c2ecf20Sopenharmony_ci else 39178c2ecf20Sopenharmony_ci i_mmap_unlock_read(mapping); 39188c2ecf20Sopenharmony_ci 39198c2ecf20Sopenharmony_ci return ret; 39208c2ecf20Sopenharmony_ci} 39218c2ecf20Sopenharmony_ci 39228c2ecf20Sopenharmony_civoid __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, 39238c2ecf20Sopenharmony_ci unsigned long start, unsigned long end, 39248c2ecf20Sopenharmony_ci struct page *ref_page) 39258c2ecf20Sopenharmony_ci{ 39268c2ecf20Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 39278c2ecf20Sopenharmony_ci unsigned long address; 39288c2ecf20Sopenharmony_ci pte_t *ptep; 39298c2ecf20Sopenharmony_ci pte_t pte; 39308c2ecf20Sopenharmony_ci spinlock_t *ptl; 39318c2ecf20Sopenharmony_ci struct page *page; 39328c2ecf20Sopenharmony_ci struct hstate *h = hstate_vma(vma); 39338c2ecf20Sopenharmony_ci unsigned long sz = huge_page_size(h); 39348c2ecf20Sopenharmony_ci struct mmu_notifier_range range; 39358c2ecf20Sopenharmony_ci bool force_flush = false; 39368c2ecf20Sopenharmony_ci 39378c2ecf20Sopenharmony_ci WARN_ON(!is_vm_hugetlb_page(vma)); 39388c2ecf20Sopenharmony_ci BUG_ON(start & ~huge_page_mask(h)); 39398c2ecf20Sopenharmony_ci BUG_ON(end & ~huge_page_mask(h)); 39408c2ecf20Sopenharmony_ci 39418c2ecf20Sopenharmony_ci /* 39428c2ecf20Sopenharmony_ci * This is a hugetlb vma, all the pte entries should point 39438c2ecf20Sopenharmony_ci * to huge page. 39448c2ecf20Sopenharmony_ci */ 39458c2ecf20Sopenharmony_ci tlb_change_page_size(tlb, sz); 39468c2ecf20Sopenharmony_ci tlb_start_vma(tlb, vma); 39478c2ecf20Sopenharmony_ci 39488c2ecf20Sopenharmony_ci /* 39498c2ecf20Sopenharmony_ci * If sharing possible, alert mmu notifiers of worst case. 39508c2ecf20Sopenharmony_ci */ 39518c2ecf20Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start, 39528c2ecf20Sopenharmony_ci end); 39538c2ecf20Sopenharmony_ci adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); 39548c2ecf20Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 39558c2ecf20Sopenharmony_ci address = start; 39568c2ecf20Sopenharmony_ci for (; address < end; address += sz) { 39578c2ecf20Sopenharmony_ci ptep = huge_pte_offset(mm, address, sz); 39588c2ecf20Sopenharmony_ci if (!ptep) 39598c2ecf20Sopenharmony_ci continue; 39608c2ecf20Sopenharmony_ci 39618c2ecf20Sopenharmony_ci ptl = huge_pte_lock(h, mm, ptep); 39628c2ecf20Sopenharmony_ci if (huge_pmd_unshare(mm, vma, &address, ptep)) { 39638c2ecf20Sopenharmony_ci spin_unlock(ptl); 39648c2ecf20Sopenharmony_ci tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE); 39658c2ecf20Sopenharmony_ci force_flush = true; 39668c2ecf20Sopenharmony_ci continue; 39678c2ecf20Sopenharmony_ci } 39688c2ecf20Sopenharmony_ci 39698c2ecf20Sopenharmony_ci pte = huge_ptep_get(ptep); 39708c2ecf20Sopenharmony_ci if (huge_pte_none(pte)) { 39718c2ecf20Sopenharmony_ci spin_unlock(ptl); 39728c2ecf20Sopenharmony_ci continue; 39738c2ecf20Sopenharmony_ci } 39748c2ecf20Sopenharmony_ci 39758c2ecf20Sopenharmony_ci /* 39768c2ecf20Sopenharmony_ci * Migrating hugepage or HWPoisoned hugepage is already 39778c2ecf20Sopenharmony_ci * unmapped and its refcount is dropped, so just clear pte here. 39788c2ecf20Sopenharmony_ci */ 39798c2ecf20Sopenharmony_ci if (unlikely(!pte_present(pte))) { 39808c2ecf20Sopenharmony_ci huge_pte_clear(mm, address, ptep, sz); 39818c2ecf20Sopenharmony_ci spin_unlock(ptl); 39828c2ecf20Sopenharmony_ci continue; 39838c2ecf20Sopenharmony_ci } 39848c2ecf20Sopenharmony_ci 39858c2ecf20Sopenharmony_ci page = pte_page(pte); 39868c2ecf20Sopenharmony_ci /* 39878c2ecf20Sopenharmony_ci * If a reference page is supplied, it is because a specific 39888c2ecf20Sopenharmony_ci * page is being unmapped, not a range. Ensure the page we 39898c2ecf20Sopenharmony_ci * are about to unmap is the actual page of interest. 39908c2ecf20Sopenharmony_ci */ 39918c2ecf20Sopenharmony_ci if (ref_page) { 39928c2ecf20Sopenharmony_ci if (page != ref_page) { 39938c2ecf20Sopenharmony_ci spin_unlock(ptl); 39948c2ecf20Sopenharmony_ci continue; 39958c2ecf20Sopenharmony_ci } 39968c2ecf20Sopenharmony_ci /* 39978c2ecf20Sopenharmony_ci * Mark the VMA as having unmapped its page so that 39988c2ecf20Sopenharmony_ci * future faults in this VMA will fail rather than 39998c2ecf20Sopenharmony_ci * looking like data was lost 40008c2ecf20Sopenharmony_ci */ 40018c2ecf20Sopenharmony_ci set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); 40028c2ecf20Sopenharmony_ci } 40038c2ecf20Sopenharmony_ci 40048c2ecf20Sopenharmony_ci pte = huge_ptep_get_and_clear(mm, address, ptep); 40058c2ecf20Sopenharmony_ci tlb_remove_huge_tlb_entry(h, tlb, ptep, address); 40068c2ecf20Sopenharmony_ci if (huge_pte_dirty(pte)) 40078c2ecf20Sopenharmony_ci set_page_dirty(page); 40088c2ecf20Sopenharmony_ci 40098c2ecf20Sopenharmony_ci hugetlb_count_sub(pages_per_huge_page(h), mm); 40108c2ecf20Sopenharmony_ci page_remove_rmap(page, true); 40118c2ecf20Sopenharmony_ci 40128c2ecf20Sopenharmony_ci spin_unlock(ptl); 40138c2ecf20Sopenharmony_ci tlb_remove_page_size(tlb, page, huge_page_size(h)); 40148c2ecf20Sopenharmony_ci /* 40158c2ecf20Sopenharmony_ci * Bail out after unmapping reference page if supplied 40168c2ecf20Sopenharmony_ci */ 40178c2ecf20Sopenharmony_ci if (ref_page) 40188c2ecf20Sopenharmony_ci break; 40198c2ecf20Sopenharmony_ci } 40208c2ecf20Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 40218c2ecf20Sopenharmony_ci tlb_end_vma(tlb, vma); 40228c2ecf20Sopenharmony_ci 40238c2ecf20Sopenharmony_ci /* 40248c2ecf20Sopenharmony_ci * If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We 40258c2ecf20Sopenharmony_ci * could defer the flush until now, since by holding i_mmap_rwsem we 40268c2ecf20Sopenharmony_ci * guaranteed that the last refernece would not be dropped. But we must 40278c2ecf20Sopenharmony_ci * do the flushing before we return, as otherwise i_mmap_rwsem will be 40288c2ecf20Sopenharmony_ci * dropped and the last reference to the shared PMDs page might be 40298c2ecf20Sopenharmony_ci * dropped as well. 40308c2ecf20Sopenharmony_ci * 40318c2ecf20Sopenharmony_ci * In theory we could defer the freeing of the PMD pages as well, but 40328c2ecf20Sopenharmony_ci * huge_pmd_unshare() relies on the exact page_count for the PMD page to 40338c2ecf20Sopenharmony_ci * detect sharing, so we cannot defer the release of the page either. 40348c2ecf20Sopenharmony_ci * Instead, do flush now. 40358c2ecf20Sopenharmony_ci */ 40368c2ecf20Sopenharmony_ci if (force_flush) 40378c2ecf20Sopenharmony_ci tlb_flush_mmu_tlbonly(tlb); 40388c2ecf20Sopenharmony_ci} 40398c2ecf20Sopenharmony_ci 40408c2ecf20Sopenharmony_civoid __unmap_hugepage_range_final(struct mmu_gather *tlb, 40418c2ecf20Sopenharmony_ci struct vm_area_struct *vma, unsigned long start, 40428c2ecf20Sopenharmony_ci unsigned long end, struct page *ref_page) 40438c2ecf20Sopenharmony_ci{ 40448c2ecf20Sopenharmony_ci __unmap_hugepage_range(tlb, vma, start, end, ref_page); 40458c2ecf20Sopenharmony_ci 40468c2ecf20Sopenharmony_ci /* 40478c2ecf20Sopenharmony_ci * Clear this flag so that x86's huge_pmd_share page_table_shareable 40488c2ecf20Sopenharmony_ci * test will fail on a vma being torn down, and not grab a page table 40498c2ecf20Sopenharmony_ci * on its way out. We're lucky that the flag has such an appropriate 40508c2ecf20Sopenharmony_ci * name, and can in fact be safely cleared here. We could clear it 40518c2ecf20Sopenharmony_ci * before the __unmap_hugepage_range above, but all that's necessary 40528c2ecf20Sopenharmony_ci * is to clear it before releasing the i_mmap_rwsem. This works 40538c2ecf20Sopenharmony_ci * because in the context this is called, the VMA is about to be 40548c2ecf20Sopenharmony_ci * destroyed and the i_mmap_rwsem is held. 40558c2ecf20Sopenharmony_ci */ 40568c2ecf20Sopenharmony_ci vma->vm_flags &= ~VM_MAYSHARE; 40578c2ecf20Sopenharmony_ci} 40588c2ecf20Sopenharmony_ci 40598c2ecf20Sopenharmony_civoid unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 40608c2ecf20Sopenharmony_ci unsigned long end, struct page *ref_page) 40618c2ecf20Sopenharmony_ci{ 40628c2ecf20Sopenharmony_ci struct mm_struct *mm; 40638c2ecf20Sopenharmony_ci struct mmu_gather tlb; 40648c2ecf20Sopenharmony_ci unsigned long tlb_start = start; 40658c2ecf20Sopenharmony_ci unsigned long tlb_end = end; 40668c2ecf20Sopenharmony_ci 40678c2ecf20Sopenharmony_ci /* 40688c2ecf20Sopenharmony_ci * If shared PMDs were possibly used within this vma range, adjust 40698c2ecf20Sopenharmony_ci * start/end for worst case tlb flushing. 40708c2ecf20Sopenharmony_ci * Note that we can not be sure if PMDs are shared until we try to 40718c2ecf20Sopenharmony_ci * unmap pages. However, we want to make sure TLB flushing covers 40728c2ecf20Sopenharmony_ci * the largest possible range. 40738c2ecf20Sopenharmony_ci */ 40748c2ecf20Sopenharmony_ci adjust_range_if_pmd_sharing_possible(vma, &tlb_start, &tlb_end); 40758c2ecf20Sopenharmony_ci 40768c2ecf20Sopenharmony_ci mm = vma->vm_mm; 40778c2ecf20Sopenharmony_ci 40788c2ecf20Sopenharmony_ci tlb_gather_mmu(&tlb, mm, tlb_start, tlb_end); 40798c2ecf20Sopenharmony_ci __unmap_hugepage_range(&tlb, vma, start, end, ref_page); 40808c2ecf20Sopenharmony_ci tlb_finish_mmu(&tlb, tlb_start, tlb_end); 40818c2ecf20Sopenharmony_ci} 40828c2ecf20Sopenharmony_ci 40838c2ecf20Sopenharmony_ci/* 40848c2ecf20Sopenharmony_ci * This is called when the original mapper is failing to COW a MAP_PRIVATE 40858c2ecf20Sopenharmony_ci * mappping it owns the reserve page for. The intention is to unmap the page 40868c2ecf20Sopenharmony_ci * from other VMAs and let the children be SIGKILLed if they are faulting the 40878c2ecf20Sopenharmony_ci * same region. 40888c2ecf20Sopenharmony_ci */ 40898c2ecf20Sopenharmony_cistatic void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, 40908c2ecf20Sopenharmony_ci struct page *page, unsigned long address) 40918c2ecf20Sopenharmony_ci{ 40928c2ecf20Sopenharmony_ci struct hstate *h = hstate_vma(vma); 40938c2ecf20Sopenharmony_ci struct vm_area_struct *iter_vma; 40948c2ecf20Sopenharmony_ci struct address_space *mapping; 40958c2ecf20Sopenharmony_ci pgoff_t pgoff; 40968c2ecf20Sopenharmony_ci 40978c2ecf20Sopenharmony_ci /* 40988c2ecf20Sopenharmony_ci * vm_pgoff is in PAGE_SIZE units, hence the different calculation 40998c2ecf20Sopenharmony_ci * from page cache lookup which is in HPAGE_SIZE units. 41008c2ecf20Sopenharmony_ci */ 41018c2ecf20Sopenharmony_ci address = address & huge_page_mask(h); 41028c2ecf20Sopenharmony_ci pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + 41038c2ecf20Sopenharmony_ci vma->vm_pgoff; 41048c2ecf20Sopenharmony_ci mapping = vma->vm_file->f_mapping; 41058c2ecf20Sopenharmony_ci 41068c2ecf20Sopenharmony_ci /* 41078c2ecf20Sopenharmony_ci * Take the mapping lock for the duration of the table walk. As 41088c2ecf20Sopenharmony_ci * this mapping should be shared between all the VMAs, 41098c2ecf20Sopenharmony_ci * __unmap_hugepage_range() is called as the lock is already held 41108c2ecf20Sopenharmony_ci */ 41118c2ecf20Sopenharmony_ci i_mmap_lock_write(mapping); 41128c2ecf20Sopenharmony_ci vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { 41138c2ecf20Sopenharmony_ci /* Do not unmap the current VMA */ 41148c2ecf20Sopenharmony_ci if (iter_vma == vma) 41158c2ecf20Sopenharmony_ci continue; 41168c2ecf20Sopenharmony_ci 41178c2ecf20Sopenharmony_ci /* 41188c2ecf20Sopenharmony_ci * Shared VMAs have their own reserves and do not affect 41198c2ecf20Sopenharmony_ci * MAP_PRIVATE accounting but it is possible that a shared 41208c2ecf20Sopenharmony_ci * VMA is using the same page so check and skip such VMAs. 41218c2ecf20Sopenharmony_ci */ 41228c2ecf20Sopenharmony_ci if (iter_vma->vm_flags & VM_MAYSHARE) 41238c2ecf20Sopenharmony_ci continue; 41248c2ecf20Sopenharmony_ci 41258c2ecf20Sopenharmony_ci /* 41268c2ecf20Sopenharmony_ci * Unmap the page from other VMAs without their own reserves. 41278c2ecf20Sopenharmony_ci * They get marked to be SIGKILLed if they fault in these 41288c2ecf20Sopenharmony_ci * areas. This is because a future no-page fault on this VMA 41298c2ecf20Sopenharmony_ci * could insert a zeroed page instead of the data existing 41308c2ecf20Sopenharmony_ci * from the time of fork. This would look like data corruption 41318c2ecf20Sopenharmony_ci */ 41328c2ecf20Sopenharmony_ci if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) 41338c2ecf20Sopenharmony_ci unmap_hugepage_range(iter_vma, address, 41348c2ecf20Sopenharmony_ci address + huge_page_size(h), page); 41358c2ecf20Sopenharmony_ci } 41368c2ecf20Sopenharmony_ci i_mmap_unlock_write(mapping); 41378c2ecf20Sopenharmony_ci} 41388c2ecf20Sopenharmony_ci 41398c2ecf20Sopenharmony_ci/* 41408c2ecf20Sopenharmony_ci * Hugetlb_cow() should be called with page lock of the original hugepage held. 41418c2ecf20Sopenharmony_ci * Called with hugetlb_instantiation_mutex held and pte_page locked so we 41428c2ecf20Sopenharmony_ci * cannot race with other handlers or page migration. 41438c2ecf20Sopenharmony_ci * Keep the pte_same checks anyway to make transition from the mutex easier. 41448c2ecf20Sopenharmony_ci */ 41458c2ecf20Sopenharmony_cistatic vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 41468c2ecf20Sopenharmony_ci unsigned long address, pte_t *ptep, 41478c2ecf20Sopenharmony_ci struct page *pagecache_page, spinlock_t *ptl) 41488c2ecf20Sopenharmony_ci{ 41498c2ecf20Sopenharmony_ci pte_t pte; 41508c2ecf20Sopenharmony_ci struct hstate *h = hstate_vma(vma); 41518c2ecf20Sopenharmony_ci struct page *old_page, *new_page; 41528c2ecf20Sopenharmony_ci int outside_reserve = 0; 41538c2ecf20Sopenharmony_ci vm_fault_t ret = 0; 41548c2ecf20Sopenharmony_ci unsigned long haddr = address & huge_page_mask(h); 41558c2ecf20Sopenharmony_ci struct mmu_notifier_range range; 41568c2ecf20Sopenharmony_ci 41578c2ecf20Sopenharmony_ci pte = huge_ptep_get(ptep); 41588c2ecf20Sopenharmony_ci old_page = pte_page(pte); 41598c2ecf20Sopenharmony_ci 41608c2ecf20Sopenharmony_ciretry_avoidcopy: 41618c2ecf20Sopenharmony_ci /* If no-one else is actually using this page, avoid the copy 41628c2ecf20Sopenharmony_ci * and just make the page writable */ 41638c2ecf20Sopenharmony_ci if (page_mapcount(old_page) == 1 && PageAnon(old_page)) { 41648c2ecf20Sopenharmony_ci page_move_anon_rmap(old_page, vma); 41658c2ecf20Sopenharmony_ci set_huge_ptep_writable(vma, haddr, ptep); 41668c2ecf20Sopenharmony_ci return 0; 41678c2ecf20Sopenharmony_ci } 41688c2ecf20Sopenharmony_ci 41698c2ecf20Sopenharmony_ci /* 41708c2ecf20Sopenharmony_ci * If the process that created a MAP_PRIVATE mapping is about to 41718c2ecf20Sopenharmony_ci * perform a COW due to a shared page count, attempt to satisfy 41728c2ecf20Sopenharmony_ci * the allocation without using the existing reserves. The pagecache 41738c2ecf20Sopenharmony_ci * page is used to determine if the reserve at this address was 41748c2ecf20Sopenharmony_ci * consumed or not. If reserves were used, a partial faulted mapping 41758c2ecf20Sopenharmony_ci * at the time of fork() could consume its reserves on COW instead 41768c2ecf20Sopenharmony_ci * of the full address range. 41778c2ecf20Sopenharmony_ci */ 41788c2ecf20Sopenharmony_ci if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && 41798c2ecf20Sopenharmony_ci old_page != pagecache_page) 41808c2ecf20Sopenharmony_ci outside_reserve = 1; 41818c2ecf20Sopenharmony_ci 41828c2ecf20Sopenharmony_ci get_page(old_page); 41838c2ecf20Sopenharmony_ci 41848c2ecf20Sopenharmony_ci /* 41858c2ecf20Sopenharmony_ci * Drop page table lock as buddy allocator may be called. It will 41868c2ecf20Sopenharmony_ci * be acquired again before returning to the caller, as expected. 41878c2ecf20Sopenharmony_ci */ 41888c2ecf20Sopenharmony_ci spin_unlock(ptl); 41898c2ecf20Sopenharmony_ci new_page = alloc_huge_page(vma, haddr, outside_reserve); 41908c2ecf20Sopenharmony_ci 41918c2ecf20Sopenharmony_ci if (IS_ERR(new_page)) { 41928c2ecf20Sopenharmony_ci /* 41938c2ecf20Sopenharmony_ci * If a process owning a MAP_PRIVATE mapping fails to COW, 41948c2ecf20Sopenharmony_ci * it is due to references held by a child and an insufficient 41958c2ecf20Sopenharmony_ci * huge page pool. To guarantee the original mappers 41968c2ecf20Sopenharmony_ci * reliability, unmap the page from child processes. The child 41978c2ecf20Sopenharmony_ci * may get SIGKILLed if it later faults. 41988c2ecf20Sopenharmony_ci */ 41998c2ecf20Sopenharmony_ci if (outside_reserve) { 42008c2ecf20Sopenharmony_ci struct address_space *mapping = vma->vm_file->f_mapping; 42018c2ecf20Sopenharmony_ci pgoff_t idx; 42028c2ecf20Sopenharmony_ci u32 hash; 42038c2ecf20Sopenharmony_ci 42048c2ecf20Sopenharmony_ci put_page(old_page); 42058c2ecf20Sopenharmony_ci BUG_ON(huge_pte_none(pte)); 42068c2ecf20Sopenharmony_ci /* 42078c2ecf20Sopenharmony_ci * Drop hugetlb_fault_mutex and i_mmap_rwsem before 42088c2ecf20Sopenharmony_ci * unmapping. unmapping needs to hold i_mmap_rwsem 42098c2ecf20Sopenharmony_ci * in write mode. Dropping i_mmap_rwsem in read mode 42108c2ecf20Sopenharmony_ci * here is OK as COW mappings do not interact with 42118c2ecf20Sopenharmony_ci * PMD sharing. 42128c2ecf20Sopenharmony_ci * 42138c2ecf20Sopenharmony_ci * Reacquire both after unmap operation. 42148c2ecf20Sopenharmony_ci */ 42158c2ecf20Sopenharmony_ci idx = vma_hugecache_offset(h, vma, haddr); 42168c2ecf20Sopenharmony_ci hash = hugetlb_fault_mutex_hash(mapping, idx); 42178c2ecf20Sopenharmony_ci mutex_unlock(&hugetlb_fault_mutex_table[hash]); 42188c2ecf20Sopenharmony_ci i_mmap_unlock_read(mapping); 42198c2ecf20Sopenharmony_ci 42208c2ecf20Sopenharmony_ci unmap_ref_private(mm, vma, old_page, haddr); 42218c2ecf20Sopenharmony_ci 42228c2ecf20Sopenharmony_ci i_mmap_lock_read(mapping); 42238c2ecf20Sopenharmony_ci mutex_lock(&hugetlb_fault_mutex_table[hash]); 42248c2ecf20Sopenharmony_ci spin_lock(ptl); 42258c2ecf20Sopenharmony_ci ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); 42268c2ecf20Sopenharmony_ci if (likely(ptep && 42278c2ecf20Sopenharmony_ci pte_same(huge_ptep_get(ptep), pte))) 42288c2ecf20Sopenharmony_ci goto retry_avoidcopy; 42298c2ecf20Sopenharmony_ci /* 42308c2ecf20Sopenharmony_ci * race occurs while re-acquiring page table 42318c2ecf20Sopenharmony_ci * lock, and our job is done. 42328c2ecf20Sopenharmony_ci */ 42338c2ecf20Sopenharmony_ci return 0; 42348c2ecf20Sopenharmony_ci } 42358c2ecf20Sopenharmony_ci 42368c2ecf20Sopenharmony_ci ret = vmf_error(PTR_ERR(new_page)); 42378c2ecf20Sopenharmony_ci goto out_release_old; 42388c2ecf20Sopenharmony_ci } 42398c2ecf20Sopenharmony_ci 42408c2ecf20Sopenharmony_ci /* 42418c2ecf20Sopenharmony_ci * When the original hugepage is shared one, it does not have 42428c2ecf20Sopenharmony_ci * anon_vma prepared. 42438c2ecf20Sopenharmony_ci */ 42448c2ecf20Sopenharmony_ci if (unlikely(anon_vma_prepare(vma))) { 42458c2ecf20Sopenharmony_ci ret = VM_FAULT_OOM; 42468c2ecf20Sopenharmony_ci goto out_release_all; 42478c2ecf20Sopenharmony_ci } 42488c2ecf20Sopenharmony_ci 42498c2ecf20Sopenharmony_ci copy_user_huge_page(new_page, old_page, address, vma, 42508c2ecf20Sopenharmony_ci pages_per_huge_page(h)); 42518c2ecf20Sopenharmony_ci __SetPageUptodate(new_page); 42528c2ecf20Sopenharmony_ci 42538c2ecf20Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr, 42548c2ecf20Sopenharmony_ci haddr + huge_page_size(h)); 42558c2ecf20Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 42568c2ecf20Sopenharmony_ci 42578c2ecf20Sopenharmony_ci /* 42588c2ecf20Sopenharmony_ci * Retake the page table lock to check for racing updates 42598c2ecf20Sopenharmony_ci * before the page tables are altered 42608c2ecf20Sopenharmony_ci */ 42618c2ecf20Sopenharmony_ci spin_lock(ptl); 42628c2ecf20Sopenharmony_ci ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); 42638c2ecf20Sopenharmony_ci if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { 42648c2ecf20Sopenharmony_ci ClearPagePrivate(new_page); 42658c2ecf20Sopenharmony_ci 42668c2ecf20Sopenharmony_ci /* Break COW */ 42678c2ecf20Sopenharmony_ci huge_ptep_clear_flush(vma, haddr, ptep); 42688c2ecf20Sopenharmony_ci mmu_notifier_invalidate_range(mm, range.start, range.end); 42698c2ecf20Sopenharmony_ci set_huge_pte_at(mm, haddr, ptep, 42708c2ecf20Sopenharmony_ci make_huge_pte(vma, new_page, 1)); 42718c2ecf20Sopenharmony_ci page_remove_rmap(old_page, true); 42728c2ecf20Sopenharmony_ci hugepage_add_new_anon_rmap(new_page, vma, haddr); 42738c2ecf20Sopenharmony_ci set_page_huge_active(new_page); 42748c2ecf20Sopenharmony_ci /* Make the old page be freed below */ 42758c2ecf20Sopenharmony_ci new_page = old_page; 42768c2ecf20Sopenharmony_ci } 42778c2ecf20Sopenharmony_ci spin_unlock(ptl); 42788c2ecf20Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 42798c2ecf20Sopenharmony_ciout_release_all: 42808c2ecf20Sopenharmony_ci restore_reserve_on_error(h, vma, haddr, new_page); 42818c2ecf20Sopenharmony_ci put_page(new_page); 42828c2ecf20Sopenharmony_ciout_release_old: 42838c2ecf20Sopenharmony_ci put_page(old_page); 42848c2ecf20Sopenharmony_ci 42858c2ecf20Sopenharmony_ci spin_lock(ptl); /* Caller expects lock to be held */ 42868c2ecf20Sopenharmony_ci return ret; 42878c2ecf20Sopenharmony_ci} 42888c2ecf20Sopenharmony_ci 42898c2ecf20Sopenharmony_ci/* Return the pagecache page at a given address within a VMA */ 42908c2ecf20Sopenharmony_cistatic struct page *hugetlbfs_pagecache_page(struct hstate *h, 42918c2ecf20Sopenharmony_ci struct vm_area_struct *vma, unsigned long address) 42928c2ecf20Sopenharmony_ci{ 42938c2ecf20Sopenharmony_ci struct address_space *mapping; 42948c2ecf20Sopenharmony_ci pgoff_t idx; 42958c2ecf20Sopenharmony_ci 42968c2ecf20Sopenharmony_ci mapping = vma->vm_file->f_mapping; 42978c2ecf20Sopenharmony_ci idx = vma_hugecache_offset(h, vma, address); 42988c2ecf20Sopenharmony_ci 42998c2ecf20Sopenharmony_ci return find_lock_page(mapping, idx); 43008c2ecf20Sopenharmony_ci} 43018c2ecf20Sopenharmony_ci 43028c2ecf20Sopenharmony_ci/* 43038c2ecf20Sopenharmony_ci * Return whether there is a pagecache page to back given address within VMA. 43048c2ecf20Sopenharmony_ci * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page. 43058c2ecf20Sopenharmony_ci */ 43068c2ecf20Sopenharmony_cistatic bool hugetlbfs_pagecache_present(struct hstate *h, 43078c2ecf20Sopenharmony_ci struct vm_area_struct *vma, unsigned long address) 43088c2ecf20Sopenharmony_ci{ 43098c2ecf20Sopenharmony_ci struct address_space *mapping; 43108c2ecf20Sopenharmony_ci pgoff_t idx; 43118c2ecf20Sopenharmony_ci struct page *page; 43128c2ecf20Sopenharmony_ci 43138c2ecf20Sopenharmony_ci mapping = vma->vm_file->f_mapping; 43148c2ecf20Sopenharmony_ci idx = vma_hugecache_offset(h, vma, address); 43158c2ecf20Sopenharmony_ci 43168c2ecf20Sopenharmony_ci page = find_get_page(mapping, idx); 43178c2ecf20Sopenharmony_ci if (page) 43188c2ecf20Sopenharmony_ci put_page(page); 43198c2ecf20Sopenharmony_ci return page != NULL; 43208c2ecf20Sopenharmony_ci} 43218c2ecf20Sopenharmony_ci 43228c2ecf20Sopenharmony_ciint huge_add_to_page_cache(struct page *page, struct address_space *mapping, 43238c2ecf20Sopenharmony_ci pgoff_t idx) 43248c2ecf20Sopenharmony_ci{ 43258c2ecf20Sopenharmony_ci struct inode *inode = mapping->host; 43268c2ecf20Sopenharmony_ci struct hstate *h = hstate_inode(inode); 43278c2ecf20Sopenharmony_ci int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 43288c2ecf20Sopenharmony_ci 43298c2ecf20Sopenharmony_ci if (err) 43308c2ecf20Sopenharmony_ci return err; 43318c2ecf20Sopenharmony_ci ClearPagePrivate(page); 43328c2ecf20Sopenharmony_ci 43338c2ecf20Sopenharmony_ci /* 43348c2ecf20Sopenharmony_ci * set page dirty so that it will not be removed from cache/file 43358c2ecf20Sopenharmony_ci * by non-hugetlbfs specific code paths. 43368c2ecf20Sopenharmony_ci */ 43378c2ecf20Sopenharmony_ci set_page_dirty(page); 43388c2ecf20Sopenharmony_ci 43398c2ecf20Sopenharmony_ci spin_lock(&inode->i_lock); 43408c2ecf20Sopenharmony_ci inode->i_blocks += blocks_per_huge_page(h); 43418c2ecf20Sopenharmony_ci spin_unlock(&inode->i_lock); 43428c2ecf20Sopenharmony_ci return 0; 43438c2ecf20Sopenharmony_ci} 43448c2ecf20Sopenharmony_ci 43458c2ecf20Sopenharmony_cistatic vm_fault_t hugetlb_no_page(struct mm_struct *mm, 43468c2ecf20Sopenharmony_ci struct vm_area_struct *vma, 43478c2ecf20Sopenharmony_ci struct address_space *mapping, pgoff_t idx, 43488c2ecf20Sopenharmony_ci unsigned long address, pte_t *ptep, unsigned int flags) 43498c2ecf20Sopenharmony_ci{ 43508c2ecf20Sopenharmony_ci struct hstate *h = hstate_vma(vma); 43518c2ecf20Sopenharmony_ci vm_fault_t ret = VM_FAULT_SIGBUS; 43528c2ecf20Sopenharmony_ci int anon_rmap = 0; 43538c2ecf20Sopenharmony_ci unsigned long size; 43548c2ecf20Sopenharmony_ci struct page *page; 43558c2ecf20Sopenharmony_ci pte_t new_pte; 43568c2ecf20Sopenharmony_ci spinlock_t *ptl; 43578c2ecf20Sopenharmony_ci unsigned long haddr = address & huge_page_mask(h); 43588c2ecf20Sopenharmony_ci bool new_page = false; 43598c2ecf20Sopenharmony_ci u32 hash = hugetlb_fault_mutex_hash(mapping, idx); 43608c2ecf20Sopenharmony_ci 43618c2ecf20Sopenharmony_ci /* 43628c2ecf20Sopenharmony_ci * Currently, we are forced to kill the process in the event the 43638c2ecf20Sopenharmony_ci * original mapper has unmapped pages from the child due to a failed 43648c2ecf20Sopenharmony_ci * COW. Warn that such a situation has occurred as it may not be obvious 43658c2ecf20Sopenharmony_ci */ 43668c2ecf20Sopenharmony_ci if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { 43678c2ecf20Sopenharmony_ci pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n", 43688c2ecf20Sopenharmony_ci current->pid); 43698c2ecf20Sopenharmony_ci goto out; 43708c2ecf20Sopenharmony_ci } 43718c2ecf20Sopenharmony_ci 43728c2ecf20Sopenharmony_ci /* 43738c2ecf20Sopenharmony_ci * We can not race with truncation due to holding i_mmap_rwsem. 43748c2ecf20Sopenharmony_ci * i_size is modified when holding i_mmap_rwsem, so check here 43758c2ecf20Sopenharmony_ci * once for faults beyond end of file. 43768c2ecf20Sopenharmony_ci */ 43778c2ecf20Sopenharmony_ci size = i_size_read(mapping->host) >> huge_page_shift(h); 43788c2ecf20Sopenharmony_ci if (idx >= size) 43798c2ecf20Sopenharmony_ci goto out; 43808c2ecf20Sopenharmony_ci 43818c2ecf20Sopenharmony_ciretry: 43828c2ecf20Sopenharmony_ci page = find_lock_page(mapping, idx); 43838c2ecf20Sopenharmony_ci if (!page) { 43848c2ecf20Sopenharmony_ci /* 43858c2ecf20Sopenharmony_ci * Check for page in userfault range 43868c2ecf20Sopenharmony_ci */ 43878c2ecf20Sopenharmony_ci if (userfaultfd_missing(vma)) { 43888c2ecf20Sopenharmony_ci struct vm_fault vmf = { 43898c2ecf20Sopenharmony_ci .vma = vma, 43908c2ecf20Sopenharmony_ci .address = haddr, 43918c2ecf20Sopenharmony_ci .flags = flags, 43928c2ecf20Sopenharmony_ci /* 43938c2ecf20Sopenharmony_ci * Hard to debug if it ends up being 43948c2ecf20Sopenharmony_ci * used by a callee that assumes 43958c2ecf20Sopenharmony_ci * something about the other 43968c2ecf20Sopenharmony_ci * uninitialized fields... same as in 43978c2ecf20Sopenharmony_ci * memory.c 43988c2ecf20Sopenharmony_ci */ 43998c2ecf20Sopenharmony_ci }; 44008c2ecf20Sopenharmony_ci 44018c2ecf20Sopenharmony_ci /* 44028c2ecf20Sopenharmony_ci * vma_lock and hugetlb_fault_mutex must be dropped 44038c2ecf20Sopenharmony_ci * before handling userfault. Also mmap_lock will 44048c2ecf20Sopenharmony_ci * be dropped during handling userfault, any vma 44058c2ecf20Sopenharmony_ci * operation should be careful from here. 44068c2ecf20Sopenharmony_ci */ 44078c2ecf20Sopenharmony_ci mutex_unlock(&hugetlb_fault_mutex_table[hash]); 44088c2ecf20Sopenharmony_ci i_mmap_unlock_read(mapping); 44098c2ecf20Sopenharmony_ci return handle_userfault(&vmf, VM_UFFD_MISSING); 44108c2ecf20Sopenharmony_ci } 44118c2ecf20Sopenharmony_ci 44128c2ecf20Sopenharmony_ci page = alloc_huge_page(vma, haddr, 0); 44138c2ecf20Sopenharmony_ci if (IS_ERR(page)) { 44148c2ecf20Sopenharmony_ci /* 44158c2ecf20Sopenharmony_ci * Returning error will result in faulting task being 44168c2ecf20Sopenharmony_ci * sent SIGBUS. The hugetlb fault mutex prevents two 44178c2ecf20Sopenharmony_ci * tasks from racing to fault in the same page which 44188c2ecf20Sopenharmony_ci * could result in false unable to allocate errors. 44198c2ecf20Sopenharmony_ci * Page migration does not take the fault mutex, but 44208c2ecf20Sopenharmony_ci * does a clear then write of pte's under page table 44218c2ecf20Sopenharmony_ci * lock. Page fault code could race with migration, 44228c2ecf20Sopenharmony_ci * notice the clear pte and try to allocate a page 44238c2ecf20Sopenharmony_ci * here. Before returning error, get ptl and make 44248c2ecf20Sopenharmony_ci * sure there really is no pte entry. 44258c2ecf20Sopenharmony_ci */ 44268c2ecf20Sopenharmony_ci ptl = huge_pte_lock(h, mm, ptep); 44278c2ecf20Sopenharmony_ci if (!huge_pte_none(huge_ptep_get(ptep))) { 44288c2ecf20Sopenharmony_ci ret = 0; 44298c2ecf20Sopenharmony_ci spin_unlock(ptl); 44308c2ecf20Sopenharmony_ci goto out; 44318c2ecf20Sopenharmony_ci } 44328c2ecf20Sopenharmony_ci spin_unlock(ptl); 44338c2ecf20Sopenharmony_ci ret = vmf_error(PTR_ERR(page)); 44348c2ecf20Sopenharmony_ci goto out; 44358c2ecf20Sopenharmony_ci } 44368c2ecf20Sopenharmony_ci clear_huge_page(page, address, pages_per_huge_page(h)); 44378c2ecf20Sopenharmony_ci __SetPageUptodate(page); 44388c2ecf20Sopenharmony_ci new_page = true; 44398c2ecf20Sopenharmony_ci 44408c2ecf20Sopenharmony_ci if (vma->vm_flags & VM_MAYSHARE) { 44418c2ecf20Sopenharmony_ci int err = huge_add_to_page_cache(page, mapping, idx); 44428c2ecf20Sopenharmony_ci if (err) { 44438c2ecf20Sopenharmony_ci put_page(page); 44448c2ecf20Sopenharmony_ci if (err == -EEXIST) 44458c2ecf20Sopenharmony_ci goto retry; 44468c2ecf20Sopenharmony_ci goto out; 44478c2ecf20Sopenharmony_ci } 44488c2ecf20Sopenharmony_ci } else { 44498c2ecf20Sopenharmony_ci lock_page(page); 44508c2ecf20Sopenharmony_ci if (unlikely(anon_vma_prepare(vma))) { 44518c2ecf20Sopenharmony_ci ret = VM_FAULT_OOM; 44528c2ecf20Sopenharmony_ci goto backout_unlocked; 44538c2ecf20Sopenharmony_ci } 44548c2ecf20Sopenharmony_ci anon_rmap = 1; 44558c2ecf20Sopenharmony_ci } 44568c2ecf20Sopenharmony_ci } else { 44578c2ecf20Sopenharmony_ci /* 44588c2ecf20Sopenharmony_ci * If memory error occurs between mmap() and fault, some process 44598c2ecf20Sopenharmony_ci * don't have hwpoisoned swap entry for errored virtual address. 44608c2ecf20Sopenharmony_ci * So we need to block hugepage fault by PG_hwpoison bit check. 44618c2ecf20Sopenharmony_ci */ 44628c2ecf20Sopenharmony_ci if (unlikely(PageHWPoison(page))) { 44638c2ecf20Sopenharmony_ci ret = VM_FAULT_HWPOISON_LARGE | 44648c2ecf20Sopenharmony_ci VM_FAULT_SET_HINDEX(hstate_index(h)); 44658c2ecf20Sopenharmony_ci goto backout_unlocked; 44668c2ecf20Sopenharmony_ci } 44678c2ecf20Sopenharmony_ci } 44688c2ecf20Sopenharmony_ci 44698c2ecf20Sopenharmony_ci /* 44708c2ecf20Sopenharmony_ci * If we are going to COW a private mapping later, we examine the 44718c2ecf20Sopenharmony_ci * pending reservations for this page now. This will ensure that 44728c2ecf20Sopenharmony_ci * any allocations necessary to record that reservation occur outside 44738c2ecf20Sopenharmony_ci * the spinlock. 44748c2ecf20Sopenharmony_ci */ 44758c2ecf20Sopenharmony_ci if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { 44768c2ecf20Sopenharmony_ci if (vma_needs_reservation(h, vma, haddr) < 0) { 44778c2ecf20Sopenharmony_ci ret = VM_FAULT_OOM; 44788c2ecf20Sopenharmony_ci goto backout_unlocked; 44798c2ecf20Sopenharmony_ci } 44808c2ecf20Sopenharmony_ci /* Just decrements count, does not deallocate */ 44818c2ecf20Sopenharmony_ci vma_end_reservation(h, vma, haddr); 44828c2ecf20Sopenharmony_ci } 44838c2ecf20Sopenharmony_ci 44848c2ecf20Sopenharmony_ci ptl = huge_pte_lock(h, mm, ptep); 44858c2ecf20Sopenharmony_ci ret = 0; 44868c2ecf20Sopenharmony_ci if (!huge_pte_none(huge_ptep_get(ptep))) 44878c2ecf20Sopenharmony_ci goto backout; 44888c2ecf20Sopenharmony_ci 44898c2ecf20Sopenharmony_ci if (anon_rmap) { 44908c2ecf20Sopenharmony_ci ClearPagePrivate(page); 44918c2ecf20Sopenharmony_ci hugepage_add_new_anon_rmap(page, vma, haddr); 44928c2ecf20Sopenharmony_ci } else 44938c2ecf20Sopenharmony_ci page_dup_rmap(page, true); 44948c2ecf20Sopenharmony_ci new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 44958c2ecf20Sopenharmony_ci && (vma->vm_flags & VM_SHARED))); 44968c2ecf20Sopenharmony_ci set_huge_pte_at(mm, haddr, ptep, new_pte); 44978c2ecf20Sopenharmony_ci 44988c2ecf20Sopenharmony_ci hugetlb_count_add(pages_per_huge_page(h), mm); 44998c2ecf20Sopenharmony_ci if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { 45008c2ecf20Sopenharmony_ci /* Optimization, do the COW without a second fault */ 45018c2ecf20Sopenharmony_ci ret = hugetlb_cow(mm, vma, address, ptep, page, ptl); 45028c2ecf20Sopenharmony_ci } 45038c2ecf20Sopenharmony_ci 45048c2ecf20Sopenharmony_ci spin_unlock(ptl); 45058c2ecf20Sopenharmony_ci 45068c2ecf20Sopenharmony_ci /* 45078c2ecf20Sopenharmony_ci * Only make newly allocated pages active. Existing pages found 45088c2ecf20Sopenharmony_ci * in the pagecache could be !page_huge_active() if they have been 45098c2ecf20Sopenharmony_ci * isolated for migration. 45108c2ecf20Sopenharmony_ci */ 45118c2ecf20Sopenharmony_ci if (new_page) 45128c2ecf20Sopenharmony_ci set_page_huge_active(page); 45138c2ecf20Sopenharmony_ci 45148c2ecf20Sopenharmony_ci unlock_page(page); 45158c2ecf20Sopenharmony_ciout: 45168c2ecf20Sopenharmony_ci mutex_unlock(&hugetlb_fault_mutex_table[hash]); 45178c2ecf20Sopenharmony_ci i_mmap_unlock_read(mapping); 45188c2ecf20Sopenharmony_ci return ret; 45198c2ecf20Sopenharmony_ci 45208c2ecf20Sopenharmony_cibackout: 45218c2ecf20Sopenharmony_ci spin_unlock(ptl); 45228c2ecf20Sopenharmony_cibackout_unlocked: 45238c2ecf20Sopenharmony_ci unlock_page(page); 45248c2ecf20Sopenharmony_ci restore_reserve_on_error(h, vma, haddr, page); 45258c2ecf20Sopenharmony_ci put_page(page); 45268c2ecf20Sopenharmony_ci goto out; 45278c2ecf20Sopenharmony_ci} 45288c2ecf20Sopenharmony_ci 45298c2ecf20Sopenharmony_ci#ifdef CONFIG_SMP 45308c2ecf20Sopenharmony_ciu32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) 45318c2ecf20Sopenharmony_ci{ 45328c2ecf20Sopenharmony_ci unsigned long key[2]; 45338c2ecf20Sopenharmony_ci u32 hash; 45348c2ecf20Sopenharmony_ci 45358c2ecf20Sopenharmony_ci key[0] = (unsigned long) mapping; 45368c2ecf20Sopenharmony_ci key[1] = idx; 45378c2ecf20Sopenharmony_ci 45388c2ecf20Sopenharmony_ci hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0); 45398c2ecf20Sopenharmony_ci 45408c2ecf20Sopenharmony_ci return hash & (num_fault_mutexes - 1); 45418c2ecf20Sopenharmony_ci} 45428c2ecf20Sopenharmony_ci#else 45438c2ecf20Sopenharmony_ci/* 45448c2ecf20Sopenharmony_ci * For uniprocesor systems we always use a single mutex, so just 45458c2ecf20Sopenharmony_ci * return 0 and avoid the hashing overhead. 45468c2ecf20Sopenharmony_ci */ 45478c2ecf20Sopenharmony_ciu32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) 45488c2ecf20Sopenharmony_ci{ 45498c2ecf20Sopenharmony_ci return 0; 45508c2ecf20Sopenharmony_ci} 45518c2ecf20Sopenharmony_ci#endif 45528c2ecf20Sopenharmony_ci 45538c2ecf20Sopenharmony_civm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 45548c2ecf20Sopenharmony_ci unsigned long address, unsigned int flags) 45558c2ecf20Sopenharmony_ci{ 45568c2ecf20Sopenharmony_ci pte_t *ptep, entry; 45578c2ecf20Sopenharmony_ci spinlock_t *ptl; 45588c2ecf20Sopenharmony_ci vm_fault_t ret; 45598c2ecf20Sopenharmony_ci u32 hash; 45608c2ecf20Sopenharmony_ci pgoff_t idx; 45618c2ecf20Sopenharmony_ci struct page *page = NULL; 45628c2ecf20Sopenharmony_ci struct page *pagecache_page = NULL; 45638c2ecf20Sopenharmony_ci struct hstate *h = hstate_vma(vma); 45648c2ecf20Sopenharmony_ci struct address_space *mapping; 45658c2ecf20Sopenharmony_ci int need_wait_lock = 0; 45668c2ecf20Sopenharmony_ci unsigned long haddr = address & huge_page_mask(h); 45678c2ecf20Sopenharmony_ci 45688c2ecf20Sopenharmony_ci ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); 45698c2ecf20Sopenharmony_ci if (ptep) { 45708c2ecf20Sopenharmony_ci /* 45718c2ecf20Sopenharmony_ci * Since we hold no locks, ptep could be stale. That is 45728c2ecf20Sopenharmony_ci * OK as we are only making decisions based on content and 45738c2ecf20Sopenharmony_ci * not actually modifying content here. 45748c2ecf20Sopenharmony_ci */ 45758c2ecf20Sopenharmony_ci entry = huge_ptep_get(ptep); 45768c2ecf20Sopenharmony_ci if (unlikely(is_hugetlb_entry_migration(entry))) { 45778c2ecf20Sopenharmony_ci migration_entry_wait_huge(vma, mm, ptep); 45788c2ecf20Sopenharmony_ci return 0; 45798c2ecf20Sopenharmony_ci } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 45808c2ecf20Sopenharmony_ci return VM_FAULT_HWPOISON_LARGE | 45818c2ecf20Sopenharmony_ci VM_FAULT_SET_HINDEX(hstate_index(h)); 45828c2ecf20Sopenharmony_ci } 45838c2ecf20Sopenharmony_ci 45848c2ecf20Sopenharmony_ci /* 45858c2ecf20Sopenharmony_ci * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold 45868c2ecf20Sopenharmony_ci * until finished with ptep. This serves two purposes: 45878c2ecf20Sopenharmony_ci * 1) It prevents huge_pmd_unshare from being called elsewhere 45888c2ecf20Sopenharmony_ci * and making the ptep no longer valid. 45898c2ecf20Sopenharmony_ci * 2) It synchronizes us with i_size modifications during truncation. 45908c2ecf20Sopenharmony_ci * 45918c2ecf20Sopenharmony_ci * ptep could have already be assigned via huge_pte_offset. That 45928c2ecf20Sopenharmony_ci * is OK, as huge_pte_alloc will return the same value unless 45938c2ecf20Sopenharmony_ci * something has changed. 45948c2ecf20Sopenharmony_ci */ 45958c2ecf20Sopenharmony_ci mapping = vma->vm_file->f_mapping; 45968c2ecf20Sopenharmony_ci i_mmap_lock_read(mapping); 45978c2ecf20Sopenharmony_ci ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); 45988c2ecf20Sopenharmony_ci if (!ptep) { 45998c2ecf20Sopenharmony_ci i_mmap_unlock_read(mapping); 46008c2ecf20Sopenharmony_ci return VM_FAULT_OOM; 46018c2ecf20Sopenharmony_ci } 46028c2ecf20Sopenharmony_ci 46038c2ecf20Sopenharmony_ci /* 46048c2ecf20Sopenharmony_ci * Serialize hugepage allocation and instantiation, so that we don't 46058c2ecf20Sopenharmony_ci * get spurious allocation failures if two CPUs race to instantiate 46068c2ecf20Sopenharmony_ci * the same page in the page cache. 46078c2ecf20Sopenharmony_ci */ 46088c2ecf20Sopenharmony_ci idx = vma_hugecache_offset(h, vma, haddr); 46098c2ecf20Sopenharmony_ci hash = hugetlb_fault_mutex_hash(mapping, idx); 46108c2ecf20Sopenharmony_ci mutex_lock(&hugetlb_fault_mutex_table[hash]); 46118c2ecf20Sopenharmony_ci 46128c2ecf20Sopenharmony_ci entry = huge_ptep_get(ptep); 46138c2ecf20Sopenharmony_ci if (huge_pte_none(entry)) 46148c2ecf20Sopenharmony_ci /* 46158c2ecf20Sopenharmony_ci * hugetlb_no_page will drop vma lock and hugetlb fault 46168c2ecf20Sopenharmony_ci * mutex internally, which make us return immediately. 46178c2ecf20Sopenharmony_ci */ 46188c2ecf20Sopenharmony_ci return hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags); 46198c2ecf20Sopenharmony_ci 46208c2ecf20Sopenharmony_ci ret = 0; 46218c2ecf20Sopenharmony_ci 46228c2ecf20Sopenharmony_ci /* 46238c2ecf20Sopenharmony_ci * entry could be a migration/hwpoison entry at this point, so this 46248c2ecf20Sopenharmony_ci * check prevents the kernel from going below assuming that we have 46258c2ecf20Sopenharmony_ci * an active hugepage in pagecache. This goto expects the 2nd page 46268c2ecf20Sopenharmony_ci * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will 46278c2ecf20Sopenharmony_ci * properly handle it. 46288c2ecf20Sopenharmony_ci */ 46298c2ecf20Sopenharmony_ci if (!pte_present(entry)) 46308c2ecf20Sopenharmony_ci goto out_mutex; 46318c2ecf20Sopenharmony_ci 46328c2ecf20Sopenharmony_ci /* 46338c2ecf20Sopenharmony_ci * If we are going to COW the mapping later, we examine the pending 46348c2ecf20Sopenharmony_ci * reservations for this page now. This will ensure that any 46358c2ecf20Sopenharmony_ci * allocations necessary to record that reservation occur outside the 46368c2ecf20Sopenharmony_ci * spinlock. For private mappings, we also lookup the pagecache 46378c2ecf20Sopenharmony_ci * page now as it is used to determine if a reservation has been 46388c2ecf20Sopenharmony_ci * consumed. 46398c2ecf20Sopenharmony_ci */ 46408c2ecf20Sopenharmony_ci if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { 46418c2ecf20Sopenharmony_ci if (vma_needs_reservation(h, vma, haddr) < 0) { 46428c2ecf20Sopenharmony_ci ret = VM_FAULT_OOM; 46438c2ecf20Sopenharmony_ci goto out_mutex; 46448c2ecf20Sopenharmony_ci } 46458c2ecf20Sopenharmony_ci /* Just decrements count, does not deallocate */ 46468c2ecf20Sopenharmony_ci vma_end_reservation(h, vma, haddr); 46478c2ecf20Sopenharmony_ci 46488c2ecf20Sopenharmony_ci if (!(vma->vm_flags & VM_MAYSHARE)) 46498c2ecf20Sopenharmony_ci pagecache_page = hugetlbfs_pagecache_page(h, 46508c2ecf20Sopenharmony_ci vma, haddr); 46518c2ecf20Sopenharmony_ci } 46528c2ecf20Sopenharmony_ci 46538c2ecf20Sopenharmony_ci ptl = huge_pte_lock(h, mm, ptep); 46548c2ecf20Sopenharmony_ci 46558c2ecf20Sopenharmony_ci /* Check for a racing update before calling hugetlb_cow */ 46568c2ecf20Sopenharmony_ci if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) 46578c2ecf20Sopenharmony_ci goto out_ptl; 46588c2ecf20Sopenharmony_ci 46598c2ecf20Sopenharmony_ci /* 46608c2ecf20Sopenharmony_ci * hugetlb_cow() requires page locks of pte_page(entry) and 46618c2ecf20Sopenharmony_ci * pagecache_page, so here we need take the former one 46628c2ecf20Sopenharmony_ci * when page != pagecache_page or !pagecache_page. 46638c2ecf20Sopenharmony_ci */ 46648c2ecf20Sopenharmony_ci page = pte_page(entry); 46658c2ecf20Sopenharmony_ci if (page != pagecache_page) 46668c2ecf20Sopenharmony_ci if (!trylock_page(page)) { 46678c2ecf20Sopenharmony_ci need_wait_lock = 1; 46688c2ecf20Sopenharmony_ci goto out_ptl; 46698c2ecf20Sopenharmony_ci } 46708c2ecf20Sopenharmony_ci 46718c2ecf20Sopenharmony_ci get_page(page); 46728c2ecf20Sopenharmony_ci 46738c2ecf20Sopenharmony_ci if (flags & FAULT_FLAG_WRITE) { 46748c2ecf20Sopenharmony_ci if (!huge_pte_write(entry)) { 46758c2ecf20Sopenharmony_ci ret = hugetlb_cow(mm, vma, address, ptep, 46768c2ecf20Sopenharmony_ci pagecache_page, ptl); 46778c2ecf20Sopenharmony_ci goto out_put_page; 46788c2ecf20Sopenharmony_ci } 46798c2ecf20Sopenharmony_ci entry = huge_pte_mkdirty(entry); 46808c2ecf20Sopenharmony_ci } 46818c2ecf20Sopenharmony_ci entry = pte_mkyoung(entry); 46828c2ecf20Sopenharmony_ci if (huge_ptep_set_access_flags(vma, haddr, ptep, entry, 46838c2ecf20Sopenharmony_ci flags & FAULT_FLAG_WRITE)) 46848c2ecf20Sopenharmony_ci update_mmu_cache(vma, haddr, ptep); 46858c2ecf20Sopenharmony_ciout_put_page: 46868c2ecf20Sopenharmony_ci if (page != pagecache_page) 46878c2ecf20Sopenharmony_ci unlock_page(page); 46888c2ecf20Sopenharmony_ci put_page(page); 46898c2ecf20Sopenharmony_ciout_ptl: 46908c2ecf20Sopenharmony_ci spin_unlock(ptl); 46918c2ecf20Sopenharmony_ci 46928c2ecf20Sopenharmony_ci if (pagecache_page) { 46938c2ecf20Sopenharmony_ci unlock_page(pagecache_page); 46948c2ecf20Sopenharmony_ci put_page(pagecache_page); 46958c2ecf20Sopenharmony_ci } 46968c2ecf20Sopenharmony_ciout_mutex: 46978c2ecf20Sopenharmony_ci mutex_unlock(&hugetlb_fault_mutex_table[hash]); 46988c2ecf20Sopenharmony_ci i_mmap_unlock_read(mapping); 46998c2ecf20Sopenharmony_ci /* 47008c2ecf20Sopenharmony_ci * Generally it's safe to hold refcount during waiting page lock. But 47018c2ecf20Sopenharmony_ci * here we just wait to defer the next page fault to avoid busy loop and 47028c2ecf20Sopenharmony_ci * the page is not used after unlocked before returning from the current 47038c2ecf20Sopenharmony_ci * page fault. So we are safe from accessing freed page, even if we wait 47048c2ecf20Sopenharmony_ci * here without taking refcount. 47058c2ecf20Sopenharmony_ci */ 47068c2ecf20Sopenharmony_ci if (need_wait_lock) 47078c2ecf20Sopenharmony_ci wait_on_page_locked(page); 47088c2ecf20Sopenharmony_ci return ret; 47098c2ecf20Sopenharmony_ci} 47108c2ecf20Sopenharmony_ci 47118c2ecf20Sopenharmony_ci/* 47128c2ecf20Sopenharmony_ci * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with 47138c2ecf20Sopenharmony_ci * modifications for huge pages. 47148c2ecf20Sopenharmony_ci */ 47158c2ecf20Sopenharmony_ciint hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, 47168c2ecf20Sopenharmony_ci pte_t *dst_pte, 47178c2ecf20Sopenharmony_ci struct vm_area_struct *dst_vma, 47188c2ecf20Sopenharmony_ci unsigned long dst_addr, 47198c2ecf20Sopenharmony_ci unsigned long src_addr, 47208c2ecf20Sopenharmony_ci struct page **pagep) 47218c2ecf20Sopenharmony_ci{ 47228c2ecf20Sopenharmony_ci struct address_space *mapping; 47238c2ecf20Sopenharmony_ci pgoff_t idx; 47248c2ecf20Sopenharmony_ci unsigned long size; 47258c2ecf20Sopenharmony_ci int vm_shared = dst_vma->vm_flags & VM_SHARED; 47268c2ecf20Sopenharmony_ci struct hstate *h = hstate_vma(dst_vma); 47278c2ecf20Sopenharmony_ci pte_t _dst_pte; 47288c2ecf20Sopenharmony_ci spinlock_t *ptl; 47298c2ecf20Sopenharmony_ci int ret; 47308c2ecf20Sopenharmony_ci struct page *page; 47318c2ecf20Sopenharmony_ci 47328c2ecf20Sopenharmony_ci if (!*pagep) { 47338c2ecf20Sopenharmony_ci /* If a page already exists, then it's UFFDIO_COPY for 47348c2ecf20Sopenharmony_ci * a non-missing case. Return -EEXIST. 47358c2ecf20Sopenharmony_ci */ 47368c2ecf20Sopenharmony_ci if (vm_shared && 47378c2ecf20Sopenharmony_ci hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { 47388c2ecf20Sopenharmony_ci ret = -EEXIST; 47398c2ecf20Sopenharmony_ci goto out; 47408c2ecf20Sopenharmony_ci } 47418c2ecf20Sopenharmony_ci 47428c2ecf20Sopenharmony_ci page = alloc_huge_page(dst_vma, dst_addr, 0); 47438c2ecf20Sopenharmony_ci if (IS_ERR(page)) { 47448c2ecf20Sopenharmony_ci ret = -ENOMEM; 47458c2ecf20Sopenharmony_ci goto out; 47468c2ecf20Sopenharmony_ci } 47478c2ecf20Sopenharmony_ci 47488c2ecf20Sopenharmony_ci ret = copy_huge_page_from_user(page, 47498c2ecf20Sopenharmony_ci (const void __user *) src_addr, 47508c2ecf20Sopenharmony_ci pages_per_huge_page(h), false); 47518c2ecf20Sopenharmony_ci 47528c2ecf20Sopenharmony_ci /* fallback to copy_from_user outside mmap_lock */ 47538c2ecf20Sopenharmony_ci if (unlikely(ret)) { 47548c2ecf20Sopenharmony_ci ret = -ENOENT; 47558c2ecf20Sopenharmony_ci *pagep = page; 47568c2ecf20Sopenharmony_ci /* don't free the page */ 47578c2ecf20Sopenharmony_ci goto out; 47588c2ecf20Sopenharmony_ci } 47598c2ecf20Sopenharmony_ci } else { 47608c2ecf20Sopenharmony_ci page = *pagep; 47618c2ecf20Sopenharmony_ci *pagep = NULL; 47628c2ecf20Sopenharmony_ci } 47638c2ecf20Sopenharmony_ci 47648c2ecf20Sopenharmony_ci /* 47658c2ecf20Sopenharmony_ci * The memory barrier inside __SetPageUptodate makes sure that 47668c2ecf20Sopenharmony_ci * preceding stores to the page contents become visible before 47678c2ecf20Sopenharmony_ci * the set_pte_at() write. 47688c2ecf20Sopenharmony_ci */ 47698c2ecf20Sopenharmony_ci __SetPageUptodate(page); 47708c2ecf20Sopenharmony_ci 47718c2ecf20Sopenharmony_ci mapping = dst_vma->vm_file->f_mapping; 47728c2ecf20Sopenharmony_ci idx = vma_hugecache_offset(h, dst_vma, dst_addr); 47738c2ecf20Sopenharmony_ci 47748c2ecf20Sopenharmony_ci /* 47758c2ecf20Sopenharmony_ci * If shared, add to page cache 47768c2ecf20Sopenharmony_ci */ 47778c2ecf20Sopenharmony_ci if (vm_shared) { 47788c2ecf20Sopenharmony_ci size = i_size_read(mapping->host) >> huge_page_shift(h); 47798c2ecf20Sopenharmony_ci ret = -EFAULT; 47808c2ecf20Sopenharmony_ci if (idx >= size) 47818c2ecf20Sopenharmony_ci goto out_release_nounlock; 47828c2ecf20Sopenharmony_ci 47838c2ecf20Sopenharmony_ci /* 47848c2ecf20Sopenharmony_ci * Serialization between remove_inode_hugepages() and 47858c2ecf20Sopenharmony_ci * huge_add_to_page_cache() below happens through the 47868c2ecf20Sopenharmony_ci * hugetlb_fault_mutex_table that here must be hold by 47878c2ecf20Sopenharmony_ci * the caller. 47888c2ecf20Sopenharmony_ci */ 47898c2ecf20Sopenharmony_ci ret = huge_add_to_page_cache(page, mapping, idx); 47908c2ecf20Sopenharmony_ci if (ret) 47918c2ecf20Sopenharmony_ci goto out_release_nounlock; 47928c2ecf20Sopenharmony_ci } 47938c2ecf20Sopenharmony_ci 47948c2ecf20Sopenharmony_ci ptl = huge_pte_lockptr(h, dst_mm, dst_pte); 47958c2ecf20Sopenharmony_ci spin_lock(ptl); 47968c2ecf20Sopenharmony_ci 47978c2ecf20Sopenharmony_ci /* 47988c2ecf20Sopenharmony_ci * Recheck the i_size after holding PT lock to make sure not 47998c2ecf20Sopenharmony_ci * to leave any page mapped (as page_mapped()) beyond the end 48008c2ecf20Sopenharmony_ci * of the i_size (remove_inode_hugepages() is strict about 48018c2ecf20Sopenharmony_ci * enforcing that). If we bail out here, we'll also leave a 48028c2ecf20Sopenharmony_ci * page in the radix tree in the vm_shared case beyond the end 48038c2ecf20Sopenharmony_ci * of the i_size, but remove_inode_hugepages() will take care 48048c2ecf20Sopenharmony_ci * of it as soon as we drop the hugetlb_fault_mutex_table. 48058c2ecf20Sopenharmony_ci */ 48068c2ecf20Sopenharmony_ci size = i_size_read(mapping->host) >> huge_page_shift(h); 48078c2ecf20Sopenharmony_ci ret = -EFAULT; 48088c2ecf20Sopenharmony_ci if (idx >= size) 48098c2ecf20Sopenharmony_ci goto out_release_unlock; 48108c2ecf20Sopenharmony_ci 48118c2ecf20Sopenharmony_ci ret = -EEXIST; 48128c2ecf20Sopenharmony_ci if (!huge_pte_none(huge_ptep_get(dst_pte))) 48138c2ecf20Sopenharmony_ci goto out_release_unlock; 48148c2ecf20Sopenharmony_ci 48158c2ecf20Sopenharmony_ci if (vm_shared) { 48168c2ecf20Sopenharmony_ci page_dup_rmap(page, true); 48178c2ecf20Sopenharmony_ci } else { 48188c2ecf20Sopenharmony_ci ClearPagePrivate(page); 48198c2ecf20Sopenharmony_ci hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); 48208c2ecf20Sopenharmony_ci } 48218c2ecf20Sopenharmony_ci 48228c2ecf20Sopenharmony_ci _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE); 48238c2ecf20Sopenharmony_ci if (dst_vma->vm_flags & VM_WRITE) 48248c2ecf20Sopenharmony_ci _dst_pte = huge_pte_mkdirty(_dst_pte); 48258c2ecf20Sopenharmony_ci _dst_pte = pte_mkyoung(_dst_pte); 48268c2ecf20Sopenharmony_ci 48278c2ecf20Sopenharmony_ci set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 48288c2ecf20Sopenharmony_ci 48298c2ecf20Sopenharmony_ci (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte, 48308c2ecf20Sopenharmony_ci dst_vma->vm_flags & VM_WRITE); 48318c2ecf20Sopenharmony_ci hugetlb_count_add(pages_per_huge_page(h), dst_mm); 48328c2ecf20Sopenharmony_ci 48338c2ecf20Sopenharmony_ci /* No need to invalidate - it was non-present before */ 48348c2ecf20Sopenharmony_ci update_mmu_cache(dst_vma, dst_addr, dst_pte); 48358c2ecf20Sopenharmony_ci 48368c2ecf20Sopenharmony_ci spin_unlock(ptl); 48378c2ecf20Sopenharmony_ci set_page_huge_active(page); 48388c2ecf20Sopenharmony_ci if (vm_shared) 48398c2ecf20Sopenharmony_ci unlock_page(page); 48408c2ecf20Sopenharmony_ci ret = 0; 48418c2ecf20Sopenharmony_ciout: 48428c2ecf20Sopenharmony_ci return ret; 48438c2ecf20Sopenharmony_ciout_release_unlock: 48448c2ecf20Sopenharmony_ci spin_unlock(ptl); 48458c2ecf20Sopenharmony_ci if (vm_shared) 48468c2ecf20Sopenharmony_ci unlock_page(page); 48478c2ecf20Sopenharmony_ciout_release_nounlock: 48488c2ecf20Sopenharmony_ci put_page(page); 48498c2ecf20Sopenharmony_ci goto out; 48508c2ecf20Sopenharmony_ci} 48518c2ecf20Sopenharmony_ci 48528c2ecf20Sopenharmony_cilong follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 48538c2ecf20Sopenharmony_ci struct page **pages, struct vm_area_struct **vmas, 48548c2ecf20Sopenharmony_ci unsigned long *position, unsigned long *nr_pages, 48558c2ecf20Sopenharmony_ci long i, unsigned int flags, int *locked) 48568c2ecf20Sopenharmony_ci{ 48578c2ecf20Sopenharmony_ci unsigned long pfn_offset; 48588c2ecf20Sopenharmony_ci unsigned long vaddr = *position; 48598c2ecf20Sopenharmony_ci unsigned long remainder = *nr_pages; 48608c2ecf20Sopenharmony_ci struct hstate *h = hstate_vma(vma); 48618c2ecf20Sopenharmony_ci int err = -EFAULT; 48628c2ecf20Sopenharmony_ci 48638c2ecf20Sopenharmony_ci while (vaddr < vma->vm_end && remainder) { 48648c2ecf20Sopenharmony_ci pte_t *pte; 48658c2ecf20Sopenharmony_ci spinlock_t *ptl = NULL; 48668c2ecf20Sopenharmony_ci int absent; 48678c2ecf20Sopenharmony_ci struct page *page; 48688c2ecf20Sopenharmony_ci 48698c2ecf20Sopenharmony_ci /* 48708c2ecf20Sopenharmony_ci * If we have a pending SIGKILL, don't keep faulting pages and 48718c2ecf20Sopenharmony_ci * potentially allocating memory. 48728c2ecf20Sopenharmony_ci */ 48738c2ecf20Sopenharmony_ci if (fatal_signal_pending(current)) { 48748c2ecf20Sopenharmony_ci remainder = 0; 48758c2ecf20Sopenharmony_ci break; 48768c2ecf20Sopenharmony_ci } 48778c2ecf20Sopenharmony_ci 48788c2ecf20Sopenharmony_ci /* 48798c2ecf20Sopenharmony_ci * Some archs (sparc64, sh*) have multiple pte_ts to 48808c2ecf20Sopenharmony_ci * each hugepage. We have to make sure we get the 48818c2ecf20Sopenharmony_ci * first, for the page indexing below to work. 48828c2ecf20Sopenharmony_ci * 48838c2ecf20Sopenharmony_ci * Note that page table lock is not held when pte is null. 48848c2ecf20Sopenharmony_ci */ 48858c2ecf20Sopenharmony_ci pte = huge_pte_offset(mm, vaddr & huge_page_mask(h), 48868c2ecf20Sopenharmony_ci huge_page_size(h)); 48878c2ecf20Sopenharmony_ci if (pte) 48888c2ecf20Sopenharmony_ci ptl = huge_pte_lock(h, mm, pte); 48898c2ecf20Sopenharmony_ci absent = !pte || huge_pte_none(huge_ptep_get(pte)); 48908c2ecf20Sopenharmony_ci 48918c2ecf20Sopenharmony_ci /* 48928c2ecf20Sopenharmony_ci * When coredumping, it suits get_dump_page if we just return 48938c2ecf20Sopenharmony_ci * an error where there's an empty slot with no huge pagecache 48948c2ecf20Sopenharmony_ci * to back it. This way, we avoid allocating a hugepage, and 48958c2ecf20Sopenharmony_ci * the sparse dumpfile avoids allocating disk blocks, but its 48968c2ecf20Sopenharmony_ci * huge holes still show up with zeroes where they need to be. 48978c2ecf20Sopenharmony_ci */ 48988c2ecf20Sopenharmony_ci if (absent && (flags & FOLL_DUMP) && 48998c2ecf20Sopenharmony_ci !hugetlbfs_pagecache_present(h, vma, vaddr)) { 49008c2ecf20Sopenharmony_ci if (pte) 49018c2ecf20Sopenharmony_ci spin_unlock(ptl); 49028c2ecf20Sopenharmony_ci remainder = 0; 49038c2ecf20Sopenharmony_ci break; 49048c2ecf20Sopenharmony_ci } 49058c2ecf20Sopenharmony_ci 49068c2ecf20Sopenharmony_ci /* 49078c2ecf20Sopenharmony_ci * We need call hugetlb_fault for both hugepages under migration 49088c2ecf20Sopenharmony_ci * (in which case hugetlb_fault waits for the migration,) and 49098c2ecf20Sopenharmony_ci * hwpoisoned hugepages (in which case we need to prevent the 49108c2ecf20Sopenharmony_ci * caller from accessing to them.) In order to do this, we use 49118c2ecf20Sopenharmony_ci * here is_swap_pte instead of is_hugetlb_entry_migration and 49128c2ecf20Sopenharmony_ci * is_hugetlb_entry_hwpoisoned. This is because it simply covers 49138c2ecf20Sopenharmony_ci * both cases, and because we can't follow correct pages 49148c2ecf20Sopenharmony_ci * directly from any kind of swap entries. 49158c2ecf20Sopenharmony_ci */ 49168c2ecf20Sopenharmony_ci if (absent || is_swap_pte(huge_ptep_get(pte)) || 49178c2ecf20Sopenharmony_ci ((flags & FOLL_WRITE) && 49188c2ecf20Sopenharmony_ci !huge_pte_write(huge_ptep_get(pte)))) { 49198c2ecf20Sopenharmony_ci vm_fault_t ret; 49208c2ecf20Sopenharmony_ci unsigned int fault_flags = 0; 49218c2ecf20Sopenharmony_ci 49228c2ecf20Sopenharmony_ci if (pte) 49238c2ecf20Sopenharmony_ci spin_unlock(ptl); 49248c2ecf20Sopenharmony_ci if (flags & FOLL_WRITE) 49258c2ecf20Sopenharmony_ci fault_flags |= FAULT_FLAG_WRITE; 49268c2ecf20Sopenharmony_ci if (locked) 49278c2ecf20Sopenharmony_ci fault_flags |= FAULT_FLAG_ALLOW_RETRY | 49288c2ecf20Sopenharmony_ci FAULT_FLAG_KILLABLE; 49298c2ecf20Sopenharmony_ci if (flags & FOLL_NOWAIT) 49308c2ecf20Sopenharmony_ci fault_flags |= FAULT_FLAG_ALLOW_RETRY | 49318c2ecf20Sopenharmony_ci FAULT_FLAG_RETRY_NOWAIT; 49328c2ecf20Sopenharmony_ci if (flags & FOLL_TRIED) { 49338c2ecf20Sopenharmony_ci /* 49348c2ecf20Sopenharmony_ci * Note: FAULT_FLAG_ALLOW_RETRY and 49358c2ecf20Sopenharmony_ci * FAULT_FLAG_TRIED can co-exist 49368c2ecf20Sopenharmony_ci */ 49378c2ecf20Sopenharmony_ci fault_flags |= FAULT_FLAG_TRIED; 49388c2ecf20Sopenharmony_ci } 49398c2ecf20Sopenharmony_ci ret = hugetlb_fault(mm, vma, vaddr, fault_flags); 49408c2ecf20Sopenharmony_ci if (ret & VM_FAULT_ERROR) { 49418c2ecf20Sopenharmony_ci err = vm_fault_to_errno(ret, flags); 49428c2ecf20Sopenharmony_ci remainder = 0; 49438c2ecf20Sopenharmony_ci break; 49448c2ecf20Sopenharmony_ci } 49458c2ecf20Sopenharmony_ci if (ret & VM_FAULT_RETRY) { 49468c2ecf20Sopenharmony_ci if (locked && 49478c2ecf20Sopenharmony_ci !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) 49488c2ecf20Sopenharmony_ci *locked = 0; 49498c2ecf20Sopenharmony_ci *nr_pages = 0; 49508c2ecf20Sopenharmony_ci /* 49518c2ecf20Sopenharmony_ci * VM_FAULT_RETRY must not return an 49528c2ecf20Sopenharmony_ci * error, it will return zero 49538c2ecf20Sopenharmony_ci * instead. 49548c2ecf20Sopenharmony_ci * 49558c2ecf20Sopenharmony_ci * No need to update "position" as the 49568c2ecf20Sopenharmony_ci * caller will not check it after 49578c2ecf20Sopenharmony_ci * *nr_pages is set to 0. 49588c2ecf20Sopenharmony_ci */ 49598c2ecf20Sopenharmony_ci return i; 49608c2ecf20Sopenharmony_ci } 49618c2ecf20Sopenharmony_ci continue; 49628c2ecf20Sopenharmony_ci } 49638c2ecf20Sopenharmony_ci 49648c2ecf20Sopenharmony_ci pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; 49658c2ecf20Sopenharmony_ci page = pte_page(huge_ptep_get(pte)); 49668c2ecf20Sopenharmony_ci 49678c2ecf20Sopenharmony_ci /* 49688c2ecf20Sopenharmony_ci * If subpage information not requested, update counters 49698c2ecf20Sopenharmony_ci * and skip the same_page loop below. 49708c2ecf20Sopenharmony_ci */ 49718c2ecf20Sopenharmony_ci if (!pages && !vmas && !pfn_offset && 49728c2ecf20Sopenharmony_ci (vaddr + huge_page_size(h) < vma->vm_end) && 49738c2ecf20Sopenharmony_ci (remainder >= pages_per_huge_page(h))) { 49748c2ecf20Sopenharmony_ci vaddr += huge_page_size(h); 49758c2ecf20Sopenharmony_ci remainder -= pages_per_huge_page(h); 49768c2ecf20Sopenharmony_ci i += pages_per_huge_page(h); 49778c2ecf20Sopenharmony_ci spin_unlock(ptl); 49788c2ecf20Sopenharmony_ci continue; 49798c2ecf20Sopenharmony_ci } 49808c2ecf20Sopenharmony_ci 49818c2ecf20Sopenharmony_cisame_page: 49828c2ecf20Sopenharmony_ci if (pages) { 49838c2ecf20Sopenharmony_ci pages[i] = mem_map_offset(page, pfn_offset); 49848c2ecf20Sopenharmony_ci /* 49858c2ecf20Sopenharmony_ci * try_grab_page() should always succeed here, because: 49868c2ecf20Sopenharmony_ci * a) we hold the ptl lock, and b) we've just checked 49878c2ecf20Sopenharmony_ci * that the huge page is present in the page tables. If 49888c2ecf20Sopenharmony_ci * the huge page is present, then the tail pages must 49898c2ecf20Sopenharmony_ci * also be present. The ptl prevents the head page and 49908c2ecf20Sopenharmony_ci * tail pages from being rearranged in any way. So this 49918c2ecf20Sopenharmony_ci * page must be available at this point, unless the page 49928c2ecf20Sopenharmony_ci * refcount overflowed: 49938c2ecf20Sopenharmony_ci */ 49948c2ecf20Sopenharmony_ci if (WARN_ON_ONCE(!try_grab_page(pages[i], flags))) { 49958c2ecf20Sopenharmony_ci spin_unlock(ptl); 49968c2ecf20Sopenharmony_ci remainder = 0; 49978c2ecf20Sopenharmony_ci err = -ENOMEM; 49988c2ecf20Sopenharmony_ci break; 49998c2ecf20Sopenharmony_ci } 50008c2ecf20Sopenharmony_ci } 50018c2ecf20Sopenharmony_ci 50028c2ecf20Sopenharmony_ci if (vmas) 50038c2ecf20Sopenharmony_ci vmas[i] = vma; 50048c2ecf20Sopenharmony_ci 50058c2ecf20Sopenharmony_ci vaddr += PAGE_SIZE; 50068c2ecf20Sopenharmony_ci ++pfn_offset; 50078c2ecf20Sopenharmony_ci --remainder; 50088c2ecf20Sopenharmony_ci ++i; 50098c2ecf20Sopenharmony_ci if (vaddr < vma->vm_end && remainder && 50108c2ecf20Sopenharmony_ci pfn_offset < pages_per_huge_page(h)) { 50118c2ecf20Sopenharmony_ci /* 50128c2ecf20Sopenharmony_ci * We use pfn_offset to avoid touching the pageframes 50138c2ecf20Sopenharmony_ci * of this compound page. 50148c2ecf20Sopenharmony_ci */ 50158c2ecf20Sopenharmony_ci goto same_page; 50168c2ecf20Sopenharmony_ci } 50178c2ecf20Sopenharmony_ci spin_unlock(ptl); 50188c2ecf20Sopenharmony_ci } 50198c2ecf20Sopenharmony_ci *nr_pages = remainder; 50208c2ecf20Sopenharmony_ci /* 50218c2ecf20Sopenharmony_ci * setting position is actually required only if remainder is 50228c2ecf20Sopenharmony_ci * not zero but it's faster not to add a "if (remainder)" 50238c2ecf20Sopenharmony_ci * branch. 50248c2ecf20Sopenharmony_ci */ 50258c2ecf20Sopenharmony_ci *position = vaddr; 50268c2ecf20Sopenharmony_ci 50278c2ecf20Sopenharmony_ci return i ? i : err; 50288c2ecf20Sopenharmony_ci} 50298c2ecf20Sopenharmony_ci 50308c2ecf20Sopenharmony_ci#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE 50318c2ecf20Sopenharmony_ci/* 50328c2ecf20Sopenharmony_ci * ARCHes with special requirements for evicting HUGETLB backing TLB entries can 50338c2ecf20Sopenharmony_ci * implement this. 50348c2ecf20Sopenharmony_ci */ 50358c2ecf20Sopenharmony_ci#define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) 50368c2ecf20Sopenharmony_ci#endif 50378c2ecf20Sopenharmony_ci 50388c2ecf20Sopenharmony_ciunsigned long hugetlb_change_protection(struct vm_area_struct *vma, 50398c2ecf20Sopenharmony_ci unsigned long address, unsigned long end, pgprot_t newprot) 50408c2ecf20Sopenharmony_ci{ 50418c2ecf20Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 50428c2ecf20Sopenharmony_ci unsigned long start = address; 50438c2ecf20Sopenharmony_ci pte_t *ptep; 50448c2ecf20Sopenharmony_ci pte_t pte; 50458c2ecf20Sopenharmony_ci struct hstate *h = hstate_vma(vma); 50468c2ecf20Sopenharmony_ci unsigned long pages = 0; 50478c2ecf20Sopenharmony_ci bool shared_pmd = false; 50488c2ecf20Sopenharmony_ci struct mmu_notifier_range range; 50498c2ecf20Sopenharmony_ci 50508c2ecf20Sopenharmony_ci /* 50518c2ecf20Sopenharmony_ci * In the case of shared PMDs, the area to flush could be beyond 50528c2ecf20Sopenharmony_ci * start/end. Set range.start/range.end to cover the maximum possible 50538c2ecf20Sopenharmony_ci * range if PMD sharing is possible. 50548c2ecf20Sopenharmony_ci */ 50558c2ecf20Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 50568c2ecf20Sopenharmony_ci 0, vma, mm, start, end); 50578c2ecf20Sopenharmony_ci adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); 50588c2ecf20Sopenharmony_ci 50598c2ecf20Sopenharmony_ci BUG_ON(address >= end); 50608c2ecf20Sopenharmony_ci flush_cache_range(vma, range.start, range.end); 50618c2ecf20Sopenharmony_ci 50628c2ecf20Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 50638c2ecf20Sopenharmony_ci i_mmap_lock_write(vma->vm_file->f_mapping); 50648c2ecf20Sopenharmony_ci for (; address < end; address += huge_page_size(h)) { 50658c2ecf20Sopenharmony_ci spinlock_t *ptl; 50668c2ecf20Sopenharmony_ci ptep = huge_pte_offset(mm, address, huge_page_size(h)); 50678c2ecf20Sopenharmony_ci if (!ptep) 50688c2ecf20Sopenharmony_ci continue; 50698c2ecf20Sopenharmony_ci ptl = huge_pte_lock(h, mm, ptep); 50708c2ecf20Sopenharmony_ci if (huge_pmd_unshare(mm, vma, &address, ptep)) { 50718c2ecf20Sopenharmony_ci pages++; 50728c2ecf20Sopenharmony_ci spin_unlock(ptl); 50738c2ecf20Sopenharmony_ci shared_pmd = true; 50748c2ecf20Sopenharmony_ci continue; 50758c2ecf20Sopenharmony_ci } 50768c2ecf20Sopenharmony_ci pte = huge_ptep_get(ptep); 50778c2ecf20Sopenharmony_ci if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { 50788c2ecf20Sopenharmony_ci spin_unlock(ptl); 50798c2ecf20Sopenharmony_ci continue; 50808c2ecf20Sopenharmony_ci } 50818c2ecf20Sopenharmony_ci if (unlikely(is_hugetlb_entry_migration(pte))) { 50828c2ecf20Sopenharmony_ci swp_entry_t entry = pte_to_swp_entry(pte); 50838c2ecf20Sopenharmony_ci 50848c2ecf20Sopenharmony_ci if (is_write_migration_entry(entry)) { 50858c2ecf20Sopenharmony_ci pte_t newpte; 50868c2ecf20Sopenharmony_ci 50878c2ecf20Sopenharmony_ci make_migration_entry_read(&entry); 50888c2ecf20Sopenharmony_ci newpte = swp_entry_to_pte(entry); 50898c2ecf20Sopenharmony_ci set_huge_swap_pte_at(mm, address, ptep, 50908c2ecf20Sopenharmony_ci newpte, huge_page_size(h)); 50918c2ecf20Sopenharmony_ci pages++; 50928c2ecf20Sopenharmony_ci } 50938c2ecf20Sopenharmony_ci spin_unlock(ptl); 50948c2ecf20Sopenharmony_ci continue; 50958c2ecf20Sopenharmony_ci } 50968c2ecf20Sopenharmony_ci if (!huge_pte_none(pte)) { 50978c2ecf20Sopenharmony_ci pte_t old_pte; 50988c2ecf20Sopenharmony_ci 50998c2ecf20Sopenharmony_ci old_pte = huge_ptep_modify_prot_start(vma, address, ptep); 51008c2ecf20Sopenharmony_ci pte = pte_mkhuge(huge_pte_modify(old_pte, newprot)); 51018c2ecf20Sopenharmony_ci pte = arch_make_huge_pte(pte, vma, NULL, 0); 51028c2ecf20Sopenharmony_ci huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); 51038c2ecf20Sopenharmony_ci pages++; 51048c2ecf20Sopenharmony_ci } 51058c2ecf20Sopenharmony_ci spin_unlock(ptl); 51068c2ecf20Sopenharmony_ci } 51078c2ecf20Sopenharmony_ci /* 51088c2ecf20Sopenharmony_ci * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare 51098c2ecf20Sopenharmony_ci * may have cleared our pud entry and done put_page on the page table: 51108c2ecf20Sopenharmony_ci * once we release i_mmap_rwsem, another task can do the final put_page 51118c2ecf20Sopenharmony_ci * and that page table be reused and filled with junk. If we actually 51128c2ecf20Sopenharmony_ci * did unshare a page of pmds, flush the range corresponding to the pud. 51138c2ecf20Sopenharmony_ci */ 51148c2ecf20Sopenharmony_ci if (shared_pmd) 51158c2ecf20Sopenharmony_ci flush_hugetlb_tlb_range(vma, range.start, range.end); 51168c2ecf20Sopenharmony_ci else 51178c2ecf20Sopenharmony_ci flush_hugetlb_tlb_range(vma, start, end); 51188c2ecf20Sopenharmony_ci /* 51198c2ecf20Sopenharmony_ci * No need to call mmu_notifier_invalidate_range() we are downgrading 51208c2ecf20Sopenharmony_ci * page table protection not changing it to point to a new page. 51218c2ecf20Sopenharmony_ci * 51228c2ecf20Sopenharmony_ci * See Documentation/vm/mmu_notifier.rst 51238c2ecf20Sopenharmony_ci */ 51248c2ecf20Sopenharmony_ci i_mmap_unlock_write(vma->vm_file->f_mapping); 51258c2ecf20Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 51268c2ecf20Sopenharmony_ci 51278c2ecf20Sopenharmony_ci return pages << h->order; 51288c2ecf20Sopenharmony_ci} 51298c2ecf20Sopenharmony_ci 51308c2ecf20Sopenharmony_ciint hugetlb_reserve_pages(struct inode *inode, 51318c2ecf20Sopenharmony_ci long from, long to, 51328c2ecf20Sopenharmony_ci struct vm_area_struct *vma, 51338c2ecf20Sopenharmony_ci vm_flags_t vm_flags) 51348c2ecf20Sopenharmony_ci{ 51358c2ecf20Sopenharmony_ci long ret, chg, add = -1; 51368c2ecf20Sopenharmony_ci struct hstate *h = hstate_inode(inode); 51378c2ecf20Sopenharmony_ci struct hugepage_subpool *spool = subpool_inode(inode); 51388c2ecf20Sopenharmony_ci struct resv_map *resv_map; 51398c2ecf20Sopenharmony_ci struct hugetlb_cgroup *h_cg = NULL; 51408c2ecf20Sopenharmony_ci long gbl_reserve, regions_needed = 0; 51418c2ecf20Sopenharmony_ci 51428c2ecf20Sopenharmony_ci /* This should never happen */ 51438c2ecf20Sopenharmony_ci if (from > to) { 51448c2ecf20Sopenharmony_ci VM_WARN(1, "%s called with a negative range\n", __func__); 51458c2ecf20Sopenharmony_ci return -EINVAL; 51468c2ecf20Sopenharmony_ci } 51478c2ecf20Sopenharmony_ci 51488c2ecf20Sopenharmony_ci /* 51498c2ecf20Sopenharmony_ci * Only apply hugepage reservation if asked. At fault time, an 51508c2ecf20Sopenharmony_ci * attempt will be made for VM_NORESERVE to allocate a page 51518c2ecf20Sopenharmony_ci * without using reserves 51528c2ecf20Sopenharmony_ci */ 51538c2ecf20Sopenharmony_ci if (vm_flags & VM_NORESERVE) 51548c2ecf20Sopenharmony_ci return 0; 51558c2ecf20Sopenharmony_ci 51568c2ecf20Sopenharmony_ci /* 51578c2ecf20Sopenharmony_ci * Shared mappings base their reservation on the number of pages that 51588c2ecf20Sopenharmony_ci * are already allocated on behalf of the file. Private mappings need 51598c2ecf20Sopenharmony_ci * to reserve the full area even if read-only as mprotect() may be 51608c2ecf20Sopenharmony_ci * called to make the mapping read-write. Assume !vma is a shm mapping 51618c2ecf20Sopenharmony_ci */ 51628c2ecf20Sopenharmony_ci if (!vma || vma->vm_flags & VM_MAYSHARE) { 51638c2ecf20Sopenharmony_ci /* 51648c2ecf20Sopenharmony_ci * resv_map can not be NULL as hugetlb_reserve_pages is only 51658c2ecf20Sopenharmony_ci * called for inodes for which resv_maps were created (see 51668c2ecf20Sopenharmony_ci * hugetlbfs_get_inode). 51678c2ecf20Sopenharmony_ci */ 51688c2ecf20Sopenharmony_ci resv_map = inode_resv_map(inode); 51698c2ecf20Sopenharmony_ci 51708c2ecf20Sopenharmony_ci chg = region_chg(resv_map, from, to, ®ions_needed); 51718c2ecf20Sopenharmony_ci 51728c2ecf20Sopenharmony_ci } else { 51738c2ecf20Sopenharmony_ci /* Private mapping. */ 51748c2ecf20Sopenharmony_ci resv_map = resv_map_alloc(); 51758c2ecf20Sopenharmony_ci if (!resv_map) 51768c2ecf20Sopenharmony_ci return -ENOMEM; 51778c2ecf20Sopenharmony_ci 51788c2ecf20Sopenharmony_ci chg = to - from; 51798c2ecf20Sopenharmony_ci 51808c2ecf20Sopenharmony_ci set_vma_resv_map(vma, resv_map); 51818c2ecf20Sopenharmony_ci set_vma_resv_flags(vma, HPAGE_RESV_OWNER); 51828c2ecf20Sopenharmony_ci } 51838c2ecf20Sopenharmony_ci 51848c2ecf20Sopenharmony_ci if (chg < 0) { 51858c2ecf20Sopenharmony_ci ret = chg; 51868c2ecf20Sopenharmony_ci goto out_err; 51878c2ecf20Sopenharmony_ci } 51888c2ecf20Sopenharmony_ci 51898c2ecf20Sopenharmony_ci ret = hugetlb_cgroup_charge_cgroup_rsvd( 51908c2ecf20Sopenharmony_ci hstate_index(h), chg * pages_per_huge_page(h), &h_cg); 51918c2ecf20Sopenharmony_ci 51928c2ecf20Sopenharmony_ci if (ret < 0) { 51938c2ecf20Sopenharmony_ci ret = -ENOMEM; 51948c2ecf20Sopenharmony_ci goto out_err; 51958c2ecf20Sopenharmony_ci } 51968c2ecf20Sopenharmony_ci 51978c2ecf20Sopenharmony_ci if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) { 51988c2ecf20Sopenharmony_ci /* For private mappings, the hugetlb_cgroup uncharge info hangs 51998c2ecf20Sopenharmony_ci * of the resv_map. 52008c2ecf20Sopenharmony_ci */ 52018c2ecf20Sopenharmony_ci resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h); 52028c2ecf20Sopenharmony_ci } 52038c2ecf20Sopenharmony_ci 52048c2ecf20Sopenharmony_ci /* 52058c2ecf20Sopenharmony_ci * There must be enough pages in the subpool for the mapping. If 52068c2ecf20Sopenharmony_ci * the subpool has a minimum size, there may be some global 52078c2ecf20Sopenharmony_ci * reservations already in place (gbl_reserve). 52088c2ecf20Sopenharmony_ci */ 52098c2ecf20Sopenharmony_ci gbl_reserve = hugepage_subpool_get_pages(spool, chg); 52108c2ecf20Sopenharmony_ci if (gbl_reserve < 0) { 52118c2ecf20Sopenharmony_ci ret = -ENOSPC; 52128c2ecf20Sopenharmony_ci goto out_uncharge_cgroup; 52138c2ecf20Sopenharmony_ci } 52148c2ecf20Sopenharmony_ci 52158c2ecf20Sopenharmony_ci /* 52168c2ecf20Sopenharmony_ci * Check enough hugepages are available for the reservation. 52178c2ecf20Sopenharmony_ci * Hand the pages back to the subpool if there are not 52188c2ecf20Sopenharmony_ci */ 52198c2ecf20Sopenharmony_ci ret = hugetlb_acct_memory(h, gbl_reserve); 52208c2ecf20Sopenharmony_ci if (ret < 0) { 52218c2ecf20Sopenharmony_ci goto out_put_pages; 52228c2ecf20Sopenharmony_ci } 52238c2ecf20Sopenharmony_ci 52248c2ecf20Sopenharmony_ci /* 52258c2ecf20Sopenharmony_ci * Account for the reservations made. Shared mappings record regions 52268c2ecf20Sopenharmony_ci * that have reservations as they are shared by multiple VMAs. 52278c2ecf20Sopenharmony_ci * When the last VMA disappears, the region map says how much 52288c2ecf20Sopenharmony_ci * the reservation was and the page cache tells how much of 52298c2ecf20Sopenharmony_ci * the reservation was consumed. Private mappings are per-VMA and 52308c2ecf20Sopenharmony_ci * only the consumed reservations are tracked. When the VMA 52318c2ecf20Sopenharmony_ci * disappears, the original reservation is the VMA size and the 52328c2ecf20Sopenharmony_ci * consumed reservations are stored in the map. Hence, nothing 52338c2ecf20Sopenharmony_ci * else has to be done for private mappings here 52348c2ecf20Sopenharmony_ci */ 52358c2ecf20Sopenharmony_ci if (!vma || vma->vm_flags & VM_MAYSHARE) { 52368c2ecf20Sopenharmony_ci add = region_add(resv_map, from, to, regions_needed, h, h_cg); 52378c2ecf20Sopenharmony_ci 52388c2ecf20Sopenharmony_ci if (unlikely(add < 0)) { 52398c2ecf20Sopenharmony_ci hugetlb_acct_memory(h, -gbl_reserve); 52408c2ecf20Sopenharmony_ci ret = add; 52418c2ecf20Sopenharmony_ci goto out_put_pages; 52428c2ecf20Sopenharmony_ci } else if (unlikely(chg > add)) { 52438c2ecf20Sopenharmony_ci /* 52448c2ecf20Sopenharmony_ci * pages in this range were added to the reserve 52458c2ecf20Sopenharmony_ci * map between region_chg and region_add. This 52468c2ecf20Sopenharmony_ci * indicates a race with alloc_huge_page. Adjust 52478c2ecf20Sopenharmony_ci * the subpool and reserve counts modified above 52488c2ecf20Sopenharmony_ci * based on the difference. 52498c2ecf20Sopenharmony_ci */ 52508c2ecf20Sopenharmony_ci long rsv_adjust; 52518c2ecf20Sopenharmony_ci 52528c2ecf20Sopenharmony_ci /* 52538c2ecf20Sopenharmony_ci * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the 52548c2ecf20Sopenharmony_ci * reference to h_cg->css. See comment below for detail. 52558c2ecf20Sopenharmony_ci */ 52568c2ecf20Sopenharmony_ci hugetlb_cgroup_uncharge_cgroup_rsvd( 52578c2ecf20Sopenharmony_ci hstate_index(h), 52588c2ecf20Sopenharmony_ci (chg - add) * pages_per_huge_page(h), h_cg); 52598c2ecf20Sopenharmony_ci 52608c2ecf20Sopenharmony_ci rsv_adjust = hugepage_subpool_put_pages(spool, 52618c2ecf20Sopenharmony_ci chg - add); 52628c2ecf20Sopenharmony_ci hugetlb_acct_memory(h, -rsv_adjust); 52638c2ecf20Sopenharmony_ci } else if (h_cg) { 52648c2ecf20Sopenharmony_ci /* 52658c2ecf20Sopenharmony_ci * The file_regions will hold their own reference to 52668c2ecf20Sopenharmony_ci * h_cg->css. So we should release the reference held 52678c2ecf20Sopenharmony_ci * via hugetlb_cgroup_charge_cgroup_rsvd() when we are 52688c2ecf20Sopenharmony_ci * done. 52698c2ecf20Sopenharmony_ci */ 52708c2ecf20Sopenharmony_ci hugetlb_cgroup_put_rsvd_cgroup(h_cg); 52718c2ecf20Sopenharmony_ci } 52728c2ecf20Sopenharmony_ci } 52738c2ecf20Sopenharmony_ci return 0; 52748c2ecf20Sopenharmony_ciout_put_pages: 52758c2ecf20Sopenharmony_ci /* put back original number of pages, chg */ 52768c2ecf20Sopenharmony_ci (void)hugepage_subpool_put_pages(spool, chg); 52778c2ecf20Sopenharmony_ciout_uncharge_cgroup: 52788c2ecf20Sopenharmony_ci hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), 52798c2ecf20Sopenharmony_ci chg * pages_per_huge_page(h), h_cg); 52808c2ecf20Sopenharmony_ciout_err: 52818c2ecf20Sopenharmony_ci if (!vma || vma->vm_flags & VM_MAYSHARE) 52828c2ecf20Sopenharmony_ci /* Only call region_abort if the region_chg succeeded but the 52838c2ecf20Sopenharmony_ci * region_add failed or didn't run. 52848c2ecf20Sopenharmony_ci */ 52858c2ecf20Sopenharmony_ci if (chg >= 0 && add < 0) 52868c2ecf20Sopenharmony_ci region_abort(resv_map, from, to, regions_needed); 52878c2ecf20Sopenharmony_ci if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 52888c2ecf20Sopenharmony_ci kref_put(&resv_map->refs, resv_map_release); 52898c2ecf20Sopenharmony_ci return ret; 52908c2ecf20Sopenharmony_ci} 52918c2ecf20Sopenharmony_ci 52928c2ecf20Sopenharmony_cilong hugetlb_unreserve_pages(struct inode *inode, long start, long end, 52938c2ecf20Sopenharmony_ci long freed) 52948c2ecf20Sopenharmony_ci{ 52958c2ecf20Sopenharmony_ci struct hstate *h = hstate_inode(inode); 52968c2ecf20Sopenharmony_ci struct resv_map *resv_map = inode_resv_map(inode); 52978c2ecf20Sopenharmony_ci long chg = 0; 52988c2ecf20Sopenharmony_ci struct hugepage_subpool *spool = subpool_inode(inode); 52998c2ecf20Sopenharmony_ci long gbl_reserve; 53008c2ecf20Sopenharmony_ci 53018c2ecf20Sopenharmony_ci /* 53028c2ecf20Sopenharmony_ci * Since this routine can be called in the evict inode path for all 53038c2ecf20Sopenharmony_ci * hugetlbfs inodes, resv_map could be NULL. 53048c2ecf20Sopenharmony_ci */ 53058c2ecf20Sopenharmony_ci if (resv_map) { 53068c2ecf20Sopenharmony_ci chg = region_del(resv_map, start, end); 53078c2ecf20Sopenharmony_ci /* 53088c2ecf20Sopenharmony_ci * region_del() can fail in the rare case where a region 53098c2ecf20Sopenharmony_ci * must be split and another region descriptor can not be 53108c2ecf20Sopenharmony_ci * allocated. If end == LONG_MAX, it will not fail. 53118c2ecf20Sopenharmony_ci */ 53128c2ecf20Sopenharmony_ci if (chg < 0) 53138c2ecf20Sopenharmony_ci return chg; 53148c2ecf20Sopenharmony_ci } 53158c2ecf20Sopenharmony_ci 53168c2ecf20Sopenharmony_ci spin_lock(&inode->i_lock); 53178c2ecf20Sopenharmony_ci inode->i_blocks -= (blocks_per_huge_page(h) * freed); 53188c2ecf20Sopenharmony_ci spin_unlock(&inode->i_lock); 53198c2ecf20Sopenharmony_ci 53208c2ecf20Sopenharmony_ci /* 53218c2ecf20Sopenharmony_ci * If the subpool has a minimum size, the number of global 53228c2ecf20Sopenharmony_ci * reservations to be released may be adjusted. 53238c2ecf20Sopenharmony_ci */ 53248c2ecf20Sopenharmony_ci gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); 53258c2ecf20Sopenharmony_ci hugetlb_acct_memory(h, -gbl_reserve); 53268c2ecf20Sopenharmony_ci 53278c2ecf20Sopenharmony_ci return 0; 53288c2ecf20Sopenharmony_ci} 53298c2ecf20Sopenharmony_ci 53308c2ecf20Sopenharmony_ci#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE 53318c2ecf20Sopenharmony_cistatic unsigned long page_table_shareable(struct vm_area_struct *svma, 53328c2ecf20Sopenharmony_ci struct vm_area_struct *vma, 53338c2ecf20Sopenharmony_ci unsigned long addr, pgoff_t idx) 53348c2ecf20Sopenharmony_ci{ 53358c2ecf20Sopenharmony_ci unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + 53368c2ecf20Sopenharmony_ci svma->vm_start; 53378c2ecf20Sopenharmony_ci unsigned long sbase = saddr & PUD_MASK; 53388c2ecf20Sopenharmony_ci unsigned long s_end = sbase + PUD_SIZE; 53398c2ecf20Sopenharmony_ci 53408c2ecf20Sopenharmony_ci /* Allow segments to share if only one is marked locked */ 53418c2ecf20Sopenharmony_ci unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; 53428c2ecf20Sopenharmony_ci unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK; 53438c2ecf20Sopenharmony_ci 53448c2ecf20Sopenharmony_ci /* 53458c2ecf20Sopenharmony_ci * match the virtual addresses, permission and the alignment of the 53468c2ecf20Sopenharmony_ci * page table page. 53478c2ecf20Sopenharmony_ci */ 53488c2ecf20Sopenharmony_ci if (pmd_index(addr) != pmd_index(saddr) || 53498c2ecf20Sopenharmony_ci vm_flags != svm_flags || 53508c2ecf20Sopenharmony_ci sbase < svma->vm_start || svma->vm_end < s_end) 53518c2ecf20Sopenharmony_ci return 0; 53528c2ecf20Sopenharmony_ci 53538c2ecf20Sopenharmony_ci return saddr; 53548c2ecf20Sopenharmony_ci} 53558c2ecf20Sopenharmony_ci 53568c2ecf20Sopenharmony_cistatic bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) 53578c2ecf20Sopenharmony_ci{ 53588c2ecf20Sopenharmony_ci unsigned long base = addr & PUD_MASK; 53598c2ecf20Sopenharmony_ci unsigned long end = base + PUD_SIZE; 53608c2ecf20Sopenharmony_ci 53618c2ecf20Sopenharmony_ci /* 53628c2ecf20Sopenharmony_ci * check on proper vm_flags and page table alignment 53638c2ecf20Sopenharmony_ci */ 53648c2ecf20Sopenharmony_ci if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end)) 53658c2ecf20Sopenharmony_ci return true; 53668c2ecf20Sopenharmony_ci return false; 53678c2ecf20Sopenharmony_ci} 53688c2ecf20Sopenharmony_ci 53698c2ecf20Sopenharmony_ci/* 53708c2ecf20Sopenharmony_ci * Determine if start,end range within vma could be mapped by shared pmd. 53718c2ecf20Sopenharmony_ci * If yes, adjust start and end to cover range associated with possible 53728c2ecf20Sopenharmony_ci * shared pmd mappings. 53738c2ecf20Sopenharmony_ci */ 53748c2ecf20Sopenharmony_civoid adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, 53758c2ecf20Sopenharmony_ci unsigned long *start, unsigned long *end) 53768c2ecf20Sopenharmony_ci{ 53778c2ecf20Sopenharmony_ci unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE), 53788c2ecf20Sopenharmony_ci v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); 53798c2ecf20Sopenharmony_ci 53808c2ecf20Sopenharmony_ci /* 53818c2ecf20Sopenharmony_ci * vma need span at least one aligned PUD size and the start,end range 53828c2ecf20Sopenharmony_ci * must at least partialy within it. 53838c2ecf20Sopenharmony_ci */ 53848c2ecf20Sopenharmony_ci if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) || 53858c2ecf20Sopenharmony_ci (*end <= v_start) || (*start >= v_end)) 53868c2ecf20Sopenharmony_ci return; 53878c2ecf20Sopenharmony_ci 53888c2ecf20Sopenharmony_ci /* Extend the range to be PUD aligned for a worst case scenario */ 53898c2ecf20Sopenharmony_ci if (*start > v_start) 53908c2ecf20Sopenharmony_ci *start = ALIGN_DOWN(*start, PUD_SIZE); 53918c2ecf20Sopenharmony_ci 53928c2ecf20Sopenharmony_ci if (*end < v_end) 53938c2ecf20Sopenharmony_ci *end = ALIGN(*end, PUD_SIZE); 53948c2ecf20Sopenharmony_ci} 53958c2ecf20Sopenharmony_ci 53968c2ecf20Sopenharmony_ci/* 53978c2ecf20Sopenharmony_ci * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() 53988c2ecf20Sopenharmony_ci * and returns the corresponding pte. While this is not necessary for the 53998c2ecf20Sopenharmony_ci * !shared pmd case because we can allocate the pmd later as well, it makes the 54008c2ecf20Sopenharmony_ci * code much cleaner. 54018c2ecf20Sopenharmony_ci * 54028c2ecf20Sopenharmony_ci * This routine must be called with i_mmap_rwsem held in at least read mode if 54038c2ecf20Sopenharmony_ci * sharing is possible. For hugetlbfs, this prevents removal of any page 54048c2ecf20Sopenharmony_ci * table entries associated with the address space. This is important as we 54058c2ecf20Sopenharmony_ci * are setting up sharing based on existing page table entries (mappings). 54068c2ecf20Sopenharmony_ci * 54078c2ecf20Sopenharmony_ci * NOTE: This routine is only called from huge_pte_alloc. Some callers of 54088c2ecf20Sopenharmony_ci * huge_pte_alloc know that sharing is not possible and do not take 54098c2ecf20Sopenharmony_ci * i_mmap_rwsem as a performance optimization. This is handled by the 54108c2ecf20Sopenharmony_ci * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is 54118c2ecf20Sopenharmony_ci * only required for subsequent processing. 54128c2ecf20Sopenharmony_ci */ 54138c2ecf20Sopenharmony_cipte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) 54148c2ecf20Sopenharmony_ci{ 54158c2ecf20Sopenharmony_ci struct vm_area_struct *vma = find_vma(mm, addr); 54168c2ecf20Sopenharmony_ci struct address_space *mapping = vma->vm_file->f_mapping; 54178c2ecf20Sopenharmony_ci pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + 54188c2ecf20Sopenharmony_ci vma->vm_pgoff; 54198c2ecf20Sopenharmony_ci struct vm_area_struct *svma; 54208c2ecf20Sopenharmony_ci unsigned long saddr; 54218c2ecf20Sopenharmony_ci pte_t *spte = NULL; 54228c2ecf20Sopenharmony_ci pte_t *pte; 54238c2ecf20Sopenharmony_ci spinlock_t *ptl; 54248c2ecf20Sopenharmony_ci 54258c2ecf20Sopenharmony_ci if (!vma_shareable(vma, addr)) 54268c2ecf20Sopenharmony_ci return (pte_t *)pmd_alloc(mm, pud, addr); 54278c2ecf20Sopenharmony_ci 54288c2ecf20Sopenharmony_ci i_mmap_assert_locked(mapping); 54298c2ecf20Sopenharmony_ci vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { 54308c2ecf20Sopenharmony_ci if (svma == vma) 54318c2ecf20Sopenharmony_ci continue; 54328c2ecf20Sopenharmony_ci 54338c2ecf20Sopenharmony_ci saddr = page_table_shareable(svma, vma, addr, idx); 54348c2ecf20Sopenharmony_ci if (saddr) { 54358c2ecf20Sopenharmony_ci spte = huge_pte_offset(svma->vm_mm, saddr, 54368c2ecf20Sopenharmony_ci vma_mmu_pagesize(svma)); 54378c2ecf20Sopenharmony_ci if (spte) { 54388c2ecf20Sopenharmony_ci get_page(virt_to_page(spte)); 54398c2ecf20Sopenharmony_ci break; 54408c2ecf20Sopenharmony_ci } 54418c2ecf20Sopenharmony_ci } 54428c2ecf20Sopenharmony_ci } 54438c2ecf20Sopenharmony_ci 54448c2ecf20Sopenharmony_ci if (!spte) 54458c2ecf20Sopenharmony_ci goto out; 54468c2ecf20Sopenharmony_ci 54478c2ecf20Sopenharmony_ci ptl = huge_pte_lock(hstate_vma(vma), mm, spte); 54488c2ecf20Sopenharmony_ci if (pud_none(*pud)) { 54498c2ecf20Sopenharmony_ci pud_populate(mm, pud, 54508c2ecf20Sopenharmony_ci (pmd_t *)((unsigned long)spte & PAGE_MASK)); 54518c2ecf20Sopenharmony_ci mm_inc_nr_pmds(mm); 54528c2ecf20Sopenharmony_ci } else { 54538c2ecf20Sopenharmony_ci put_page(virt_to_page(spte)); 54548c2ecf20Sopenharmony_ci } 54558c2ecf20Sopenharmony_ci spin_unlock(ptl); 54568c2ecf20Sopenharmony_ciout: 54578c2ecf20Sopenharmony_ci pte = (pte_t *)pmd_alloc(mm, pud, addr); 54588c2ecf20Sopenharmony_ci return pte; 54598c2ecf20Sopenharmony_ci} 54608c2ecf20Sopenharmony_ci 54618c2ecf20Sopenharmony_ci/* 54628c2ecf20Sopenharmony_ci * unmap huge page backed by shared pte. 54638c2ecf20Sopenharmony_ci * 54648c2ecf20Sopenharmony_ci * Hugetlb pte page is ref counted at the time of mapping. If pte is shared 54658c2ecf20Sopenharmony_ci * indicated by page_count > 1, unmap is achieved by clearing pud and 54668c2ecf20Sopenharmony_ci * decrementing the ref count. If count == 1, the pte page is not shared. 54678c2ecf20Sopenharmony_ci * 54688c2ecf20Sopenharmony_ci * Called with page table lock held and i_mmap_rwsem held in write mode. 54698c2ecf20Sopenharmony_ci * 54708c2ecf20Sopenharmony_ci * returns: 1 successfully unmapped a shared pte page 54718c2ecf20Sopenharmony_ci * 0 the underlying pte page is not shared, or it is the last user 54728c2ecf20Sopenharmony_ci */ 54738c2ecf20Sopenharmony_ciint huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, 54748c2ecf20Sopenharmony_ci unsigned long *addr, pte_t *ptep) 54758c2ecf20Sopenharmony_ci{ 54768c2ecf20Sopenharmony_ci pgd_t *pgd = pgd_offset(mm, *addr); 54778c2ecf20Sopenharmony_ci p4d_t *p4d = p4d_offset(pgd, *addr); 54788c2ecf20Sopenharmony_ci pud_t *pud = pud_offset(p4d, *addr); 54798c2ecf20Sopenharmony_ci 54808c2ecf20Sopenharmony_ci i_mmap_assert_write_locked(vma->vm_file->f_mapping); 54818c2ecf20Sopenharmony_ci BUG_ON(page_count(virt_to_page(ptep)) == 0); 54828c2ecf20Sopenharmony_ci if (page_count(virt_to_page(ptep)) == 1) 54838c2ecf20Sopenharmony_ci return 0; 54848c2ecf20Sopenharmony_ci 54858c2ecf20Sopenharmony_ci pud_clear(pud); 54868c2ecf20Sopenharmony_ci put_page(virt_to_page(ptep)); 54878c2ecf20Sopenharmony_ci mm_dec_nr_pmds(mm); 54888c2ecf20Sopenharmony_ci /* 54898c2ecf20Sopenharmony_ci * This update of passed address optimizes loops sequentially 54908c2ecf20Sopenharmony_ci * processing addresses in increments of huge page size (PMD_SIZE 54918c2ecf20Sopenharmony_ci * in this case). By clearing the pud, a PUD_SIZE area is unmapped. 54928c2ecf20Sopenharmony_ci * Update address to the 'last page' in the cleared area so that 54938c2ecf20Sopenharmony_ci * calling loop can move to first page past this area. 54948c2ecf20Sopenharmony_ci */ 54958c2ecf20Sopenharmony_ci *addr |= PUD_SIZE - PMD_SIZE; 54968c2ecf20Sopenharmony_ci return 1; 54978c2ecf20Sopenharmony_ci} 54988c2ecf20Sopenharmony_ci#define want_pmd_share() (1) 54998c2ecf20Sopenharmony_ci#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ 55008c2ecf20Sopenharmony_cipte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) 55018c2ecf20Sopenharmony_ci{ 55028c2ecf20Sopenharmony_ci return NULL; 55038c2ecf20Sopenharmony_ci} 55048c2ecf20Sopenharmony_ci 55058c2ecf20Sopenharmony_ciint huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, 55068c2ecf20Sopenharmony_ci unsigned long *addr, pte_t *ptep) 55078c2ecf20Sopenharmony_ci{ 55088c2ecf20Sopenharmony_ci return 0; 55098c2ecf20Sopenharmony_ci} 55108c2ecf20Sopenharmony_ci 55118c2ecf20Sopenharmony_civoid adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, 55128c2ecf20Sopenharmony_ci unsigned long *start, unsigned long *end) 55138c2ecf20Sopenharmony_ci{ 55148c2ecf20Sopenharmony_ci} 55158c2ecf20Sopenharmony_ci#define want_pmd_share() (0) 55168c2ecf20Sopenharmony_ci#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ 55178c2ecf20Sopenharmony_ci 55188c2ecf20Sopenharmony_ci#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB 55198c2ecf20Sopenharmony_cipte_t *huge_pte_alloc(struct mm_struct *mm, 55208c2ecf20Sopenharmony_ci unsigned long addr, unsigned long sz) 55218c2ecf20Sopenharmony_ci{ 55228c2ecf20Sopenharmony_ci pgd_t *pgd; 55238c2ecf20Sopenharmony_ci p4d_t *p4d; 55248c2ecf20Sopenharmony_ci pud_t *pud; 55258c2ecf20Sopenharmony_ci pte_t *pte = NULL; 55268c2ecf20Sopenharmony_ci 55278c2ecf20Sopenharmony_ci pgd = pgd_offset(mm, addr); 55288c2ecf20Sopenharmony_ci p4d = p4d_alloc(mm, pgd, addr); 55298c2ecf20Sopenharmony_ci if (!p4d) 55308c2ecf20Sopenharmony_ci return NULL; 55318c2ecf20Sopenharmony_ci pud = pud_alloc(mm, p4d, addr); 55328c2ecf20Sopenharmony_ci if (pud) { 55338c2ecf20Sopenharmony_ci if (sz == PUD_SIZE) { 55348c2ecf20Sopenharmony_ci pte = (pte_t *)pud; 55358c2ecf20Sopenharmony_ci } else { 55368c2ecf20Sopenharmony_ci BUG_ON(sz != PMD_SIZE); 55378c2ecf20Sopenharmony_ci if (want_pmd_share() && pud_none(*pud)) 55388c2ecf20Sopenharmony_ci pte = huge_pmd_share(mm, addr, pud); 55398c2ecf20Sopenharmony_ci else 55408c2ecf20Sopenharmony_ci pte = (pte_t *)pmd_alloc(mm, pud, addr); 55418c2ecf20Sopenharmony_ci } 55428c2ecf20Sopenharmony_ci } 55438c2ecf20Sopenharmony_ci BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte)); 55448c2ecf20Sopenharmony_ci 55458c2ecf20Sopenharmony_ci return pte; 55468c2ecf20Sopenharmony_ci} 55478c2ecf20Sopenharmony_ci 55488c2ecf20Sopenharmony_ci/* 55498c2ecf20Sopenharmony_ci * huge_pte_offset() - Walk the page table to resolve the hugepage 55508c2ecf20Sopenharmony_ci * entry at address @addr 55518c2ecf20Sopenharmony_ci * 55528c2ecf20Sopenharmony_ci * Return: Pointer to page table entry (PUD or PMD) for 55538c2ecf20Sopenharmony_ci * address @addr, or NULL if a !p*d_present() entry is encountered and the 55548c2ecf20Sopenharmony_ci * size @sz doesn't match the hugepage size at this level of the page 55558c2ecf20Sopenharmony_ci * table. 55568c2ecf20Sopenharmony_ci */ 55578c2ecf20Sopenharmony_cipte_t *huge_pte_offset(struct mm_struct *mm, 55588c2ecf20Sopenharmony_ci unsigned long addr, unsigned long sz) 55598c2ecf20Sopenharmony_ci{ 55608c2ecf20Sopenharmony_ci pgd_t *pgd; 55618c2ecf20Sopenharmony_ci p4d_t *p4d; 55628c2ecf20Sopenharmony_ci pud_t *pud; 55638c2ecf20Sopenharmony_ci pmd_t *pmd; 55648c2ecf20Sopenharmony_ci 55658c2ecf20Sopenharmony_ci pgd = pgd_offset(mm, addr); 55668c2ecf20Sopenharmony_ci if (!pgd_present(*pgd)) 55678c2ecf20Sopenharmony_ci return NULL; 55688c2ecf20Sopenharmony_ci p4d = p4d_offset(pgd, addr); 55698c2ecf20Sopenharmony_ci if (!p4d_present(*p4d)) 55708c2ecf20Sopenharmony_ci return NULL; 55718c2ecf20Sopenharmony_ci 55728c2ecf20Sopenharmony_ci pud = pud_offset(p4d, addr); 55738c2ecf20Sopenharmony_ci if (sz == PUD_SIZE) 55748c2ecf20Sopenharmony_ci /* must be pud huge, non-present or none */ 55758c2ecf20Sopenharmony_ci return (pte_t *)pud; 55768c2ecf20Sopenharmony_ci if (!pud_present(*pud)) 55778c2ecf20Sopenharmony_ci return NULL; 55788c2ecf20Sopenharmony_ci /* must have a valid entry and size to go further */ 55798c2ecf20Sopenharmony_ci 55808c2ecf20Sopenharmony_ci pmd = pmd_offset(pud, addr); 55818c2ecf20Sopenharmony_ci /* must be pmd huge, non-present or none */ 55828c2ecf20Sopenharmony_ci return (pte_t *)pmd; 55838c2ecf20Sopenharmony_ci} 55848c2ecf20Sopenharmony_ci 55858c2ecf20Sopenharmony_ci#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ 55868c2ecf20Sopenharmony_ci 55878c2ecf20Sopenharmony_ci/* 55888c2ecf20Sopenharmony_ci * These functions are overwritable if your architecture needs its own 55898c2ecf20Sopenharmony_ci * behavior. 55908c2ecf20Sopenharmony_ci */ 55918c2ecf20Sopenharmony_cistruct page * __weak 55928c2ecf20Sopenharmony_cifollow_huge_addr(struct mm_struct *mm, unsigned long address, 55938c2ecf20Sopenharmony_ci int write) 55948c2ecf20Sopenharmony_ci{ 55958c2ecf20Sopenharmony_ci return ERR_PTR(-EINVAL); 55968c2ecf20Sopenharmony_ci} 55978c2ecf20Sopenharmony_ci 55988c2ecf20Sopenharmony_cistruct page * __weak 55998c2ecf20Sopenharmony_cifollow_huge_pd(struct vm_area_struct *vma, 56008c2ecf20Sopenharmony_ci unsigned long address, hugepd_t hpd, int flags, int pdshift) 56018c2ecf20Sopenharmony_ci{ 56028c2ecf20Sopenharmony_ci WARN(1, "hugepd follow called with no support for hugepage directory format\n"); 56038c2ecf20Sopenharmony_ci return NULL; 56048c2ecf20Sopenharmony_ci} 56058c2ecf20Sopenharmony_ci 56068c2ecf20Sopenharmony_cistruct page * __weak 56078c2ecf20Sopenharmony_cifollow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, int flags) 56088c2ecf20Sopenharmony_ci{ 56098c2ecf20Sopenharmony_ci struct hstate *h = hstate_vma(vma); 56108c2ecf20Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 56118c2ecf20Sopenharmony_ci struct page *page = NULL; 56128c2ecf20Sopenharmony_ci spinlock_t *ptl; 56138c2ecf20Sopenharmony_ci pte_t *ptep, pte; 56148c2ecf20Sopenharmony_ci 56158c2ecf20Sopenharmony_ci /* FOLL_GET and FOLL_PIN are mutually exclusive. */ 56168c2ecf20Sopenharmony_ci if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == 56178c2ecf20Sopenharmony_ci (FOLL_PIN | FOLL_GET))) 56188c2ecf20Sopenharmony_ci return NULL; 56198c2ecf20Sopenharmony_ci 56208c2ecf20Sopenharmony_ciretry: 56218c2ecf20Sopenharmony_ci ptep = huge_pte_offset(mm, address, huge_page_size(h)); 56228c2ecf20Sopenharmony_ci if (!ptep) 56238c2ecf20Sopenharmony_ci return NULL; 56248c2ecf20Sopenharmony_ci 56258c2ecf20Sopenharmony_ci ptl = huge_pte_lock(h, mm, ptep); 56268c2ecf20Sopenharmony_ci pte = huge_ptep_get(ptep); 56278c2ecf20Sopenharmony_ci if (pte_present(pte)) { 56288c2ecf20Sopenharmony_ci page = pte_page(pte) + 56298c2ecf20Sopenharmony_ci ((address & ~huge_page_mask(h)) >> PAGE_SHIFT); 56308c2ecf20Sopenharmony_ci /* 56318c2ecf20Sopenharmony_ci * try_grab_page() should always succeed here, because: a) we 56328c2ecf20Sopenharmony_ci * hold the pmd (ptl) lock, and b) we've just checked that the 56338c2ecf20Sopenharmony_ci * huge pmd (head) page is present in the page tables. The ptl 56348c2ecf20Sopenharmony_ci * prevents the head page and tail pages from being rearranged 56358c2ecf20Sopenharmony_ci * in any way. So this page must be available at this point, 56368c2ecf20Sopenharmony_ci * unless the page refcount overflowed: 56378c2ecf20Sopenharmony_ci */ 56388c2ecf20Sopenharmony_ci if (WARN_ON_ONCE(!try_grab_page(page, flags))) { 56398c2ecf20Sopenharmony_ci page = NULL; 56408c2ecf20Sopenharmony_ci goto out; 56418c2ecf20Sopenharmony_ci } 56428c2ecf20Sopenharmony_ci } else { 56438c2ecf20Sopenharmony_ci if (is_hugetlb_entry_migration(pte)) { 56448c2ecf20Sopenharmony_ci spin_unlock(ptl); 56458c2ecf20Sopenharmony_ci __migration_entry_wait(mm, ptep, ptl); 56468c2ecf20Sopenharmony_ci goto retry; 56478c2ecf20Sopenharmony_ci } 56488c2ecf20Sopenharmony_ci /* 56498c2ecf20Sopenharmony_ci * hwpoisoned entry is treated as no_page_table in 56508c2ecf20Sopenharmony_ci * follow_page_mask(). 56518c2ecf20Sopenharmony_ci */ 56528c2ecf20Sopenharmony_ci } 56538c2ecf20Sopenharmony_ciout: 56548c2ecf20Sopenharmony_ci spin_unlock(ptl); 56558c2ecf20Sopenharmony_ci return page; 56568c2ecf20Sopenharmony_ci} 56578c2ecf20Sopenharmony_ci 56588c2ecf20Sopenharmony_cistruct page * __weak 56598c2ecf20Sopenharmony_cifollow_huge_pud(struct mm_struct *mm, unsigned long address, 56608c2ecf20Sopenharmony_ci pud_t *pud, int flags) 56618c2ecf20Sopenharmony_ci{ 56628c2ecf20Sopenharmony_ci if (flags & (FOLL_GET | FOLL_PIN)) 56638c2ecf20Sopenharmony_ci return NULL; 56648c2ecf20Sopenharmony_ci 56658c2ecf20Sopenharmony_ci return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); 56668c2ecf20Sopenharmony_ci} 56678c2ecf20Sopenharmony_ci 56688c2ecf20Sopenharmony_cistruct page * __weak 56698c2ecf20Sopenharmony_cifollow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags) 56708c2ecf20Sopenharmony_ci{ 56718c2ecf20Sopenharmony_ci if (flags & (FOLL_GET | FOLL_PIN)) 56728c2ecf20Sopenharmony_ci return NULL; 56738c2ecf20Sopenharmony_ci 56748c2ecf20Sopenharmony_ci return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT); 56758c2ecf20Sopenharmony_ci} 56768c2ecf20Sopenharmony_ci 56778c2ecf20Sopenharmony_ciint isolate_hugetlb(struct page *page, struct list_head *list) 56788c2ecf20Sopenharmony_ci{ 56798c2ecf20Sopenharmony_ci int ret = 0; 56808c2ecf20Sopenharmony_ci 56818c2ecf20Sopenharmony_ci spin_lock(&hugetlb_lock); 56828c2ecf20Sopenharmony_ci if (!PageHeadHuge(page) || !page_huge_active(page) || 56838c2ecf20Sopenharmony_ci !get_page_unless_zero(page)) { 56848c2ecf20Sopenharmony_ci ret = -EBUSY; 56858c2ecf20Sopenharmony_ci goto unlock; 56868c2ecf20Sopenharmony_ci } 56878c2ecf20Sopenharmony_ci clear_page_huge_active(page); 56888c2ecf20Sopenharmony_ci list_move_tail(&page->lru, list); 56898c2ecf20Sopenharmony_ciunlock: 56908c2ecf20Sopenharmony_ci spin_unlock(&hugetlb_lock); 56918c2ecf20Sopenharmony_ci return ret; 56928c2ecf20Sopenharmony_ci} 56938c2ecf20Sopenharmony_ci 56948c2ecf20Sopenharmony_civoid putback_active_hugepage(struct page *page) 56958c2ecf20Sopenharmony_ci{ 56968c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(!PageHead(page), page); 56978c2ecf20Sopenharmony_ci spin_lock(&hugetlb_lock); 56988c2ecf20Sopenharmony_ci set_page_huge_active(page); 56998c2ecf20Sopenharmony_ci list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); 57008c2ecf20Sopenharmony_ci spin_unlock(&hugetlb_lock); 57018c2ecf20Sopenharmony_ci put_page(page); 57028c2ecf20Sopenharmony_ci} 57038c2ecf20Sopenharmony_ci 57048c2ecf20Sopenharmony_civoid move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) 57058c2ecf20Sopenharmony_ci{ 57068c2ecf20Sopenharmony_ci struct hstate *h = page_hstate(oldpage); 57078c2ecf20Sopenharmony_ci 57088c2ecf20Sopenharmony_ci hugetlb_cgroup_migrate(oldpage, newpage); 57098c2ecf20Sopenharmony_ci set_page_owner_migrate_reason(newpage, reason); 57108c2ecf20Sopenharmony_ci 57118c2ecf20Sopenharmony_ci /* 57128c2ecf20Sopenharmony_ci * transfer temporary state of the new huge page. This is 57138c2ecf20Sopenharmony_ci * reverse to other transitions because the newpage is going to 57148c2ecf20Sopenharmony_ci * be final while the old one will be freed so it takes over 57158c2ecf20Sopenharmony_ci * the temporary status. 57168c2ecf20Sopenharmony_ci * 57178c2ecf20Sopenharmony_ci * Also note that we have to transfer the per-node surplus state 57188c2ecf20Sopenharmony_ci * here as well otherwise the global surplus count will not match 57198c2ecf20Sopenharmony_ci * the per-node's. 57208c2ecf20Sopenharmony_ci */ 57218c2ecf20Sopenharmony_ci if (PageHugeTemporary(newpage)) { 57228c2ecf20Sopenharmony_ci int old_nid = page_to_nid(oldpage); 57238c2ecf20Sopenharmony_ci int new_nid = page_to_nid(newpage); 57248c2ecf20Sopenharmony_ci 57258c2ecf20Sopenharmony_ci SetPageHugeTemporary(oldpage); 57268c2ecf20Sopenharmony_ci ClearPageHugeTemporary(newpage); 57278c2ecf20Sopenharmony_ci 57288c2ecf20Sopenharmony_ci spin_lock(&hugetlb_lock); 57298c2ecf20Sopenharmony_ci if (h->surplus_huge_pages_node[old_nid]) { 57308c2ecf20Sopenharmony_ci h->surplus_huge_pages_node[old_nid]--; 57318c2ecf20Sopenharmony_ci h->surplus_huge_pages_node[new_nid]++; 57328c2ecf20Sopenharmony_ci } 57338c2ecf20Sopenharmony_ci spin_unlock(&hugetlb_lock); 57348c2ecf20Sopenharmony_ci } 57358c2ecf20Sopenharmony_ci} 57368c2ecf20Sopenharmony_ci 57378c2ecf20Sopenharmony_ci#ifdef CONFIG_CMA 57388c2ecf20Sopenharmony_cistatic bool cma_reserve_called __initdata; 57398c2ecf20Sopenharmony_ci 57408c2ecf20Sopenharmony_cistatic int __init cmdline_parse_hugetlb_cma(char *p) 57418c2ecf20Sopenharmony_ci{ 57428c2ecf20Sopenharmony_ci hugetlb_cma_size = memparse(p, &p); 57438c2ecf20Sopenharmony_ci return 0; 57448c2ecf20Sopenharmony_ci} 57458c2ecf20Sopenharmony_ci 57468c2ecf20Sopenharmony_ciearly_param("hugetlb_cma", cmdline_parse_hugetlb_cma); 57478c2ecf20Sopenharmony_ci 57488c2ecf20Sopenharmony_civoid __init hugetlb_cma_reserve(int order) 57498c2ecf20Sopenharmony_ci{ 57508c2ecf20Sopenharmony_ci unsigned long size, reserved, per_node; 57518c2ecf20Sopenharmony_ci int nid; 57528c2ecf20Sopenharmony_ci 57538c2ecf20Sopenharmony_ci cma_reserve_called = true; 57548c2ecf20Sopenharmony_ci 57558c2ecf20Sopenharmony_ci if (!hugetlb_cma_size) 57568c2ecf20Sopenharmony_ci return; 57578c2ecf20Sopenharmony_ci 57588c2ecf20Sopenharmony_ci if (hugetlb_cma_size < (PAGE_SIZE << order)) { 57598c2ecf20Sopenharmony_ci pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n", 57608c2ecf20Sopenharmony_ci (PAGE_SIZE << order) / SZ_1M); 57618c2ecf20Sopenharmony_ci return; 57628c2ecf20Sopenharmony_ci } 57638c2ecf20Sopenharmony_ci 57648c2ecf20Sopenharmony_ci /* 57658c2ecf20Sopenharmony_ci * If 3 GB area is requested on a machine with 4 numa nodes, 57668c2ecf20Sopenharmony_ci * let's allocate 1 GB on first three nodes and ignore the last one. 57678c2ecf20Sopenharmony_ci */ 57688c2ecf20Sopenharmony_ci per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes); 57698c2ecf20Sopenharmony_ci pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n", 57708c2ecf20Sopenharmony_ci hugetlb_cma_size / SZ_1M, per_node / SZ_1M); 57718c2ecf20Sopenharmony_ci 57728c2ecf20Sopenharmony_ci reserved = 0; 57738c2ecf20Sopenharmony_ci for_each_node_state(nid, N_ONLINE) { 57748c2ecf20Sopenharmony_ci int res; 57758c2ecf20Sopenharmony_ci char name[CMA_MAX_NAME]; 57768c2ecf20Sopenharmony_ci 57778c2ecf20Sopenharmony_ci size = min(per_node, hugetlb_cma_size - reserved); 57788c2ecf20Sopenharmony_ci size = round_up(size, PAGE_SIZE << order); 57798c2ecf20Sopenharmony_ci 57808c2ecf20Sopenharmony_ci snprintf(name, sizeof(name), "hugetlb%d", nid); 57818c2ecf20Sopenharmony_ci res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order, 57828c2ecf20Sopenharmony_ci 0, false, name, 57838c2ecf20Sopenharmony_ci &hugetlb_cma[nid], nid); 57848c2ecf20Sopenharmony_ci if (res) { 57858c2ecf20Sopenharmony_ci pr_warn("hugetlb_cma: reservation failed: err %d, node %d", 57868c2ecf20Sopenharmony_ci res, nid); 57878c2ecf20Sopenharmony_ci continue; 57888c2ecf20Sopenharmony_ci } 57898c2ecf20Sopenharmony_ci 57908c2ecf20Sopenharmony_ci reserved += size; 57918c2ecf20Sopenharmony_ci pr_info("hugetlb_cma: reserved %lu MiB on node %d\n", 57928c2ecf20Sopenharmony_ci size / SZ_1M, nid); 57938c2ecf20Sopenharmony_ci 57948c2ecf20Sopenharmony_ci if (reserved >= hugetlb_cma_size) 57958c2ecf20Sopenharmony_ci break; 57968c2ecf20Sopenharmony_ci } 57978c2ecf20Sopenharmony_ci} 57988c2ecf20Sopenharmony_ci 57998c2ecf20Sopenharmony_civoid __init hugetlb_cma_check(void) 58008c2ecf20Sopenharmony_ci{ 58018c2ecf20Sopenharmony_ci if (!hugetlb_cma_size || cma_reserve_called) 58028c2ecf20Sopenharmony_ci return; 58038c2ecf20Sopenharmony_ci 58048c2ecf20Sopenharmony_ci pr_warn("hugetlb_cma: the option isn't supported by current arch\n"); 58058c2ecf20Sopenharmony_ci} 58068c2ecf20Sopenharmony_ci 58078c2ecf20Sopenharmony_ci#endif /* CONFIG_CMA */ 5808