18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * linux/mm/page_alloc.c 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Manages the free list, the system allocates free pages here. 68c2ecf20Sopenharmony_ci * Note that kmalloc() lives in slab.c 78c2ecf20Sopenharmony_ci * 88c2ecf20Sopenharmony_ci * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 98c2ecf20Sopenharmony_ci * Swap reorganised 29.12.95, Stephen Tweedie 108c2ecf20Sopenharmony_ci * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 118c2ecf20Sopenharmony_ci * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 128c2ecf20Sopenharmony_ci * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 138c2ecf20Sopenharmony_ci * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 148c2ecf20Sopenharmony_ci * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 158c2ecf20Sopenharmony_ci * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 168c2ecf20Sopenharmony_ci */ 178c2ecf20Sopenharmony_ci 188c2ecf20Sopenharmony_ci#include <linux/stddef.h> 198c2ecf20Sopenharmony_ci#include <linux/mm.h> 208c2ecf20Sopenharmony_ci#include <linux/highmem.h> 218c2ecf20Sopenharmony_ci#include <linux/swap.h> 228c2ecf20Sopenharmony_ci#include <linux/interrupt.h> 238c2ecf20Sopenharmony_ci#include <linux/pagemap.h> 248c2ecf20Sopenharmony_ci#include <linux/jiffies.h> 258c2ecf20Sopenharmony_ci#include <linux/memblock.h> 268c2ecf20Sopenharmony_ci#include <linux/compiler.h> 278c2ecf20Sopenharmony_ci#include <linux/kernel.h> 288c2ecf20Sopenharmony_ci#include <linux/kasan.h> 298c2ecf20Sopenharmony_ci#include <linux/module.h> 308c2ecf20Sopenharmony_ci#include <linux/suspend.h> 318c2ecf20Sopenharmony_ci#include <linux/pagevec.h> 328c2ecf20Sopenharmony_ci#include <linux/blkdev.h> 338c2ecf20Sopenharmony_ci#include <linux/slab.h> 348c2ecf20Sopenharmony_ci#include <linux/ratelimit.h> 358c2ecf20Sopenharmony_ci#include <linux/oom.h> 368c2ecf20Sopenharmony_ci#include <linux/topology.h> 378c2ecf20Sopenharmony_ci#include <linux/sysctl.h> 388c2ecf20Sopenharmony_ci#include <linux/cpu.h> 398c2ecf20Sopenharmony_ci#include <linux/cpuset.h> 408c2ecf20Sopenharmony_ci#include <linux/memory_hotplug.h> 418c2ecf20Sopenharmony_ci#include <linux/nodemask.h> 428c2ecf20Sopenharmony_ci#include <linux/vmalloc.h> 438c2ecf20Sopenharmony_ci#include <linux/vmstat.h> 448c2ecf20Sopenharmony_ci#include <linux/mempolicy.h> 458c2ecf20Sopenharmony_ci#include <linux/memremap.h> 468c2ecf20Sopenharmony_ci#include <linux/stop_machine.h> 478c2ecf20Sopenharmony_ci#include <linux/random.h> 488c2ecf20Sopenharmony_ci#include <linux/sort.h> 498c2ecf20Sopenharmony_ci#include <linux/pfn.h> 508c2ecf20Sopenharmony_ci#include <linux/backing-dev.h> 518c2ecf20Sopenharmony_ci#include <linux/fault-inject.h> 528c2ecf20Sopenharmony_ci#include <linux/page-isolation.h> 538c2ecf20Sopenharmony_ci#include <linux/debugobjects.h> 548c2ecf20Sopenharmony_ci#include <linux/kmemleak.h> 558c2ecf20Sopenharmony_ci#include <linux/compaction.h> 568c2ecf20Sopenharmony_ci#include <trace/events/kmem.h> 578c2ecf20Sopenharmony_ci#include <trace/events/oom.h> 588c2ecf20Sopenharmony_ci#include <linux/prefetch.h> 598c2ecf20Sopenharmony_ci#include <linux/mm_inline.h> 608c2ecf20Sopenharmony_ci#include <linux/migrate.h> 618c2ecf20Sopenharmony_ci#include <linux/hugetlb.h> 628c2ecf20Sopenharmony_ci#include <linux/sched/rt.h> 638c2ecf20Sopenharmony_ci#include <linux/sched/mm.h> 648c2ecf20Sopenharmony_ci#include <linux/page_owner.h> 658c2ecf20Sopenharmony_ci#include <linux/kthread.h> 668c2ecf20Sopenharmony_ci#include <linux/memcontrol.h> 678c2ecf20Sopenharmony_ci#include <linux/ftrace.h> 688c2ecf20Sopenharmony_ci#include <linux/lockdep.h> 698c2ecf20Sopenharmony_ci#include <linux/nmi.h> 708c2ecf20Sopenharmony_ci#include <linux/psi.h> 718c2ecf20Sopenharmony_ci#include <linux/padata.h> 728c2ecf20Sopenharmony_ci#include <linux/khugepaged.h> 738c2ecf20Sopenharmony_ci#include <linux/zswapd.h> 748c2ecf20Sopenharmony_ci#ifdef CONFIG_RECLAIM_ACCT 758c2ecf20Sopenharmony_ci#include <linux/reclaim_acct.h> 768c2ecf20Sopenharmony_ci#endif 778c2ecf20Sopenharmony_ci 788c2ecf20Sopenharmony_ci#include <asm/sections.h> 798c2ecf20Sopenharmony_ci#include <asm/tlbflush.h> 808c2ecf20Sopenharmony_ci#include <asm/div64.h> 818c2ecf20Sopenharmony_ci#include "internal.h" 828c2ecf20Sopenharmony_ci#include "shuffle.h" 838c2ecf20Sopenharmony_ci#include "page_reporting.h" 848c2ecf20Sopenharmony_ci 858c2ecf20Sopenharmony_ci/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */ 868c2ecf20Sopenharmony_citypedef int __bitwise fpi_t; 878c2ecf20Sopenharmony_ci 888c2ecf20Sopenharmony_ci/* No special request */ 898c2ecf20Sopenharmony_ci#define FPI_NONE ((__force fpi_t)0) 908c2ecf20Sopenharmony_ci 918c2ecf20Sopenharmony_ci/* 928c2ecf20Sopenharmony_ci * Skip free page reporting notification for the (possibly merged) page. 938c2ecf20Sopenharmony_ci * This does not hinder free page reporting from grabbing the page, 948c2ecf20Sopenharmony_ci * reporting it and marking it "reported" - it only skips notifying 958c2ecf20Sopenharmony_ci * the free page reporting infrastructure about a newly freed page. For 968c2ecf20Sopenharmony_ci * example, used when temporarily pulling a page from a freelist and 978c2ecf20Sopenharmony_ci * putting it back unmodified. 988c2ecf20Sopenharmony_ci */ 998c2ecf20Sopenharmony_ci#define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0)) 1008c2ecf20Sopenharmony_ci 1018c2ecf20Sopenharmony_ci/* 1028c2ecf20Sopenharmony_ci * Place the (possibly merged) page to the tail of the freelist. Will ignore 1038c2ecf20Sopenharmony_ci * page shuffling (relevant code - e.g., memory onlining - is expected to 1048c2ecf20Sopenharmony_ci * shuffle the whole zone). 1058c2ecf20Sopenharmony_ci * 1068c2ecf20Sopenharmony_ci * Note: No code should rely on this flag for correctness - it's purely 1078c2ecf20Sopenharmony_ci * to allow for optimizations when handing back either fresh pages 1088c2ecf20Sopenharmony_ci * (memory onlining) or untouched pages (page isolation, free page 1098c2ecf20Sopenharmony_ci * reporting). 1108c2ecf20Sopenharmony_ci */ 1118c2ecf20Sopenharmony_ci#define FPI_TO_TAIL ((__force fpi_t)BIT(1)) 1128c2ecf20Sopenharmony_ci 1138c2ecf20Sopenharmony_ci/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ 1148c2ecf20Sopenharmony_cistatic DEFINE_MUTEX(pcp_batch_high_lock); 1158c2ecf20Sopenharmony_ci#define MIN_PERCPU_PAGELIST_FRACTION (8) 1168c2ecf20Sopenharmony_ci 1178c2ecf20Sopenharmony_ci#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 1188c2ecf20Sopenharmony_ciDEFINE_PER_CPU(int, numa_node); 1198c2ecf20Sopenharmony_ciEXPORT_PER_CPU_SYMBOL(numa_node); 1208c2ecf20Sopenharmony_ci#endif 1218c2ecf20Sopenharmony_ci 1228c2ecf20Sopenharmony_ciDEFINE_STATIC_KEY_TRUE(vm_numa_stat_key); 1238c2ecf20Sopenharmony_ci 1248c2ecf20Sopenharmony_ci#ifdef CONFIG_HAVE_MEMORYLESS_NODES 1258c2ecf20Sopenharmony_ci/* 1268c2ecf20Sopenharmony_ci * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. 1278c2ecf20Sopenharmony_ci * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. 1288c2ecf20Sopenharmony_ci * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() 1298c2ecf20Sopenharmony_ci * defined in <linux/topology.h>. 1308c2ecf20Sopenharmony_ci */ 1318c2ecf20Sopenharmony_ciDEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ 1328c2ecf20Sopenharmony_ciEXPORT_PER_CPU_SYMBOL(_numa_mem_); 1338c2ecf20Sopenharmony_ci#endif 1348c2ecf20Sopenharmony_ci 1358c2ecf20Sopenharmony_ci/* work_structs for global per-cpu drains */ 1368c2ecf20Sopenharmony_cistruct pcpu_drain { 1378c2ecf20Sopenharmony_ci struct zone *zone; 1388c2ecf20Sopenharmony_ci struct work_struct work; 1398c2ecf20Sopenharmony_ci}; 1408c2ecf20Sopenharmony_cistatic DEFINE_MUTEX(pcpu_drain_mutex); 1418c2ecf20Sopenharmony_cistatic DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain); 1428c2ecf20Sopenharmony_ci 1438c2ecf20Sopenharmony_ci#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY 1448c2ecf20Sopenharmony_civolatile unsigned long latent_entropy __latent_entropy; 1458c2ecf20Sopenharmony_ciEXPORT_SYMBOL(latent_entropy); 1468c2ecf20Sopenharmony_ci#endif 1478c2ecf20Sopenharmony_ci 1488c2ecf20Sopenharmony_ci/* 1498c2ecf20Sopenharmony_ci * Array of node states. 1508c2ecf20Sopenharmony_ci */ 1518c2ecf20Sopenharmony_cinodemask_t node_states[NR_NODE_STATES] __read_mostly = { 1528c2ecf20Sopenharmony_ci [N_POSSIBLE] = NODE_MASK_ALL, 1538c2ecf20Sopenharmony_ci [N_ONLINE] = { { [0] = 1UL } }, 1548c2ecf20Sopenharmony_ci#ifndef CONFIG_NUMA 1558c2ecf20Sopenharmony_ci [N_NORMAL_MEMORY] = { { [0] = 1UL } }, 1568c2ecf20Sopenharmony_ci#ifdef CONFIG_HIGHMEM 1578c2ecf20Sopenharmony_ci [N_HIGH_MEMORY] = { { [0] = 1UL } }, 1588c2ecf20Sopenharmony_ci#endif 1598c2ecf20Sopenharmony_ci [N_MEMORY] = { { [0] = 1UL } }, 1608c2ecf20Sopenharmony_ci [N_CPU] = { { [0] = 1UL } }, 1618c2ecf20Sopenharmony_ci#endif /* NUMA */ 1628c2ecf20Sopenharmony_ci}; 1638c2ecf20Sopenharmony_ciEXPORT_SYMBOL(node_states); 1648c2ecf20Sopenharmony_ci 1658c2ecf20Sopenharmony_ciatomic_long_t _totalram_pages __read_mostly; 1668c2ecf20Sopenharmony_ciEXPORT_SYMBOL(_totalram_pages); 1678c2ecf20Sopenharmony_ciunsigned long totalreserve_pages __read_mostly; 1688c2ecf20Sopenharmony_ciunsigned long totalcma_pages __read_mostly; 1698c2ecf20Sopenharmony_ci 1708c2ecf20Sopenharmony_ciint percpu_pagelist_fraction; 1718c2ecf20Sopenharmony_cigfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 1728c2ecf20Sopenharmony_ci#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON 1738c2ecf20Sopenharmony_ciDEFINE_STATIC_KEY_TRUE(init_on_alloc); 1748c2ecf20Sopenharmony_ci#else 1758c2ecf20Sopenharmony_ciDEFINE_STATIC_KEY_FALSE(init_on_alloc); 1768c2ecf20Sopenharmony_ci#endif 1778c2ecf20Sopenharmony_ciEXPORT_SYMBOL(init_on_alloc); 1788c2ecf20Sopenharmony_ci 1798c2ecf20Sopenharmony_ci#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON 1808c2ecf20Sopenharmony_ciDEFINE_STATIC_KEY_TRUE(init_on_free); 1818c2ecf20Sopenharmony_ci#else 1828c2ecf20Sopenharmony_ciDEFINE_STATIC_KEY_FALSE(init_on_free); 1838c2ecf20Sopenharmony_ci#endif 1848c2ecf20Sopenharmony_ciEXPORT_SYMBOL(init_on_free); 1858c2ecf20Sopenharmony_ci 1868c2ecf20Sopenharmony_cistatic int __init early_init_on_alloc(char *buf) 1878c2ecf20Sopenharmony_ci{ 1888c2ecf20Sopenharmony_ci int ret; 1898c2ecf20Sopenharmony_ci bool bool_result; 1908c2ecf20Sopenharmony_ci 1918c2ecf20Sopenharmony_ci ret = kstrtobool(buf, &bool_result); 1928c2ecf20Sopenharmony_ci if (ret) 1938c2ecf20Sopenharmony_ci return ret; 1948c2ecf20Sopenharmony_ci if (bool_result && page_poisoning_enabled()) 1958c2ecf20Sopenharmony_ci pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n"); 1968c2ecf20Sopenharmony_ci if (bool_result) 1978c2ecf20Sopenharmony_ci static_branch_enable(&init_on_alloc); 1988c2ecf20Sopenharmony_ci else 1998c2ecf20Sopenharmony_ci static_branch_disable(&init_on_alloc); 2008c2ecf20Sopenharmony_ci return 0; 2018c2ecf20Sopenharmony_ci} 2028c2ecf20Sopenharmony_ciearly_param("init_on_alloc", early_init_on_alloc); 2038c2ecf20Sopenharmony_ci 2048c2ecf20Sopenharmony_cistatic int __init early_init_on_free(char *buf) 2058c2ecf20Sopenharmony_ci{ 2068c2ecf20Sopenharmony_ci int ret; 2078c2ecf20Sopenharmony_ci bool bool_result; 2088c2ecf20Sopenharmony_ci 2098c2ecf20Sopenharmony_ci ret = kstrtobool(buf, &bool_result); 2108c2ecf20Sopenharmony_ci if (ret) 2118c2ecf20Sopenharmony_ci return ret; 2128c2ecf20Sopenharmony_ci if (bool_result && page_poisoning_enabled()) 2138c2ecf20Sopenharmony_ci pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n"); 2148c2ecf20Sopenharmony_ci if (bool_result) 2158c2ecf20Sopenharmony_ci static_branch_enable(&init_on_free); 2168c2ecf20Sopenharmony_ci else 2178c2ecf20Sopenharmony_ci static_branch_disable(&init_on_free); 2188c2ecf20Sopenharmony_ci return 0; 2198c2ecf20Sopenharmony_ci} 2208c2ecf20Sopenharmony_ciearly_param("init_on_free", early_init_on_free); 2218c2ecf20Sopenharmony_ci 2228c2ecf20Sopenharmony_ci/* 2238c2ecf20Sopenharmony_ci * A cached value of the page's pageblock's migratetype, used when the page is 2248c2ecf20Sopenharmony_ci * put on a pcplist. Used to avoid the pageblock migratetype lookup when 2258c2ecf20Sopenharmony_ci * freeing from pcplists in most cases, at the cost of possibly becoming stale. 2268c2ecf20Sopenharmony_ci * Also the migratetype set in the page does not necessarily match the pcplist 2278c2ecf20Sopenharmony_ci * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any 2288c2ecf20Sopenharmony_ci * other index - this ensures that it will be put on the correct CMA freelist. 2298c2ecf20Sopenharmony_ci */ 2308c2ecf20Sopenharmony_cistatic inline int get_pcppage_migratetype(struct page *page) 2318c2ecf20Sopenharmony_ci{ 2328c2ecf20Sopenharmony_ci return page->index; 2338c2ecf20Sopenharmony_ci} 2348c2ecf20Sopenharmony_ci 2358c2ecf20Sopenharmony_cistatic inline void set_pcppage_migratetype(struct page *page, int migratetype) 2368c2ecf20Sopenharmony_ci{ 2378c2ecf20Sopenharmony_ci page->index = migratetype; 2388c2ecf20Sopenharmony_ci} 2398c2ecf20Sopenharmony_ci 2408c2ecf20Sopenharmony_ci#ifdef CONFIG_PM_SLEEP 2418c2ecf20Sopenharmony_ci/* 2428c2ecf20Sopenharmony_ci * The following functions are used by the suspend/hibernate code to temporarily 2438c2ecf20Sopenharmony_ci * change gfp_allowed_mask in order to avoid using I/O during memory allocations 2448c2ecf20Sopenharmony_ci * while devices are suspended. To avoid races with the suspend/hibernate code, 2458c2ecf20Sopenharmony_ci * they should always be called with system_transition_mutex held 2468c2ecf20Sopenharmony_ci * (gfp_allowed_mask also should only be modified with system_transition_mutex 2478c2ecf20Sopenharmony_ci * held, unless the suspend/hibernate code is guaranteed not to run in parallel 2488c2ecf20Sopenharmony_ci * with that modification). 2498c2ecf20Sopenharmony_ci */ 2508c2ecf20Sopenharmony_ci 2518c2ecf20Sopenharmony_cistatic gfp_t saved_gfp_mask; 2528c2ecf20Sopenharmony_ci 2538c2ecf20Sopenharmony_civoid pm_restore_gfp_mask(void) 2548c2ecf20Sopenharmony_ci{ 2558c2ecf20Sopenharmony_ci WARN_ON(!mutex_is_locked(&system_transition_mutex)); 2568c2ecf20Sopenharmony_ci if (saved_gfp_mask) { 2578c2ecf20Sopenharmony_ci gfp_allowed_mask = saved_gfp_mask; 2588c2ecf20Sopenharmony_ci saved_gfp_mask = 0; 2598c2ecf20Sopenharmony_ci } 2608c2ecf20Sopenharmony_ci} 2618c2ecf20Sopenharmony_ci 2628c2ecf20Sopenharmony_civoid pm_restrict_gfp_mask(void) 2638c2ecf20Sopenharmony_ci{ 2648c2ecf20Sopenharmony_ci WARN_ON(!mutex_is_locked(&system_transition_mutex)); 2658c2ecf20Sopenharmony_ci WARN_ON(saved_gfp_mask); 2668c2ecf20Sopenharmony_ci saved_gfp_mask = gfp_allowed_mask; 2678c2ecf20Sopenharmony_ci gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); 2688c2ecf20Sopenharmony_ci} 2698c2ecf20Sopenharmony_ci 2708c2ecf20Sopenharmony_cibool pm_suspended_storage(void) 2718c2ecf20Sopenharmony_ci{ 2728c2ecf20Sopenharmony_ci if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) 2738c2ecf20Sopenharmony_ci return false; 2748c2ecf20Sopenharmony_ci return true; 2758c2ecf20Sopenharmony_ci} 2768c2ecf20Sopenharmony_ci#endif /* CONFIG_PM_SLEEP */ 2778c2ecf20Sopenharmony_ci 2788c2ecf20Sopenharmony_ci#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 2798c2ecf20Sopenharmony_ciunsigned int pageblock_order __read_mostly; 2808c2ecf20Sopenharmony_ci#endif 2818c2ecf20Sopenharmony_ci 2828c2ecf20Sopenharmony_cistatic void __free_pages_ok(struct page *page, unsigned int order, 2838c2ecf20Sopenharmony_ci fpi_t fpi_flags); 2848c2ecf20Sopenharmony_ci 2858c2ecf20Sopenharmony_ci/* 2868c2ecf20Sopenharmony_ci * results with 256, 32 in the lowmem_reserve sysctl: 2878c2ecf20Sopenharmony_ci * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 2888c2ecf20Sopenharmony_ci * 1G machine -> (16M dma, 784M normal, 224M high) 2898c2ecf20Sopenharmony_ci * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 2908c2ecf20Sopenharmony_ci * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 2918c2ecf20Sopenharmony_ci * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA 2928c2ecf20Sopenharmony_ci * 2938c2ecf20Sopenharmony_ci * TBD: should special case ZONE_DMA32 machines here - in those we normally 2948c2ecf20Sopenharmony_ci * don't need any ZONE_NORMAL reservation 2958c2ecf20Sopenharmony_ci */ 2968c2ecf20Sopenharmony_ciint sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = { 2978c2ecf20Sopenharmony_ci#ifdef CONFIG_ZONE_DMA 2988c2ecf20Sopenharmony_ci [ZONE_DMA] = 256, 2998c2ecf20Sopenharmony_ci#endif 3008c2ecf20Sopenharmony_ci#ifdef CONFIG_ZONE_DMA32 3018c2ecf20Sopenharmony_ci [ZONE_DMA32] = 256, 3028c2ecf20Sopenharmony_ci#endif 3038c2ecf20Sopenharmony_ci [ZONE_NORMAL] = 32, 3048c2ecf20Sopenharmony_ci#ifdef CONFIG_HIGHMEM 3058c2ecf20Sopenharmony_ci [ZONE_HIGHMEM] = 0, 3068c2ecf20Sopenharmony_ci#endif 3078c2ecf20Sopenharmony_ci [ZONE_MOVABLE] = 0, 3088c2ecf20Sopenharmony_ci}; 3098c2ecf20Sopenharmony_ci 3108c2ecf20Sopenharmony_cistatic char * const zone_names[MAX_NR_ZONES] = { 3118c2ecf20Sopenharmony_ci#ifdef CONFIG_ZONE_DMA 3128c2ecf20Sopenharmony_ci "DMA", 3138c2ecf20Sopenharmony_ci#endif 3148c2ecf20Sopenharmony_ci#ifdef CONFIG_ZONE_DMA32 3158c2ecf20Sopenharmony_ci "DMA32", 3168c2ecf20Sopenharmony_ci#endif 3178c2ecf20Sopenharmony_ci "Normal", 3188c2ecf20Sopenharmony_ci#ifdef CONFIG_HIGHMEM 3198c2ecf20Sopenharmony_ci "HighMem", 3208c2ecf20Sopenharmony_ci#endif 3218c2ecf20Sopenharmony_ci "Movable", 3228c2ecf20Sopenharmony_ci#ifdef CONFIG_ZONE_DEVICE 3238c2ecf20Sopenharmony_ci "Device", 3248c2ecf20Sopenharmony_ci#endif 3258c2ecf20Sopenharmony_ci}; 3268c2ecf20Sopenharmony_ci 3278c2ecf20Sopenharmony_ciconst char * const migratetype_names[MIGRATE_TYPES] = { 3288c2ecf20Sopenharmony_ci "Unmovable", 3298c2ecf20Sopenharmony_ci "Movable", 3308c2ecf20Sopenharmony_ci "Reclaimable", 3318c2ecf20Sopenharmony_ci#ifdef CONFIG_CMA_REUSE 3328c2ecf20Sopenharmony_ci "CMA", 3338c2ecf20Sopenharmony_ci#endif 3348c2ecf20Sopenharmony_ci "HighAtomic", 3358c2ecf20Sopenharmony_ci#if defined(CONFIG_CMA) && !defined(CONFIG_CMA_REUSE) 3368c2ecf20Sopenharmony_ci "CMA", 3378c2ecf20Sopenharmony_ci#endif 3388c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMORY_ISOLATION 3398c2ecf20Sopenharmony_ci "Isolate", 3408c2ecf20Sopenharmony_ci#endif 3418c2ecf20Sopenharmony_ci}; 3428c2ecf20Sopenharmony_ci 3438c2ecf20Sopenharmony_cicompound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { 3448c2ecf20Sopenharmony_ci [NULL_COMPOUND_DTOR] = NULL, 3458c2ecf20Sopenharmony_ci [COMPOUND_PAGE_DTOR] = free_compound_page, 3468c2ecf20Sopenharmony_ci#ifdef CONFIG_HUGETLB_PAGE 3478c2ecf20Sopenharmony_ci [HUGETLB_PAGE_DTOR] = free_huge_page, 3488c2ecf20Sopenharmony_ci#endif 3498c2ecf20Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 3508c2ecf20Sopenharmony_ci [TRANSHUGE_PAGE_DTOR] = free_transhuge_page, 3518c2ecf20Sopenharmony_ci#endif 3528c2ecf20Sopenharmony_ci}; 3538c2ecf20Sopenharmony_ci 3548c2ecf20Sopenharmony_ciint min_free_kbytes = 1024; 3558c2ecf20Sopenharmony_ciint user_min_free_kbytes = -1; 3568c2ecf20Sopenharmony_ci#ifdef CONFIG_DISCONTIGMEM 3578c2ecf20Sopenharmony_ci/* 3588c2ecf20Sopenharmony_ci * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges 3598c2ecf20Sopenharmony_ci * are not on separate NUMA nodes. Functionally this works but with 3608c2ecf20Sopenharmony_ci * watermark_boost_factor, it can reclaim prematurely as the ranges can be 3618c2ecf20Sopenharmony_ci * quite small. By default, do not boost watermarks on discontigmem as in 3628c2ecf20Sopenharmony_ci * many cases very high-order allocations like THP are likely to be 3638c2ecf20Sopenharmony_ci * unsupported and the premature reclaim offsets the advantage of long-term 3648c2ecf20Sopenharmony_ci * fragmentation avoidance. 3658c2ecf20Sopenharmony_ci */ 3668c2ecf20Sopenharmony_ciint watermark_boost_factor __read_mostly; 3678c2ecf20Sopenharmony_ci#else 3688c2ecf20Sopenharmony_ciint watermark_boost_factor __read_mostly = 15000; 3698c2ecf20Sopenharmony_ci#endif 3708c2ecf20Sopenharmony_ciint watermark_scale_factor = 10; 3718c2ecf20Sopenharmony_ci 3728c2ecf20Sopenharmony_cistatic unsigned long nr_kernel_pages __initdata; 3738c2ecf20Sopenharmony_cistatic unsigned long nr_all_pages __initdata; 3748c2ecf20Sopenharmony_cistatic unsigned long dma_reserve __initdata; 3758c2ecf20Sopenharmony_ci 3768c2ecf20Sopenharmony_cistatic unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata; 3778c2ecf20Sopenharmony_cistatic unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata; 3788c2ecf20Sopenharmony_cistatic unsigned long required_kernelcore __initdata; 3798c2ecf20Sopenharmony_cistatic unsigned long required_kernelcore_percent __initdata; 3808c2ecf20Sopenharmony_cistatic unsigned long required_movablecore __initdata; 3818c2ecf20Sopenharmony_cistatic unsigned long required_movablecore_percent __initdata; 3828c2ecf20Sopenharmony_cistatic unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata; 3838c2ecf20Sopenharmony_cistatic bool mirrored_kernelcore __meminitdata; 3848c2ecf20Sopenharmony_ci 3858c2ecf20Sopenharmony_ci/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 3868c2ecf20Sopenharmony_ciint movable_zone; 3878c2ecf20Sopenharmony_ciEXPORT_SYMBOL(movable_zone); 3888c2ecf20Sopenharmony_ci 3898c2ecf20Sopenharmony_ci#if MAX_NUMNODES > 1 3908c2ecf20Sopenharmony_ciunsigned int nr_node_ids __read_mostly = MAX_NUMNODES; 3918c2ecf20Sopenharmony_ciunsigned int nr_online_nodes __read_mostly = 1; 3928c2ecf20Sopenharmony_ciEXPORT_SYMBOL(nr_node_ids); 3938c2ecf20Sopenharmony_ciEXPORT_SYMBOL(nr_online_nodes); 3948c2ecf20Sopenharmony_ci#endif 3958c2ecf20Sopenharmony_ci 3968c2ecf20Sopenharmony_ciint page_group_by_mobility_disabled __read_mostly; 3978c2ecf20Sopenharmony_ci 3988c2ecf20Sopenharmony_ci#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 3998c2ecf20Sopenharmony_ci/* 4008c2ecf20Sopenharmony_ci * During boot we initialize deferred pages on-demand, as needed, but once 4018c2ecf20Sopenharmony_ci * page_alloc_init_late() has finished, the deferred pages are all initialized, 4028c2ecf20Sopenharmony_ci * and we can permanently disable that path. 4038c2ecf20Sopenharmony_ci */ 4048c2ecf20Sopenharmony_cistatic DEFINE_STATIC_KEY_TRUE(deferred_pages); 4058c2ecf20Sopenharmony_ci 4068c2ecf20Sopenharmony_ci/* 4078c2ecf20Sopenharmony_ci * Calling kasan_free_pages() only after deferred memory initialization 4088c2ecf20Sopenharmony_ci * has completed. Poisoning pages during deferred memory init will greatly 4098c2ecf20Sopenharmony_ci * lengthen the process and cause problem in large memory systems as the 4108c2ecf20Sopenharmony_ci * deferred pages initialization is done with interrupt disabled. 4118c2ecf20Sopenharmony_ci * 4128c2ecf20Sopenharmony_ci * Assuming that there will be no reference to those newly initialized 4138c2ecf20Sopenharmony_ci * pages before they are ever allocated, this should have no effect on 4148c2ecf20Sopenharmony_ci * KASAN memory tracking as the poison will be properly inserted at page 4158c2ecf20Sopenharmony_ci * allocation time. The only corner case is when pages are allocated by 4168c2ecf20Sopenharmony_ci * on-demand allocation and then freed again before the deferred pages 4178c2ecf20Sopenharmony_ci * initialization is done, but this is not likely to happen. 4188c2ecf20Sopenharmony_ci */ 4198c2ecf20Sopenharmony_cistatic inline void kasan_free_nondeferred_pages(struct page *page, int order) 4208c2ecf20Sopenharmony_ci{ 4218c2ecf20Sopenharmony_ci if (!static_branch_unlikely(&deferred_pages)) 4228c2ecf20Sopenharmony_ci kasan_free_pages(page, order); 4238c2ecf20Sopenharmony_ci} 4248c2ecf20Sopenharmony_ci 4258c2ecf20Sopenharmony_ci/* Returns true if the struct page for the pfn is uninitialised */ 4268c2ecf20Sopenharmony_cistatic inline bool __meminit early_page_uninitialised(unsigned long pfn) 4278c2ecf20Sopenharmony_ci{ 4288c2ecf20Sopenharmony_ci int nid = early_pfn_to_nid(pfn); 4298c2ecf20Sopenharmony_ci 4308c2ecf20Sopenharmony_ci if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn) 4318c2ecf20Sopenharmony_ci return true; 4328c2ecf20Sopenharmony_ci 4338c2ecf20Sopenharmony_ci return false; 4348c2ecf20Sopenharmony_ci} 4358c2ecf20Sopenharmony_ci 4368c2ecf20Sopenharmony_ci/* 4378c2ecf20Sopenharmony_ci * Returns true when the remaining initialisation should be deferred until 4388c2ecf20Sopenharmony_ci * later in the boot cycle when it can be parallelised. 4398c2ecf20Sopenharmony_ci */ 4408c2ecf20Sopenharmony_cistatic bool __meminit 4418c2ecf20Sopenharmony_cidefer_init(int nid, unsigned long pfn, unsigned long end_pfn) 4428c2ecf20Sopenharmony_ci{ 4438c2ecf20Sopenharmony_ci static unsigned long prev_end_pfn, nr_initialised; 4448c2ecf20Sopenharmony_ci 4458c2ecf20Sopenharmony_ci /* 4468c2ecf20Sopenharmony_ci * prev_end_pfn static that contains the end of previous zone 4478c2ecf20Sopenharmony_ci * No need to protect because called very early in boot before smp_init. 4488c2ecf20Sopenharmony_ci */ 4498c2ecf20Sopenharmony_ci if (prev_end_pfn != end_pfn) { 4508c2ecf20Sopenharmony_ci prev_end_pfn = end_pfn; 4518c2ecf20Sopenharmony_ci nr_initialised = 0; 4528c2ecf20Sopenharmony_ci } 4538c2ecf20Sopenharmony_ci 4548c2ecf20Sopenharmony_ci /* Always populate low zones for address-constrained allocations */ 4558c2ecf20Sopenharmony_ci if (end_pfn < pgdat_end_pfn(NODE_DATA(nid))) 4568c2ecf20Sopenharmony_ci return false; 4578c2ecf20Sopenharmony_ci 4588c2ecf20Sopenharmony_ci if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX) 4598c2ecf20Sopenharmony_ci return true; 4608c2ecf20Sopenharmony_ci /* 4618c2ecf20Sopenharmony_ci * We start only with one section of pages, more pages are added as 4628c2ecf20Sopenharmony_ci * needed until the rest of deferred pages are initialized. 4638c2ecf20Sopenharmony_ci */ 4648c2ecf20Sopenharmony_ci nr_initialised++; 4658c2ecf20Sopenharmony_ci if ((nr_initialised > PAGES_PER_SECTION) && 4668c2ecf20Sopenharmony_ci (pfn & (PAGES_PER_SECTION - 1)) == 0) { 4678c2ecf20Sopenharmony_ci NODE_DATA(nid)->first_deferred_pfn = pfn; 4688c2ecf20Sopenharmony_ci return true; 4698c2ecf20Sopenharmony_ci } 4708c2ecf20Sopenharmony_ci return false; 4718c2ecf20Sopenharmony_ci} 4728c2ecf20Sopenharmony_ci#else 4738c2ecf20Sopenharmony_ci#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o) 4748c2ecf20Sopenharmony_ci 4758c2ecf20Sopenharmony_cistatic inline bool early_page_uninitialised(unsigned long pfn) 4768c2ecf20Sopenharmony_ci{ 4778c2ecf20Sopenharmony_ci return false; 4788c2ecf20Sopenharmony_ci} 4798c2ecf20Sopenharmony_ci 4808c2ecf20Sopenharmony_cistatic inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) 4818c2ecf20Sopenharmony_ci{ 4828c2ecf20Sopenharmony_ci return false; 4838c2ecf20Sopenharmony_ci} 4848c2ecf20Sopenharmony_ci#endif 4858c2ecf20Sopenharmony_ci 4868c2ecf20Sopenharmony_ci/* Return a pointer to the bitmap storing bits affecting a block of pages */ 4878c2ecf20Sopenharmony_cistatic inline unsigned long *get_pageblock_bitmap(struct page *page, 4888c2ecf20Sopenharmony_ci unsigned long pfn) 4898c2ecf20Sopenharmony_ci{ 4908c2ecf20Sopenharmony_ci#ifdef CONFIG_SPARSEMEM 4918c2ecf20Sopenharmony_ci return section_to_usemap(__pfn_to_section(pfn)); 4928c2ecf20Sopenharmony_ci#else 4938c2ecf20Sopenharmony_ci return page_zone(page)->pageblock_flags; 4948c2ecf20Sopenharmony_ci#endif /* CONFIG_SPARSEMEM */ 4958c2ecf20Sopenharmony_ci} 4968c2ecf20Sopenharmony_ci 4978c2ecf20Sopenharmony_cistatic inline int pfn_to_bitidx(struct page *page, unsigned long pfn) 4988c2ecf20Sopenharmony_ci{ 4998c2ecf20Sopenharmony_ci#ifdef CONFIG_SPARSEMEM 5008c2ecf20Sopenharmony_ci pfn &= (PAGES_PER_SECTION-1); 5018c2ecf20Sopenharmony_ci#else 5028c2ecf20Sopenharmony_ci pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages); 5038c2ecf20Sopenharmony_ci#endif /* CONFIG_SPARSEMEM */ 5048c2ecf20Sopenharmony_ci return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 5058c2ecf20Sopenharmony_ci} 5068c2ecf20Sopenharmony_ci 5078c2ecf20Sopenharmony_ci/** 5088c2ecf20Sopenharmony_ci * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages 5098c2ecf20Sopenharmony_ci * @page: The page within the block of interest 5108c2ecf20Sopenharmony_ci * @pfn: The target page frame number 5118c2ecf20Sopenharmony_ci * @mask: mask of bits that the caller is interested in 5128c2ecf20Sopenharmony_ci * 5138c2ecf20Sopenharmony_ci * Return: pageblock_bits flags 5148c2ecf20Sopenharmony_ci */ 5158c2ecf20Sopenharmony_cistatic __always_inline 5168c2ecf20Sopenharmony_ciunsigned long __get_pfnblock_flags_mask(struct page *page, 5178c2ecf20Sopenharmony_ci unsigned long pfn, 5188c2ecf20Sopenharmony_ci unsigned long mask) 5198c2ecf20Sopenharmony_ci{ 5208c2ecf20Sopenharmony_ci unsigned long *bitmap; 5218c2ecf20Sopenharmony_ci unsigned long bitidx, word_bitidx; 5228c2ecf20Sopenharmony_ci unsigned long word; 5238c2ecf20Sopenharmony_ci 5248c2ecf20Sopenharmony_ci bitmap = get_pageblock_bitmap(page, pfn); 5258c2ecf20Sopenharmony_ci bitidx = pfn_to_bitidx(page, pfn); 5268c2ecf20Sopenharmony_ci word_bitidx = bitidx / BITS_PER_LONG; 5278c2ecf20Sopenharmony_ci bitidx &= (BITS_PER_LONG-1); 5288c2ecf20Sopenharmony_ci 5298c2ecf20Sopenharmony_ci word = bitmap[word_bitidx]; 5308c2ecf20Sopenharmony_ci return (word >> bitidx) & mask; 5318c2ecf20Sopenharmony_ci} 5328c2ecf20Sopenharmony_ci 5338c2ecf20Sopenharmony_ciunsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, 5348c2ecf20Sopenharmony_ci unsigned long mask) 5358c2ecf20Sopenharmony_ci{ 5368c2ecf20Sopenharmony_ci return __get_pfnblock_flags_mask(page, pfn, mask); 5378c2ecf20Sopenharmony_ci} 5388c2ecf20Sopenharmony_ci 5398c2ecf20Sopenharmony_cistatic __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn) 5408c2ecf20Sopenharmony_ci{ 5418c2ecf20Sopenharmony_ci return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK); 5428c2ecf20Sopenharmony_ci} 5438c2ecf20Sopenharmony_ci 5448c2ecf20Sopenharmony_ci/** 5458c2ecf20Sopenharmony_ci * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages 5468c2ecf20Sopenharmony_ci * @page: The page within the block of interest 5478c2ecf20Sopenharmony_ci * @flags: The flags to set 5488c2ecf20Sopenharmony_ci * @pfn: The target page frame number 5498c2ecf20Sopenharmony_ci * @mask: mask of bits that the caller is interested in 5508c2ecf20Sopenharmony_ci */ 5518c2ecf20Sopenharmony_civoid set_pfnblock_flags_mask(struct page *page, unsigned long flags, 5528c2ecf20Sopenharmony_ci unsigned long pfn, 5538c2ecf20Sopenharmony_ci unsigned long mask) 5548c2ecf20Sopenharmony_ci{ 5558c2ecf20Sopenharmony_ci unsigned long *bitmap; 5568c2ecf20Sopenharmony_ci unsigned long bitidx, word_bitidx; 5578c2ecf20Sopenharmony_ci unsigned long old_word, word; 5588c2ecf20Sopenharmony_ci 5598c2ecf20Sopenharmony_ci BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); 5608c2ecf20Sopenharmony_ci BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits)); 5618c2ecf20Sopenharmony_ci 5628c2ecf20Sopenharmony_ci bitmap = get_pageblock_bitmap(page, pfn); 5638c2ecf20Sopenharmony_ci bitidx = pfn_to_bitidx(page, pfn); 5648c2ecf20Sopenharmony_ci word_bitidx = bitidx / BITS_PER_LONG; 5658c2ecf20Sopenharmony_ci bitidx &= (BITS_PER_LONG-1); 5668c2ecf20Sopenharmony_ci 5678c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); 5688c2ecf20Sopenharmony_ci 5698c2ecf20Sopenharmony_ci mask <<= bitidx; 5708c2ecf20Sopenharmony_ci flags <<= bitidx; 5718c2ecf20Sopenharmony_ci 5728c2ecf20Sopenharmony_ci word = READ_ONCE(bitmap[word_bitidx]); 5738c2ecf20Sopenharmony_ci for (;;) { 5748c2ecf20Sopenharmony_ci old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); 5758c2ecf20Sopenharmony_ci if (word == old_word) 5768c2ecf20Sopenharmony_ci break; 5778c2ecf20Sopenharmony_ci word = old_word; 5788c2ecf20Sopenharmony_ci } 5798c2ecf20Sopenharmony_ci} 5808c2ecf20Sopenharmony_ci 5818c2ecf20Sopenharmony_civoid set_pageblock_migratetype(struct page *page, int migratetype) 5828c2ecf20Sopenharmony_ci{ 5838c2ecf20Sopenharmony_ci if (unlikely(page_group_by_mobility_disabled && 5848c2ecf20Sopenharmony_ci migratetype < MIGRATE_PCPTYPES)) 5858c2ecf20Sopenharmony_ci migratetype = MIGRATE_UNMOVABLE; 5868c2ecf20Sopenharmony_ci 5878c2ecf20Sopenharmony_ci set_pfnblock_flags_mask(page, (unsigned long)migratetype, 5888c2ecf20Sopenharmony_ci page_to_pfn(page), MIGRATETYPE_MASK); 5898c2ecf20Sopenharmony_ci} 5908c2ecf20Sopenharmony_ci 5918c2ecf20Sopenharmony_ci#ifdef CONFIG_DEBUG_VM 5928c2ecf20Sopenharmony_cistatic int page_outside_zone_boundaries(struct zone *zone, struct page *page) 5938c2ecf20Sopenharmony_ci{ 5948c2ecf20Sopenharmony_ci int ret = 0; 5958c2ecf20Sopenharmony_ci unsigned seq; 5968c2ecf20Sopenharmony_ci unsigned long pfn = page_to_pfn(page); 5978c2ecf20Sopenharmony_ci unsigned long sp, start_pfn; 5988c2ecf20Sopenharmony_ci 5998c2ecf20Sopenharmony_ci do { 6008c2ecf20Sopenharmony_ci seq = zone_span_seqbegin(zone); 6018c2ecf20Sopenharmony_ci start_pfn = zone->zone_start_pfn; 6028c2ecf20Sopenharmony_ci sp = zone->spanned_pages; 6038c2ecf20Sopenharmony_ci if (!zone_spans_pfn(zone, pfn)) 6048c2ecf20Sopenharmony_ci ret = 1; 6058c2ecf20Sopenharmony_ci } while (zone_span_seqretry(zone, seq)); 6068c2ecf20Sopenharmony_ci 6078c2ecf20Sopenharmony_ci if (ret) 6088c2ecf20Sopenharmony_ci pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n", 6098c2ecf20Sopenharmony_ci pfn, zone_to_nid(zone), zone->name, 6108c2ecf20Sopenharmony_ci start_pfn, start_pfn + sp); 6118c2ecf20Sopenharmony_ci 6128c2ecf20Sopenharmony_ci return ret; 6138c2ecf20Sopenharmony_ci} 6148c2ecf20Sopenharmony_ci 6158c2ecf20Sopenharmony_cistatic int page_is_consistent(struct zone *zone, struct page *page) 6168c2ecf20Sopenharmony_ci{ 6178c2ecf20Sopenharmony_ci if (!pfn_valid_within(page_to_pfn(page))) 6188c2ecf20Sopenharmony_ci return 0; 6198c2ecf20Sopenharmony_ci if (zone != page_zone(page)) 6208c2ecf20Sopenharmony_ci return 0; 6218c2ecf20Sopenharmony_ci 6228c2ecf20Sopenharmony_ci return 1; 6238c2ecf20Sopenharmony_ci} 6248c2ecf20Sopenharmony_ci/* 6258c2ecf20Sopenharmony_ci * Temporary debugging check for pages not lying within a given zone. 6268c2ecf20Sopenharmony_ci */ 6278c2ecf20Sopenharmony_cistatic int __maybe_unused bad_range(struct zone *zone, struct page *page) 6288c2ecf20Sopenharmony_ci{ 6298c2ecf20Sopenharmony_ci if (page_outside_zone_boundaries(zone, page)) 6308c2ecf20Sopenharmony_ci return 1; 6318c2ecf20Sopenharmony_ci if (!page_is_consistent(zone, page)) 6328c2ecf20Sopenharmony_ci return 1; 6338c2ecf20Sopenharmony_ci 6348c2ecf20Sopenharmony_ci return 0; 6358c2ecf20Sopenharmony_ci} 6368c2ecf20Sopenharmony_ci#else 6378c2ecf20Sopenharmony_cistatic inline int __maybe_unused bad_range(struct zone *zone, struct page *page) 6388c2ecf20Sopenharmony_ci{ 6398c2ecf20Sopenharmony_ci return 0; 6408c2ecf20Sopenharmony_ci} 6418c2ecf20Sopenharmony_ci#endif 6428c2ecf20Sopenharmony_ci 6438c2ecf20Sopenharmony_cistatic void bad_page(struct page *page, const char *reason) 6448c2ecf20Sopenharmony_ci{ 6458c2ecf20Sopenharmony_ci static unsigned long resume; 6468c2ecf20Sopenharmony_ci static unsigned long nr_shown; 6478c2ecf20Sopenharmony_ci static unsigned long nr_unshown; 6488c2ecf20Sopenharmony_ci 6498c2ecf20Sopenharmony_ci /* 6508c2ecf20Sopenharmony_ci * Allow a burst of 60 reports, then keep quiet for that minute; 6518c2ecf20Sopenharmony_ci * or allow a steady drip of one report per second. 6528c2ecf20Sopenharmony_ci */ 6538c2ecf20Sopenharmony_ci if (nr_shown == 60) { 6548c2ecf20Sopenharmony_ci if (time_before(jiffies, resume)) { 6558c2ecf20Sopenharmony_ci nr_unshown++; 6568c2ecf20Sopenharmony_ci goto out; 6578c2ecf20Sopenharmony_ci } 6588c2ecf20Sopenharmony_ci if (nr_unshown) { 6598c2ecf20Sopenharmony_ci pr_alert( 6608c2ecf20Sopenharmony_ci "BUG: Bad page state: %lu messages suppressed\n", 6618c2ecf20Sopenharmony_ci nr_unshown); 6628c2ecf20Sopenharmony_ci nr_unshown = 0; 6638c2ecf20Sopenharmony_ci } 6648c2ecf20Sopenharmony_ci nr_shown = 0; 6658c2ecf20Sopenharmony_ci } 6668c2ecf20Sopenharmony_ci if (nr_shown++ == 0) 6678c2ecf20Sopenharmony_ci resume = jiffies + 60 * HZ; 6688c2ecf20Sopenharmony_ci 6698c2ecf20Sopenharmony_ci pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", 6708c2ecf20Sopenharmony_ci current->comm, page_to_pfn(page)); 6718c2ecf20Sopenharmony_ci __dump_page(page, reason); 6728c2ecf20Sopenharmony_ci dump_page_owner(page); 6738c2ecf20Sopenharmony_ci 6748c2ecf20Sopenharmony_ci print_modules(); 6758c2ecf20Sopenharmony_ci dump_stack(); 6768c2ecf20Sopenharmony_ciout: 6778c2ecf20Sopenharmony_ci /* Leave bad fields for debug, except PageBuddy could make trouble */ 6788c2ecf20Sopenharmony_ci page_mapcount_reset(page); /* remove PageBuddy */ 6798c2ecf20Sopenharmony_ci add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 6808c2ecf20Sopenharmony_ci} 6818c2ecf20Sopenharmony_ci 6828c2ecf20Sopenharmony_ci/* 6838c2ecf20Sopenharmony_ci * Higher-order pages are called "compound pages". They are structured thusly: 6848c2ecf20Sopenharmony_ci * 6858c2ecf20Sopenharmony_ci * The first PAGE_SIZE page is called the "head page" and have PG_head set. 6868c2ecf20Sopenharmony_ci * 6878c2ecf20Sopenharmony_ci * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded 6888c2ecf20Sopenharmony_ci * in bit 0 of page->compound_head. The rest of bits is pointer to head page. 6898c2ecf20Sopenharmony_ci * 6908c2ecf20Sopenharmony_ci * The first tail page's ->compound_dtor holds the offset in array of compound 6918c2ecf20Sopenharmony_ci * page destructors. See compound_page_dtors. 6928c2ecf20Sopenharmony_ci * 6938c2ecf20Sopenharmony_ci * The first tail page's ->compound_order holds the order of allocation. 6948c2ecf20Sopenharmony_ci * This usage means that zero-order pages may not be compound. 6958c2ecf20Sopenharmony_ci */ 6968c2ecf20Sopenharmony_ci 6978c2ecf20Sopenharmony_civoid free_compound_page(struct page *page) 6988c2ecf20Sopenharmony_ci{ 6998c2ecf20Sopenharmony_ci mem_cgroup_uncharge(page); 7008c2ecf20Sopenharmony_ci __free_pages_ok(page, compound_order(page), FPI_NONE); 7018c2ecf20Sopenharmony_ci} 7028c2ecf20Sopenharmony_ci 7038c2ecf20Sopenharmony_civoid prep_compound_page(struct page *page, unsigned int order) 7048c2ecf20Sopenharmony_ci{ 7058c2ecf20Sopenharmony_ci int i; 7068c2ecf20Sopenharmony_ci int nr_pages = 1 << order; 7078c2ecf20Sopenharmony_ci 7088c2ecf20Sopenharmony_ci __SetPageHead(page); 7098c2ecf20Sopenharmony_ci for (i = 1; i < nr_pages; i++) { 7108c2ecf20Sopenharmony_ci struct page *p = page + i; 7118c2ecf20Sopenharmony_ci set_page_count(p, 0); 7128c2ecf20Sopenharmony_ci p->mapping = TAIL_MAPPING; 7138c2ecf20Sopenharmony_ci set_compound_head(p, page); 7148c2ecf20Sopenharmony_ci } 7158c2ecf20Sopenharmony_ci 7168c2ecf20Sopenharmony_ci set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); 7178c2ecf20Sopenharmony_ci set_compound_order(page, order); 7188c2ecf20Sopenharmony_ci atomic_set(compound_mapcount_ptr(page), -1); 7198c2ecf20Sopenharmony_ci if (hpage_pincount_available(page)) 7208c2ecf20Sopenharmony_ci atomic_set(compound_pincount_ptr(page), 0); 7218c2ecf20Sopenharmony_ci} 7228c2ecf20Sopenharmony_ci 7238c2ecf20Sopenharmony_ci#ifdef CONFIG_DEBUG_PAGEALLOC 7248c2ecf20Sopenharmony_ciunsigned int _debug_guardpage_minorder; 7258c2ecf20Sopenharmony_ci 7268c2ecf20Sopenharmony_cibool _debug_pagealloc_enabled_early __read_mostly 7278c2ecf20Sopenharmony_ci = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); 7288c2ecf20Sopenharmony_ciEXPORT_SYMBOL(_debug_pagealloc_enabled_early); 7298c2ecf20Sopenharmony_ciDEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); 7308c2ecf20Sopenharmony_ciEXPORT_SYMBOL(_debug_pagealloc_enabled); 7318c2ecf20Sopenharmony_ci 7328c2ecf20Sopenharmony_ciDEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled); 7338c2ecf20Sopenharmony_ci 7348c2ecf20Sopenharmony_cistatic int __init early_debug_pagealloc(char *buf) 7358c2ecf20Sopenharmony_ci{ 7368c2ecf20Sopenharmony_ci return kstrtobool(buf, &_debug_pagealloc_enabled_early); 7378c2ecf20Sopenharmony_ci} 7388c2ecf20Sopenharmony_ciearly_param("debug_pagealloc", early_debug_pagealloc); 7398c2ecf20Sopenharmony_ci 7408c2ecf20Sopenharmony_civoid init_debug_pagealloc(void) 7418c2ecf20Sopenharmony_ci{ 7428c2ecf20Sopenharmony_ci if (!debug_pagealloc_enabled()) 7438c2ecf20Sopenharmony_ci return; 7448c2ecf20Sopenharmony_ci 7458c2ecf20Sopenharmony_ci static_branch_enable(&_debug_pagealloc_enabled); 7468c2ecf20Sopenharmony_ci 7478c2ecf20Sopenharmony_ci if (!debug_guardpage_minorder()) 7488c2ecf20Sopenharmony_ci return; 7498c2ecf20Sopenharmony_ci 7508c2ecf20Sopenharmony_ci static_branch_enable(&_debug_guardpage_enabled); 7518c2ecf20Sopenharmony_ci} 7528c2ecf20Sopenharmony_ci 7538c2ecf20Sopenharmony_cistatic int __init debug_guardpage_minorder_setup(char *buf) 7548c2ecf20Sopenharmony_ci{ 7558c2ecf20Sopenharmony_ci unsigned long res; 7568c2ecf20Sopenharmony_ci 7578c2ecf20Sopenharmony_ci if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { 7588c2ecf20Sopenharmony_ci pr_err("Bad debug_guardpage_minorder value\n"); 7598c2ecf20Sopenharmony_ci return 0; 7608c2ecf20Sopenharmony_ci } 7618c2ecf20Sopenharmony_ci _debug_guardpage_minorder = res; 7628c2ecf20Sopenharmony_ci pr_info("Setting debug_guardpage_minorder to %lu\n", res); 7638c2ecf20Sopenharmony_ci return 0; 7648c2ecf20Sopenharmony_ci} 7658c2ecf20Sopenharmony_ciearly_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); 7668c2ecf20Sopenharmony_ci 7678c2ecf20Sopenharmony_cistatic inline bool set_page_guard(struct zone *zone, struct page *page, 7688c2ecf20Sopenharmony_ci unsigned int order, int migratetype) 7698c2ecf20Sopenharmony_ci{ 7708c2ecf20Sopenharmony_ci if (!debug_guardpage_enabled()) 7718c2ecf20Sopenharmony_ci return false; 7728c2ecf20Sopenharmony_ci 7738c2ecf20Sopenharmony_ci if (order >= debug_guardpage_minorder()) 7748c2ecf20Sopenharmony_ci return false; 7758c2ecf20Sopenharmony_ci 7768c2ecf20Sopenharmony_ci __SetPageGuard(page); 7778c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&page->lru); 7788c2ecf20Sopenharmony_ci set_page_private(page, order); 7798c2ecf20Sopenharmony_ci /* Guard pages are not available for any usage */ 7808c2ecf20Sopenharmony_ci __mod_zone_freepage_state(zone, -(1 << order), migratetype); 7818c2ecf20Sopenharmony_ci 7828c2ecf20Sopenharmony_ci return true; 7838c2ecf20Sopenharmony_ci} 7848c2ecf20Sopenharmony_ci 7858c2ecf20Sopenharmony_cistatic inline void clear_page_guard(struct zone *zone, struct page *page, 7868c2ecf20Sopenharmony_ci unsigned int order, int migratetype) 7878c2ecf20Sopenharmony_ci{ 7888c2ecf20Sopenharmony_ci if (!debug_guardpage_enabled()) 7898c2ecf20Sopenharmony_ci return; 7908c2ecf20Sopenharmony_ci 7918c2ecf20Sopenharmony_ci __ClearPageGuard(page); 7928c2ecf20Sopenharmony_ci 7938c2ecf20Sopenharmony_ci set_page_private(page, 0); 7948c2ecf20Sopenharmony_ci if (!is_migrate_isolate(migratetype)) 7958c2ecf20Sopenharmony_ci __mod_zone_freepage_state(zone, (1 << order), migratetype); 7968c2ecf20Sopenharmony_ci} 7978c2ecf20Sopenharmony_ci#else 7988c2ecf20Sopenharmony_cistatic inline bool set_page_guard(struct zone *zone, struct page *page, 7998c2ecf20Sopenharmony_ci unsigned int order, int migratetype) { return false; } 8008c2ecf20Sopenharmony_cistatic inline void clear_page_guard(struct zone *zone, struct page *page, 8018c2ecf20Sopenharmony_ci unsigned int order, int migratetype) {} 8028c2ecf20Sopenharmony_ci#endif 8038c2ecf20Sopenharmony_ci 8048c2ecf20Sopenharmony_cistatic inline void set_buddy_order(struct page *page, unsigned int order) 8058c2ecf20Sopenharmony_ci{ 8068c2ecf20Sopenharmony_ci set_page_private(page, order); 8078c2ecf20Sopenharmony_ci __SetPageBuddy(page); 8088c2ecf20Sopenharmony_ci} 8098c2ecf20Sopenharmony_ci 8108c2ecf20Sopenharmony_ci/* 8118c2ecf20Sopenharmony_ci * This function checks whether a page is free && is the buddy 8128c2ecf20Sopenharmony_ci * we can coalesce a page and its buddy if 8138c2ecf20Sopenharmony_ci * (a) the buddy is not in a hole (check before calling!) && 8148c2ecf20Sopenharmony_ci * (b) the buddy is in the buddy system && 8158c2ecf20Sopenharmony_ci * (c) a page and its buddy have the same order && 8168c2ecf20Sopenharmony_ci * (d) a page and its buddy are in the same zone. 8178c2ecf20Sopenharmony_ci * 8188c2ecf20Sopenharmony_ci * For recording whether a page is in the buddy system, we set PageBuddy. 8198c2ecf20Sopenharmony_ci * Setting, clearing, and testing PageBuddy is serialized by zone->lock. 8208c2ecf20Sopenharmony_ci * 8218c2ecf20Sopenharmony_ci * For recording page's order, we use page_private(page). 8228c2ecf20Sopenharmony_ci */ 8238c2ecf20Sopenharmony_cistatic inline bool page_is_buddy(struct page *page, struct page *buddy, 8248c2ecf20Sopenharmony_ci unsigned int order) 8258c2ecf20Sopenharmony_ci{ 8268c2ecf20Sopenharmony_ci if (!page_is_guard(buddy) && !PageBuddy(buddy)) 8278c2ecf20Sopenharmony_ci return false; 8288c2ecf20Sopenharmony_ci 8298c2ecf20Sopenharmony_ci if (buddy_order(buddy) != order) 8308c2ecf20Sopenharmony_ci return false; 8318c2ecf20Sopenharmony_ci 8328c2ecf20Sopenharmony_ci /* 8338c2ecf20Sopenharmony_ci * zone check is done late to avoid uselessly calculating 8348c2ecf20Sopenharmony_ci * zone/node ids for pages that could never merge. 8358c2ecf20Sopenharmony_ci */ 8368c2ecf20Sopenharmony_ci if (page_zone_id(page) != page_zone_id(buddy)) 8378c2ecf20Sopenharmony_ci return false; 8388c2ecf20Sopenharmony_ci 8398c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); 8408c2ecf20Sopenharmony_ci 8418c2ecf20Sopenharmony_ci return true; 8428c2ecf20Sopenharmony_ci} 8438c2ecf20Sopenharmony_ci 8448c2ecf20Sopenharmony_ci#ifdef CONFIG_COMPACTION 8458c2ecf20Sopenharmony_cistatic inline struct capture_control *task_capc(struct zone *zone) 8468c2ecf20Sopenharmony_ci{ 8478c2ecf20Sopenharmony_ci struct capture_control *capc = current->capture_control; 8488c2ecf20Sopenharmony_ci 8498c2ecf20Sopenharmony_ci return unlikely(capc) && 8508c2ecf20Sopenharmony_ci !(current->flags & PF_KTHREAD) && 8518c2ecf20Sopenharmony_ci !capc->page && 8528c2ecf20Sopenharmony_ci capc->cc->zone == zone ? capc : NULL; 8538c2ecf20Sopenharmony_ci} 8548c2ecf20Sopenharmony_ci 8558c2ecf20Sopenharmony_cistatic inline bool 8568c2ecf20Sopenharmony_cicompaction_capture(struct capture_control *capc, struct page *page, 8578c2ecf20Sopenharmony_ci int order, int migratetype) 8588c2ecf20Sopenharmony_ci{ 8598c2ecf20Sopenharmony_ci if (!capc || order != capc->cc->order) 8608c2ecf20Sopenharmony_ci return false; 8618c2ecf20Sopenharmony_ci 8628c2ecf20Sopenharmony_ci /* Do not accidentally pollute CMA or isolated regions*/ 8638c2ecf20Sopenharmony_ci if (is_migrate_cma(migratetype) || 8648c2ecf20Sopenharmony_ci is_migrate_isolate(migratetype)) 8658c2ecf20Sopenharmony_ci return false; 8668c2ecf20Sopenharmony_ci 8678c2ecf20Sopenharmony_ci /* 8688c2ecf20Sopenharmony_ci * Do not let lower order allocations polluate a movable pageblock. 8698c2ecf20Sopenharmony_ci * This might let an unmovable request use a reclaimable pageblock 8708c2ecf20Sopenharmony_ci * and vice-versa but no more than normal fallback logic which can 8718c2ecf20Sopenharmony_ci * have trouble finding a high-order free page. 8728c2ecf20Sopenharmony_ci */ 8738c2ecf20Sopenharmony_ci if (order < pageblock_order && migratetype == MIGRATE_MOVABLE) 8748c2ecf20Sopenharmony_ci return false; 8758c2ecf20Sopenharmony_ci 8768c2ecf20Sopenharmony_ci capc->page = page; 8778c2ecf20Sopenharmony_ci return true; 8788c2ecf20Sopenharmony_ci} 8798c2ecf20Sopenharmony_ci 8808c2ecf20Sopenharmony_ci#else 8818c2ecf20Sopenharmony_cistatic inline struct capture_control *task_capc(struct zone *zone) 8828c2ecf20Sopenharmony_ci{ 8838c2ecf20Sopenharmony_ci return NULL; 8848c2ecf20Sopenharmony_ci} 8858c2ecf20Sopenharmony_ci 8868c2ecf20Sopenharmony_cistatic inline bool 8878c2ecf20Sopenharmony_cicompaction_capture(struct capture_control *capc, struct page *page, 8888c2ecf20Sopenharmony_ci int order, int migratetype) 8898c2ecf20Sopenharmony_ci{ 8908c2ecf20Sopenharmony_ci return false; 8918c2ecf20Sopenharmony_ci} 8928c2ecf20Sopenharmony_ci#endif /* CONFIG_COMPACTION */ 8938c2ecf20Sopenharmony_ci 8948c2ecf20Sopenharmony_ci/* Used for pages not on another list */ 8958c2ecf20Sopenharmony_cistatic inline void add_to_free_list(struct page *page, struct zone *zone, 8968c2ecf20Sopenharmony_ci unsigned int order, int migratetype) 8978c2ecf20Sopenharmony_ci{ 8988c2ecf20Sopenharmony_ci struct free_area *area = &zone->free_area[order]; 8998c2ecf20Sopenharmony_ci 9008c2ecf20Sopenharmony_ci list_add(&page->lru, &area->free_list[migratetype]); 9018c2ecf20Sopenharmony_ci area->nr_free++; 9028c2ecf20Sopenharmony_ci} 9038c2ecf20Sopenharmony_ci 9048c2ecf20Sopenharmony_ci/* Used for pages not on another list */ 9058c2ecf20Sopenharmony_cistatic inline void add_to_free_list_tail(struct page *page, struct zone *zone, 9068c2ecf20Sopenharmony_ci unsigned int order, int migratetype) 9078c2ecf20Sopenharmony_ci{ 9088c2ecf20Sopenharmony_ci struct free_area *area = &zone->free_area[order]; 9098c2ecf20Sopenharmony_ci 9108c2ecf20Sopenharmony_ci list_add_tail(&page->lru, &area->free_list[migratetype]); 9118c2ecf20Sopenharmony_ci area->nr_free++; 9128c2ecf20Sopenharmony_ci} 9138c2ecf20Sopenharmony_ci 9148c2ecf20Sopenharmony_ci/* 9158c2ecf20Sopenharmony_ci * Used for pages which are on another list. Move the pages to the tail 9168c2ecf20Sopenharmony_ci * of the list - so the moved pages won't immediately be considered for 9178c2ecf20Sopenharmony_ci * allocation again (e.g., optimization for memory onlining). 9188c2ecf20Sopenharmony_ci */ 9198c2ecf20Sopenharmony_cistatic inline void move_to_free_list(struct page *page, struct zone *zone, 9208c2ecf20Sopenharmony_ci unsigned int order, int migratetype) 9218c2ecf20Sopenharmony_ci{ 9228c2ecf20Sopenharmony_ci struct free_area *area = &zone->free_area[order]; 9238c2ecf20Sopenharmony_ci 9248c2ecf20Sopenharmony_ci list_move_tail(&page->lru, &area->free_list[migratetype]); 9258c2ecf20Sopenharmony_ci} 9268c2ecf20Sopenharmony_ci 9278c2ecf20Sopenharmony_cistatic inline void del_page_from_free_list(struct page *page, struct zone *zone, 9288c2ecf20Sopenharmony_ci unsigned int order) 9298c2ecf20Sopenharmony_ci{ 9308c2ecf20Sopenharmony_ci /* clear reported state and update reported page count */ 9318c2ecf20Sopenharmony_ci if (page_reported(page)) 9328c2ecf20Sopenharmony_ci __ClearPageReported(page); 9338c2ecf20Sopenharmony_ci 9348c2ecf20Sopenharmony_ci list_del(&page->lru); 9358c2ecf20Sopenharmony_ci __ClearPageBuddy(page); 9368c2ecf20Sopenharmony_ci set_page_private(page, 0); 9378c2ecf20Sopenharmony_ci zone->free_area[order].nr_free--; 9388c2ecf20Sopenharmony_ci} 9398c2ecf20Sopenharmony_ci 9408c2ecf20Sopenharmony_ci/* 9418c2ecf20Sopenharmony_ci * If this is not the largest possible page, check if the buddy 9428c2ecf20Sopenharmony_ci * of the next-highest order is free. If it is, it's possible 9438c2ecf20Sopenharmony_ci * that pages are being freed that will coalesce soon. In case, 9448c2ecf20Sopenharmony_ci * that is happening, add the free page to the tail of the list 9458c2ecf20Sopenharmony_ci * so it's less likely to be used soon and more likely to be merged 9468c2ecf20Sopenharmony_ci * as a higher order page 9478c2ecf20Sopenharmony_ci */ 9488c2ecf20Sopenharmony_cistatic inline bool 9498c2ecf20Sopenharmony_cibuddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, 9508c2ecf20Sopenharmony_ci struct page *page, unsigned int order) 9518c2ecf20Sopenharmony_ci{ 9528c2ecf20Sopenharmony_ci struct page *higher_page, *higher_buddy; 9538c2ecf20Sopenharmony_ci unsigned long combined_pfn; 9548c2ecf20Sopenharmony_ci 9558c2ecf20Sopenharmony_ci if (order >= MAX_ORDER - 2) 9568c2ecf20Sopenharmony_ci return false; 9578c2ecf20Sopenharmony_ci 9588c2ecf20Sopenharmony_ci if (!pfn_valid_within(buddy_pfn)) 9598c2ecf20Sopenharmony_ci return false; 9608c2ecf20Sopenharmony_ci 9618c2ecf20Sopenharmony_ci combined_pfn = buddy_pfn & pfn; 9628c2ecf20Sopenharmony_ci higher_page = page + (combined_pfn - pfn); 9638c2ecf20Sopenharmony_ci buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); 9648c2ecf20Sopenharmony_ci higher_buddy = higher_page + (buddy_pfn - combined_pfn); 9658c2ecf20Sopenharmony_ci 9668c2ecf20Sopenharmony_ci return pfn_valid_within(buddy_pfn) && 9678c2ecf20Sopenharmony_ci page_is_buddy(higher_page, higher_buddy, order + 1); 9688c2ecf20Sopenharmony_ci} 9698c2ecf20Sopenharmony_ci 9708c2ecf20Sopenharmony_ci/* 9718c2ecf20Sopenharmony_ci * Freeing function for a buddy system allocator. 9728c2ecf20Sopenharmony_ci * 9738c2ecf20Sopenharmony_ci * The concept of a buddy system is to maintain direct-mapped table 9748c2ecf20Sopenharmony_ci * (containing bit values) for memory blocks of various "orders". 9758c2ecf20Sopenharmony_ci * The bottom level table contains the map for the smallest allocatable 9768c2ecf20Sopenharmony_ci * units of memory (here, pages), and each level above it describes 9778c2ecf20Sopenharmony_ci * pairs of units from the levels below, hence, "buddies". 9788c2ecf20Sopenharmony_ci * At a high level, all that happens here is marking the table entry 9798c2ecf20Sopenharmony_ci * at the bottom level available, and propagating the changes upward 9808c2ecf20Sopenharmony_ci * as necessary, plus some accounting needed to play nicely with other 9818c2ecf20Sopenharmony_ci * parts of the VM system. 9828c2ecf20Sopenharmony_ci * At each level, we keep a list of pages, which are heads of continuous 9838c2ecf20Sopenharmony_ci * free pages of length of (1 << order) and marked with PageBuddy. 9848c2ecf20Sopenharmony_ci * Page's order is recorded in page_private(page) field. 9858c2ecf20Sopenharmony_ci * So when we are allocating or freeing one, we can derive the state of the 9868c2ecf20Sopenharmony_ci * other. That is, if we allocate a small block, and both were 9878c2ecf20Sopenharmony_ci * free, the remainder of the region must be split into blocks. 9888c2ecf20Sopenharmony_ci * If a block is freed, and its buddy is also free, then this 9898c2ecf20Sopenharmony_ci * triggers coalescing into a block of larger size. 9908c2ecf20Sopenharmony_ci * 9918c2ecf20Sopenharmony_ci * -- nyc 9928c2ecf20Sopenharmony_ci */ 9938c2ecf20Sopenharmony_ci 9948c2ecf20Sopenharmony_cistatic inline void __free_one_page(struct page *page, 9958c2ecf20Sopenharmony_ci unsigned long pfn, 9968c2ecf20Sopenharmony_ci struct zone *zone, unsigned int order, 9978c2ecf20Sopenharmony_ci int migratetype, fpi_t fpi_flags) 9988c2ecf20Sopenharmony_ci{ 9998c2ecf20Sopenharmony_ci struct capture_control *capc = task_capc(zone); 10008c2ecf20Sopenharmony_ci unsigned long buddy_pfn; 10018c2ecf20Sopenharmony_ci unsigned long combined_pfn; 10028c2ecf20Sopenharmony_ci unsigned int max_order; 10038c2ecf20Sopenharmony_ci struct page *buddy; 10048c2ecf20Sopenharmony_ci bool to_tail; 10058c2ecf20Sopenharmony_ci 10068c2ecf20Sopenharmony_ci max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order); 10078c2ecf20Sopenharmony_ci 10088c2ecf20Sopenharmony_ci VM_BUG_ON(!zone_is_initialized(zone)); 10098c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); 10108c2ecf20Sopenharmony_ci 10118c2ecf20Sopenharmony_ci VM_BUG_ON(migratetype == -1); 10128c2ecf20Sopenharmony_ci if (likely(!is_migrate_isolate(migratetype))) 10138c2ecf20Sopenharmony_ci __mod_zone_freepage_state(zone, 1 << order, migratetype); 10148c2ecf20Sopenharmony_ci 10158c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); 10168c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(bad_range(zone, page), page); 10178c2ecf20Sopenharmony_ci 10188c2ecf20Sopenharmony_cicontinue_merging: 10198c2ecf20Sopenharmony_ci while (order < max_order) { 10208c2ecf20Sopenharmony_ci if (compaction_capture(capc, page, order, migratetype)) { 10218c2ecf20Sopenharmony_ci __mod_zone_freepage_state(zone, -(1 << order), 10228c2ecf20Sopenharmony_ci migratetype); 10238c2ecf20Sopenharmony_ci return; 10248c2ecf20Sopenharmony_ci } 10258c2ecf20Sopenharmony_ci buddy_pfn = __find_buddy_pfn(pfn, order); 10268c2ecf20Sopenharmony_ci buddy = page + (buddy_pfn - pfn); 10278c2ecf20Sopenharmony_ci 10288c2ecf20Sopenharmony_ci if (!pfn_valid_within(buddy_pfn)) 10298c2ecf20Sopenharmony_ci goto done_merging; 10308c2ecf20Sopenharmony_ci if (!page_is_buddy(page, buddy, order)) 10318c2ecf20Sopenharmony_ci goto done_merging; 10328c2ecf20Sopenharmony_ci /* 10338c2ecf20Sopenharmony_ci * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, 10348c2ecf20Sopenharmony_ci * merge with it and move up one order. 10358c2ecf20Sopenharmony_ci */ 10368c2ecf20Sopenharmony_ci if (page_is_guard(buddy)) 10378c2ecf20Sopenharmony_ci clear_page_guard(zone, buddy, order, migratetype); 10388c2ecf20Sopenharmony_ci else 10398c2ecf20Sopenharmony_ci del_page_from_free_list(buddy, zone, order); 10408c2ecf20Sopenharmony_ci combined_pfn = buddy_pfn & pfn; 10418c2ecf20Sopenharmony_ci page = page + (combined_pfn - pfn); 10428c2ecf20Sopenharmony_ci pfn = combined_pfn; 10438c2ecf20Sopenharmony_ci order++; 10448c2ecf20Sopenharmony_ci } 10458c2ecf20Sopenharmony_ci if (order < MAX_ORDER - 1) { 10468c2ecf20Sopenharmony_ci /* If we are here, it means order is >= pageblock_order. 10478c2ecf20Sopenharmony_ci * We want to prevent merge between freepages on isolate 10488c2ecf20Sopenharmony_ci * pageblock and normal pageblock. Without this, pageblock 10498c2ecf20Sopenharmony_ci * isolation could cause incorrect freepage or CMA accounting. 10508c2ecf20Sopenharmony_ci * 10518c2ecf20Sopenharmony_ci * We don't want to hit this code for the more frequent 10528c2ecf20Sopenharmony_ci * low-order merging. 10538c2ecf20Sopenharmony_ci */ 10548c2ecf20Sopenharmony_ci if (unlikely(has_isolate_pageblock(zone))) { 10558c2ecf20Sopenharmony_ci int buddy_mt; 10568c2ecf20Sopenharmony_ci 10578c2ecf20Sopenharmony_ci buddy_pfn = __find_buddy_pfn(pfn, order); 10588c2ecf20Sopenharmony_ci buddy = page + (buddy_pfn - pfn); 10598c2ecf20Sopenharmony_ci buddy_mt = get_pageblock_migratetype(buddy); 10608c2ecf20Sopenharmony_ci 10618c2ecf20Sopenharmony_ci if (migratetype != buddy_mt 10628c2ecf20Sopenharmony_ci && (is_migrate_isolate(migratetype) || 10638c2ecf20Sopenharmony_ci is_migrate_isolate(buddy_mt))) 10648c2ecf20Sopenharmony_ci goto done_merging; 10658c2ecf20Sopenharmony_ci } 10668c2ecf20Sopenharmony_ci max_order = order + 1; 10678c2ecf20Sopenharmony_ci goto continue_merging; 10688c2ecf20Sopenharmony_ci } 10698c2ecf20Sopenharmony_ci 10708c2ecf20Sopenharmony_cidone_merging: 10718c2ecf20Sopenharmony_ci set_buddy_order(page, order); 10728c2ecf20Sopenharmony_ci 10738c2ecf20Sopenharmony_ci if (fpi_flags & FPI_TO_TAIL) 10748c2ecf20Sopenharmony_ci to_tail = true; 10758c2ecf20Sopenharmony_ci else if (is_shuffle_order(order)) 10768c2ecf20Sopenharmony_ci to_tail = shuffle_pick_tail(); 10778c2ecf20Sopenharmony_ci else 10788c2ecf20Sopenharmony_ci to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order); 10798c2ecf20Sopenharmony_ci 10808c2ecf20Sopenharmony_ci if (to_tail) 10818c2ecf20Sopenharmony_ci add_to_free_list_tail(page, zone, order, migratetype); 10828c2ecf20Sopenharmony_ci else 10838c2ecf20Sopenharmony_ci add_to_free_list(page, zone, order, migratetype); 10848c2ecf20Sopenharmony_ci 10858c2ecf20Sopenharmony_ci /* Notify page reporting subsystem of freed page */ 10868c2ecf20Sopenharmony_ci if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY)) 10878c2ecf20Sopenharmony_ci page_reporting_notify_free(order); 10888c2ecf20Sopenharmony_ci} 10898c2ecf20Sopenharmony_ci 10908c2ecf20Sopenharmony_ci/* 10918c2ecf20Sopenharmony_ci * A bad page could be due to a number of fields. Instead of multiple branches, 10928c2ecf20Sopenharmony_ci * try and check multiple fields with one check. The caller must do a detailed 10938c2ecf20Sopenharmony_ci * check if necessary. 10948c2ecf20Sopenharmony_ci */ 10958c2ecf20Sopenharmony_cistatic inline bool page_expected_state(struct page *page, 10968c2ecf20Sopenharmony_ci unsigned long check_flags) 10978c2ecf20Sopenharmony_ci{ 10988c2ecf20Sopenharmony_ci if (unlikely(atomic_read(&page->_mapcount) != -1)) 10998c2ecf20Sopenharmony_ci return false; 11008c2ecf20Sopenharmony_ci 11018c2ecf20Sopenharmony_ci if (unlikely((unsigned long)page->mapping | 11028c2ecf20Sopenharmony_ci page_ref_count(page) | 11038c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMCG 11048c2ecf20Sopenharmony_ci (unsigned long)page->mem_cgroup | 11058c2ecf20Sopenharmony_ci#endif 11068c2ecf20Sopenharmony_ci (page->flags & check_flags))) 11078c2ecf20Sopenharmony_ci return false; 11088c2ecf20Sopenharmony_ci 11098c2ecf20Sopenharmony_ci return true; 11108c2ecf20Sopenharmony_ci} 11118c2ecf20Sopenharmony_ci 11128c2ecf20Sopenharmony_cistatic const char *page_bad_reason(struct page *page, unsigned long flags) 11138c2ecf20Sopenharmony_ci{ 11148c2ecf20Sopenharmony_ci const char *bad_reason = NULL; 11158c2ecf20Sopenharmony_ci 11168c2ecf20Sopenharmony_ci if (unlikely(atomic_read(&page->_mapcount) != -1)) 11178c2ecf20Sopenharmony_ci bad_reason = "nonzero mapcount"; 11188c2ecf20Sopenharmony_ci if (unlikely(page->mapping != NULL)) 11198c2ecf20Sopenharmony_ci bad_reason = "non-NULL mapping"; 11208c2ecf20Sopenharmony_ci if (unlikely(page_ref_count(page) != 0)) 11218c2ecf20Sopenharmony_ci bad_reason = "nonzero _refcount"; 11228c2ecf20Sopenharmony_ci if (unlikely(page->flags & flags)) { 11238c2ecf20Sopenharmony_ci if (flags == PAGE_FLAGS_CHECK_AT_PREP) 11248c2ecf20Sopenharmony_ci bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set"; 11258c2ecf20Sopenharmony_ci else 11268c2ecf20Sopenharmony_ci bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; 11278c2ecf20Sopenharmony_ci } 11288c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMCG 11298c2ecf20Sopenharmony_ci if (unlikely(page->mem_cgroup)) 11308c2ecf20Sopenharmony_ci bad_reason = "page still charged to cgroup"; 11318c2ecf20Sopenharmony_ci#endif 11328c2ecf20Sopenharmony_ci return bad_reason; 11338c2ecf20Sopenharmony_ci} 11348c2ecf20Sopenharmony_ci 11358c2ecf20Sopenharmony_cistatic void check_free_page_bad(struct page *page) 11368c2ecf20Sopenharmony_ci{ 11378c2ecf20Sopenharmony_ci bad_page(page, 11388c2ecf20Sopenharmony_ci page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); 11398c2ecf20Sopenharmony_ci} 11408c2ecf20Sopenharmony_ci 11418c2ecf20Sopenharmony_cistatic inline int check_free_page(struct page *page) 11428c2ecf20Sopenharmony_ci{ 11438c2ecf20Sopenharmony_ci if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) 11448c2ecf20Sopenharmony_ci return 0; 11458c2ecf20Sopenharmony_ci 11468c2ecf20Sopenharmony_ci /* Something has gone sideways, find it */ 11478c2ecf20Sopenharmony_ci check_free_page_bad(page); 11488c2ecf20Sopenharmony_ci return 1; 11498c2ecf20Sopenharmony_ci} 11508c2ecf20Sopenharmony_ci 11518c2ecf20Sopenharmony_cistatic int free_tail_pages_check(struct page *head_page, struct page *page) 11528c2ecf20Sopenharmony_ci{ 11538c2ecf20Sopenharmony_ci int ret = 1; 11548c2ecf20Sopenharmony_ci 11558c2ecf20Sopenharmony_ci /* 11568c2ecf20Sopenharmony_ci * We rely page->lru.next never has bit 0 set, unless the page 11578c2ecf20Sopenharmony_ci * is PageTail(). Let's make sure that's true even for poisoned ->lru. 11588c2ecf20Sopenharmony_ci */ 11598c2ecf20Sopenharmony_ci BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); 11608c2ecf20Sopenharmony_ci 11618c2ecf20Sopenharmony_ci if (!IS_ENABLED(CONFIG_DEBUG_VM)) { 11628c2ecf20Sopenharmony_ci ret = 0; 11638c2ecf20Sopenharmony_ci goto out; 11648c2ecf20Sopenharmony_ci } 11658c2ecf20Sopenharmony_ci switch (page - head_page) { 11668c2ecf20Sopenharmony_ci case 1: 11678c2ecf20Sopenharmony_ci /* the first tail page: ->mapping may be compound_mapcount() */ 11688c2ecf20Sopenharmony_ci if (unlikely(compound_mapcount(page))) { 11698c2ecf20Sopenharmony_ci bad_page(page, "nonzero compound_mapcount"); 11708c2ecf20Sopenharmony_ci goto out; 11718c2ecf20Sopenharmony_ci } 11728c2ecf20Sopenharmony_ci break; 11738c2ecf20Sopenharmony_ci case 2: 11748c2ecf20Sopenharmony_ci /* 11758c2ecf20Sopenharmony_ci * the second tail page: ->mapping is 11768c2ecf20Sopenharmony_ci * deferred_list.next -- ignore value. 11778c2ecf20Sopenharmony_ci */ 11788c2ecf20Sopenharmony_ci break; 11798c2ecf20Sopenharmony_ci default: 11808c2ecf20Sopenharmony_ci if (page->mapping != TAIL_MAPPING) { 11818c2ecf20Sopenharmony_ci bad_page(page, "corrupted mapping in tail page"); 11828c2ecf20Sopenharmony_ci goto out; 11838c2ecf20Sopenharmony_ci } 11848c2ecf20Sopenharmony_ci break; 11858c2ecf20Sopenharmony_ci } 11868c2ecf20Sopenharmony_ci if (unlikely(!PageTail(page))) { 11878c2ecf20Sopenharmony_ci bad_page(page, "PageTail not set"); 11888c2ecf20Sopenharmony_ci goto out; 11898c2ecf20Sopenharmony_ci } 11908c2ecf20Sopenharmony_ci if (unlikely(compound_head(page) != head_page)) { 11918c2ecf20Sopenharmony_ci bad_page(page, "compound_head not consistent"); 11928c2ecf20Sopenharmony_ci goto out; 11938c2ecf20Sopenharmony_ci } 11948c2ecf20Sopenharmony_ci ret = 0; 11958c2ecf20Sopenharmony_ciout: 11968c2ecf20Sopenharmony_ci page->mapping = NULL; 11978c2ecf20Sopenharmony_ci clear_compound_head(page); 11988c2ecf20Sopenharmony_ci return ret; 11998c2ecf20Sopenharmony_ci} 12008c2ecf20Sopenharmony_ci 12018c2ecf20Sopenharmony_cistatic void kernel_init_free_pages(struct page *page, int numpages) 12028c2ecf20Sopenharmony_ci{ 12038c2ecf20Sopenharmony_ci int i; 12048c2ecf20Sopenharmony_ci 12058c2ecf20Sopenharmony_ci /* s390's use of memset() could override KASAN redzones. */ 12068c2ecf20Sopenharmony_ci kasan_disable_current(); 12078c2ecf20Sopenharmony_ci for (i = 0; i < numpages; i++) 12088c2ecf20Sopenharmony_ci clear_highpage(page + i); 12098c2ecf20Sopenharmony_ci kasan_enable_current(); 12108c2ecf20Sopenharmony_ci} 12118c2ecf20Sopenharmony_ci 12128c2ecf20Sopenharmony_cistatic __always_inline bool free_pages_prepare(struct page *page, 12138c2ecf20Sopenharmony_ci unsigned int order, bool check_free) 12148c2ecf20Sopenharmony_ci{ 12158c2ecf20Sopenharmony_ci int bad = 0; 12168c2ecf20Sopenharmony_ci 12178c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(PageTail(page), page); 12188c2ecf20Sopenharmony_ci 12198c2ecf20Sopenharmony_ci trace_mm_page_free(page, order); 12208c2ecf20Sopenharmony_ci 12218c2ecf20Sopenharmony_ci if (unlikely(PageHWPoison(page)) && !order) { 12228c2ecf20Sopenharmony_ci /* 12238c2ecf20Sopenharmony_ci * Do not let hwpoison pages hit pcplists/buddy 12248c2ecf20Sopenharmony_ci * Untie memcg state and reset page's owner 12258c2ecf20Sopenharmony_ci */ 12268c2ecf20Sopenharmony_ci if (memcg_kmem_enabled() && PageKmemcg(page)) 12278c2ecf20Sopenharmony_ci __memcg_kmem_uncharge_page(page, order); 12288c2ecf20Sopenharmony_ci reset_page_owner(page, order); 12298c2ecf20Sopenharmony_ci return false; 12308c2ecf20Sopenharmony_ci } 12318c2ecf20Sopenharmony_ci 12328c2ecf20Sopenharmony_ci /* 12338c2ecf20Sopenharmony_ci * Check tail pages before head page information is cleared to 12348c2ecf20Sopenharmony_ci * avoid checking PageCompound for order-0 pages. 12358c2ecf20Sopenharmony_ci */ 12368c2ecf20Sopenharmony_ci if (unlikely(order)) { 12378c2ecf20Sopenharmony_ci bool compound = PageCompound(page); 12388c2ecf20Sopenharmony_ci int i; 12398c2ecf20Sopenharmony_ci 12408c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); 12418c2ecf20Sopenharmony_ci 12428c2ecf20Sopenharmony_ci if (compound) 12438c2ecf20Sopenharmony_ci ClearPageDoubleMap(page); 12448c2ecf20Sopenharmony_ci for (i = 1; i < (1 << order); i++) { 12458c2ecf20Sopenharmony_ci if (compound) 12468c2ecf20Sopenharmony_ci bad += free_tail_pages_check(page, page + i); 12478c2ecf20Sopenharmony_ci if (unlikely(check_free_page(page + i))) { 12488c2ecf20Sopenharmony_ci bad++; 12498c2ecf20Sopenharmony_ci continue; 12508c2ecf20Sopenharmony_ci } 12518c2ecf20Sopenharmony_ci (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 12528c2ecf20Sopenharmony_ci } 12538c2ecf20Sopenharmony_ci } 12548c2ecf20Sopenharmony_ci if (PageMappingFlags(page)) 12558c2ecf20Sopenharmony_ci page->mapping = NULL; 12568c2ecf20Sopenharmony_ci if (memcg_kmem_enabled() && PageKmemcg(page)) 12578c2ecf20Sopenharmony_ci __memcg_kmem_uncharge_page(page, order); 12588c2ecf20Sopenharmony_ci if (check_free) 12598c2ecf20Sopenharmony_ci bad += check_free_page(page); 12608c2ecf20Sopenharmony_ci if (bad) 12618c2ecf20Sopenharmony_ci return false; 12628c2ecf20Sopenharmony_ci 12638c2ecf20Sopenharmony_ci page_cpupid_reset_last(page); 12648c2ecf20Sopenharmony_ci page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 12658c2ecf20Sopenharmony_ci reset_page_owner(page, order); 12668c2ecf20Sopenharmony_ci 12678c2ecf20Sopenharmony_ci if (!PageHighMem(page)) { 12688c2ecf20Sopenharmony_ci debug_check_no_locks_freed(page_address(page), 12698c2ecf20Sopenharmony_ci PAGE_SIZE << order); 12708c2ecf20Sopenharmony_ci debug_check_no_obj_freed(page_address(page), 12718c2ecf20Sopenharmony_ci PAGE_SIZE << order); 12728c2ecf20Sopenharmony_ci } 12738c2ecf20Sopenharmony_ci if (want_init_on_free()) 12748c2ecf20Sopenharmony_ci kernel_init_free_pages(page, 1 << order); 12758c2ecf20Sopenharmony_ci 12768c2ecf20Sopenharmony_ci kernel_poison_pages(page, 1 << order, 0); 12778c2ecf20Sopenharmony_ci /* 12788c2ecf20Sopenharmony_ci * arch_free_page() can make the page's contents inaccessible. s390 12798c2ecf20Sopenharmony_ci * does this. So nothing which can access the page's contents should 12808c2ecf20Sopenharmony_ci * happen after this. 12818c2ecf20Sopenharmony_ci */ 12828c2ecf20Sopenharmony_ci arch_free_page(page, order); 12838c2ecf20Sopenharmony_ci 12848c2ecf20Sopenharmony_ci if (debug_pagealloc_enabled_static()) 12858c2ecf20Sopenharmony_ci kernel_map_pages(page, 1 << order, 0); 12868c2ecf20Sopenharmony_ci 12878c2ecf20Sopenharmony_ci kasan_free_nondeferred_pages(page, order); 12888c2ecf20Sopenharmony_ci 12898c2ecf20Sopenharmony_ci return true; 12908c2ecf20Sopenharmony_ci} 12918c2ecf20Sopenharmony_ci 12928c2ecf20Sopenharmony_ci#ifdef CONFIG_DEBUG_VM 12938c2ecf20Sopenharmony_ci/* 12948c2ecf20Sopenharmony_ci * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed 12958c2ecf20Sopenharmony_ci * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when 12968c2ecf20Sopenharmony_ci * moved from pcp lists to free lists. 12978c2ecf20Sopenharmony_ci */ 12988c2ecf20Sopenharmony_cistatic bool free_pcp_prepare(struct page *page) 12998c2ecf20Sopenharmony_ci{ 13008c2ecf20Sopenharmony_ci return free_pages_prepare(page, 0, true); 13018c2ecf20Sopenharmony_ci} 13028c2ecf20Sopenharmony_ci 13038c2ecf20Sopenharmony_cistatic bool bulkfree_pcp_prepare(struct page *page) 13048c2ecf20Sopenharmony_ci{ 13058c2ecf20Sopenharmony_ci if (debug_pagealloc_enabled_static()) 13068c2ecf20Sopenharmony_ci return check_free_page(page); 13078c2ecf20Sopenharmony_ci else 13088c2ecf20Sopenharmony_ci return false; 13098c2ecf20Sopenharmony_ci} 13108c2ecf20Sopenharmony_ci#else 13118c2ecf20Sopenharmony_ci/* 13128c2ecf20Sopenharmony_ci * With DEBUG_VM disabled, order-0 pages being freed are checked only when 13138c2ecf20Sopenharmony_ci * moving from pcp lists to free list in order to reduce overhead. With 13148c2ecf20Sopenharmony_ci * debug_pagealloc enabled, they are checked also immediately when being freed 13158c2ecf20Sopenharmony_ci * to the pcp lists. 13168c2ecf20Sopenharmony_ci */ 13178c2ecf20Sopenharmony_cistatic bool free_pcp_prepare(struct page *page) 13188c2ecf20Sopenharmony_ci{ 13198c2ecf20Sopenharmony_ci if (debug_pagealloc_enabled_static()) 13208c2ecf20Sopenharmony_ci return free_pages_prepare(page, 0, true); 13218c2ecf20Sopenharmony_ci else 13228c2ecf20Sopenharmony_ci return free_pages_prepare(page, 0, false); 13238c2ecf20Sopenharmony_ci} 13248c2ecf20Sopenharmony_ci 13258c2ecf20Sopenharmony_cistatic bool bulkfree_pcp_prepare(struct page *page) 13268c2ecf20Sopenharmony_ci{ 13278c2ecf20Sopenharmony_ci return check_free_page(page); 13288c2ecf20Sopenharmony_ci} 13298c2ecf20Sopenharmony_ci#endif /* CONFIG_DEBUG_VM */ 13308c2ecf20Sopenharmony_ci 13318c2ecf20Sopenharmony_cistatic inline void prefetch_buddy(struct page *page) 13328c2ecf20Sopenharmony_ci{ 13338c2ecf20Sopenharmony_ci unsigned long pfn = page_to_pfn(page); 13348c2ecf20Sopenharmony_ci unsigned long buddy_pfn = __find_buddy_pfn(pfn, 0); 13358c2ecf20Sopenharmony_ci struct page *buddy = page + (buddy_pfn - pfn); 13368c2ecf20Sopenharmony_ci 13378c2ecf20Sopenharmony_ci prefetch(buddy); 13388c2ecf20Sopenharmony_ci} 13398c2ecf20Sopenharmony_ci 13408c2ecf20Sopenharmony_ci/* 13418c2ecf20Sopenharmony_ci * Frees a number of pages from the PCP lists 13428c2ecf20Sopenharmony_ci * Assumes all pages on list are in same zone, and of same order. 13438c2ecf20Sopenharmony_ci * count is the number of pages to free. 13448c2ecf20Sopenharmony_ci * 13458c2ecf20Sopenharmony_ci * If the zone was previously in an "all pages pinned" state then look to 13468c2ecf20Sopenharmony_ci * see if this freeing clears that state. 13478c2ecf20Sopenharmony_ci * 13488c2ecf20Sopenharmony_ci * And clear the zone's pages_scanned counter, to hold off the "all pages are 13498c2ecf20Sopenharmony_ci * pinned" detection logic. 13508c2ecf20Sopenharmony_ci */ 13518c2ecf20Sopenharmony_cistatic void free_pcppages_bulk(struct zone *zone, int count, 13528c2ecf20Sopenharmony_ci struct per_cpu_pages *pcp) 13538c2ecf20Sopenharmony_ci{ 13548c2ecf20Sopenharmony_ci int migratetype = 0; 13558c2ecf20Sopenharmony_ci int batch_free = 0; 13568c2ecf20Sopenharmony_ci int prefetch_nr = 0; 13578c2ecf20Sopenharmony_ci bool isolated_pageblocks; 13588c2ecf20Sopenharmony_ci struct page *page, *tmp; 13598c2ecf20Sopenharmony_ci LIST_HEAD(head); 13608c2ecf20Sopenharmony_ci 13618c2ecf20Sopenharmony_ci /* 13628c2ecf20Sopenharmony_ci * Ensure proper count is passed which otherwise would stuck in the 13638c2ecf20Sopenharmony_ci * below while (list_empty(list)) loop. 13648c2ecf20Sopenharmony_ci */ 13658c2ecf20Sopenharmony_ci count = min(pcp->count, count); 13668c2ecf20Sopenharmony_ci while (count) { 13678c2ecf20Sopenharmony_ci struct list_head *list; 13688c2ecf20Sopenharmony_ci 13698c2ecf20Sopenharmony_ci /* 13708c2ecf20Sopenharmony_ci * Remove pages from lists in a round-robin fashion. A 13718c2ecf20Sopenharmony_ci * batch_free count is maintained that is incremented when an 13728c2ecf20Sopenharmony_ci * empty list is encountered. This is so more pages are freed 13738c2ecf20Sopenharmony_ci * off fuller lists instead of spinning excessively around empty 13748c2ecf20Sopenharmony_ci * lists 13758c2ecf20Sopenharmony_ci */ 13768c2ecf20Sopenharmony_ci do { 13778c2ecf20Sopenharmony_ci batch_free++; 13788c2ecf20Sopenharmony_ci if (++migratetype == MIGRATE_PCPTYPES) 13798c2ecf20Sopenharmony_ci migratetype = 0; 13808c2ecf20Sopenharmony_ci list = &pcp->lists[migratetype]; 13818c2ecf20Sopenharmony_ci } while (list_empty(list)); 13828c2ecf20Sopenharmony_ci 13838c2ecf20Sopenharmony_ci /* This is the only non-empty list. Free them all. */ 13848c2ecf20Sopenharmony_ci if (batch_free == MIGRATE_PCPTYPES) 13858c2ecf20Sopenharmony_ci batch_free = count; 13868c2ecf20Sopenharmony_ci 13878c2ecf20Sopenharmony_ci do { 13888c2ecf20Sopenharmony_ci page = list_last_entry(list, struct page, lru); 13898c2ecf20Sopenharmony_ci /* must delete to avoid corrupting pcp list */ 13908c2ecf20Sopenharmony_ci list_del(&page->lru); 13918c2ecf20Sopenharmony_ci pcp->count--; 13928c2ecf20Sopenharmony_ci 13938c2ecf20Sopenharmony_ci if (bulkfree_pcp_prepare(page)) 13948c2ecf20Sopenharmony_ci continue; 13958c2ecf20Sopenharmony_ci 13968c2ecf20Sopenharmony_ci list_add_tail(&page->lru, &head); 13978c2ecf20Sopenharmony_ci 13988c2ecf20Sopenharmony_ci /* 13998c2ecf20Sopenharmony_ci * We are going to put the page back to the global 14008c2ecf20Sopenharmony_ci * pool, prefetch its buddy to speed up later access 14018c2ecf20Sopenharmony_ci * under zone->lock. It is believed the overhead of 14028c2ecf20Sopenharmony_ci * an additional test and calculating buddy_pfn here 14038c2ecf20Sopenharmony_ci * can be offset by reduced memory latency later. To 14048c2ecf20Sopenharmony_ci * avoid excessive prefetching due to large count, only 14058c2ecf20Sopenharmony_ci * prefetch buddy for the first pcp->batch nr of pages. 14068c2ecf20Sopenharmony_ci */ 14078c2ecf20Sopenharmony_ci if (prefetch_nr++ < pcp->batch) 14088c2ecf20Sopenharmony_ci prefetch_buddy(page); 14098c2ecf20Sopenharmony_ci } while (--count && --batch_free && !list_empty(list)); 14108c2ecf20Sopenharmony_ci } 14118c2ecf20Sopenharmony_ci 14128c2ecf20Sopenharmony_ci spin_lock(&zone->lock); 14138c2ecf20Sopenharmony_ci isolated_pageblocks = has_isolate_pageblock(zone); 14148c2ecf20Sopenharmony_ci 14158c2ecf20Sopenharmony_ci /* 14168c2ecf20Sopenharmony_ci * Use safe version since after __free_one_page(), 14178c2ecf20Sopenharmony_ci * page->lru.next will not point to original list. 14188c2ecf20Sopenharmony_ci */ 14198c2ecf20Sopenharmony_ci list_for_each_entry_safe(page, tmp, &head, lru) { 14208c2ecf20Sopenharmony_ci int mt = get_pcppage_migratetype(page); 14218c2ecf20Sopenharmony_ci /* MIGRATE_ISOLATE page should not go to pcplists */ 14228c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); 14238c2ecf20Sopenharmony_ci /* Pageblock could have been isolated meanwhile */ 14248c2ecf20Sopenharmony_ci if (unlikely(isolated_pageblocks)) 14258c2ecf20Sopenharmony_ci mt = get_pageblock_migratetype(page); 14268c2ecf20Sopenharmony_ci 14278c2ecf20Sopenharmony_ci __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE); 14288c2ecf20Sopenharmony_ci trace_mm_page_pcpu_drain(page, 0, mt); 14298c2ecf20Sopenharmony_ci } 14308c2ecf20Sopenharmony_ci spin_unlock(&zone->lock); 14318c2ecf20Sopenharmony_ci} 14328c2ecf20Sopenharmony_ci 14338c2ecf20Sopenharmony_cistatic void free_one_page(struct zone *zone, 14348c2ecf20Sopenharmony_ci struct page *page, unsigned long pfn, 14358c2ecf20Sopenharmony_ci unsigned int order, 14368c2ecf20Sopenharmony_ci int migratetype, fpi_t fpi_flags) 14378c2ecf20Sopenharmony_ci{ 14388c2ecf20Sopenharmony_ci spin_lock(&zone->lock); 14398c2ecf20Sopenharmony_ci if (unlikely(has_isolate_pageblock(zone) || 14408c2ecf20Sopenharmony_ci is_migrate_isolate(migratetype))) { 14418c2ecf20Sopenharmony_ci migratetype = get_pfnblock_migratetype(page, pfn); 14428c2ecf20Sopenharmony_ci } 14438c2ecf20Sopenharmony_ci __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); 14448c2ecf20Sopenharmony_ci spin_unlock(&zone->lock); 14458c2ecf20Sopenharmony_ci} 14468c2ecf20Sopenharmony_ci 14478c2ecf20Sopenharmony_cistatic void __meminit __init_single_page(struct page *page, unsigned long pfn, 14488c2ecf20Sopenharmony_ci unsigned long zone, int nid) 14498c2ecf20Sopenharmony_ci{ 14508c2ecf20Sopenharmony_ci mm_zero_struct_page(page); 14518c2ecf20Sopenharmony_ci set_page_links(page, zone, nid, pfn); 14528c2ecf20Sopenharmony_ci init_page_count(page); 14538c2ecf20Sopenharmony_ci page_mapcount_reset(page); 14548c2ecf20Sopenharmony_ci page_cpupid_reset_last(page); 14558c2ecf20Sopenharmony_ci page_kasan_tag_reset(page); 14568c2ecf20Sopenharmony_ci 14578c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&page->lru); 14588c2ecf20Sopenharmony_ci#ifdef WANT_PAGE_VIRTUAL 14598c2ecf20Sopenharmony_ci /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 14608c2ecf20Sopenharmony_ci if (!is_highmem_idx(zone)) 14618c2ecf20Sopenharmony_ci set_page_address(page, __va(pfn << PAGE_SHIFT)); 14628c2ecf20Sopenharmony_ci#endif 14638c2ecf20Sopenharmony_ci} 14648c2ecf20Sopenharmony_ci 14658c2ecf20Sopenharmony_ci#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 14668c2ecf20Sopenharmony_cistatic void __meminit init_reserved_page(unsigned long pfn) 14678c2ecf20Sopenharmony_ci{ 14688c2ecf20Sopenharmony_ci pg_data_t *pgdat; 14698c2ecf20Sopenharmony_ci int nid, zid; 14708c2ecf20Sopenharmony_ci 14718c2ecf20Sopenharmony_ci if (!early_page_uninitialised(pfn)) 14728c2ecf20Sopenharmony_ci return; 14738c2ecf20Sopenharmony_ci 14748c2ecf20Sopenharmony_ci nid = early_pfn_to_nid(pfn); 14758c2ecf20Sopenharmony_ci pgdat = NODE_DATA(nid); 14768c2ecf20Sopenharmony_ci 14778c2ecf20Sopenharmony_ci for (zid = 0; zid < MAX_NR_ZONES; zid++) { 14788c2ecf20Sopenharmony_ci struct zone *zone = &pgdat->node_zones[zid]; 14798c2ecf20Sopenharmony_ci 14808c2ecf20Sopenharmony_ci if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone)) 14818c2ecf20Sopenharmony_ci break; 14828c2ecf20Sopenharmony_ci } 14838c2ecf20Sopenharmony_ci __init_single_page(pfn_to_page(pfn), pfn, zid, nid); 14848c2ecf20Sopenharmony_ci} 14858c2ecf20Sopenharmony_ci#else 14868c2ecf20Sopenharmony_cistatic inline void init_reserved_page(unsigned long pfn) 14878c2ecf20Sopenharmony_ci{ 14888c2ecf20Sopenharmony_ci} 14898c2ecf20Sopenharmony_ci#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 14908c2ecf20Sopenharmony_ci 14918c2ecf20Sopenharmony_ci/* 14928c2ecf20Sopenharmony_ci * Initialised pages do not have PageReserved set. This function is 14938c2ecf20Sopenharmony_ci * called for each range allocated by the bootmem allocator and 14948c2ecf20Sopenharmony_ci * marks the pages PageReserved. The remaining valid pages are later 14958c2ecf20Sopenharmony_ci * sent to the buddy page allocator. 14968c2ecf20Sopenharmony_ci */ 14978c2ecf20Sopenharmony_civoid __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) 14988c2ecf20Sopenharmony_ci{ 14998c2ecf20Sopenharmony_ci unsigned long start_pfn = PFN_DOWN(start); 15008c2ecf20Sopenharmony_ci unsigned long end_pfn = PFN_UP(end); 15018c2ecf20Sopenharmony_ci 15028c2ecf20Sopenharmony_ci for (; start_pfn < end_pfn; start_pfn++) { 15038c2ecf20Sopenharmony_ci if (pfn_valid(start_pfn)) { 15048c2ecf20Sopenharmony_ci struct page *page = pfn_to_page(start_pfn); 15058c2ecf20Sopenharmony_ci 15068c2ecf20Sopenharmony_ci init_reserved_page(start_pfn); 15078c2ecf20Sopenharmony_ci 15088c2ecf20Sopenharmony_ci /* Avoid false-positive PageTail() */ 15098c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&page->lru); 15108c2ecf20Sopenharmony_ci 15118c2ecf20Sopenharmony_ci /* 15128c2ecf20Sopenharmony_ci * no need for atomic set_bit because the struct 15138c2ecf20Sopenharmony_ci * page is not visible yet so nobody should 15148c2ecf20Sopenharmony_ci * access it yet. 15158c2ecf20Sopenharmony_ci */ 15168c2ecf20Sopenharmony_ci __SetPageReserved(page); 15178c2ecf20Sopenharmony_ci } 15188c2ecf20Sopenharmony_ci } 15198c2ecf20Sopenharmony_ci} 15208c2ecf20Sopenharmony_ci 15218c2ecf20Sopenharmony_cistatic void __free_pages_ok(struct page *page, unsigned int order, 15228c2ecf20Sopenharmony_ci fpi_t fpi_flags) 15238c2ecf20Sopenharmony_ci{ 15248c2ecf20Sopenharmony_ci unsigned long flags; 15258c2ecf20Sopenharmony_ci int migratetype; 15268c2ecf20Sopenharmony_ci unsigned long pfn = page_to_pfn(page); 15278c2ecf20Sopenharmony_ci 15288c2ecf20Sopenharmony_ci if (!free_pages_prepare(page, order, true)) 15298c2ecf20Sopenharmony_ci return; 15308c2ecf20Sopenharmony_ci 15318c2ecf20Sopenharmony_ci migratetype = get_pfnblock_migratetype(page, pfn); 15328c2ecf20Sopenharmony_ci local_irq_save(flags); 15338c2ecf20Sopenharmony_ci __count_vm_events(PGFREE, 1 << order); 15348c2ecf20Sopenharmony_ci free_one_page(page_zone(page), page, pfn, order, migratetype, 15358c2ecf20Sopenharmony_ci fpi_flags); 15368c2ecf20Sopenharmony_ci local_irq_restore(flags); 15378c2ecf20Sopenharmony_ci} 15388c2ecf20Sopenharmony_ci 15398c2ecf20Sopenharmony_civoid __free_pages_core(struct page *page, unsigned int order) 15408c2ecf20Sopenharmony_ci{ 15418c2ecf20Sopenharmony_ci unsigned int nr_pages = 1 << order; 15428c2ecf20Sopenharmony_ci struct page *p = page; 15438c2ecf20Sopenharmony_ci unsigned int loop; 15448c2ecf20Sopenharmony_ci 15458c2ecf20Sopenharmony_ci /* 15468c2ecf20Sopenharmony_ci * When initializing the memmap, __init_single_page() sets the refcount 15478c2ecf20Sopenharmony_ci * of all pages to 1 ("allocated"/"not free"). We have to set the 15488c2ecf20Sopenharmony_ci * refcount of all involved pages to 0. 15498c2ecf20Sopenharmony_ci */ 15508c2ecf20Sopenharmony_ci prefetchw(p); 15518c2ecf20Sopenharmony_ci for (loop = 0; loop < (nr_pages - 1); loop++, p++) { 15528c2ecf20Sopenharmony_ci prefetchw(p + 1); 15538c2ecf20Sopenharmony_ci __ClearPageReserved(p); 15548c2ecf20Sopenharmony_ci set_page_count(p, 0); 15558c2ecf20Sopenharmony_ci } 15568c2ecf20Sopenharmony_ci __ClearPageReserved(p); 15578c2ecf20Sopenharmony_ci set_page_count(p, 0); 15588c2ecf20Sopenharmony_ci 15598c2ecf20Sopenharmony_ci atomic_long_add(nr_pages, &page_zone(page)->managed_pages); 15608c2ecf20Sopenharmony_ci 15618c2ecf20Sopenharmony_ci /* 15628c2ecf20Sopenharmony_ci * Bypass PCP and place fresh pages right to the tail, primarily 15638c2ecf20Sopenharmony_ci * relevant for memory onlining. 15648c2ecf20Sopenharmony_ci */ 15658c2ecf20Sopenharmony_ci __free_pages_ok(page, order, FPI_TO_TAIL); 15668c2ecf20Sopenharmony_ci} 15678c2ecf20Sopenharmony_ci 15688c2ecf20Sopenharmony_ci#ifdef CONFIG_NEED_MULTIPLE_NODES 15698c2ecf20Sopenharmony_ci 15708c2ecf20Sopenharmony_cistatic struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; 15718c2ecf20Sopenharmony_ci 15728c2ecf20Sopenharmony_ci#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID 15738c2ecf20Sopenharmony_ci 15748c2ecf20Sopenharmony_ci/* 15758c2ecf20Sopenharmony_ci * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. 15768c2ecf20Sopenharmony_ci */ 15778c2ecf20Sopenharmony_ciint __meminit __early_pfn_to_nid(unsigned long pfn, 15788c2ecf20Sopenharmony_ci struct mminit_pfnnid_cache *state) 15798c2ecf20Sopenharmony_ci{ 15808c2ecf20Sopenharmony_ci unsigned long start_pfn, end_pfn; 15818c2ecf20Sopenharmony_ci int nid; 15828c2ecf20Sopenharmony_ci 15838c2ecf20Sopenharmony_ci if (state->last_start <= pfn && pfn < state->last_end) 15848c2ecf20Sopenharmony_ci return state->last_nid; 15858c2ecf20Sopenharmony_ci 15868c2ecf20Sopenharmony_ci nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); 15878c2ecf20Sopenharmony_ci if (nid != NUMA_NO_NODE) { 15888c2ecf20Sopenharmony_ci state->last_start = start_pfn; 15898c2ecf20Sopenharmony_ci state->last_end = end_pfn; 15908c2ecf20Sopenharmony_ci state->last_nid = nid; 15918c2ecf20Sopenharmony_ci } 15928c2ecf20Sopenharmony_ci 15938c2ecf20Sopenharmony_ci return nid; 15948c2ecf20Sopenharmony_ci} 15958c2ecf20Sopenharmony_ci#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ 15968c2ecf20Sopenharmony_ci 15978c2ecf20Sopenharmony_ciint __meminit early_pfn_to_nid(unsigned long pfn) 15988c2ecf20Sopenharmony_ci{ 15998c2ecf20Sopenharmony_ci static DEFINE_SPINLOCK(early_pfn_lock); 16008c2ecf20Sopenharmony_ci int nid; 16018c2ecf20Sopenharmony_ci 16028c2ecf20Sopenharmony_ci spin_lock(&early_pfn_lock); 16038c2ecf20Sopenharmony_ci nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); 16048c2ecf20Sopenharmony_ci if (nid < 0) 16058c2ecf20Sopenharmony_ci nid = first_online_node; 16068c2ecf20Sopenharmony_ci spin_unlock(&early_pfn_lock); 16078c2ecf20Sopenharmony_ci 16088c2ecf20Sopenharmony_ci return nid; 16098c2ecf20Sopenharmony_ci} 16108c2ecf20Sopenharmony_ci#endif /* CONFIG_NEED_MULTIPLE_NODES */ 16118c2ecf20Sopenharmony_ci 16128c2ecf20Sopenharmony_civoid __init memblock_free_pages(struct page *page, unsigned long pfn, 16138c2ecf20Sopenharmony_ci unsigned int order) 16148c2ecf20Sopenharmony_ci{ 16158c2ecf20Sopenharmony_ci if (early_page_uninitialised(pfn)) 16168c2ecf20Sopenharmony_ci return; 16178c2ecf20Sopenharmony_ci __free_pages_core(page, order); 16188c2ecf20Sopenharmony_ci} 16198c2ecf20Sopenharmony_ci 16208c2ecf20Sopenharmony_ci/* 16218c2ecf20Sopenharmony_ci * Check that the whole (or subset of) a pageblock given by the interval of 16228c2ecf20Sopenharmony_ci * [start_pfn, end_pfn) is valid and within the same zone, before scanning it 16238c2ecf20Sopenharmony_ci * with the migration of free compaction scanner. The scanners then need to 16248c2ecf20Sopenharmony_ci * use only pfn_valid_within() check for arches that allow holes within 16258c2ecf20Sopenharmony_ci * pageblocks. 16268c2ecf20Sopenharmony_ci * 16278c2ecf20Sopenharmony_ci * Return struct page pointer of start_pfn, or NULL if checks were not passed. 16288c2ecf20Sopenharmony_ci * 16298c2ecf20Sopenharmony_ci * It's possible on some configurations to have a setup like node0 node1 node0 16308c2ecf20Sopenharmony_ci * i.e. it's possible that all pages within a zones range of pages do not 16318c2ecf20Sopenharmony_ci * belong to a single zone. We assume that a border between node0 and node1 16328c2ecf20Sopenharmony_ci * can occur within a single pageblock, but not a node0 node1 node0 16338c2ecf20Sopenharmony_ci * interleaving within a single pageblock. It is therefore sufficient to check 16348c2ecf20Sopenharmony_ci * the first and last page of a pageblock and avoid checking each individual 16358c2ecf20Sopenharmony_ci * page in a pageblock. 16368c2ecf20Sopenharmony_ci */ 16378c2ecf20Sopenharmony_cistruct page *__pageblock_pfn_to_page(unsigned long start_pfn, 16388c2ecf20Sopenharmony_ci unsigned long end_pfn, struct zone *zone) 16398c2ecf20Sopenharmony_ci{ 16408c2ecf20Sopenharmony_ci struct page *start_page; 16418c2ecf20Sopenharmony_ci struct page *end_page; 16428c2ecf20Sopenharmony_ci 16438c2ecf20Sopenharmony_ci /* end_pfn is one past the range we are checking */ 16448c2ecf20Sopenharmony_ci end_pfn--; 16458c2ecf20Sopenharmony_ci 16468c2ecf20Sopenharmony_ci if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) 16478c2ecf20Sopenharmony_ci return NULL; 16488c2ecf20Sopenharmony_ci 16498c2ecf20Sopenharmony_ci start_page = pfn_to_online_page(start_pfn); 16508c2ecf20Sopenharmony_ci if (!start_page) 16518c2ecf20Sopenharmony_ci return NULL; 16528c2ecf20Sopenharmony_ci 16538c2ecf20Sopenharmony_ci if (page_zone(start_page) != zone) 16548c2ecf20Sopenharmony_ci return NULL; 16558c2ecf20Sopenharmony_ci 16568c2ecf20Sopenharmony_ci end_page = pfn_to_page(end_pfn); 16578c2ecf20Sopenharmony_ci 16588c2ecf20Sopenharmony_ci /* This gives a shorter code than deriving page_zone(end_page) */ 16598c2ecf20Sopenharmony_ci if (page_zone_id(start_page) != page_zone_id(end_page)) 16608c2ecf20Sopenharmony_ci return NULL; 16618c2ecf20Sopenharmony_ci 16628c2ecf20Sopenharmony_ci return start_page; 16638c2ecf20Sopenharmony_ci} 16648c2ecf20Sopenharmony_ci 16658c2ecf20Sopenharmony_civoid set_zone_contiguous(struct zone *zone) 16668c2ecf20Sopenharmony_ci{ 16678c2ecf20Sopenharmony_ci unsigned long block_start_pfn = zone->zone_start_pfn; 16688c2ecf20Sopenharmony_ci unsigned long block_end_pfn; 16698c2ecf20Sopenharmony_ci 16708c2ecf20Sopenharmony_ci block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages); 16718c2ecf20Sopenharmony_ci for (; block_start_pfn < zone_end_pfn(zone); 16728c2ecf20Sopenharmony_ci block_start_pfn = block_end_pfn, 16738c2ecf20Sopenharmony_ci block_end_pfn += pageblock_nr_pages) { 16748c2ecf20Sopenharmony_ci 16758c2ecf20Sopenharmony_ci block_end_pfn = min(block_end_pfn, zone_end_pfn(zone)); 16768c2ecf20Sopenharmony_ci 16778c2ecf20Sopenharmony_ci if (!__pageblock_pfn_to_page(block_start_pfn, 16788c2ecf20Sopenharmony_ci block_end_pfn, zone)) 16798c2ecf20Sopenharmony_ci return; 16808c2ecf20Sopenharmony_ci cond_resched(); 16818c2ecf20Sopenharmony_ci } 16828c2ecf20Sopenharmony_ci 16838c2ecf20Sopenharmony_ci /* We confirm that there is no hole */ 16848c2ecf20Sopenharmony_ci zone->contiguous = true; 16858c2ecf20Sopenharmony_ci} 16868c2ecf20Sopenharmony_ci 16878c2ecf20Sopenharmony_civoid clear_zone_contiguous(struct zone *zone) 16888c2ecf20Sopenharmony_ci{ 16898c2ecf20Sopenharmony_ci zone->contiguous = false; 16908c2ecf20Sopenharmony_ci} 16918c2ecf20Sopenharmony_ci 16928c2ecf20Sopenharmony_ci#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 16938c2ecf20Sopenharmony_cistatic void __init deferred_free_range(unsigned long pfn, 16948c2ecf20Sopenharmony_ci unsigned long nr_pages) 16958c2ecf20Sopenharmony_ci{ 16968c2ecf20Sopenharmony_ci struct page *page; 16978c2ecf20Sopenharmony_ci unsigned long i; 16988c2ecf20Sopenharmony_ci 16998c2ecf20Sopenharmony_ci if (!nr_pages) 17008c2ecf20Sopenharmony_ci return; 17018c2ecf20Sopenharmony_ci 17028c2ecf20Sopenharmony_ci page = pfn_to_page(pfn); 17038c2ecf20Sopenharmony_ci 17048c2ecf20Sopenharmony_ci /* Free a large naturally-aligned chunk if possible */ 17058c2ecf20Sopenharmony_ci if (nr_pages == pageblock_nr_pages && 17068c2ecf20Sopenharmony_ci (pfn & (pageblock_nr_pages - 1)) == 0) { 17078c2ecf20Sopenharmony_ci set_pageblock_migratetype(page, MIGRATE_MOVABLE); 17088c2ecf20Sopenharmony_ci __free_pages_core(page, pageblock_order); 17098c2ecf20Sopenharmony_ci return; 17108c2ecf20Sopenharmony_ci } 17118c2ecf20Sopenharmony_ci 17128c2ecf20Sopenharmony_ci for (i = 0; i < nr_pages; i++, page++, pfn++) { 17138c2ecf20Sopenharmony_ci if ((pfn & (pageblock_nr_pages - 1)) == 0) 17148c2ecf20Sopenharmony_ci set_pageblock_migratetype(page, MIGRATE_MOVABLE); 17158c2ecf20Sopenharmony_ci __free_pages_core(page, 0); 17168c2ecf20Sopenharmony_ci } 17178c2ecf20Sopenharmony_ci} 17188c2ecf20Sopenharmony_ci 17198c2ecf20Sopenharmony_ci/* Completion tracking for deferred_init_memmap() threads */ 17208c2ecf20Sopenharmony_cistatic atomic_t pgdat_init_n_undone __initdata; 17218c2ecf20Sopenharmony_cistatic __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp); 17228c2ecf20Sopenharmony_ci 17238c2ecf20Sopenharmony_cistatic inline void __init pgdat_init_report_one_done(void) 17248c2ecf20Sopenharmony_ci{ 17258c2ecf20Sopenharmony_ci if (atomic_dec_and_test(&pgdat_init_n_undone)) 17268c2ecf20Sopenharmony_ci complete(&pgdat_init_all_done_comp); 17278c2ecf20Sopenharmony_ci} 17288c2ecf20Sopenharmony_ci 17298c2ecf20Sopenharmony_ci/* 17308c2ecf20Sopenharmony_ci * Returns true if page needs to be initialized or freed to buddy allocator. 17318c2ecf20Sopenharmony_ci * 17328c2ecf20Sopenharmony_ci * First we check if pfn is valid on architectures where it is possible to have 17338c2ecf20Sopenharmony_ci * holes within pageblock_nr_pages. On systems where it is not possible, this 17348c2ecf20Sopenharmony_ci * function is optimized out. 17358c2ecf20Sopenharmony_ci * 17368c2ecf20Sopenharmony_ci * Then, we check if a current large page is valid by only checking the validity 17378c2ecf20Sopenharmony_ci * of the head pfn. 17388c2ecf20Sopenharmony_ci */ 17398c2ecf20Sopenharmony_cistatic inline bool __init deferred_pfn_valid(unsigned long pfn) 17408c2ecf20Sopenharmony_ci{ 17418c2ecf20Sopenharmony_ci if (!pfn_valid_within(pfn)) 17428c2ecf20Sopenharmony_ci return false; 17438c2ecf20Sopenharmony_ci if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn)) 17448c2ecf20Sopenharmony_ci return false; 17458c2ecf20Sopenharmony_ci return true; 17468c2ecf20Sopenharmony_ci} 17478c2ecf20Sopenharmony_ci 17488c2ecf20Sopenharmony_ci/* 17498c2ecf20Sopenharmony_ci * Free pages to buddy allocator. Try to free aligned pages in 17508c2ecf20Sopenharmony_ci * pageblock_nr_pages sizes. 17518c2ecf20Sopenharmony_ci */ 17528c2ecf20Sopenharmony_cistatic void __init deferred_free_pages(unsigned long pfn, 17538c2ecf20Sopenharmony_ci unsigned long end_pfn) 17548c2ecf20Sopenharmony_ci{ 17558c2ecf20Sopenharmony_ci unsigned long nr_pgmask = pageblock_nr_pages - 1; 17568c2ecf20Sopenharmony_ci unsigned long nr_free = 0; 17578c2ecf20Sopenharmony_ci 17588c2ecf20Sopenharmony_ci for (; pfn < end_pfn; pfn++) { 17598c2ecf20Sopenharmony_ci if (!deferred_pfn_valid(pfn)) { 17608c2ecf20Sopenharmony_ci deferred_free_range(pfn - nr_free, nr_free); 17618c2ecf20Sopenharmony_ci nr_free = 0; 17628c2ecf20Sopenharmony_ci } else if (!(pfn & nr_pgmask)) { 17638c2ecf20Sopenharmony_ci deferred_free_range(pfn - nr_free, nr_free); 17648c2ecf20Sopenharmony_ci nr_free = 1; 17658c2ecf20Sopenharmony_ci } else { 17668c2ecf20Sopenharmony_ci nr_free++; 17678c2ecf20Sopenharmony_ci } 17688c2ecf20Sopenharmony_ci } 17698c2ecf20Sopenharmony_ci /* Free the last block of pages to allocator */ 17708c2ecf20Sopenharmony_ci deferred_free_range(pfn - nr_free, nr_free); 17718c2ecf20Sopenharmony_ci} 17728c2ecf20Sopenharmony_ci 17738c2ecf20Sopenharmony_ci/* 17748c2ecf20Sopenharmony_ci * Initialize struct pages. We minimize pfn page lookups and scheduler checks 17758c2ecf20Sopenharmony_ci * by performing it only once every pageblock_nr_pages. 17768c2ecf20Sopenharmony_ci * Return number of pages initialized. 17778c2ecf20Sopenharmony_ci */ 17788c2ecf20Sopenharmony_cistatic unsigned long __init deferred_init_pages(struct zone *zone, 17798c2ecf20Sopenharmony_ci unsigned long pfn, 17808c2ecf20Sopenharmony_ci unsigned long end_pfn) 17818c2ecf20Sopenharmony_ci{ 17828c2ecf20Sopenharmony_ci unsigned long nr_pgmask = pageblock_nr_pages - 1; 17838c2ecf20Sopenharmony_ci int nid = zone_to_nid(zone); 17848c2ecf20Sopenharmony_ci unsigned long nr_pages = 0; 17858c2ecf20Sopenharmony_ci int zid = zone_idx(zone); 17868c2ecf20Sopenharmony_ci struct page *page = NULL; 17878c2ecf20Sopenharmony_ci 17888c2ecf20Sopenharmony_ci for (; pfn < end_pfn; pfn++) { 17898c2ecf20Sopenharmony_ci if (!deferred_pfn_valid(pfn)) { 17908c2ecf20Sopenharmony_ci page = NULL; 17918c2ecf20Sopenharmony_ci continue; 17928c2ecf20Sopenharmony_ci } else if (!page || !(pfn & nr_pgmask)) { 17938c2ecf20Sopenharmony_ci page = pfn_to_page(pfn); 17948c2ecf20Sopenharmony_ci } else { 17958c2ecf20Sopenharmony_ci page++; 17968c2ecf20Sopenharmony_ci } 17978c2ecf20Sopenharmony_ci __init_single_page(page, pfn, zid, nid); 17988c2ecf20Sopenharmony_ci nr_pages++; 17998c2ecf20Sopenharmony_ci } 18008c2ecf20Sopenharmony_ci return (nr_pages); 18018c2ecf20Sopenharmony_ci} 18028c2ecf20Sopenharmony_ci 18038c2ecf20Sopenharmony_ci/* 18048c2ecf20Sopenharmony_ci * This function is meant to pre-load the iterator for the zone init. 18058c2ecf20Sopenharmony_ci * Specifically it walks through the ranges until we are caught up to the 18068c2ecf20Sopenharmony_ci * first_init_pfn value and exits there. If we never encounter the value we 18078c2ecf20Sopenharmony_ci * return false indicating there are no valid ranges left. 18088c2ecf20Sopenharmony_ci */ 18098c2ecf20Sopenharmony_cistatic bool __init 18108c2ecf20Sopenharmony_cideferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone, 18118c2ecf20Sopenharmony_ci unsigned long *spfn, unsigned long *epfn, 18128c2ecf20Sopenharmony_ci unsigned long first_init_pfn) 18138c2ecf20Sopenharmony_ci{ 18148c2ecf20Sopenharmony_ci u64 j; 18158c2ecf20Sopenharmony_ci 18168c2ecf20Sopenharmony_ci /* 18178c2ecf20Sopenharmony_ci * Start out by walking through the ranges in this zone that have 18188c2ecf20Sopenharmony_ci * already been initialized. We don't need to do anything with them 18198c2ecf20Sopenharmony_ci * so we just need to flush them out of the system. 18208c2ecf20Sopenharmony_ci */ 18218c2ecf20Sopenharmony_ci for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) { 18228c2ecf20Sopenharmony_ci if (*epfn <= first_init_pfn) 18238c2ecf20Sopenharmony_ci continue; 18248c2ecf20Sopenharmony_ci if (*spfn < first_init_pfn) 18258c2ecf20Sopenharmony_ci *spfn = first_init_pfn; 18268c2ecf20Sopenharmony_ci *i = j; 18278c2ecf20Sopenharmony_ci return true; 18288c2ecf20Sopenharmony_ci } 18298c2ecf20Sopenharmony_ci 18308c2ecf20Sopenharmony_ci return false; 18318c2ecf20Sopenharmony_ci} 18328c2ecf20Sopenharmony_ci 18338c2ecf20Sopenharmony_ci/* 18348c2ecf20Sopenharmony_ci * Initialize and free pages. We do it in two loops: first we initialize 18358c2ecf20Sopenharmony_ci * struct page, then free to buddy allocator, because while we are 18368c2ecf20Sopenharmony_ci * freeing pages we can access pages that are ahead (computing buddy 18378c2ecf20Sopenharmony_ci * page in __free_one_page()). 18388c2ecf20Sopenharmony_ci * 18398c2ecf20Sopenharmony_ci * In order to try and keep some memory in the cache we have the loop 18408c2ecf20Sopenharmony_ci * broken along max page order boundaries. This way we will not cause 18418c2ecf20Sopenharmony_ci * any issues with the buddy page computation. 18428c2ecf20Sopenharmony_ci */ 18438c2ecf20Sopenharmony_cistatic unsigned long __init 18448c2ecf20Sopenharmony_cideferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn, 18458c2ecf20Sopenharmony_ci unsigned long *end_pfn) 18468c2ecf20Sopenharmony_ci{ 18478c2ecf20Sopenharmony_ci unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES); 18488c2ecf20Sopenharmony_ci unsigned long spfn = *start_pfn, epfn = *end_pfn; 18498c2ecf20Sopenharmony_ci unsigned long nr_pages = 0; 18508c2ecf20Sopenharmony_ci u64 j = *i; 18518c2ecf20Sopenharmony_ci 18528c2ecf20Sopenharmony_ci /* First we loop through and initialize the page values */ 18538c2ecf20Sopenharmony_ci for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) { 18548c2ecf20Sopenharmony_ci unsigned long t; 18558c2ecf20Sopenharmony_ci 18568c2ecf20Sopenharmony_ci if (mo_pfn <= *start_pfn) 18578c2ecf20Sopenharmony_ci break; 18588c2ecf20Sopenharmony_ci 18598c2ecf20Sopenharmony_ci t = min(mo_pfn, *end_pfn); 18608c2ecf20Sopenharmony_ci nr_pages += deferred_init_pages(zone, *start_pfn, t); 18618c2ecf20Sopenharmony_ci 18628c2ecf20Sopenharmony_ci if (mo_pfn < *end_pfn) { 18638c2ecf20Sopenharmony_ci *start_pfn = mo_pfn; 18648c2ecf20Sopenharmony_ci break; 18658c2ecf20Sopenharmony_ci } 18668c2ecf20Sopenharmony_ci } 18678c2ecf20Sopenharmony_ci 18688c2ecf20Sopenharmony_ci /* Reset values and now loop through freeing pages as needed */ 18698c2ecf20Sopenharmony_ci swap(j, *i); 18708c2ecf20Sopenharmony_ci 18718c2ecf20Sopenharmony_ci for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) { 18728c2ecf20Sopenharmony_ci unsigned long t; 18738c2ecf20Sopenharmony_ci 18748c2ecf20Sopenharmony_ci if (mo_pfn <= spfn) 18758c2ecf20Sopenharmony_ci break; 18768c2ecf20Sopenharmony_ci 18778c2ecf20Sopenharmony_ci t = min(mo_pfn, epfn); 18788c2ecf20Sopenharmony_ci deferred_free_pages(spfn, t); 18798c2ecf20Sopenharmony_ci 18808c2ecf20Sopenharmony_ci if (mo_pfn <= epfn) 18818c2ecf20Sopenharmony_ci break; 18828c2ecf20Sopenharmony_ci } 18838c2ecf20Sopenharmony_ci 18848c2ecf20Sopenharmony_ci return nr_pages; 18858c2ecf20Sopenharmony_ci} 18868c2ecf20Sopenharmony_ci 18878c2ecf20Sopenharmony_cistatic void __init 18888c2ecf20Sopenharmony_cideferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn, 18898c2ecf20Sopenharmony_ci void *arg) 18908c2ecf20Sopenharmony_ci{ 18918c2ecf20Sopenharmony_ci unsigned long spfn, epfn; 18928c2ecf20Sopenharmony_ci struct zone *zone = arg; 18938c2ecf20Sopenharmony_ci u64 i; 18948c2ecf20Sopenharmony_ci 18958c2ecf20Sopenharmony_ci deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn); 18968c2ecf20Sopenharmony_ci 18978c2ecf20Sopenharmony_ci /* 18988c2ecf20Sopenharmony_ci * Initialize and free pages in MAX_ORDER sized increments so that we 18998c2ecf20Sopenharmony_ci * can avoid introducing any issues with the buddy allocator. 19008c2ecf20Sopenharmony_ci */ 19018c2ecf20Sopenharmony_ci while (spfn < end_pfn) { 19028c2ecf20Sopenharmony_ci deferred_init_maxorder(&i, zone, &spfn, &epfn); 19038c2ecf20Sopenharmony_ci cond_resched(); 19048c2ecf20Sopenharmony_ci } 19058c2ecf20Sopenharmony_ci} 19068c2ecf20Sopenharmony_ci 19078c2ecf20Sopenharmony_ci/* An arch may override for more concurrency. */ 19088c2ecf20Sopenharmony_ci__weak int __init 19098c2ecf20Sopenharmony_cideferred_page_init_max_threads(const struct cpumask *node_cpumask) 19108c2ecf20Sopenharmony_ci{ 19118c2ecf20Sopenharmony_ci return 1; 19128c2ecf20Sopenharmony_ci} 19138c2ecf20Sopenharmony_ci 19148c2ecf20Sopenharmony_ci/* Initialise remaining memory on a node */ 19158c2ecf20Sopenharmony_cistatic int __init deferred_init_memmap(void *data) 19168c2ecf20Sopenharmony_ci{ 19178c2ecf20Sopenharmony_ci pg_data_t *pgdat = data; 19188c2ecf20Sopenharmony_ci const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); 19198c2ecf20Sopenharmony_ci unsigned long spfn = 0, epfn = 0; 19208c2ecf20Sopenharmony_ci unsigned long first_init_pfn, flags; 19218c2ecf20Sopenharmony_ci unsigned long start = jiffies; 19228c2ecf20Sopenharmony_ci struct zone *zone; 19238c2ecf20Sopenharmony_ci int zid, max_threads; 19248c2ecf20Sopenharmony_ci u64 i; 19258c2ecf20Sopenharmony_ci 19268c2ecf20Sopenharmony_ci /* Bind memory initialisation thread to a local node if possible */ 19278c2ecf20Sopenharmony_ci if (!cpumask_empty(cpumask)) 19288c2ecf20Sopenharmony_ci set_cpus_allowed_ptr(current, cpumask); 19298c2ecf20Sopenharmony_ci 19308c2ecf20Sopenharmony_ci pgdat_resize_lock(pgdat, &flags); 19318c2ecf20Sopenharmony_ci first_init_pfn = pgdat->first_deferred_pfn; 19328c2ecf20Sopenharmony_ci if (first_init_pfn == ULONG_MAX) { 19338c2ecf20Sopenharmony_ci pgdat_resize_unlock(pgdat, &flags); 19348c2ecf20Sopenharmony_ci pgdat_init_report_one_done(); 19358c2ecf20Sopenharmony_ci return 0; 19368c2ecf20Sopenharmony_ci } 19378c2ecf20Sopenharmony_ci 19388c2ecf20Sopenharmony_ci /* Sanity check boundaries */ 19398c2ecf20Sopenharmony_ci BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn); 19408c2ecf20Sopenharmony_ci BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); 19418c2ecf20Sopenharmony_ci pgdat->first_deferred_pfn = ULONG_MAX; 19428c2ecf20Sopenharmony_ci 19438c2ecf20Sopenharmony_ci /* 19448c2ecf20Sopenharmony_ci * Once we unlock here, the zone cannot be grown anymore, thus if an 19458c2ecf20Sopenharmony_ci * interrupt thread must allocate this early in boot, zone must be 19468c2ecf20Sopenharmony_ci * pre-grown prior to start of deferred page initialization. 19478c2ecf20Sopenharmony_ci */ 19488c2ecf20Sopenharmony_ci pgdat_resize_unlock(pgdat, &flags); 19498c2ecf20Sopenharmony_ci 19508c2ecf20Sopenharmony_ci /* Only the highest zone is deferred so find it */ 19518c2ecf20Sopenharmony_ci for (zid = 0; zid < MAX_NR_ZONES; zid++) { 19528c2ecf20Sopenharmony_ci zone = pgdat->node_zones + zid; 19538c2ecf20Sopenharmony_ci if (first_init_pfn < zone_end_pfn(zone)) 19548c2ecf20Sopenharmony_ci break; 19558c2ecf20Sopenharmony_ci } 19568c2ecf20Sopenharmony_ci 19578c2ecf20Sopenharmony_ci /* If the zone is empty somebody else may have cleared out the zone */ 19588c2ecf20Sopenharmony_ci if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, 19598c2ecf20Sopenharmony_ci first_init_pfn)) 19608c2ecf20Sopenharmony_ci goto zone_empty; 19618c2ecf20Sopenharmony_ci 19628c2ecf20Sopenharmony_ci max_threads = deferred_page_init_max_threads(cpumask); 19638c2ecf20Sopenharmony_ci 19648c2ecf20Sopenharmony_ci while (spfn < epfn) { 19658c2ecf20Sopenharmony_ci unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION); 19668c2ecf20Sopenharmony_ci struct padata_mt_job job = { 19678c2ecf20Sopenharmony_ci .thread_fn = deferred_init_memmap_chunk, 19688c2ecf20Sopenharmony_ci .fn_arg = zone, 19698c2ecf20Sopenharmony_ci .start = spfn, 19708c2ecf20Sopenharmony_ci .size = epfn_align - spfn, 19718c2ecf20Sopenharmony_ci .align = PAGES_PER_SECTION, 19728c2ecf20Sopenharmony_ci .min_chunk = PAGES_PER_SECTION, 19738c2ecf20Sopenharmony_ci .max_threads = max_threads, 19748c2ecf20Sopenharmony_ci }; 19758c2ecf20Sopenharmony_ci 19768c2ecf20Sopenharmony_ci padata_do_multithreaded(&job); 19778c2ecf20Sopenharmony_ci deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, 19788c2ecf20Sopenharmony_ci epfn_align); 19798c2ecf20Sopenharmony_ci } 19808c2ecf20Sopenharmony_cizone_empty: 19818c2ecf20Sopenharmony_ci /* Sanity check that the next zone really is unpopulated */ 19828c2ecf20Sopenharmony_ci WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); 19838c2ecf20Sopenharmony_ci 19848c2ecf20Sopenharmony_ci pr_info("node %d deferred pages initialised in %ums\n", 19858c2ecf20Sopenharmony_ci pgdat->node_id, jiffies_to_msecs(jiffies - start)); 19868c2ecf20Sopenharmony_ci 19878c2ecf20Sopenharmony_ci pgdat_init_report_one_done(); 19888c2ecf20Sopenharmony_ci return 0; 19898c2ecf20Sopenharmony_ci} 19908c2ecf20Sopenharmony_ci 19918c2ecf20Sopenharmony_ci/* 19928c2ecf20Sopenharmony_ci * If this zone has deferred pages, try to grow it by initializing enough 19938c2ecf20Sopenharmony_ci * deferred pages to satisfy the allocation specified by order, rounded up to 19948c2ecf20Sopenharmony_ci * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments 19958c2ecf20Sopenharmony_ci * of SECTION_SIZE bytes by initializing struct pages in increments of 19968c2ecf20Sopenharmony_ci * PAGES_PER_SECTION * sizeof(struct page) bytes. 19978c2ecf20Sopenharmony_ci * 19988c2ecf20Sopenharmony_ci * Return true when zone was grown, otherwise return false. We return true even 19998c2ecf20Sopenharmony_ci * when we grow less than requested, to let the caller decide if there are 20008c2ecf20Sopenharmony_ci * enough pages to satisfy the allocation. 20018c2ecf20Sopenharmony_ci * 20028c2ecf20Sopenharmony_ci * Note: We use noinline because this function is needed only during boot, and 20038c2ecf20Sopenharmony_ci * it is called from a __ref function _deferred_grow_zone. This way we are 20048c2ecf20Sopenharmony_ci * making sure that it is not inlined into permanent text section. 20058c2ecf20Sopenharmony_ci */ 20068c2ecf20Sopenharmony_cistatic noinline bool __init 20078c2ecf20Sopenharmony_cideferred_grow_zone(struct zone *zone, unsigned int order) 20088c2ecf20Sopenharmony_ci{ 20098c2ecf20Sopenharmony_ci unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION); 20108c2ecf20Sopenharmony_ci pg_data_t *pgdat = zone->zone_pgdat; 20118c2ecf20Sopenharmony_ci unsigned long first_deferred_pfn = pgdat->first_deferred_pfn; 20128c2ecf20Sopenharmony_ci unsigned long spfn, epfn, flags; 20138c2ecf20Sopenharmony_ci unsigned long nr_pages = 0; 20148c2ecf20Sopenharmony_ci u64 i; 20158c2ecf20Sopenharmony_ci 20168c2ecf20Sopenharmony_ci /* Only the last zone may have deferred pages */ 20178c2ecf20Sopenharmony_ci if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat)) 20188c2ecf20Sopenharmony_ci return false; 20198c2ecf20Sopenharmony_ci 20208c2ecf20Sopenharmony_ci pgdat_resize_lock(pgdat, &flags); 20218c2ecf20Sopenharmony_ci 20228c2ecf20Sopenharmony_ci /* 20238c2ecf20Sopenharmony_ci * If someone grew this zone while we were waiting for spinlock, return 20248c2ecf20Sopenharmony_ci * true, as there might be enough pages already. 20258c2ecf20Sopenharmony_ci */ 20268c2ecf20Sopenharmony_ci if (first_deferred_pfn != pgdat->first_deferred_pfn) { 20278c2ecf20Sopenharmony_ci pgdat_resize_unlock(pgdat, &flags); 20288c2ecf20Sopenharmony_ci return true; 20298c2ecf20Sopenharmony_ci } 20308c2ecf20Sopenharmony_ci 20318c2ecf20Sopenharmony_ci /* If the zone is empty somebody else may have cleared out the zone */ 20328c2ecf20Sopenharmony_ci if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, 20338c2ecf20Sopenharmony_ci first_deferred_pfn)) { 20348c2ecf20Sopenharmony_ci pgdat->first_deferred_pfn = ULONG_MAX; 20358c2ecf20Sopenharmony_ci pgdat_resize_unlock(pgdat, &flags); 20368c2ecf20Sopenharmony_ci /* Retry only once. */ 20378c2ecf20Sopenharmony_ci return first_deferred_pfn != ULONG_MAX; 20388c2ecf20Sopenharmony_ci } 20398c2ecf20Sopenharmony_ci 20408c2ecf20Sopenharmony_ci /* 20418c2ecf20Sopenharmony_ci * Initialize and free pages in MAX_ORDER sized increments so 20428c2ecf20Sopenharmony_ci * that we can avoid introducing any issues with the buddy 20438c2ecf20Sopenharmony_ci * allocator. 20448c2ecf20Sopenharmony_ci */ 20458c2ecf20Sopenharmony_ci while (spfn < epfn) { 20468c2ecf20Sopenharmony_ci /* update our first deferred PFN for this section */ 20478c2ecf20Sopenharmony_ci first_deferred_pfn = spfn; 20488c2ecf20Sopenharmony_ci 20498c2ecf20Sopenharmony_ci nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); 20508c2ecf20Sopenharmony_ci touch_nmi_watchdog(); 20518c2ecf20Sopenharmony_ci 20528c2ecf20Sopenharmony_ci /* We should only stop along section boundaries */ 20538c2ecf20Sopenharmony_ci if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION) 20548c2ecf20Sopenharmony_ci continue; 20558c2ecf20Sopenharmony_ci 20568c2ecf20Sopenharmony_ci /* If our quota has been met we can stop here */ 20578c2ecf20Sopenharmony_ci if (nr_pages >= nr_pages_needed) 20588c2ecf20Sopenharmony_ci break; 20598c2ecf20Sopenharmony_ci } 20608c2ecf20Sopenharmony_ci 20618c2ecf20Sopenharmony_ci pgdat->first_deferred_pfn = spfn; 20628c2ecf20Sopenharmony_ci pgdat_resize_unlock(pgdat, &flags); 20638c2ecf20Sopenharmony_ci 20648c2ecf20Sopenharmony_ci return nr_pages > 0; 20658c2ecf20Sopenharmony_ci} 20668c2ecf20Sopenharmony_ci 20678c2ecf20Sopenharmony_ci/* 20688c2ecf20Sopenharmony_ci * deferred_grow_zone() is __init, but it is called from 20698c2ecf20Sopenharmony_ci * get_page_from_freelist() during early boot until deferred_pages permanently 20708c2ecf20Sopenharmony_ci * disables this call. This is why we have refdata wrapper to avoid warning, 20718c2ecf20Sopenharmony_ci * and to ensure that the function body gets unloaded. 20728c2ecf20Sopenharmony_ci */ 20738c2ecf20Sopenharmony_cistatic bool __ref 20748c2ecf20Sopenharmony_ci_deferred_grow_zone(struct zone *zone, unsigned int order) 20758c2ecf20Sopenharmony_ci{ 20768c2ecf20Sopenharmony_ci return deferred_grow_zone(zone, order); 20778c2ecf20Sopenharmony_ci} 20788c2ecf20Sopenharmony_ci 20798c2ecf20Sopenharmony_ci#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 20808c2ecf20Sopenharmony_ci 20818c2ecf20Sopenharmony_civoid __init page_alloc_init_late(void) 20828c2ecf20Sopenharmony_ci{ 20838c2ecf20Sopenharmony_ci struct zone *zone; 20848c2ecf20Sopenharmony_ci int nid; 20858c2ecf20Sopenharmony_ci 20868c2ecf20Sopenharmony_ci#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 20878c2ecf20Sopenharmony_ci 20888c2ecf20Sopenharmony_ci /* There will be num_node_state(N_MEMORY) threads */ 20898c2ecf20Sopenharmony_ci atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY)); 20908c2ecf20Sopenharmony_ci for_each_node_state(nid, N_MEMORY) { 20918c2ecf20Sopenharmony_ci kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid); 20928c2ecf20Sopenharmony_ci } 20938c2ecf20Sopenharmony_ci 20948c2ecf20Sopenharmony_ci /* Block until all are initialised */ 20958c2ecf20Sopenharmony_ci wait_for_completion(&pgdat_init_all_done_comp); 20968c2ecf20Sopenharmony_ci 20978c2ecf20Sopenharmony_ci /* 20988c2ecf20Sopenharmony_ci * The number of managed pages has changed due to the initialisation 20998c2ecf20Sopenharmony_ci * so the pcpu batch and high limits needs to be updated or the limits 21008c2ecf20Sopenharmony_ci * will be artificially small. 21018c2ecf20Sopenharmony_ci */ 21028c2ecf20Sopenharmony_ci for_each_populated_zone(zone) 21038c2ecf20Sopenharmony_ci zone_pcp_update(zone); 21048c2ecf20Sopenharmony_ci 21058c2ecf20Sopenharmony_ci /* 21068c2ecf20Sopenharmony_ci * We initialized the rest of the deferred pages. Permanently disable 21078c2ecf20Sopenharmony_ci * on-demand struct page initialization. 21088c2ecf20Sopenharmony_ci */ 21098c2ecf20Sopenharmony_ci static_branch_disable(&deferred_pages); 21108c2ecf20Sopenharmony_ci 21118c2ecf20Sopenharmony_ci /* Reinit limits that are based on free pages after the kernel is up */ 21128c2ecf20Sopenharmony_ci files_maxfiles_init(); 21138c2ecf20Sopenharmony_ci#endif 21148c2ecf20Sopenharmony_ci 21158c2ecf20Sopenharmony_ci /* Discard memblock private memory */ 21168c2ecf20Sopenharmony_ci memblock_discard(); 21178c2ecf20Sopenharmony_ci 21188c2ecf20Sopenharmony_ci for_each_node_state(nid, N_MEMORY) 21198c2ecf20Sopenharmony_ci shuffle_free_memory(NODE_DATA(nid)); 21208c2ecf20Sopenharmony_ci 21218c2ecf20Sopenharmony_ci for_each_populated_zone(zone) 21228c2ecf20Sopenharmony_ci set_zone_contiguous(zone); 21238c2ecf20Sopenharmony_ci} 21248c2ecf20Sopenharmony_ci 21258c2ecf20Sopenharmony_ci#ifdef CONFIG_CMA 21268c2ecf20Sopenharmony_ci/* Free whole pageblock and set its migration type to MIGRATE_CMA. */ 21278c2ecf20Sopenharmony_civoid __init init_cma_reserved_pageblock(struct page *page) 21288c2ecf20Sopenharmony_ci{ 21298c2ecf20Sopenharmony_ci unsigned i = pageblock_nr_pages; 21308c2ecf20Sopenharmony_ci struct page *p = page; 21318c2ecf20Sopenharmony_ci 21328c2ecf20Sopenharmony_ci do { 21338c2ecf20Sopenharmony_ci __ClearPageReserved(p); 21348c2ecf20Sopenharmony_ci set_page_count(p, 0); 21358c2ecf20Sopenharmony_ci } while (++p, --i); 21368c2ecf20Sopenharmony_ci 21378c2ecf20Sopenharmony_ci set_pageblock_migratetype(page, MIGRATE_CMA); 21388c2ecf20Sopenharmony_ci 21398c2ecf20Sopenharmony_ci if (pageblock_order >= MAX_ORDER) { 21408c2ecf20Sopenharmony_ci i = pageblock_nr_pages; 21418c2ecf20Sopenharmony_ci p = page; 21428c2ecf20Sopenharmony_ci do { 21438c2ecf20Sopenharmony_ci set_page_refcounted(p); 21448c2ecf20Sopenharmony_ci __free_pages(p, MAX_ORDER - 1); 21458c2ecf20Sopenharmony_ci p += MAX_ORDER_NR_PAGES; 21468c2ecf20Sopenharmony_ci } while (i -= MAX_ORDER_NR_PAGES); 21478c2ecf20Sopenharmony_ci } else { 21488c2ecf20Sopenharmony_ci set_page_refcounted(page); 21498c2ecf20Sopenharmony_ci __free_pages(page, pageblock_order); 21508c2ecf20Sopenharmony_ci } 21518c2ecf20Sopenharmony_ci 21528c2ecf20Sopenharmony_ci adjust_managed_page_count(page, pageblock_nr_pages); 21538c2ecf20Sopenharmony_ci} 21548c2ecf20Sopenharmony_ci#endif 21558c2ecf20Sopenharmony_ci 21568c2ecf20Sopenharmony_ci/* 21578c2ecf20Sopenharmony_ci * The order of subdivision here is critical for the IO subsystem. 21588c2ecf20Sopenharmony_ci * Please do not alter this order without good reasons and regression 21598c2ecf20Sopenharmony_ci * testing. Specifically, as large blocks of memory are subdivided, 21608c2ecf20Sopenharmony_ci * the order in which smaller blocks are delivered depends on the order 21618c2ecf20Sopenharmony_ci * they're subdivided in this function. This is the primary factor 21628c2ecf20Sopenharmony_ci * influencing the order in which pages are delivered to the IO 21638c2ecf20Sopenharmony_ci * subsystem according to empirical testing, and this is also justified 21648c2ecf20Sopenharmony_ci * by considering the behavior of a buddy system containing a single 21658c2ecf20Sopenharmony_ci * large block of memory acted on by a series of small allocations. 21668c2ecf20Sopenharmony_ci * This behavior is a critical factor in sglist merging's success. 21678c2ecf20Sopenharmony_ci * 21688c2ecf20Sopenharmony_ci * -- nyc 21698c2ecf20Sopenharmony_ci */ 21708c2ecf20Sopenharmony_cistatic inline void expand(struct zone *zone, struct page *page, 21718c2ecf20Sopenharmony_ci int low, int high, int migratetype) 21728c2ecf20Sopenharmony_ci{ 21738c2ecf20Sopenharmony_ci unsigned long size = 1 << high; 21748c2ecf20Sopenharmony_ci 21758c2ecf20Sopenharmony_ci while (high > low) { 21768c2ecf20Sopenharmony_ci high--; 21778c2ecf20Sopenharmony_ci size >>= 1; 21788c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); 21798c2ecf20Sopenharmony_ci 21808c2ecf20Sopenharmony_ci /* 21818c2ecf20Sopenharmony_ci * Mark as guard pages (or page), that will allow to 21828c2ecf20Sopenharmony_ci * merge back to allocator when buddy will be freed. 21838c2ecf20Sopenharmony_ci * Corresponding page table entries will not be touched, 21848c2ecf20Sopenharmony_ci * pages will stay not present in virtual address space 21858c2ecf20Sopenharmony_ci */ 21868c2ecf20Sopenharmony_ci if (set_page_guard(zone, &page[size], high, migratetype)) 21878c2ecf20Sopenharmony_ci continue; 21888c2ecf20Sopenharmony_ci 21898c2ecf20Sopenharmony_ci add_to_free_list(&page[size], zone, high, migratetype); 21908c2ecf20Sopenharmony_ci set_buddy_order(&page[size], high); 21918c2ecf20Sopenharmony_ci } 21928c2ecf20Sopenharmony_ci} 21938c2ecf20Sopenharmony_ci 21948c2ecf20Sopenharmony_cistatic void check_new_page_bad(struct page *page) 21958c2ecf20Sopenharmony_ci{ 21968c2ecf20Sopenharmony_ci if (unlikely(page->flags & __PG_HWPOISON)) { 21978c2ecf20Sopenharmony_ci /* Don't complain about hwpoisoned pages */ 21988c2ecf20Sopenharmony_ci page_mapcount_reset(page); /* remove PageBuddy */ 21998c2ecf20Sopenharmony_ci return; 22008c2ecf20Sopenharmony_ci } 22018c2ecf20Sopenharmony_ci 22028c2ecf20Sopenharmony_ci bad_page(page, 22038c2ecf20Sopenharmony_ci page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP)); 22048c2ecf20Sopenharmony_ci} 22058c2ecf20Sopenharmony_ci 22068c2ecf20Sopenharmony_ci/* 22078c2ecf20Sopenharmony_ci * This page is about to be returned from the page allocator 22088c2ecf20Sopenharmony_ci */ 22098c2ecf20Sopenharmony_cistatic inline int check_new_page(struct page *page) 22108c2ecf20Sopenharmony_ci{ 22118c2ecf20Sopenharmony_ci if (likely(page_expected_state(page, 22128c2ecf20Sopenharmony_ci PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON))) 22138c2ecf20Sopenharmony_ci return 0; 22148c2ecf20Sopenharmony_ci 22158c2ecf20Sopenharmony_ci check_new_page_bad(page); 22168c2ecf20Sopenharmony_ci return 1; 22178c2ecf20Sopenharmony_ci} 22188c2ecf20Sopenharmony_ci 22198c2ecf20Sopenharmony_cistatic inline bool free_pages_prezeroed(void) 22208c2ecf20Sopenharmony_ci{ 22218c2ecf20Sopenharmony_ci return (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && 22228c2ecf20Sopenharmony_ci page_poisoning_enabled()) || want_init_on_free(); 22238c2ecf20Sopenharmony_ci} 22248c2ecf20Sopenharmony_ci 22258c2ecf20Sopenharmony_ci#ifdef CONFIG_DEBUG_VM 22268c2ecf20Sopenharmony_ci/* 22278c2ecf20Sopenharmony_ci * With DEBUG_VM enabled, order-0 pages are checked for expected state when 22288c2ecf20Sopenharmony_ci * being allocated from pcp lists. With debug_pagealloc also enabled, they are 22298c2ecf20Sopenharmony_ci * also checked when pcp lists are refilled from the free lists. 22308c2ecf20Sopenharmony_ci */ 22318c2ecf20Sopenharmony_cistatic inline bool check_pcp_refill(struct page *page) 22328c2ecf20Sopenharmony_ci{ 22338c2ecf20Sopenharmony_ci if (debug_pagealloc_enabled_static()) 22348c2ecf20Sopenharmony_ci return check_new_page(page); 22358c2ecf20Sopenharmony_ci else 22368c2ecf20Sopenharmony_ci return false; 22378c2ecf20Sopenharmony_ci} 22388c2ecf20Sopenharmony_ci 22398c2ecf20Sopenharmony_cistatic inline bool check_new_pcp(struct page *page) 22408c2ecf20Sopenharmony_ci{ 22418c2ecf20Sopenharmony_ci return check_new_page(page); 22428c2ecf20Sopenharmony_ci} 22438c2ecf20Sopenharmony_ci#else 22448c2ecf20Sopenharmony_ci/* 22458c2ecf20Sopenharmony_ci * With DEBUG_VM disabled, free order-0 pages are checked for expected state 22468c2ecf20Sopenharmony_ci * when pcp lists are being refilled from the free lists. With debug_pagealloc 22478c2ecf20Sopenharmony_ci * enabled, they are also checked when being allocated from the pcp lists. 22488c2ecf20Sopenharmony_ci */ 22498c2ecf20Sopenharmony_cistatic inline bool check_pcp_refill(struct page *page) 22508c2ecf20Sopenharmony_ci{ 22518c2ecf20Sopenharmony_ci return check_new_page(page); 22528c2ecf20Sopenharmony_ci} 22538c2ecf20Sopenharmony_cistatic inline bool check_new_pcp(struct page *page) 22548c2ecf20Sopenharmony_ci{ 22558c2ecf20Sopenharmony_ci if (debug_pagealloc_enabled_static()) 22568c2ecf20Sopenharmony_ci return check_new_page(page); 22578c2ecf20Sopenharmony_ci else 22588c2ecf20Sopenharmony_ci return false; 22598c2ecf20Sopenharmony_ci} 22608c2ecf20Sopenharmony_ci#endif /* CONFIG_DEBUG_VM */ 22618c2ecf20Sopenharmony_ci 22628c2ecf20Sopenharmony_cistatic bool check_new_pages(struct page *page, unsigned int order) 22638c2ecf20Sopenharmony_ci{ 22648c2ecf20Sopenharmony_ci int i; 22658c2ecf20Sopenharmony_ci for (i = 0; i < (1 << order); i++) { 22668c2ecf20Sopenharmony_ci struct page *p = page + i; 22678c2ecf20Sopenharmony_ci 22688c2ecf20Sopenharmony_ci if (unlikely(check_new_page(p))) 22698c2ecf20Sopenharmony_ci return true; 22708c2ecf20Sopenharmony_ci } 22718c2ecf20Sopenharmony_ci 22728c2ecf20Sopenharmony_ci return false; 22738c2ecf20Sopenharmony_ci} 22748c2ecf20Sopenharmony_ci 22758c2ecf20Sopenharmony_ciinline void post_alloc_hook(struct page *page, unsigned int order, 22768c2ecf20Sopenharmony_ci gfp_t gfp_flags) 22778c2ecf20Sopenharmony_ci{ 22788c2ecf20Sopenharmony_ci set_page_private(page, 0); 22798c2ecf20Sopenharmony_ci set_page_refcounted(page); 22808c2ecf20Sopenharmony_ci 22818c2ecf20Sopenharmony_ci arch_alloc_page(page, order); 22828c2ecf20Sopenharmony_ci if (debug_pagealloc_enabled_static()) 22838c2ecf20Sopenharmony_ci kernel_map_pages(page, 1 << order, 1); 22848c2ecf20Sopenharmony_ci kasan_alloc_pages(page, order); 22858c2ecf20Sopenharmony_ci kernel_poison_pages(page, 1 << order, 1); 22868c2ecf20Sopenharmony_ci set_page_owner(page, order, gfp_flags); 22878c2ecf20Sopenharmony_ci} 22888c2ecf20Sopenharmony_ci 22898c2ecf20Sopenharmony_cistatic void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, 22908c2ecf20Sopenharmony_ci unsigned int alloc_flags) 22918c2ecf20Sopenharmony_ci{ 22928c2ecf20Sopenharmony_ci post_alloc_hook(page, order, gfp_flags); 22938c2ecf20Sopenharmony_ci 22948c2ecf20Sopenharmony_ci if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags)) 22958c2ecf20Sopenharmony_ci kernel_init_free_pages(page, 1 << order); 22968c2ecf20Sopenharmony_ci 22978c2ecf20Sopenharmony_ci if (order && (gfp_flags & __GFP_COMP)) 22988c2ecf20Sopenharmony_ci prep_compound_page(page, order); 22998c2ecf20Sopenharmony_ci 23008c2ecf20Sopenharmony_ci /* 23018c2ecf20Sopenharmony_ci * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to 23028c2ecf20Sopenharmony_ci * allocate the page. The expectation is that the caller is taking 23038c2ecf20Sopenharmony_ci * steps that will free more memory. The caller should avoid the page 23048c2ecf20Sopenharmony_ci * being used for !PFMEMALLOC purposes. 23058c2ecf20Sopenharmony_ci */ 23068c2ecf20Sopenharmony_ci if (alloc_flags & ALLOC_NO_WATERMARKS) 23078c2ecf20Sopenharmony_ci set_page_pfmemalloc(page); 23088c2ecf20Sopenharmony_ci else 23098c2ecf20Sopenharmony_ci clear_page_pfmemalloc(page); 23108c2ecf20Sopenharmony_ci} 23118c2ecf20Sopenharmony_ci 23128c2ecf20Sopenharmony_ci/* 23138c2ecf20Sopenharmony_ci * Go through the free lists for the given migratetype and remove 23148c2ecf20Sopenharmony_ci * the smallest available page from the freelists 23158c2ecf20Sopenharmony_ci */ 23168c2ecf20Sopenharmony_cistatic __always_inline 23178c2ecf20Sopenharmony_cistruct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 23188c2ecf20Sopenharmony_ci int migratetype) 23198c2ecf20Sopenharmony_ci{ 23208c2ecf20Sopenharmony_ci unsigned int current_order; 23218c2ecf20Sopenharmony_ci struct free_area *area; 23228c2ecf20Sopenharmony_ci struct page *page; 23238c2ecf20Sopenharmony_ci 23248c2ecf20Sopenharmony_ci /* Find a page of the appropriate size in the preferred list */ 23258c2ecf20Sopenharmony_ci for (current_order = order; current_order < MAX_ORDER; ++current_order) { 23268c2ecf20Sopenharmony_ci area = &(zone->free_area[current_order]); 23278c2ecf20Sopenharmony_ci page = get_page_from_free_area(area, migratetype); 23288c2ecf20Sopenharmony_ci if (!page) 23298c2ecf20Sopenharmony_ci continue; 23308c2ecf20Sopenharmony_ci del_page_from_free_list(page, zone, current_order); 23318c2ecf20Sopenharmony_ci expand(zone, page, order, current_order, migratetype); 23328c2ecf20Sopenharmony_ci set_pcppage_migratetype(page, migratetype); 23338c2ecf20Sopenharmony_ci return page; 23348c2ecf20Sopenharmony_ci } 23358c2ecf20Sopenharmony_ci 23368c2ecf20Sopenharmony_ci return NULL; 23378c2ecf20Sopenharmony_ci} 23388c2ecf20Sopenharmony_ci 23398c2ecf20Sopenharmony_ci 23408c2ecf20Sopenharmony_ci/* 23418c2ecf20Sopenharmony_ci * This array describes the order lists are fallen back to when 23428c2ecf20Sopenharmony_ci * the free lists for the desirable migrate type are depleted 23438c2ecf20Sopenharmony_ci */ 23448c2ecf20Sopenharmony_cistatic int fallbacks[MIGRATE_TYPES][3] = { 23458c2ecf20Sopenharmony_ci [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, 23468c2ecf20Sopenharmony_ci [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, 23478c2ecf20Sopenharmony_ci [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, 23488c2ecf20Sopenharmony_ci#ifdef CONFIG_CMA 23498c2ecf20Sopenharmony_ci [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */ 23508c2ecf20Sopenharmony_ci#endif 23518c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMORY_ISOLATION 23528c2ecf20Sopenharmony_ci [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */ 23538c2ecf20Sopenharmony_ci#endif 23548c2ecf20Sopenharmony_ci}; 23558c2ecf20Sopenharmony_ci 23568c2ecf20Sopenharmony_ci#ifdef CONFIG_CMA 23578c2ecf20Sopenharmony_cistatic __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone, 23588c2ecf20Sopenharmony_ci unsigned int order) 23598c2ecf20Sopenharmony_ci{ 23608c2ecf20Sopenharmony_ci return __rmqueue_smallest(zone, order, MIGRATE_CMA); 23618c2ecf20Sopenharmony_ci} 23628c2ecf20Sopenharmony_ci#else 23638c2ecf20Sopenharmony_cistatic inline struct page *__rmqueue_cma_fallback(struct zone *zone, 23648c2ecf20Sopenharmony_ci unsigned int order) { return NULL; } 23658c2ecf20Sopenharmony_ci#endif 23668c2ecf20Sopenharmony_ci 23678c2ecf20Sopenharmony_ci/* 23688c2ecf20Sopenharmony_ci * Move the free pages in a range to the freelist tail of the requested type. 23698c2ecf20Sopenharmony_ci * Note that start_page and end_pages are not aligned on a pageblock 23708c2ecf20Sopenharmony_ci * boundary. If alignment is required, use move_freepages_block() 23718c2ecf20Sopenharmony_ci */ 23728c2ecf20Sopenharmony_cistatic int move_freepages(struct zone *zone, 23738c2ecf20Sopenharmony_ci unsigned long start_pfn, unsigned long end_pfn, 23748c2ecf20Sopenharmony_ci int migratetype, int *num_movable) 23758c2ecf20Sopenharmony_ci{ 23768c2ecf20Sopenharmony_ci struct page *page; 23778c2ecf20Sopenharmony_ci unsigned long pfn; 23788c2ecf20Sopenharmony_ci unsigned int order; 23798c2ecf20Sopenharmony_ci int pages_moved = 0; 23808c2ecf20Sopenharmony_ci 23818c2ecf20Sopenharmony_ci for (pfn = start_pfn; pfn <= end_pfn;) { 23828c2ecf20Sopenharmony_ci if (!pfn_valid_within(pfn)) { 23838c2ecf20Sopenharmony_ci pfn++; 23848c2ecf20Sopenharmony_ci continue; 23858c2ecf20Sopenharmony_ci } 23868c2ecf20Sopenharmony_ci 23878c2ecf20Sopenharmony_ci page = pfn_to_page(pfn); 23888c2ecf20Sopenharmony_ci if (!PageBuddy(page)) { 23898c2ecf20Sopenharmony_ci /* 23908c2ecf20Sopenharmony_ci * We assume that pages that could be isolated for 23918c2ecf20Sopenharmony_ci * migration are movable. But we don't actually try 23928c2ecf20Sopenharmony_ci * isolating, as that would be expensive. 23938c2ecf20Sopenharmony_ci */ 23948c2ecf20Sopenharmony_ci if (num_movable && 23958c2ecf20Sopenharmony_ci (PageLRU(page) || __PageMovable(page))) 23968c2ecf20Sopenharmony_ci (*num_movable)++; 23978c2ecf20Sopenharmony_ci pfn++; 23988c2ecf20Sopenharmony_ci continue; 23998c2ecf20Sopenharmony_ci } 24008c2ecf20Sopenharmony_ci 24018c2ecf20Sopenharmony_ci /* Make sure we are not inadvertently changing nodes */ 24028c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); 24038c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(page_zone(page) != zone, page); 24048c2ecf20Sopenharmony_ci 24058c2ecf20Sopenharmony_ci order = buddy_order(page); 24068c2ecf20Sopenharmony_ci move_to_free_list(page, zone, order, migratetype); 24078c2ecf20Sopenharmony_ci pfn += 1 << order; 24088c2ecf20Sopenharmony_ci pages_moved += 1 << order; 24098c2ecf20Sopenharmony_ci } 24108c2ecf20Sopenharmony_ci 24118c2ecf20Sopenharmony_ci return pages_moved; 24128c2ecf20Sopenharmony_ci} 24138c2ecf20Sopenharmony_ci 24148c2ecf20Sopenharmony_ciint move_freepages_block(struct zone *zone, struct page *page, 24158c2ecf20Sopenharmony_ci int migratetype, int *num_movable) 24168c2ecf20Sopenharmony_ci{ 24178c2ecf20Sopenharmony_ci unsigned long start_pfn, end_pfn, pfn; 24188c2ecf20Sopenharmony_ci 24198c2ecf20Sopenharmony_ci if (num_movable) 24208c2ecf20Sopenharmony_ci *num_movable = 0; 24218c2ecf20Sopenharmony_ci 24228c2ecf20Sopenharmony_ci pfn = page_to_pfn(page); 24238c2ecf20Sopenharmony_ci start_pfn = pfn & ~(pageblock_nr_pages - 1); 24248c2ecf20Sopenharmony_ci end_pfn = start_pfn + pageblock_nr_pages - 1; 24258c2ecf20Sopenharmony_ci 24268c2ecf20Sopenharmony_ci /* Do not cross zone boundaries */ 24278c2ecf20Sopenharmony_ci if (!zone_spans_pfn(zone, start_pfn)) 24288c2ecf20Sopenharmony_ci start_pfn = pfn; 24298c2ecf20Sopenharmony_ci if (!zone_spans_pfn(zone, end_pfn)) 24308c2ecf20Sopenharmony_ci return 0; 24318c2ecf20Sopenharmony_ci 24328c2ecf20Sopenharmony_ci return move_freepages(zone, start_pfn, end_pfn, migratetype, 24338c2ecf20Sopenharmony_ci num_movable); 24348c2ecf20Sopenharmony_ci} 24358c2ecf20Sopenharmony_ci 24368c2ecf20Sopenharmony_cistatic void change_pageblock_range(struct page *pageblock_page, 24378c2ecf20Sopenharmony_ci int start_order, int migratetype) 24388c2ecf20Sopenharmony_ci{ 24398c2ecf20Sopenharmony_ci int nr_pageblocks = 1 << (start_order - pageblock_order); 24408c2ecf20Sopenharmony_ci 24418c2ecf20Sopenharmony_ci while (nr_pageblocks--) { 24428c2ecf20Sopenharmony_ci set_pageblock_migratetype(pageblock_page, migratetype); 24438c2ecf20Sopenharmony_ci pageblock_page += pageblock_nr_pages; 24448c2ecf20Sopenharmony_ci } 24458c2ecf20Sopenharmony_ci} 24468c2ecf20Sopenharmony_ci 24478c2ecf20Sopenharmony_ci/* 24488c2ecf20Sopenharmony_ci * When we are falling back to another migratetype during allocation, try to 24498c2ecf20Sopenharmony_ci * steal extra free pages from the same pageblocks to satisfy further 24508c2ecf20Sopenharmony_ci * allocations, instead of polluting multiple pageblocks. 24518c2ecf20Sopenharmony_ci * 24528c2ecf20Sopenharmony_ci * If we are stealing a relatively large buddy page, it is likely there will 24538c2ecf20Sopenharmony_ci * be more free pages in the pageblock, so try to steal them all. For 24548c2ecf20Sopenharmony_ci * reclaimable and unmovable allocations, we steal regardless of page size, 24558c2ecf20Sopenharmony_ci * as fragmentation caused by those allocations polluting movable pageblocks 24568c2ecf20Sopenharmony_ci * is worse than movable allocations stealing from unmovable and reclaimable 24578c2ecf20Sopenharmony_ci * pageblocks. 24588c2ecf20Sopenharmony_ci */ 24598c2ecf20Sopenharmony_cistatic bool can_steal_fallback(unsigned int order, int start_mt) 24608c2ecf20Sopenharmony_ci{ 24618c2ecf20Sopenharmony_ci /* 24628c2ecf20Sopenharmony_ci * Leaving this order check is intended, although there is 24638c2ecf20Sopenharmony_ci * relaxed order check in next check. The reason is that 24648c2ecf20Sopenharmony_ci * we can actually steal whole pageblock if this condition met, 24658c2ecf20Sopenharmony_ci * but, below check doesn't guarantee it and that is just heuristic 24668c2ecf20Sopenharmony_ci * so could be changed anytime. 24678c2ecf20Sopenharmony_ci */ 24688c2ecf20Sopenharmony_ci if (order >= pageblock_order) 24698c2ecf20Sopenharmony_ci return true; 24708c2ecf20Sopenharmony_ci 24718c2ecf20Sopenharmony_ci if (order >= pageblock_order / 2 || 24728c2ecf20Sopenharmony_ci start_mt == MIGRATE_RECLAIMABLE || 24738c2ecf20Sopenharmony_ci start_mt == MIGRATE_UNMOVABLE || 24748c2ecf20Sopenharmony_ci page_group_by_mobility_disabled) 24758c2ecf20Sopenharmony_ci return true; 24768c2ecf20Sopenharmony_ci 24778c2ecf20Sopenharmony_ci return false; 24788c2ecf20Sopenharmony_ci} 24798c2ecf20Sopenharmony_ci 24808c2ecf20Sopenharmony_cistatic inline bool boost_watermark(struct zone *zone) 24818c2ecf20Sopenharmony_ci{ 24828c2ecf20Sopenharmony_ci unsigned long max_boost; 24838c2ecf20Sopenharmony_ci 24848c2ecf20Sopenharmony_ci if (!watermark_boost_factor) 24858c2ecf20Sopenharmony_ci return false; 24868c2ecf20Sopenharmony_ci /* 24878c2ecf20Sopenharmony_ci * Don't bother in zones that are unlikely to produce results. 24888c2ecf20Sopenharmony_ci * On small machines, including kdump capture kernels running 24898c2ecf20Sopenharmony_ci * in a small area, boosting the watermark can cause an out of 24908c2ecf20Sopenharmony_ci * memory situation immediately. 24918c2ecf20Sopenharmony_ci */ 24928c2ecf20Sopenharmony_ci if ((pageblock_nr_pages * 4) > zone_managed_pages(zone)) 24938c2ecf20Sopenharmony_ci return false; 24948c2ecf20Sopenharmony_ci 24958c2ecf20Sopenharmony_ci max_boost = mult_frac(zone->_watermark[WMARK_HIGH], 24968c2ecf20Sopenharmony_ci watermark_boost_factor, 10000); 24978c2ecf20Sopenharmony_ci 24988c2ecf20Sopenharmony_ci /* 24998c2ecf20Sopenharmony_ci * high watermark may be uninitialised if fragmentation occurs 25008c2ecf20Sopenharmony_ci * very early in boot so do not boost. We do not fall 25018c2ecf20Sopenharmony_ci * through and boost by pageblock_nr_pages as failing 25028c2ecf20Sopenharmony_ci * allocations that early means that reclaim is not going 25038c2ecf20Sopenharmony_ci * to help and it may even be impossible to reclaim the 25048c2ecf20Sopenharmony_ci * boosted watermark resulting in a hang. 25058c2ecf20Sopenharmony_ci */ 25068c2ecf20Sopenharmony_ci if (!max_boost) 25078c2ecf20Sopenharmony_ci return false; 25088c2ecf20Sopenharmony_ci 25098c2ecf20Sopenharmony_ci max_boost = max(pageblock_nr_pages, max_boost); 25108c2ecf20Sopenharmony_ci 25118c2ecf20Sopenharmony_ci zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages, 25128c2ecf20Sopenharmony_ci max_boost); 25138c2ecf20Sopenharmony_ci 25148c2ecf20Sopenharmony_ci return true; 25158c2ecf20Sopenharmony_ci} 25168c2ecf20Sopenharmony_ci 25178c2ecf20Sopenharmony_ci/* 25188c2ecf20Sopenharmony_ci * This function implements actual steal behaviour. If order is large enough, 25198c2ecf20Sopenharmony_ci * we can steal whole pageblock. If not, we first move freepages in this 25208c2ecf20Sopenharmony_ci * pageblock to our migratetype and determine how many already-allocated pages 25218c2ecf20Sopenharmony_ci * are there in the pageblock with a compatible migratetype. If at least half 25228c2ecf20Sopenharmony_ci * of pages are free or compatible, we can change migratetype of the pageblock 25238c2ecf20Sopenharmony_ci * itself, so pages freed in the future will be put on the correct free list. 25248c2ecf20Sopenharmony_ci */ 25258c2ecf20Sopenharmony_cistatic void steal_suitable_fallback(struct zone *zone, struct page *page, 25268c2ecf20Sopenharmony_ci unsigned int alloc_flags, int start_type, bool whole_block) 25278c2ecf20Sopenharmony_ci{ 25288c2ecf20Sopenharmony_ci unsigned int current_order = buddy_order(page); 25298c2ecf20Sopenharmony_ci int free_pages, movable_pages, alike_pages; 25308c2ecf20Sopenharmony_ci int old_block_type; 25318c2ecf20Sopenharmony_ci 25328c2ecf20Sopenharmony_ci old_block_type = get_pageblock_migratetype(page); 25338c2ecf20Sopenharmony_ci 25348c2ecf20Sopenharmony_ci /* 25358c2ecf20Sopenharmony_ci * This can happen due to races and we want to prevent broken 25368c2ecf20Sopenharmony_ci * highatomic accounting. 25378c2ecf20Sopenharmony_ci */ 25388c2ecf20Sopenharmony_ci if (is_migrate_highatomic(old_block_type)) 25398c2ecf20Sopenharmony_ci goto single_page; 25408c2ecf20Sopenharmony_ci 25418c2ecf20Sopenharmony_ci /* Take ownership for orders >= pageblock_order */ 25428c2ecf20Sopenharmony_ci if (current_order >= pageblock_order) { 25438c2ecf20Sopenharmony_ci change_pageblock_range(page, current_order, start_type); 25448c2ecf20Sopenharmony_ci goto single_page; 25458c2ecf20Sopenharmony_ci } 25468c2ecf20Sopenharmony_ci 25478c2ecf20Sopenharmony_ci /* 25488c2ecf20Sopenharmony_ci * Boost watermarks to increase reclaim pressure to reduce the 25498c2ecf20Sopenharmony_ci * likelihood of future fallbacks. Wake kswapd now as the node 25508c2ecf20Sopenharmony_ci * may be balanced overall and kswapd will not wake naturally. 25518c2ecf20Sopenharmony_ci */ 25528c2ecf20Sopenharmony_ci if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD)) 25538c2ecf20Sopenharmony_ci set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); 25548c2ecf20Sopenharmony_ci 25558c2ecf20Sopenharmony_ci /* We are not allowed to try stealing from the whole block */ 25568c2ecf20Sopenharmony_ci if (!whole_block) 25578c2ecf20Sopenharmony_ci goto single_page; 25588c2ecf20Sopenharmony_ci 25598c2ecf20Sopenharmony_ci free_pages = move_freepages_block(zone, page, start_type, 25608c2ecf20Sopenharmony_ci &movable_pages); 25618c2ecf20Sopenharmony_ci /* 25628c2ecf20Sopenharmony_ci * Determine how many pages are compatible with our allocation. 25638c2ecf20Sopenharmony_ci * For movable allocation, it's the number of movable pages which 25648c2ecf20Sopenharmony_ci * we just obtained. For other types it's a bit more tricky. 25658c2ecf20Sopenharmony_ci */ 25668c2ecf20Sopenharmony_ci if (start_type == MIGRATE_MOVABLE) { 25678c2ecf20Sopenharmony_ci alike_pages = movable_pages; 25688c2ecf20Sopenharmony_ci } else { 25698c2ecf20Sopenharmony_ci /* 25708c2ecf20Sopenharmony_ci * If we are falling back a RECLAIMABLE or UNMOVABLE allocation 25718c2ecf20Sopenharmony_ci * to MOVABLE pageblock, consider all non-movable pages as 25728c2ecf20Sopenharmony_ci * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or 25738c2ecf20Sopenharmony_ci * vice versa, be conservative since we can't distinguish the 25748c2ecf20Sopenharmony_ci * exact migratetype of non-movable pages. 25758c2ecf20Sopenharmony_ci */ 25768c2ecf20Sopenharmony_ci if (old_block_type == MIGRATE_MOVABLE) 25778c2ecf20Sopenharmony_ci alike_pages = pageblock_nr_pages 25788c2ecf20Sopenharmony_ci - (free_pages + movable_pages); 25798c2ecf20Sopenharmony_ci else 25808c2ecf20Sopenharmony_ci alike_pages = 0; 25818c2ecf20Sopenharmony_ci } 25828c2ecf20Sopenharmony_ci 25838c2ecf20Sopenharmony_ci /* moving whole block can fail due to zone boundary conditions */ 25848c2ecf20Sopenharmony_ci if (!free_pages) 25858c2ecf20Sopenharmony_ci goto single_page; 25868c2ecf20Sopenharmony_ci 25878c2ecf20Sopenharmony_ci /* 25888c2ecf20Sopenharmony_ci * If a sufficient number of pages in the block are either free or of 25898c2ecf20Sopenharmony_ci * comparable migratability as our allocation, claim the whole block. 25908c2ecf20Sopenharmony_ci */ 25918c2ecf20Sopenharmony_ci if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || 25928c2ecf20Sopenharmony_ci page_group_by_mobility_disabled) 25938c2ecf20Sopenharmony_ci set_pageblock_migratetype(page, start_type); 25948c2ecf20Sopenharmony_ci 25958c2ecf20Sopenharmony_ci return; 25968c2ecf20Sopenharmony_ci 25978c2ecf20Sopenharmony_cisingle_page: 25988c2ecf20Sopenharmony_ci move_to_free_list(page, zone, current_order, start_type); 25998c2ecf20Sopenharmony_ci} 26008c2ecf20Sopenharmony_ci 26018c2ecf20Sopenharmony_ci/* 26028c2ecf20Sopenharmony_ci * Check whether there is a suitable fallback freepage with requested order. 26038c2ecf20Sopenharmony_ci * If only_stealable is true, this function returns fallback_mt only if 26048c2ecf20Sopenharmony_ci * we can steal other freepages all together. This would help to reduce 26058c2ecf20Sopenharmony_ci * fragmentation due to mixed migratetype pages in one pageblock. 26068c2ecf20Sopenharmony_ci */ 26078c2ecf20Sopenharmony_ciint find_suitable_fallback(struct free_area *area, unsigned int order, 26088c2ecf20Sopenharmony_ci int migratetype, bool only_stealable, bool *can_steal) 26098c2ecf20Sopenharmony_ci{ 26108c2ecf20Sopenharmony_ci int i; 26118c2ecf20Sopenharmony_ci int fallback_mt; 26128c2ecf20Sopenharmony_ci 26138c2ecf20Sopenharmony_ci if (area->nr_free == 0) 26148c2ecf20Sopenharmony_ci return -1; 26158c2ecf20Sopenharmony_ci 26168c2ecf20Sopenharmony_ci *can_steal = false; 26178c2ecf20Sopenharmony_ci for (i = 0;; i++) { 26188c2ecf20Sopenharmony_ci fallback_mt = fallbacks[migratetype][i]; 26198c2ecf20Sopenharmony_ci if (fallback_mt == MIGRATE_TYPES) 26208c2ecf20Sopenharmony_ci break; 26218c2ecf20Sopenharmony_ci 26228c2ecf20Sopenharmony_ci if (free_area_empty(area, fallback_mt)) 26238c2ecf20Sopenharmony_ci continue; 26248c2ecf20Sopenharmony_ci 26258c2ecf20Sopenharmony_ci if (can_steal_fallback(order, migratetype)) 26268c2ecf20Sopenharmony_ci *can_steal = true; 26278c2ecf20Sopenharmony_ci 26288c2ecf20Sopenharmony_ci if (!only_stealable) 26298c2ecf20Sopenharmony_ci return fallback_mt; 26308c2ecf20Sopenharmony_ci 26318c2ecf20Sopenharmony_ci if (*can_steal) 26328c2ecf20Sopenharmony_ci return fallback_mt; 26338c2ecf20Sopenharmony_ci } 26348c2ecf20Sopenharmony_ci 26358c2ecf20Sopenharmony_ci return -1; 26368c2ecf20Sopenharmony_ci} 26378c2ecf20Sopenharmony_ci 26388c2ecf20Sopenharmony_ci/* 26398c2ecf20Sopenharmony_ci * Reserve a pageblock for exclusive use of high-order atomic allocations if 26408c2ecf20Sopenharmony_ci * there are no empty page blocks that contain a page with a suitable order 26418c2ecf20Sopenharmony_ci */ 26428c2ecf20Sopenharmony_cistatic void reserve_highatomic_pageblock(struct page *page, struct zone *zone, 26438c2ecf20Sopenharmony_ci unsigned int alloc_order) 26448c2ecf20Sopenharmony_ci{ 26458c2ecf20Sopenharmony_ci int mt; 26468c2ecf20Sopenharmony_ci unsigned long max_managed, flags; 26478c2ecf20Sopenharmony_ci 26488c2ecf20Sopenharmony_ci /* 26498c2ecf20Sopenharmony_ci * Limit the number reserved to 1 pageblock or roughly 1% of a zone. 26508c2ecf20Sopenharmony_ci * Check is race-prone but harmless. 26518c2ecf20Sopenharmony_ci */ 26528c2ecf20Sopenharmony_ci max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages; 26538c2ecf20Sopenharmony_ci if (zone->nr_reserved_highatomic >= max_managed) 26548c2ecf20Sopenharmony_ci return; 26558c2ecf20Sopenharmony_ci 26568c2ecf20Sopenharmony_ci spin_lock_irqsave(&zone->lock, flags); 26578c2ecf20Sopenharmony_ci 26588c2ecf20Sopenharmony_ci /* Recheck the nr_reserved_highatomic limit under the lock */ 26598c2ecf20Sopenharmony_ci if (zone->nr_reserved_highatomic >= max_managed) 26608c2ecf20Sopenharmony_ci goto out_unlock; 26618c2ecf20Sopenharmony_ci 26628c2ecf20Sopenharmony_ci /* Yoink! */ 26638c2ecf20Sopenharmony_ci mt = get_pageblock_migratetype(page); 26648c2ecf20Sopenharmony_ci if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt) 26658c2ecf20Sopenharmony_ci && !is_migrate_cma(mt)) { 26668c2ecf20Sopenharmony_ci zone->nr_reserved_highatomic += pageblock_nr_pages; 26678c2ecf20Sopenharmony_ci set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); 26688c2ecf20Sopenharmony_ci move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); 26698c2ecf20Sopenharmony_ci } 26708c2ecf20Sopenharmony_ci 26718c2ecf20Sopenharmony_ciout_unlock: 26728c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&zone->lock, flags); 26738c2ecf20Sopenharmony_ci} 26748c2ecf20Sopenharmony_ci 26758c2ecf20Sopenharmony_ci/* 26768c2ecf20Sopenharmony_ci * Used when an allocation is about to fail under memory pressure. This 26778c2ecf20Sopenharmony_ci * potentially hurts the reliability of high-order allocations when under 26788c2ecf20Sopenharmony_ci * intense memory pressure but failed atomic allocations should be easier 26798c2ecf20Sopenharmony_ci * to recover from than an OOM. 26808c2ecf20Sopenharmony_ci * 26818c2ecf20Sopenharmony_ci * If @force is true, try to unreserve a pageblock even though highatomic 26828c2ecf20Sopenharmony_ci * pageblock is exhausted. 26838c2ecf20Sopenharmony_ci */ 26848c2ecf20Sopenharmony_cistatic bool unreserve_highatomic_pageblock(const struct alloc_context *ac, 26858c2ecf20Sopenharmony_ci bool force) 26868c2ecf20Sopenharmony_ci{ 26878c2ecf20Sopenharmony_ci struct zonelist *zonelist = ac->zonelist; 26888c2ecf20Sopenharmony_ci unsigned long flags; 26898c2ecf20Sopenharmony_ci struct zoneref *z; 26908c2ecf20Sopenharmony_ci struct zone *zone; 26918c2ecf20Sopenharmony_ci struct page *page; 26928c2ecf20Sopenharmony_ci int order; 26938c2ecf20Sopenharmony_ci bool ret; 26948c2ecf20Sopenharmony_ci 26958c2ecf20Sopenharmony_ci for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, 26968c2ecf20Sopenharmony_ci ac->nodemask) { 26978c2ecf20Sopenharmony_ci /* 26988c2ecf20Sopenharmony_ci * Preserve at least one pageblock unless memory pressure 26998c2ecf20Sopenharmony_ci * is really high. 27008c2ecf20Sopenharmony_ci */ 27018c2ecf20Sopenharmony_ci if (!force && zone->nr_reserved_highatomic <= 27028c2ecf20Sopenharmony_ci pageblock_nr_pages) 27038c2ecf20Sopenharmony_ci continue; 27048c2ecf20Sopenharmony_ci 27058c2ecf20Sopenharmony_ci spin_lock_irqsave(&zone->lock, flags); 27068c2ecf20Sopenharmony_ci for (order = 0; order < MAX_ORDER; order++) { 27078c2ecf20Sopenharmony_ci struct free_area *area = &(zone->free_area[order]); 27088c2ecf20Sopenharmony_ci 27098c2ecf20Sopenharmony_ci page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); 27108c2ecf20Sopenharmony_ci if (!page) 27118c2ecf20Sopenharmony_ci continue; 27128c2ecf20Sopenharmony_ci 27138c2ecf20Sopenharmony_ci /* 27148c2ecf20Sopenharmony_ci * In page freeing path, migratetype change is racy so 27158c2ecf20Sopenharmony_ci * we can counter several free pages in a pageblock 27168c2ecf20Sopenharmony_ci * in this loop althoug we changed the pageblock type 27178c2ecf20Sopenharmony_ci * from highatomic to ac->migratetype. So we should 27188c2ecf20Sopenharmony_ci * adjust the count once. 27198c2ecf20Sopenharmony_ci */ 27208c2ecf20Sopenharmony_ci if (is_migrate_highatomic_page(page)) { 27218c2ecf20Sopenharmony_ci /* 27228c2ecf20Sopenharmony_ci * It should never happen but changes to 27238c2ecf20Sopenharmony_ci * locking could inadvertently allow a per-cpu 27248c2ecf20Sopenharmony_ci * drain to add pages to MIGRATE_HIGHATOMIC 27258c2ecf20Sopenharmony_ci * while unreserving so be safe and watch for 27268c2ecf20Sopenharmony_ci * underflows. 27278c2ecf20Sopenharmony_ci */ 27288c2ecf20Sopenharmony_ci zone->nr_reserved_highatomic -= min( 27298c2ecf20Sopenharmony_ci pageblock_nr_pages, 27308c2ecf20Sopenharmony_ci zone->nr_reserved_highatomic); 27318c2ecf20Sopenharmony_ci } 27328c2ecf20Sopenharmony_ci 27338c2ecf20Sopenharmony_ci /* 27348c2ecf20Sopenharmony_ci * Convert to ac->migratetype and avoid the normal 27358c2ecf20Sopenharmony_ci * pageblock stealing heuristics. Minimally, the caller 27368c2ecf20Sopenharmony_ci * is doing the work and needs the pages. More 27378c2ecf20Sopenharmony_ci * importantly, if the block was always converted to 27388c2ecf20Sopenharmony_ci * MIGRATE_UNMOVABLE or another type then the number 27398c2ecf20Sopenharmony_ci * of pageblocks that cannot be completely freed 27408c2ecf20Sopenharmony_ci * may increase. 27418c2ecf20Sopenharmony_ci */ 27428c2ecf20Sopenharmony_ci set_pageblock_migratetype(page, ac->migratetype); 27438c2ecf20Sopenharmony_ci ret = move_freepages_block(zone, page, ac->migratetype, 27448c2ecf20Sopenharmony_ci NULL); 27458c2ecf20Sopenharmony_ci if (ret) { 27468c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&zone->lock, flags); 27478c2ecf20Sopenharmony_ci return ret; 27488c2ecf20Sopenharmony_ci } 27498c2ecf20Sopenharmony_ci } 27508c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&zone->lock, flags); 27518c2ecf20Sopenharmony_ci } 27528c2ecf20Sopenharmony_ci 27538c2ecf20Sopenharmony_ci return false; 27548c2ecf20Sopenharmony_ci} 27558c2ecf20Sopenharmony_ci 27568c2ecf20Sopenharmony_ci/* 27578c2ecf20Sopenharmony_ci * Try finding a free buddy page on the fallback list and put it on the free 27588c2ecf20Sopenharmony_ci * list of requested migratetype, possibly along with other pages from the same 27598c2ecf20Sopenharmony_ci * block, depending on fragmentation avoidance heuristics. Returns true if 27608c2ecf20Sopenharmony_ci * fallback was found so that __rmqueue_smallest() can grab it. 27618c2ecf20Sopenharmony_ci * 27628c2ecf20Sopenharmony_ci * The use of signed ints for order and current_order is a deliberate 27638c2ecf20Sopenharmony_ci * deviation from the rest of this file, to make the for loop 27648c2ecf20Sopenharmony_ci * condition simpler. 27658c2ecf20Sopenharmony_ci */ 27668c2ecf20Sopenharmony_cistatic __always_inline bool 27678c2ecf20Sopenharmony_ci__rmqueue_fallback(struct zone *zone, int order, int start_migratetype, 27688c2ecf20Sopenharmony_ci unsigned int alloc_flags) 27698c2ecf20Sopenharmony_ci{ 27708c2ecf20Sopenharmony_ci struct free_area *area; 27718c2ecf20Sopenharmony_ci int current_order; 27728c2ecf20Sopenharmony_ci int min_order = order; 27738c2ecf20Sopenharmony_ci struct page *page; 27748c2ecf20Sopenharmony_ci int fallback_mt; 27758c2ecf20Sopenharmony_ci bool can_steal; 27768c2ecf20Sopenharmony_ci 27778c2ecf20Sopenharmony_ci /* 27788c2ecf20Sopenharmony_ci * Do not steal pages from freelists belonging to other pageblocks 27798c2ecf20Sopenharmony_ci * i.e. orders < pageblock_order. If there are no local zones free, 27808c2ecf20Sopenharmony_ci * the zonelists will be reiterated without ALLOC_NOFRAGMENT. 27818c2ecf20Sopenharmony_ci */ 27828c2ecf20Sopenharmony_ci if (alloc_flags & ALLOC_NOFRAGMENT) 27838c2ecf20Sopenharmony_ci min_order = pageblock_order; 27848c2ecf20Sopenharmony_ci 27858c2ecf20Sopenharmony_ci /* 27868c2ecf20Sopenharmony_ci * Find the largest available free page in the other list. This roughly 27878c2ecf20Sopenharmony_ci * approximates finding the pageblock with the most free pages, which 27888c2ecf20Sopenharmony_ci * would be too costly to do exactly. 27898c2ecf20Sopenharmony_ci */ 27908c2ecf20Sopenharmony_ci for (current_order = MAX_ORDER - 1; current_order >= min_order; 27918c2ecf20Sopenharmony_ci --current_order) { 27928c2ecf20Sopenharmony_ci area = &(zone->free_area[current_order]); 27938c2ecf20Sopenharmony_ci fallback_mt = find_suitable_fallback(area, current_order, 27948c2ecf20Sopenharmony_ci start_migratetype, false, &can_steal); 27958c2ecf20Sopenharmony_ci if (fallback_mt == -1) 27968c2ecf20Sopenharmony_ci continue; 27978c2ecf20Sopenharmony_ci 27988c2ecf20Sopenharmony_ci /* 27998c2ecf20Sopenharmony_ci * We cannot steal all free pages from the pageblock and the 28008c2ecf20Sopenharmony_ci * requested migratetype is movable. In that case it's better to 28018c2ecf20Sopenharmony_ci * steal and split the smallest available page instead of the 28028c2ecf20Sopenharmony_ci * largest available page, because even if the next movable 28038c2ecf20Sopenharmony_ci * allocation falls back into a different pageblock than this 28048c2ecf20Sopenharmony_ci * one, it won't cause permanent fragmentation. 28058c2ecf20Sopenharmony_ci */ 28068c2ecf20Sopenharmony_ci if (!can_steal && start_migratetype == MIGRATE_MOVABLE 28078c2ecf20Sopenharmony_ci && current_order > order) 28088c2ecf20Sopenharmony_ci goto find_smallest; 28098c2ecf20Sopenharmony_ci 28108c2ecf20Sopenharmony_ci goto do_steal; 28118c2ecf20Sopenharmony_ci } 28128c2ecf20Sopenharmony_ci 28138c2ecf20Sopenharmony_ci return false; 28148c2ecf20Sopenharmony_ci 28158c2ecf20Sopenharmony_cifind_smallest: 28168c2ecf20Sopenharmony_ci for (current_order = order; current_order < MAX_ORDER; 28178c2ecf20Sopenharmony_ci current_order++) { 28188c2ecf20Sopenharmony_ci area = &(zone->free_area[current_order]); 28198c2ecf20Sopenharmony_ci fallback_mt = find_suitable_fallback(area, current_order, 28208c2ecf20Sopenharmony_ci start_migratetype, false, &can_steal); 28218c2ecf20Sopenharmony_ci if (fallback_mt != -1) 28228c2ecf20Sopenharmony_ci break; 28238c2ecf20Sopenharmony_ci } 28248c2ecf20Sopenharmony_ci 28258c2ecf20Sopenharmony_ci /* 28268c2ecf20Sopenharmony_ci * This should not happen - we already found a suitable fallback 28278c2ecf20Sopenharmony_ci * when looking for the largest page. 28288c2ecf20Sopenharmony_ci */ 28298c2ecf20Sopenharmony_ci VM_BUG_ON(current_order == MAX_ORDER); 28308c2ecf20Sopenharmony_ci 28318c2ecf20Sopenharmony_cido_steal: 28328c2ecf20Sopenharmony_ci page = get_page_from_free_area(area, fallback_mt); 28338c2ecf20Sopenharmony_ci 28348c2ecf20Sopenharmony_ci steal_suitable_fallback(zone, page, alloc_flags, start_migratetype, 28358c2ecf20Sopenharmony_ci can_steal); 28368c2ecf20Sopenharmony_ci 28378c2ecf20Sopenharmony_ci trace_mm_page_alloc_extfrag(page, order, current_order, 28388c2ecf20Sopenharmony_ci start_migratetype, fallback_mt); 28398c2ecf20Sopenharmony_ci 28408c2ecf20Sopenharmony_ci return true; 28418c2ecf20Sopenharmony_ci 28428c2ecf20Sopenharmony_ci} 28438c2ecf20Sopenharmony_ci 28448c2ecf20Sopenharmony_cistatic __always_inline struct page * 28458c2ecf20Sopenharmony_ci__rmqueue_with_cma_reuse(struct zone *zone, unsigned int order, 28468c2ecf20Sopenharmony_ci int migratetype, unsigned int alloc_flags) 28478c2ecf20Sopenharmony_ci{ 28488c2ecf20Sopenharmony_ci struct page *page = NULL; 28498c2ecf20Sopenharmony_ciretry: 28508c2ecf20Sopenharmony_ci page = __rmqueue_smallest(zone, order, migratetype); 28518c2ecf20Sopenharmony_ci 28528c2ecf20Sopenharmony_ci if (unlikely(!page) && is_migrate_cma(migratetype)) { 28538c2ecf20Sopenharmony_ci migratetype = MIGRATE_MOVABLE; 28548c2ecf20Sopenharmony_ci alloc_flags &= ~ALLOC_CMA; 28558c2ecf20Sopenharmony_ci page = __rmqueue_smallest(zone, order, migratetype); 28568c2ecf20Sopenharmony_ci } 28578c2ecf20Sopenharmony_ci 28588c2ecf20Sopenharmony_ci if (unlikely(!page) && 28598c2ecf20Sopenharmony_ci __rmqueue_fallback(zone, order, migratetype, alloc_flags)) 28608c2ecf20Sopenharmony_ci goto retry; 28618c2ecf20Sopenharmony_ci 28628c2ecf20Sopenharmony_ci return page; 28638c2ecf20Sopenharmony_ci} 28648c2ecf20Sopenharmony_ci 28658c2ecf20Sopenharmony_ci/* 28668c2ecf20Sopenharmony_ci * Do the hard work of removing an element from the buddy allocator. 28678c2ecf20Sopenharmony_ci * Call me with the zone->lock already held. 28688c2ecf20Sopenharmony_ci */ 28698c2ecf20Sopenharmony_cistatic __always_inline struct page * 28708c2ecf20Sopenharmony_ci__rmqueue(struct zone *zone, unsigned int order, int migratetype, 28718c2ecf20Sopenharmony_ci unsigned int alloc_flags) 28728c2ecf20Sopenharmony_ci{ 28738c2ecf20Sopenharmony_ci struct page *page; 28748c2ecf20Sopenharmony_ci 28758c2ecf20Sopenharmony_ci#ifdef CONFIG_CMA_REUSE 28768c2ecf20Sopenharmony_ci page = __rmqueue_with_cma_reuse(zone, order, migratetype, alloc_flags); 28778c2ecf20Sopenharmony_ci goto out; 28788c2ecf20Sopenharmony_ci#endif 28798c2ecf20Sopenharmony_ci 28808c2ecf20Sopenharmony_ci if (IS_ENABLED(CONFIG_CMA)) { 28818c2ecf20Sopenharmony_ci /* 28828c2ecf20Sopenharmony_ci * Balance movable allocations between regular and CMA areas by 28838c2ecf20Sopenharmony_ci * allocating from CMA when over half of the zone's free memory 28848c2ecf20Sopenharmony_ci * is in the CMA area. 28858c2ecf20Sopenharmony_ci */ 28868c2ecf20Sopenharmony_ci if (alloc_flags & ALLOC_CMA && 28878c2ecf20Sopenharmony_ci zone_page_state(zone, NR_FREE_CMA_PAGES) > 28888c2ecf20Sopenharmony_ci zone_page_state(zone, NR_FREE_PAGES) / 2) { 28898c2ecf20Sopenharmony_ci page = __rmqueue_cma_fallback(zone, order); 28908c2ecf20Sopenharmony_ci if (page) 28918c2ecf20Sopenharmony_ci goto out; 28928c2ecf20Sopenharmony_ci } 28938c2ecf20Sopenharmony_ci } 28948c2ecf20Sopenharmony_ciretry: 28958c2ecf20Sopenharmony_ci page = __rmqueue_smallest(zone, order, migratetype); 28968c2ecf20Sopenharmony_ci if (unlikely(!page)) { 28978c2ecf20Sopenharmony_ci if (alloc_flags & ALLOC_CMA) 28988c2ecf20Sopenharmony_ci page = __rmqueue_cma_fallback(zone, order); 28998c2ecf20Sopenharmony_ci 29008c2ecf20Sopenharmony_ci if (!page && __rmqueue_fallback(zone, order, migratetype, 29018c2ecf20Sopenharmony_ci alloc_flags)) 29028c2ecf20Sopenharmony_ci goto retry; 29038c2ecf20Sopenharmony_ci } 29048c2ecf20Sopenharmony_ciout: 29058c2ecf20Sopenharmony_ci if (page) 29068c2ecf20Sopenharmony_ci trace_mm_page_alloc_zone_locked(page, order, migratetype); 29078c2ecf20Sopenharmony_ci return page; 29088c2ecf20Sopenharmony_ci} 29098c2ecf20Sopenharmony_ci 29108c2ecf20Sopenharmony_ci/* 29118c2ecf20Sopenharmony_ci * Obtain a specified number of elements from the buddy allocator, all under 29128c2ecf20Sopenharmony_ci * a single hold of the lock, for efficiency. Add them to the supplied list. 29138c2ecf20Sopenharmony_ci * Returns the number of new pages which were placed at *list. 29148c2ecf20Sopenharmony_ci */ 29158c2ecf20Sopenharmony_cistatic int rmqueue_bulk(struct zone *zone, unsigned int order, 29168c2ecf20Sopenharmony_ci unsigned long count, struct list_head *list, 29178c2ecf20Sopenharmony_ci int migratetype, unsigned int alloc_flags) 29188c2ecf20Sopenharmony_ci{ 29198c2ecf20Sopenharmony_ci int i, alloced = 0; 29208c2ecf20Sopenharmony_ci 29218c2ecf20Sopenharmony_ci spin_lock(&zone->lock); 29228c2ecf20Sopenharmony_ci for (i = 0; i < count; ++i) { 29238c2ecf20Sopenharmony_ci struct page *page = __rmqueue(zone, order, migratetype, 29248c2ecf20Sopenharmony_ci alloc_flags); 29258c2ecf20Sopenharmony_ci if (unlikely(page == NULL)) 29268c2ecf20Sopenharmony_ci break; 29278c2ecf20Sopenharmony_ci 29288c2ecf20Sopenharmony_ci if (unlikely(check_pcp_refill(page))) 29298c2ecf20Sopenharmony_ci continue; 29308c2ecf20Sopenharmony_ci 29318c2ecf20Sopenharmony_ci /* 29328c2ecf20Sopenharmony_ci * Split buddy pages returned by expand() are received here in 29338c2ecf20Sopenharmony_ci * physical page order. The page is added to the tail of 29348c2ecf20Sopenharmony_ci * caller's list. From the callers perspective, the linked list 29358c2ecf20Sopenharmony_ci * is ordered by page number under some conditions. This is 29368c2ecf20Sopenharmony_ci * useful for IO devices that can forward direction from the 29378c2ecf20Sopenharmony_ci * head, thus also in the physical page order. This is useful 29388c2ecf20Sopenharmony_ci * for IO devices that can merge IO requests if the physical 29398c2ecf20Sopenharmony_ci * pages are ordered properly. 29408c2ecf20Sopenharmony_ci */ 29418c2ecf20Sopenharmony_ci list_add_tail(&page->lru, list); 29428c2ecf20Sopenharmony_ci alloced++; 29438c2ecf20Sopenharmony_ci if (is_migrate_cma(get_pcppage_migratetype(page))) 29448c2ecf20Sopenharmony_ci __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 29458c2ecf20Sopenharmony_ci -(1 << order)); 29468c2ecf20Sopenharmony_ci } 29478c2ecf20Sopenharmony_ci 29488c2ecf20Sopenharmony_ci /* 29498c2ecf20Sopenharmony_ci * i pages were removed from the buddy list even if some leak due 29508c2ecf20Sopenharmony_ci * to check_pcp_refill failing so adjust NR_FREE_PAGES based 29518c2ecf20Sopenharmony_ci * on i. Do not confuse with 'alloced' which is the number of 29528c2ecf20Sopenharmony_ci * pages added to the pcp list. 29538c2ecf20Sopenharmony_ci */ 29548c2ecf20Sopenharmony_ci __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); 29558c2ecf20Sopenharmony_ci spin_unlock(&zone->lock); 29568c2ecf20Sopenharmony_ci return alloced; 29578c2ecf20Sopenharmony_ci} 29588c2ecf20Sopenharmony_ci 29598c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA 29608c2ecf20Sopenharmony_ci/* 29618c2ecf20Sopenharmony_ci * Called from the vmstat counter updater to drain pagesets of this 29628c2ecf20Sopenharmony_ci * currently executing processor on remote nodes after they have 29638c2ecf20Sopenharmony_ci * expired. 29648c2ecf20Sopenharmony_ci * 29658c2ecf20Sopenharmony_ci * Note that this function must be called with the thread pinned to 29668c2ecf20Sopenharmony_ci * a single processor. 29678c2ecf20Sopenharmony_ci */ 29688c2ecf20Sopenharmony_civoid drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) 29698c2ecf20Sopenharmony_ci{ 29708c2ecf20Sopenharmony_ci unsigned long flags; 29718c2ecf20Sopenharmony_ci int to_drain, batch; 29728c2ecf20Sopenharmony_ci 29738c2ecf20Sopenharmony_ci local_irq_save(flags); 29748c2ecf20Sopenharmony_ci batch = READ_ONCE(pcp->batch); 29758c2ecf20Sopenharmony_ci to_drain = min(pcp->count, batch); 29768c2ecf20Sopenharmony_ci if (to_drain > 0) 29778c2ecf20Sopenharmony_ci free_pcppages_bulk(zone, to_drain, pcp); 29788c2ecf20Sopenharmony_ci local_irq_restore(flags); 29798c2ecf20Sopenharmony_ci} 29808c2ecf20Sopenharmony_ci#endif 29818c2ecf20Sopenharmony_ci 29828c2ecf20Sopenharmony_ci/* 29838c2ecf20Sopenharmony_ci * Drain pcplists of the indicated processor and zone. 29848c2ecf20Sopenharmony_ci * 29858c2ecf20Sopenharmony_ci * The processor must either be the current processor and the 29868c2ecf20Sopenharmony_ci * thread pinned to the current processor or a processor that 29878c2ecf20Sopenharmony_ci * is not online. 29888c2ecf20Sopenharmony_ci */ 29898c2ecf20Sopenharmony_cistatic void drain_pages_zone(unsigned int cpu, struct zone *zone) 29908c2ecf20Sopenharmony_ci{ 29918c2ecf20Sopenharmony_ci unsigned long flags; 29928c2ecf20Sopenharmony_ci struct per_cpu_pageset *pset; 29938c2ecf20Sopenharmony_ci struct per_cpu_pages *pcp; 29948c2ecf20Sopenharmony_ci 29958c2ecf20Sopenharmony_ci local_irq_save(flags); 29968c2ecf20Sopenharmony_ci pset = per_cpu_ptr(zone->pageset, cpu); 29978c2ecf20Sopenharmony_ci 29988c2ecf20Sopenharmony_ci pcp = &pset->pcp; 29998c2ecf20Sopenharmony_ci if (pcp->count) 30008c2ecf20Sopenharmony_ci free_pcppages_bulk(zone, pcp->count, pcp); 30018c2ecf20Sopenharmony_ci local_irq_restore(flags); 30028c2ecf20Sopenharmony_ci} 30038c2ecf20Sopenharmony_ci 30048c2ecf20Sopenharmony_ci/* 30058c2ecf20Sopenharmony_ci * Drain pcplists of all zones on the indicated processor. 30068c2ecf20Sopenharmony_ci * 30078c2ecf20Sopenharmony_ci * The processor must either be the current processor and the 30088c2ecf20Sopenharmony_ci * thread pinned to the current processor or a processor that 30098c2ecf20Sopenharmony_ci * is not online. 30108c2ecf20Sopenharmony_ci */ 30118c2ecf20Sopenharmony_cistatic void drain_pages(unsigned int cpu) 30128c2ecf20Sopenharmony_ci{ 30138c2ecf20Sopenharmony_ci struct zone *zone; 30148c2ecf20Sopenharmony_ci 30158c2ecf20Sopenharmony_ci for_each_populated_zone(zone) { 30168c2ecf20Sopenharmony_ci drain_pages_zone(cpu, zone); 30178c2ecf20Sopenharmony_ci } 30188c2ecf20Sopenharmony_ci} 30198c2ecf20Sopenharmony_ci 30208c2ecf20Sopenharmony_ci/* 30218c2ecf20Sopenharmony_ci * Spill all of this CPU's per-cpu pages back into the buddy allocator. 30228c2ecf20Sopenharmony_ci * 30238c2ecf20Sopenharmony_ci * The CPU has to be pinned. When zone parameter is non-NULL, spill just 30248c2ecf20Sopenharmony_ci * the single zone's pages. 30258c2ecf20Sopenharmony_ci */ 30268c2ecf20Sopenharmony_civoid drain_local_pages(struct zone *zone) 30278c2ecf20Sopenharmony_ci{ 30288c2ecf20Sopenharmony_ci int cpu = smp_processor_id(); 30298c2ecf20Sopenharmony_ci 30308c2ecf20Sopenharmony_ci if (zone) 30318c2ecf20Sopenharmony_ci drain_pages_zone(cpu, zone); 30328c2ecf20Sopenharmony_ci else 30338c2ecf20Sopenharmony_ci drain_pages(cpu); 30348c2ecf20Sopenharmony_ci} 30358c2ecf20Sopenharmony_ci 30368c2ecf20Sopenharmony_cistatic void drain_local_pages_wq(struct work_struct *work) 30378c2ecf20Sopenharmony_ci{ 30388c2ecf20Sopenharmony_ci struct pcpu_drain *drain; 30398c2ecf20Sopenharmony_ci 30408c2ecf20Sopenharmony_ci drain = container_of(work, struct pcpu_drain, work); 30418c2ecf20Sopenharmony_ci 30428c2ecf20Sopenharmony_ci /* 30438c2ecf20Sopenharmony_ci * drain_all_pages doesn't use proper cpu hotplug protection so 30448c2ecf20Sopenharmony_ci * we can race with cpu offline when the WQ can move this from 30458c2ecf20Sopenharmony_ci * a cpu pinned worker to an unbound one. We can operate on a different 30468c2ecf20Sopenharmony_ci * cpu which is allright but we also have to make sure to not move to 30478c2ecf20Sopenharmony_ci * a different one. 30488c2ecf20Sopenharmony_ci */ 30498c2ecf20Sopenharmony_ci preempt_disable(); 30508c2ecf20Sopenharmony_ci drain_local_pages(drain->zone); 30518c2ecf20Sopenharmony_ci preempt_enable(); 30528c2ecf20Sopenharmony_ci} 30538c2ecf20Sopenharmony_ci 30548c2ecf20Sopenharmony_ci/* 30558c2ecf20Sopenharmony_ci * Spill all the per-cpu pages from all CPUs back into the buddy allocator. 30568c2ecf20Sopenharmony_ci * 30578c2ecf20Sopenharmony_ci * When zone parameter is non-NULL, spill just the single zone's pages. 30588c2ecf20Sopenharmony_ci * 30598c2ecf20Sopenharmony_ci * Note that this can be extremely slow as the draining happens in a workqueue. 30608c2ecf20Sopenharmony_ci */ 30618c2ecf20Sopenharmony_civoid drain_all_pages(struct zone *zone) 30628c2ecf20Sopenharmony_ci{ 30638c2ecf20Sopenharmony_ci int cpu; 30648c2ecf20Sopenharmony_ci 30658c2ecf20Sopenharmony_ci /* 30668c2ecf20Sopenharmony_ci * Allocate in the BSS so we wont require allocation in 30678c2ecf20Sopenharmony_ci * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y 30688c2ecf20Sopenharmony_ci */ 30698c2ecf20Sopenharmony_ci static cpumask_t cpus_with_pcps; 30708c2ecf20Sopenharmony_ci 30718c2ecf20Sopenharmony_ci /* 30728c2ecf20Sopenharmony_ci * Make sure nobody triggers this path before mm_percpu_wq is fully 30738c2ecf20Sopenharmony_ci * initialized. 30748c2ecf20Sopenharmony_ci */ 30758c2ecf20Sopenharmony_ci if (WARN_ON_ONCE(!mm_percpu_wq)) 30768c2ecf20Sopenharmony_ci return; 30778c2ecf20Sopenharmony_ci 30788c2ecf20Sopenharmony_ci /* 30798c2ecf20Sopenharmony_ci * Do not drain if one is already in progress unless it's specific to 30808c2ecf20Sopenharmony_ci * a zone. Such callers are primarily CMA and memory hotplug and need 30818c2ecf20Sopenharmony_ci * the drain to be complete when the call returns. 30828c2ecf20Sopenharmony_ci */ 30838c2ecf20Sopenharmony_ci if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) { 30848c2ecf20Sopenharmony_ci if (!zone) 30858c2ecf20Sopenharmony_ci return; 30868c2ecf20Sopenharmony_ci mutex_lock(&pcpu_drain_mutex); 30878c2ecf20Sopenharmony_ci } 30888c2ecf20Sopenharmony_ci 30898c2ecf20Sopenharmony_ci /* 30908c2ecf20Sopenharmony_ci * We don't care about racing with CPU hotplug event 30918c2ecf20Sopenharmony_ci * as offline notification will cause the notified 30928c2ecf20Sopenharmony_ci * cpu to drain that CPU pcps and on_each_cpu_mask 30938c2ecf20Sopenharmony_ci * disables preemption as part of its processing 30948c2ecf20Sopenharmony_ci */ 30958c2ecf20Sopenharmony_ci for_each_online_cpu(cpu) { 30968c2ecf20Sopenharmony_ci struct per_cpu_pageset *pcp; 30978c2ecf20Sopenharmony_ci struct zone *z; 30988c2ecf20Sopenharmony_ci bool has_pcps = false; 30998c2ecf20Sopenharmony_ci 31008c2ecf20Sopenharmony_ci if (zone) { 31018c2ecf20Sopenharmony_ci pcp = per_cpu_ptr(zone->pageset, cpu); 31028c2ecf20Sopenharmony_ci if (pcp->pcp.count) 31038c2ecf20Sopenharmony_ci has_pcps = true; 31048c2ecf20Sopenharmony_ci } else { 31058c2ecf20Sopenharmony_ci for_each_populated_zone(z) { 31068c2ecf20Sopenharmony_ci pcp = per_cpu_ptr(z->pageset, cpu); 31078c2ecf20Sopenharmony_ci if (pcp->pcp.count) { 31088c2ecf20Sopenharmony_ci has_pcps = true; 31098c2ecf20Sopenharmony_ci break; 31108c2ecf20Sopenharmony_ci } 31118c2ecf20Sopenharmony_ci } 31128c2ecf20Sopenharmony_ci } 31138c2ecf20Sopenharmony_ci 31148c2ecf20Sopenharmony_ci if (has_pcps) 31158c2ecf20Sopenharmony_ci cpumask_set_cpu(cpu, &cpus_with_pcps); 31168c2ecf20Sopenharmony_ci else 31178c2ecf20Sopenharmony_ci cpumask_clear_cpu(cpu, &cpus_with_pcps); 31188c2ecf20Sopenharmony_ci } 31198c2ecf20Sopenharmony_ci 31208c2ecf20Sopenharmony_ci for_each_cpu(cpu, &cpus_with_pcps) { 31218c2ecf20Sopenharmony_ci struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu); 31228c2ecf20Sopenharmony_ci 31238c2ecf20Sopenharmony_ci drain->zone = zone; 31248c2ecf20Sopenharmony_ci INIT_WORK(&drain->work, drain_local_pages_wq); 31258c2ecf20Sopenharmony_ci queue_work_on(cpu, mm_percpu_wq, &drain->work); 31268c2ecf20Sopenharmony_ci } 31278c2ecf20Sopenharmony_ci for_each_cpu(cpu, &cpus_with_pcps) 31288c2ecf20Sopenharmony_ci flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work); 31298c2ecf20Sopenharmony_ci 31308c2ecf20Sopenharmony_ci mutex_unlock(&pcpu_drain_mutex); 31318c2ecf20Sopenharmony_ci} 31328c2ecf20Sopenharmony_ci 31338c2ecf20Sopenharmony_ci#ifdef CONFIG_HIBERNATION 31348c2ecf20Sopenharmony_ci 31358c2ecf20Sopenharmony_ci/* 31368c2ecf20Sopenharmony_ci * Touch the watchdog for every WD_PAGE_COUNT pages. 31378c2ecf20Sopenharmony_ci */ 31388c2ecf20Sopenharmony_ci#define WD_PAGE_COUNT (128*1024) 31398c2ecf20Sopenharmony_ci 31408c2ecf20Sopenharmony_civoid mark_free_pages(struct zone *zone) 31418c2ecf20Sopenharmony_ci{ 31428c2ecf20Sopenharmony_ci unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT; 31438c2ecf20Sopenharmony_ci unsigned long flags; 31448c2ecf20Sopenharmony_ci unsigned int order, t; 31458c2ecf20Sopenharmony_ci struct page *page; 31468c2ecf20Sopenharmony_ci 31478c2ecf20Sopenharmony_ci if (zone_is_empty(zone)) 31488c2ecf20Sopenharmony_ci return; 31498c2ecf20Sopenharmony_ci 31508c2ecf20Sopenharmony_ci spin_lock_irqsave(&zone->lock, flags); 31518c2ecf20Sopenharmony_ci 31528c2ecf20Sopenharmony_ci max_zone_pfn = zone_end_pfn(zone); 31538c2ecf20Sopenharmony_ci for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 31548c2ecf20Sopenharmony_ci if (pfn_valid(pfn)) { 31558c2ecf20Sopenharmony_ci page = pfn_to_page(pfn); 31568c2ecf20Sopenharmony_ci 31578c2ecf20Sopenharmony_ci if (!--page_count) { 31588c2ecf20Sopenharmony_ci touch_nmi_watchdog(); 31598c2ecf20Sopenharmony_ci page_count = WD_PAGE_COUNT; 31608c2ecf20Sopenharmony_ci } 31618c2ecf20Sopenharmony_ci 31628c2ecf20Sopenharmony_ci if (page_zone(page) != zone) 31638c2ecf20Sopenharmony_ci continue; 31648c2ecf20Sopenharmony_ci 31658c2ecf20Sopenharmony_ci if (!swsusp_page_is_forbidden(page)) 31668c2ecf20Sopenharmony_ci swsusp_unset_page_free(page); 31678c2ecf20Sopenharmony_ci } 31688c2ecf20Sopenharmony_ci 31698c2ecf20Sopenharmony_ci for_each_migratetype_order(order, t) { 31708c2ecf20Sopenharmony_ci list_for_each_entry(page, 31718c2ecf20Sopenharmony_ci &zone->free_area[order].free_list[t], lru) { 31728c2ecf20Sopenharmony_ci unsigned long i; 31738c2ecf20Sopenharmony_ci 31748c2ecf20Sopenharmony_ci pfn = page_to_pfn(page); 31758c2ecf20Sopenharmony_ci for (i = 0; i < (1UL << order); i++) { 31768c2ecf20Sopenharmony_ci if (!--page_count) { 31778c2ecf20Sopenharmony_ci touch_nmi_watchdog(); 31788c2ecf20Sopenharmony_ci page_count = WD_PAGE_COUNT; 31798c2ecf20Sopenharmony_ci } 31808c2ecf20Sopenharmony_ci swsusp_set_page_free(pfn_to_page(pfn + i)); 31818c2ecf20Sopenharmony_ci } 31828c2ecf20Sopenharmony_ci } 31838c2ecf20Sopenharmony_ci } 31848c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&zone->lock, flags); 31858c2ecf20Sopenharmony_ci} 31868c2ecf20Sopenharmony_ci#endif /* CONFIG_PM */ 31878c2ecf20Sopenharmony_ci 31888c2ecf20Sopenharmony_cistatic bool free_unref_page_prepare(struct page *page, unsigned long pfn) 31898c2ecf20Sopenharmony_ci{ 31908c2ecf20Sopenharmony_ci int migratetype; 31918c2ecf20Sopenharmony_ci 31928c2ecf20Sopenharmony_ci if (!free_pcp_prepare(page)) 31938c2ecf20Sopenharmony_ci return false; 31948c2ecf20Sopenharmony_ci 31958c2ecf20Sopenharmony_ci migratetype = get_pfnblock_migratetype(page, pfn); 31968c2ecf20Sopenharmony_ci set_pcppage_migratetype(page, migratetype); 31978c2ecf20Sopenharmony_ci return true; 31988c2ecf20Sopenharmony_ci} 31998c2ecf20Sopenharmony_ci 32008c2ecf20Sopenharmony_cistatic void free_unref_page_commit(struct page *page, unsigned long pfn) 32018c2ecf20Sopenharmony_ci{ 32028c2ecf20Sopenharmony_ci struct zone *zone = page_zone(page); 32038c2ecf20Sopenharmony_ci struct per_cpu_pages *pcp; 32048c2ecf20Sopenharmony_ci int migratetype; 32058c2ecf20Sopenharmony_ci 32068c2ecf20Sopenharmony_ci migratetype = get_pcppage_migratetype(page); 32078c2ecf20Sopenharmony_ci __count_vm_event(PGFREE); 32088c2ecf20Sopenharmony_ci 32098c2ecf20Sopenharmony_ci /* 32108c2ecf20Sopenharmony_ci * We only track unmovable, reclaimable and movable on pcp lists. 32118c2ecf20Sopenharmony_ci * Free ISOLATE pages back to the allocator because they are being 32128c2ecf20Sopenharmony_ci * offlined but treat HIGHATOMIC as movable pages so we can get those 32138c2ecf20Sopenharmony_ci * areas back if necessary. Otherwise, we may have to free 32148c2ecf20Sopenharmony_ci * excessively into the page allocator 32158c2ecf20Sopenharmony_ci */ 32168c2ecf20Sopenharmony_ci if (migratetype >= MIGRATE_PCPTYPES) { 32178c2ecf20Sopenharmony_ci if (unlikely(is_migrate_isolate(migratetype))) { 32188c2ecf20Sopenharmony_ci free_one_page(zone, page, pfn, 0, migratetype, 32198c2ecf20Sopenharmony_ci FPI_NONE); 32208c2ecf20Sopenharmony_ci return; 32218c2ecf20Sopenharmony_ci } 32228c2ecf20Sopenharmony_ci migratetype = MIGRATE_MOVABLE; 32238c2ecf20Sopenharmony_ci } 32248c2ecf20Sopenharmony_ci 32258c2ecf20Sopenharmony_ci pcp = &this_cpu_ptr(zone->pageset)->pcp; 32268c2ecf20Sopenharmony_ci list_add(&page->lru, &pcp->lists[migratetype]); 32278c2ecf20Sopenharmony_ci pcp->count++; 32288c2ecf20Sopenharmony_ci if (pcp->count >= pcp->high) { 32298c2ecf20Sopenharmony_ci unsigned long batch = READ_ONCE(pcp->batch); 32308c2ecf20Sopenharmony_ci free_pcppages_bulk(zone, batch, pcp); 32318c2ecf20Sopenharmony_ci } 32328c2ecf20Sopenharmony_ci} 32338c2ecf20Sopenharmony_ci 32348c2ecf20Sopenharmony_ci/* 32358c2ecf20Sopenharmony_ci * Free a 0-order page 32368c2ecf20Sopenharmony_ci */ 32378c2ecf20Sopenharmony_civoid free_unref_page(struct page *page) 32388c2ecf20Sopenharmony_ci{ 32398c2ecf20Sopenharmony_ci unsigned long flags; 32408c2ecf20Sopenharmony_ci unsigned long pfn = page_to_pfn(page); 32418c2ecf20Sopenharmony_ci 32428c2ecf20Sopenharmony_ci if (!free_unref_page_prepare(page, pfn)) 32438c2ecf20Sopenharmony_ci return; 32448c2ecf20Sopenharmony_ci 32458c2ecf20Sopenharmony_ci local_irq_save(flags); 32468c2ecf20Sopenharmony_ci free_unref_page_commit(page, pfn); 32478c2ecf20Sopenharmony_ci local_irq_restore(flags); 32488c2ecf20Sopenharmony_ci} 32498c2ecf20Sopenharmony_ci 32508c2ecf20Sopenharmony_ci/* 32518c2ecf20Sopenharmony_ci * Free a list of 0-order pages 32528c2ecf20Sopenharmony_ci */ 32538c2ecf20Sopenharmony_civoid free_unref_page_list(struct list_head *list) 32548c2ecf20Sopenharmony_ci{ 32558c2ecf20Sopenharmony_ci struct page *page, *next; 32568c2ecf20Sopenharmony_ci unsigned long flags, pfn; 32578c2ecf20Sopenharmony_ci int batch_count = 0; 32588c2ecf20Sopenharmony_ci 32598c2ecf20Sopenharmony_ci /* Prepare pages for freeing */ 32608c2ecf20Sopenharmony_ci list_for_each_entry_safe(page, next, list, lru) { 32618c2ecf20Sopenharmony_ci pfn = page_to_pfn(page); 32628c2ecf20Sopenharmony_ci if (!free_unref_page_prepare(page, pfn)) 32638c2ecf20Sopenharmony_ci list_del(&page->lru); 32648c2ecf20Sopenharmony_ci set_page_private(page, pfn); 32658c2ecf20Sopenharmony_ci } 32668c2ecf20Sopenharmony_ci 32678c2ecf20Sopenharmony_ci local_irq_save(flags); 32688c2ecf20Sopenharmony_ci list_for_each_entry_safe(page, next, list, lru) { 32698c2ecf20Sopenharmony_ci unsigned long pfn = page_private(page); 32708c2ecf20Sopenharmony_ci 32718c2ecf20Sopenharmony_ci set_page_private(page, 0); 32728c2ecf20Sopenharmony_ci trace_mm_page_free_batched(page); 32738c2ecf20Sopenharmony_ci free_unref_page_commit(page, pfn); 32748c2ecf20Sopenharmony_ci 32758c2ecf20Sopenharmony_ci /* 32768c2ecf20Sopenharmony_ci * Guard against excessive IRQ disabled times when we get 32778c2ecf20Sopenharmony_ci * a large list of pages to free. 32788c2ecf20Sopenharmony_ci */ 32798c2ecf20Sopenharmony_ci if (++batch_count == SWAP_CLUSTER_MAX) { 32808c2ecf20Sopenharmony_ci local_irq_restore(flags); 32818c2ecf20Sopenharmony_ci batch_count = 0; 32828c2ecf20Sopenharmony_ci local_irq_save(flags); 32838c2ecf20Sopenharmony_ci } 32848c2ecf20Sopenharmony_ci } 32858c2ecf20Sopenharmony_ci local_irq_restore(flags); 32868c2ecf20Sopenharmony_ci} 32878c2ecf20Sopenharmony_ci 32888c2ecf20Sopenharmony_ci/* 32898c2ecf20Sopenharmony_ci * split_page takes a non-compound higher-order page, and splits it into 32908c2ecf20Sopenharmony_ci * n (1<<order) sub-pages: page[0..n] 32918c2ecf20Sopenharmony_ci * Each sub-page must be freed individually. 32928c2ecf20Sopenharmony_ci * 32938c2ecf20Sopenharmony_ci * Note: this is probably too low level an operation for use in drivers. 32948c2ecf20Sopenharmony_ci * Please consult with lkml before using this in your driver. 32958c2ecf20Sopenharmony_ci */ 32968c2ecf20Sopenharmony_civoid split_page(struct page *page, unsigned int order) 32978c2ecf20Sopenharmony_ci{ 32988c2ecf20Sopenharmony_ci int i; 32998c2ecf20Sopenharmony_ci 33008c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(PageCompound(page), page); 33018c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(!page_count(page), page); 33028c2ecf20Sopenharmony_ci 33038c2ecf20Sopenharmony_ci for (i = 1; i < (1 << order); i++) 33048c2ecf20Sopenharmony_ci set_page_refcounted(page + i); 33058c2ecf20Sopenharmony_ci split_page_owner(page, 1 << order); 33068c2ecf20Sopenharmony_ci split_page_memcg(page, 1 << order); 33078c2ecf20Sopenharmony_ci} 33088c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(split_page); 33098c2ecf20Sopenharmony_ci 33108c2ecf20Sopenharmony_ciint __isolate_free_page(struct page *page, unsigned int order) 33118c2ecf20Sopenharmony_ci{ 33128c2ecf20Sopenharmony_ci unsigned long watermark; 33138c2ecf20Sopenharmony_ci struct zone *zone; 33148c2ecf20Sopenharmony_ci int mt; 33158c2ecf20Sopenharmony_ci 33168c2ecf20Sopenharmony_ci BUG_ON(!PageBuddy(page)); 33178c2ecf20Sopenharmony_ci 33188c2ecf20Sopenharmony_ci zone = page_zone(page); 33198c2ecf20Sopenharmony_ci mt = get_pageblock_migratetype(page); 33208c2ecf20Sopenharmony_ci 33218c2ecf20Sopenharmony_ci if (!is_migrate_isolate(mt)) { 33228c2ecf20Sopenharmony_ci /* 33238c2ecf20Sopenharmony_ci * Obey watermarks as if the page was being allocated. We can 33248c2ecf20Sopenharmony_ci * emulate a high-order watermark check with a raised order-0 33258c2ecf20Sopenharmony_ci * watermark, because we already know our high-order page 33268c2ecf20Sopenharmony_ci * exists. 33278c2ecf20Sopenharmony_ci */ 33288c2ecf20Sopenharmony_ci watermark = zone->_watermark[WMARK_MIN] + (1UL << order); 33298c2ecf20Sopenharmony_ci if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) 33308c2ecf20Sopenharmony_ci return 0; 33318c2ecf20Sopenharmony_ci 33328c2ecf20Sopenharmony_ci __mod_zone_freepage_state(zone, -(1UL << order), mt); 33338c2ecf20Sopenharmony_ci } 33348c2ecf20Sopenharmony_ci 33358c2ecf20Sopenharmony_ci /* Remove page from free list */ 33368c2ecf20Sopenharmony_ci 33378c2ecf20Sopenharmony_ci del_page_from_free_list(page, zone, order); 33388c2ecf20Sopenharmony_ci 33398c2ecf20Sopenharmony_ci /* 33408c2ecf20Sopenharmony_ci * Set the pageblock if the isolated page is at least half of a 33418c2ecf20Sopenharmony_ci * pageblock 33428c2ecf20Sopenharmony_ci */ 33438c2ecf20Sopenharmony_ci if (order >= pageblock_order - 1) { 33448c2ecf20Sopenharmony_ci struct page *endpage = page + (1 << order) - 1; 33458c2ecf20Sopenharmony_ci for (; page < endpage; page += pageblock_nr_pages) { 33468c2ecf20Sopenharmony_ci int mt = get_pageblock_migratetype(page); 33478c2ecf20Sopenharmony_ci if (!is_migrate_isolate(mt) && !is_migrate_cma(mt) 33488c2ecf20Sopenharmony_ci && !is_migrate_highatomic(mt)) 33498c2ecf20Sopenharmony_ci set_pageblock_migratetype(page, 33508c2ecf20Sopenharmony_ci MIGRATE_MOVABLE); 33518c2ecf20Sopenharmony_ci } 33528c2ecf20Sopenharmony_ci } 33538c2ecf20Sopenharmony_ci 33548c2ecf20Sopenharmony_ci 33558c2ecf20Sopenharmony_ci return 1UL << order; 33568c2ecf20Sopenharmony_ci} 33578c2ecf20Sopenharmony_ci 33588c2ecf20Sopenharmony_ci/** 33598c2ecf20Sopenharmony_ci * __putback_isolated_page - Return a now-isolated page back where we got it 33608c2ecf20Sopenharmony_ci * @page: Page that was isolated 33618c2ecf20Sopenharmony_ci * @order: Order of the isolated page 33628c2ecf20Sopenharmony_ci * @mt: The page's pageblock's migratetype 33638c2ecf20Sopenharmony_ci * 33648c2ecf20Sopenharmony_ci * This function is meant to return a page pulled from the free lists via 33658c2ecf20Sopenharmony_ci * __isolate_free_page back to the free lists they were pulled from. 33668c2ecf20Sopenharmony_ci */ 33678c2ecf20Sopenharmony_civoid __putback_isolated_page(struct page *page, unsigned int order, int mt) 33688c2ecf20Sopenharmony_ci{ 33698c2ecf20Sopenharmony_ci struct zone *zone = page_zone(page); 33708c2ecf20Sopenharmony_ci 33718c2ecf20Sopenharmony_ci /* zone lock should be held when this function is called */ 33728c2ecf20Sopenharmony_ci lockdep_assert_held(&zone->lock); 33738c2ecf20Sopenharmony_ci 33748c2ecf20Sopenharmony_ci /* Return isolated page to tail of freelist. */ 33758c2ecf20Sopenharmony_ci __free_one_page(page, page_to_pfn(page), zone, order, mt, 33768c2ecf20Sopenharmony_ci FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL); 33778c2ecf20Sopenharmony_ci} 33788c2ecf20Sopenharmony_ci 33798c2ecf20Sopenharmony_ci/* 33808c2ecf20Sopenharmony_ci * Update NUMA hit/miss statistics 33818c2ecf20Sopenharmony_ci * 33828c2ecf20Sopenharmony_ci * Must be called with interrupts disabled. 33838c2ecf20Sopenharmony_ci */ 33848c2ecf20Sopenharmony_cistatic inline void zone_statistics(struct zone *preferred_zone, struct zone *z) 33858c2ecf20Sopenharmony_ci{ 33868c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA 33878c2ecf20Sopenharmony_ci enum numa_stat_item local_stat = NUMA_LOCAL; 33888c2ecf20Sopenharmony_ci 33898c2ecf20Sopenharmony_ci /* skip numa counters update if numa stats is disabled */ 33908c2ecf20Sopenharmony_ci if (!static_branch_likely(&vm_numa_stat_key)) 33918c2ecf20Sopenharmony_ci return; 33928c2ecf20Sopenharmony_ci 33938c2ecf20Sopenharmony_ci if (zone_to_nid(z) != numa_node_id()) 33948c2ecf20Sopenharmony_ci local_stat = NUMA_OTHER; 33958c2ecf20Sopenharmony_ci 33968c2ecf20Sopenharmony_ci if (zone_to_nid(z) == zone_to_nid(preferred_zone)) 33978c2ecf20Sopenharmony_ci __inc_numa_state(z, NUMA_HIT); 33988c2ecf20Sopenharmony_ci else { 33998c2ecf20Sopenharmony_ci __inc_numa_state(z, NUMA_MISS); 34008c2ecf20Sopenharmony_ci __inc_numa_state(preferred_zone, NUMA_FOREIGN); 34018c2ecf20Sopenharmony_ci } 34028c2ecf20Sopenharmony_ci __inc_numa_state(z, local_stat); 34038c2ecf20Sopenharmony_ci#endif 34048c2ecf20Sopenharmony_ci} 34058c2ecf20Sopenharmony_ci 34068c2ecf20Sopenharmony_ci/* Remove page from the per-cpu list, caller must protect the list */ 34078c2ecf20Sopenharmony_cistatic struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, 34088c2ecf20Sopenharmony_ci unsigned int alloc_flags, 34098c2ecf20Sopenharmony_ci struct per_cpu_pages *pcp, 34108c2ecf20Sopenharmony_ci struct list_head *list) 34118c2ecf20Sopenharmony_ci{ 34128c2ecf20Sopenharmony_ci struct page *page; 34138c2ecf20Sopenharmony_ci 34148c2ecf20Sopenharmony_ci do { 34158c2ecf20Sopenharmony_ci if (list_empty(list)) { 34168c2ecf20Sopenharmony_ci pcp->count += rmqueue_bulk(zone, 0, 34178c2ecf20Sopenharmony_ci pcp->batch, list, 34188c2ecf20Sopenharmony_ci migratetype, alloc_flags); 34198c2ecf20Sopenharmony_ci if (unlikely(list_empty(list))) 34208c2ecf20Sopenharmony_ci return NULL; 34218c2ecf20Sopenharmony_ci } 34228c2ecf20Sopenharmony_ci 34238c2ecf20Sopenharmony_ci page = list_first_entry(list, struct page, lru); 34248c2ecf20Sopenharmony_ci list_del(&page->lru); 34258c2ecf20Sopenharmony_ci pcp->count--; 34268c2ecf20Sopenharmony_ci } while (check_new_pcp(page)); 34278c2ecf20Sopenharmony_ci 34288c2ecf20Sopenharmony_ci return page; 34298c2ecf20Sopenharmony_ci} 34308c2ecf20Sopenharmony_ci 34318c2ecf20Sopenharmony_ci/* Lock and remove page from the per-cpu list */ 34328c2ecf20Sopenharmony_cistatic struct page *rmqueue_pcplist(struct zone *preferred_zone, 34338c2ecf20Sopenharmony_ci struct zone *zone, gfp_t gfp_flags, 34348c2ecf20Sopenharmony_ci int migratetype, unsigned int alloc_flags) 34358c2ecf20Sopenharmony_ci{ 34368c2ecf20Sopenharmony_ci struct per_cpu_pages *pcp; 34378c2ecf20Sopenharmony_ci struct list_head *list; 34388c2ecf20Sopenharmony_ci struct page *page; 34398c2ecf20Sopenharmony_ci unsigned long flags; 34408c2ecf20Sopenharmony_ci 34418c2ecf20Sopenharmony_ci local_irq_save(flags); 34428c2ecf20Sopenharmony_ci pcp = &this_cpu_ptr(zone->pageset)->pcp; 34438c2ecf20Sopenharmony_ci list = &pcp->lists[migratetype]; 34448c2ecf20Sopenharmony_ci page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); 34458c2ecf20Sopenharmony_ci if (page) { 34468c2ecf20Sopenharmony_ci __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); 34478c2ecf20Sopenharmony_ci zone_statistics(preferred_zone, zone); 34488c2ecf20Sopenharmony_ci } 34498c2ecf20Sopenharmony_ci local_irq_restore(flags); 34508c2ecf20Sopenharmony_ci return page; 34518c2ecf20Sopenharmony_ci} 34528c2ecf20Sopenharmony_ci 34538c2ecf20Sopenharmony_ci/* 34548c2ecf20Sopenharmony_ci * Allocate a page from the given zone. Use pcplists for order-0 allocations. 34558c2ecf20Sopenharmony_ci */ 34568c2ecf20Sopenharmony_cistatic inline 34578c2ecf20Sopenharmony_cistruct page *rmqueue(struct zone *preferred_zone, 34588c2ecf20Sopenharmony_ci struct zone *zone, unsigned int order, 34598c2ecf20Sopenharmony_ci gfp_t gfp_flags, unsigned int alloc_flags, 34608c2ecf20Sopenharmony_ci int migratetype) 34618c2ecf20Sopenharmony_ci{ 34628c2ecf20Sopenharmony_ci unsigned long flags; 34638c2ecf20Sopenharmony_ci struct page *page; 34648c2ecf20Sopenharmony_ci 34658c2ecf20Sopenharmony_ci if (likely(order == 0)) { 34668c2ecf20Sopenharmony_ci /* 34678c2ecf20Sopenharmony_ci * MIGRATE_MOVABLE pcplist could have the pages on CMA area and 34688c2ecf20Sopenharmony_ci * we need to skip it when CMA area isn't allowed. 34698c2ecf20Sopenharmony_ci */ 34708c2ecf20Sopenharmony_ci if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA || 34718c2ecf20Sopenharmony_ci migratetype != MIGRATE_MOVABLE || 34728c2ecf20Sopenharmony_ci IS_ENABLED(CONFIG_CMA_REUSE)) { 34738c2ecf20Sopenharmony_ci page = rmqueue_pcplist(preferred_zone, zone, gfp_flags, 34748c2ecf20Sopenharmony_ci migratetype, alloc_flags); 34758c2ecf20Sopenharmony_ci goto out; 34768c2ecf20Sopenharmony_ci } 34778c2ecf20Sopenharmony_ci } 34788c2ecf20Sopenharmony_ci 34798c2ecf20Sopenharmony_ci /* 34808c2ecf20Sopenharmony_ci * We most definitely don't want callers attempting to 34818c2ecf20Sopenharmony_ci * allocate greater than order-1 page units with __GFP_NOFAIL. 34828c2ecf20Sopenharmony_ci */ 34838c2ecf20Sopenharmony_ci WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); 34848c2ecf20Sopenharmony_ci spin_lock_irqsave(&zone->lock, flags); 34858c2ecf20Sopenharmony_ci 34868c2ecf20Sopenharmony_ci do { 34878c2ecf20Sopenharmony_ci page = NULL; 34888c2ecf20Sopenharmony_ci /* 34898c2ecf20Sopenharmony_ci * order-0 request can reach here when the pcplist is skipped 34908c2ecf20Sopenharmony_ci * due to non-CMA allocation context. HIGHATOMIC area is 34918c2ecf20Sopenharmony_ci * reserved for high-order atomic allocation, so order-0 34928c2ecf20Sopenharmony_ci * request should skip it. 34938c2ecf20Sopenharmony_ci */ 34948c2ecf20Sopenharmony_ci if (order > 0 && alloc_flags & ALLOC_HARDER) { 34958c2ecf20Sopenharmony_ci page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); 34968c2ecf20Sopenharmony_ci if (page) 34978c2ecf20Sopenharmony_ci trace_mm_page_alloc_zone_locked(page, order, migratetype); 34988c2ecf20Sopenharmony_ci } 34998c2ecf20Sopenharmony_ci if (!page) 35008c2ecf20Sopenharmony_ci page = __rmqueue(zone, order, migratetype, alloc_flags); 35018c2ecf20Sopenharmony_ci } while (page && check_new_pages(page, order)); 35028c2ecf20Sopenharmony_ci spin_unlock(&zone->lock); 35038c2ecf20Sopenharmony_ci if (!page) 35048c2ecf20Sopenharmony_ci goto failed; 35058c2ecf20Sopenharmony_ci __mod_zone_freepage_state(zone, -(1 << order), 35068c2ecf20Sopenharmony_ci get_pcppage_migratetype(page)); 35078c2ecf20Sopenharmony_ci 35088c2ecf20Sopenharmony_ci __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); 35098c2ecf20Sopenharmony_ci zone_statistics(preferred_zone, zone); 35108c2ecf20Sopenharmony_ci local_irq_restore(flags); 35118c2ecf20Sopenharmony_ci 35128c2ecf20Sopenharmony_ciout: 35138c2ecf20Sopenharmony_ci /* Separate test+clear to avoid unnecessary atomics */ 35148c2ecf20Sopenharmony_ci if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) { 35158c2ecf20Sopenharmony_ci clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); 35168c2ecf20Sopenharmony_ci wakeup_kswapd(zone, 0, 0, zone_idx(zone)); 35178c2ecf20Sopenharmony_ci } 35188c2ecf20Sopenharmony_ci 35198c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(page && bad_range(zone, page), page); 35208c2ecf20Sopenharmony_ci return page; 35218c2ecf20Sopenharmony_ci 35228c2ecf20Sopenharmony_cifailed: 35238c2ecf20Sopenharmony_ci local_irq_restore(flags); 35248c2ecf20Sopenharmony_ci return NULL; 35258c2ecf20Sopenharmony_ci} 35268c2ecf20Sopenharmony_ci 35278c2ecf20Sopenharmony_ci#ifdef CONFIG_FAIL_PAGE_ALLOC 35288c2ecf20Sopenharmony_ci 35298c2ecf20Sopenharmony_cistatic struct { 35308c2ecf20Sopenharmony_ci struct fault_attr attr; 35318c2ecf20Sopenharmony_ci 35328c2ecf20Sopenharmony_ci bool ignore_gfp_highmem; 35338c2ecf20Sopenharmony_ci bool ignore_gfp_reclaim; 35348c2ecf20Sopenharmony_ci u32 min_order; 35358c2ecf20Sopenharmony_ci} fail_page_alloc = { 35368c2ecf20Sopenharmony_ci .attr = FAULT_ATTR_INITIALIZER, 35378c2ecf20Sopenharmony_ci .ignore_gfp_reclaim = true, 35388c2ecf20Sopenharmony_ci .ignore_gfp_highmem = true, 35398c2ecf20Sopenharmony_ci .min_order = 1, 35408c2ecf20Sopenharmony_ci}; 35418c2ecf20Sopenharmony_ci 35428c2ecf20Sopenharmony_cistatic int __init setup_fail_page_alloc(char *str) 35438c2ecf20Sopenharmony_ci{ 35448c2ecf20Sopenharmony_ci return setup_fault_attr(&fail_page_alloc.attr, str); 35458c2ecf20Sopenharmony_ci} 35468c2ecf20Sopenharmony_ci__setup("fail_page_alloc=", setup_fail_page_alloc); 35478c2ecf20Sopenharmony_ci 35488c2ecf20Sopenharmony_cistatic bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 35498c2ecf20Sopenharmony_ci{ 35508c2ecf20Sopenharmony_ci if (order < fail_page_alloc.min_order) 35518c2ecf20Sopenharmony_ci return false; 35528c2ecf20Sopenharmony_ci if (gfp_mask & __GFP_NOFAIL) 35538c2ecf20Sopenharmony_ci return false; 35548c2ecf20Sopenharmony_ci if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) 35558c2ecf20Sopenharmony_ci return false; 35568c2ecf20Sopenharmony_ci if (fail_page_alloc.ignore_gfp_reclaim && 35578c2ecf20Sopenharmony_ci (gfp_mask & __GFP_DIRECT_RECLAIM)) 35588c2ecf20Sopenharmony_ci return false; 35598c2ecf20Sopenharmony_ci 35608c2ecf20Sopenharmony_ci return should_fail(&fail_page_alloc.attr, 1 << order); 35618c2ecf20Sopenharmony_ci} 35628c2ecf20Sopenharmony_ci 35638c2ecf20Sopenharmony_ci#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 35648c2ecf20Sopenharmony_ci 35658c2ecf20Sopenharmony_cistatic int __init fail_page_alloc_debugfs(void) 35668c2ecf20Sopenharmony_ci{ 35678c2ecf20Sopenharmony_ci umode_t mode = S_IFREG | 0600; 35688c2ecf20Sopenharmony_ci struct dentry *dir; 35698c2ecf20Sopenharmony_ci 35708c2ecf20Sopenharmony_ci dir = fault_create_debugfs_attr("fail_page_alloc", NULL, 35718c2ecf20Sopenharmony_ci &fail_page_alloc.attr); 35728c2ecf20Sopenharmony_ci 35738c2ecf20Sopenharmony_ci debugfs_create_bool("ignore-gfp-wait", mode, dir, 35748c2ecf20Sopenharmony_ci &fail_page_alloc.ignore_gfp_reclaim); 35758c2ecf20Sopenharmony_ci debugfs_create_bool("ignore-gfp-highmem", mode, dir, 35768c2ecf20Sopenharmony_ci &fail_page_alloc.ignore_gfp_highmem); 35778c2ecf20Sopenharmony_ci debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order); 35788c2ecf20Sopenharmony_ci 35798c2ecf20Sopenharmony_ci return 0; 35808c2ecf20Sopenharmony_ci} 35818c2ecf20Sopenharmony_ci 35828c2ecf20Sopenharmony_cilate_initcall(fail_page_alloc_debugfs); 35838c2ecf20Sopenharmony_ci 35848c2ecf20Sopenharmony_ci#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 35858c2ecf20Sopenharmony_ci 35868c2ecf20Sopenharmony_ci#else /* CONFIG_FAIL_PAGE_ALLOC */ 35878c2ecf20Sopenharmony_ci 35888c2ecf20Sopenharmony_cistatic inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 35898c2ecf20Sopenharmony_ci{ 35908c2ecf20Sopenharmony_ci return false; 35918c2ecf20Sopenharmony_ci} 35928c2ecf20Sopenharmony_ci 35938c2ecf20Sopenharmony_ci#endif /* CONFIG_FAIL_PAGE_ALLOC */ 35948c2ecf20Sopenharmony_ci 35958c2ecf20Sopenharmony_cinoinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 35968c2ecf20Sopenharmony_ci{ 35978c2ecf20Sopenharmony_ci return __should_fail_alloc_page(gfp_mask, order); 35988c2ecf20Sopenharmony_ci} 35998c2ecf20Sopenharmony_ciALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); 36008c2ecf20Sopenharmony_ci 36018c2ecf20Sopenharmony_cistatic inline long __zone_watermark_unusable_free(struct zone *z, 36028c2ecf20Sopenharmony_ci unsigned int order, unsigned int alloc_flags) 36038c2ecf20Sopenharmony_ci{ 36048c2ecf20Sopenharmony_ci const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); 36058c2ecf20Sopenharmony_ci long unusable_free = (1 << order) - 1; 36068c2ecf20Sopenharmony_ci 36078c2ecf20Sopenharmony_ci /* 36088c2ecf20Sopenharmony_ci * If the caller does not have rights to ALLOC_HARDER then subtract 36098c2ecf20Sopenharmony_ci * the high-atomic reserves. This will over-estimate the size of the 36108c2ecf20Sopenharmony_ci * atomic reserve but it avoids a search. 36118c2ecf20Sopenharmony_ci */ 36128c2ecf20Sopenharmony_ci if (likely(!alloc_harder)) 36138c2ecf20Sopenharmony_ci unusable_free += z->nr_reserved_highatomic; 36148c2ecf20Sopenharmony_ci 36158c2ecf20Sopenharmony_ci#ifdef CONFIG_CMA 36168c2ecf20Sopenharmony_ci /* If allocation can't use CMA areas don't use free CMA pages */ 36178c2ecf20Sopenharmony_ci if (!(alloc_flags & ALLOC_CMA)) 36188c2ecf20Sopenharmony_ci unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES); 36198c2ecf20Sopenharmony_ci#endif 36208c2ecf20Sopenharmony_ci 36218c2ecf20Sopenharmony_ci return unusable_free; 36228c2ecf20Sopenharmony_ci} 36238c2ecf20Sopenharmony_ci 36248c2ecf20Sopenharmony_ci/* 36258c2ecf20Sopenharmony_ci * Return true if free base pages are above 'mark'. For high-order checks it 36268c2ecf20Sopenharmony_ci * will return true of the order-0 watermark is reached and there is at least 36278c2ecf20Sopenharmony_ci * one free page of a suitable size. Checking now avoids taking the zone lock 36288c2ecf20Sopenharmony_ci * to check in the allocation paths if no pages are free. 36298c2ecf20Sopenharmony_ci */ 36308c2ecf20Sopenharmony_cibool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, 36318c2ecf20Sopenharmony_ci int highest_zoneidx, unsigned int alloc_flags, 36328c2ecf20Sopenharmony_ci long free_pages) 36338c2ecf20Sopenharmony_ci{ 36348c2ecf20Sopenharmony_ci long min = mark; 36358c2ecf20Sopenharmony_ci int o; 36368c2ecf20Sopenharmony_ci const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); 36378c2ecf20Sopenharmony_ci 36388c2ecf20Sopenharmony_ci /* free_pages may go negative - that's OK */ 36398c2ecf20Sopenharmony_ci free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags); 36408c2ecf20Sopenharmony_ci 36418c2ecf20Sopenharmony_ci if (alloc_flags & ALLOC_HIGH) 36428c2ecf20Sopenharmony_ci min -= min / 2; 36438c2ecf20Sopenharmony_ci 36448c2ecf20Sopenharmony_ci if (unlikely(alloc_harder)) { 36458c2ecf20Sopenharmony_ci /* 36468c2ecf20Sopenharmony_ci * OOM victims can try even harder than normal ALLOC_HARDER 36478c2ecf20Sopenharmony_ci * users on the grounds that it's definitely going to be in 36488c2ecf20Sopenharmony_ci * the exit path shortly and free memory. Any allocation it 36498c2ecf20Sopenharmony_ci * makes during the free path will be small and short-lived. 36508c2ecf20Sopenharmony_ci */ 36518c2ecf20Sopenharmony_ci if (alloc_flags & ALLOC_OOM) 36528c2ecf20Sopenharmony_ci min -= min / 2; 36538c2ecf20Sopenharmony_ci else 36548c2ecf20Sopenharmony_ci min -= min / 4; 36558c2ecf20Sopenharmony_ci } 36568c2ecf20Sopenharmony_ci 36578c2ecf20Sopenharmony_ci /* 36588c2ecf20Sopenharmony_ci * Check watermarks for an order-0 allocation request. If these 36598c2ecf20Sopenharmony_ci * are not met, then a high-order request also cannot go ahead 36608c2ecf20Sopenharmony_ci * even if a suitable page happened to be free. 36618c2ecf20Sopenharmony_ci */ 36628c2ecf20Sopenharmony_ci if (free_pages <= min + z->lowmem_reserve[highest_zoneidx]) 36638c2ecf20Sopenharmony_ci return false; 36648c2ecf20Sopenharmony_ci 36658c2ecf20Sopenharmony_ci /* If this is an order-0 request then the watermark is fine */ 36668c2ecf20Sopenharmony_ci if (!order) 36678c2ecf20Sopenharmony_ci return true; 36688c2ecf20Sopenharmony_ci 36698c2ecf20Sopenharmony_ci /* For a high-order request, check at least one suitable page is free */ 36708c2ecf20Sopenharmony_ci for (o = order; o < MAX_ORDER; o++) { 36718c2ecf20Sopenharmony_ci struct free_area *area = &z->free_area[o]; 36728c2ecf20Sopenharmony_ci int mt; 36738c2ecf20Sopenharmony_ci 36748c2ecf20Sopenharmony_ci if (!area->nr_free) 36758c2ecf20Sopenharmony_ci continue; 36768c2ecf20Sopenharmony_ci 36778c2ecf20Sopenharmony_ci for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { 36788c2ecf20Sopenharmony_ci if (!free_area_empty(area, mt)) 36798c2ecf20Sopenharmony_ci return true; 36808c2ecf20Sopenharmony_ci } 36818c2ecf20Sopenharmony_ci 36828c2ecf20Sopenharmony_ci#ifdef CONFIG_CMA 36838c2ecf20Sopenharmony_ci if ((alloc_flags & ALLOC_CMA) && 36848c2ecf20Sopenharmony_ci !free_area_empty(area, MIGRATE_CMA)) { 36858c2ecf20Sopenharmony_ci return true; 36868c2ecf20Sopenharmony_ci } 36878c2ecf20Sopenharmony_ci#endif 36888c2ecf20Sopenharmony_ci if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC)) 36898c2ecf20Sopenharmony_ci return true; 36908c2ecf20Sopenharmony_ci } 36918c2ecf20Sopenharmony_ci return false; 36928c2ecf20Sopenharmony_ci} 36938c2ecf20Sopenharmony_ci 36948c2ecf20Sopenharmony_cibool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, 36958c2ecf20Sopenharmony_ci int highest_zoneidx, unsigned int alloc_flags) 36968c2ecf20Sopenharmony_ci{ 36978c2ecf20Sopenharmony_ci return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, 36988c2ecf20Sopenharmony_ci zone_page_state(z, NR_FREE_PAGES)); 36998c2ecf20Sopenharmony_ci} 37008c2ecf20Sopenharmony_ci 37018c2ecf20Sopenharmony_cistatic inline bool zone_watermark_fast(struct zone *z, unsigned int order, 37028c2ecf20Sopenharmony_ci unsigned long mark, int highest_zoneidx, 37038c2ecf20Sopenharmony_ci unsigned int alloc_flags, gfp_t gfp_mask) 37048c2ecf20Sopenharmony_ci{ 37058c2ecf20Sopenharmony_ci long free_pages; 37068c2ecf20Sopenharmony_ci 37078c2ecf20Sopenharmony_ci free_pages = zone_page_state(z, NR_FREE_PAGES); 37088c2ecf20Sopenharmony_ci 37098c2ecf20Sopenharmony_ci /* 37108c2ecf20Sopenharmony_ci * Fast check for order-0 only. If this fails then the reserves 37118c2ecf20Sopenharmony_ci * need to be calculated. 37128c2ecf20Sopenharmony_ci */ 37138c2ecf20Sopenharmony_ci if (!order) { 37148c2ecf20Sopenharmony_ci long usable_free; 37158c2ecf20Sopenharmony_ci long reserved; 37168c2ecf20Sopenharmony_ci 37178c2ecf20Sopenharmony_ci usable_free = free_pages; 37188c2ecf20Sopenharmony_ci reserved = __zone_watermark_unusable_free(z, 0, alloc_flags); 37198c2ecf20Sopenharmony_ci 37208c2ecf20Sopenharmony_ci /* reserved may over estimate high-atomic reserves. */ 37218c2ecf20Sopenharmony_ci usable_free -= min(usable_free, reserved); 37228c2ecf20Sopenharmony_ci if (usable_free > mark + z->lowmem_reserve[highest_zoneidx]) 37238c2ecf20Sopenharmony_ci return true; 37248c2ecf20Sopenharmony_ci } 37258c2ecf20Sopenharmony_ci 37268c2ecf20Sopenharmony_ci if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, 37278c2ecf20Sopenharmony_ci free_pages)) 37288c2ecf20Sopenharmony_ci return true; 37298c2ecf20Sopenharmony_ci /* 37308c2ecf20Sopenharmony_ci * Ignore watermark boosting for GFP_ATOMIC order-0 allocations 37318c2ecf20Sopenharmony_ci * when checking the min watermark. The min watermark is the 37328c2ecf20Sopenharmony_ci * point where boosting is ignored so that kswapd is woken up 37338c2ecf20Sopenharmony_ci * when below the low watermark. 37348c2ecf20Sopenharmony_ci */ 37358c2ecf20Sopenharmony_ci if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost 37368c2ecf20Sopenharmony_ci && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) { 37378c2ecf20Sopenharmony_ci mark = z->_watermark[WMARK_MIN]; 37388c2ecf20Sopenharmony_ci return __zone_watermark_ok(z, order, mark, highest_zoneidx, 37398c2ecf20Sopenharmony_ci alloc_flags, free_pages); 37408c2ecf20Sopenharmony_ci } 37418c2ecf20Sopenharmony_ci 37428c2ecf20Sopenharmony_ci return false; 37438c2ecf20Sopenharmony_ci} 37448c2ecf20Sopenharmony_ci 37458c2ecf20Sopenharmony_cibool zone_watermark_ok_safe(struct zone *z, unsigned int order, 37468c2ecf20Sopenharmony_ci unsigned long mark, int highest_zoneidx) 37478c2ecf20Sopenharmony_ci{ 37488c2ecf20Sopenharmony_ci long free_pages = zone_page_state(z, NR_FREE_PAGES); 37498c2ecf20Sopenharmony_ci 37508c2ecf20Sopenharmony_ci if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) 37518c2ecf20Sopenharmony_ci free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); 37528c2ecf20Sopenharmony_ci 37538c2ecf20Sopenharmony_ci return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0, 37548c2ecf20Sopenharmony_ci free_pages); 37558c2ecf20Sopenharmony_ci} 37568c2ecf20Sopenharmony_ci 37578c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA 37588c2ecf20Sopenharmony_cistatic bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 37598c2ecf20Sopenharmony_ci{ 37608c2ecf20Sopenharmony_ci return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <= 37618c2ecf20Sopenharmony_ci node_reclaim_distance; 37628c2ecf20Sopenharmony_ci} 37638c2ecf20Sopenharmony_ci#else /* CONFIG_NUMA */ 37648c2ecf20Sopenharmony_cistatic bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 37658c2ecf20Sopenharmony_ci{ 37668c2ecf20Sopenharmony_ci return true; 37678c2ecf20Sopenharmony_ci} 37688c2ecf20Sopenharmony_ci#endif /* CONFIG_NUMA */ 37698c2ecf20Sopenharmony_ci 37708c2ecf20Sopenharmony_ci/* 37718c2ecf20Sopenharmony_ci * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid 37728c2ecf20Sopenharmony_ci * fragmentation is subtle. If the preferred zone was HIGHMEM then 37738c2ecf20Sopenharmony_ci * premature use of a lower zone may cause lowmem pressure problems that 37748c2ecf20Sopenharmony_ci * are worse than fragmentation. If the next zone is ZONE_DMA then it is 37758c2ecf20Sopenharmony_ci * probably too small. It only makes sense to spread allocations to avoid 37768c2ecf20Sopenharmony_ci * fragmentation between the Normal and DMA32 zones. 37778c2ecf20Sopenharmony_ci */ 37788c2ecf20Sopenharmony_cistatic inline unsigned int 37798c2ecf20Sopenharmony_cialloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) 37808c2ecf20Sopenharmony_ci{ 37818c2ecf20Sopenharmony_ci unsigned int alloc_flags; 37828c2ecf20Sopenharmony_ci 37838c2ecf20Sopenharmony_ci /* 37848c2ecf20Sopenharmony_ci * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD 37858c2ecf20Sopenharmony_ci * to save a branch. 37868c2ecf20Sopenharmony_ci */ 37878c2ecf20Sopenharmony_ci alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM); 37888c2ecf20Sopenharmony_ci 37898c2ecf20Sopenharmony_ci#ifdef CONFIG_ZONE_DMA32 37908c2ecf20Sopenharmony_ci if (!zone) 37918c2ecf20Sopenharmony_ci return alloc_flags; 37928c2ecf20Sopenharmony_ci 37938c2ecf20Sopenharmony_ci if (zone_idx(zone) != ZONE_NORMAL) 37948c2ecf20Sopenharmony_ci return alloc_flags; 37958c2ecf20Sopenharmony_ci 37968c2ecf20Sopenharmony_ci /* 37978c2ecf20Sopenharmony_ci * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and 37988c2ecf20Sopenharmony_ci * the pointer is within zone->zone_pgdat->node_zones[]. Also assume 37998c2ecf20Sopenharmony_ci * on UMA that if Normal is populated then so is DMA32. 38008c2ecf20Sopenharmony_ci */ 38018c2ecf20Sopenharmony_ci BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1); 38028c2ecf20Sopenharmony_ci if (nr_online_nodes > 1 && !populated_zone(--zone)) 38038c2ecf20Sopenharmony_ci return alloc_flags; 38048c2ecf20Sopenharmony_ci 38058c2ecf20Sopenharmony_ci alloc_flags |= ALLOC_NOFRAGMENT; 38068c2ecf20Sopenharmony_ci#endif /* CONFIG_ZONE_DMA32 */ 38078c2ecf20Sopenharmony_ci return alloc_flags; 38088c2ecf20Sopenharmony_ci} 38098c2ecf20Sopenharmony_ci 38108c2ecf20Sopenharmony_cistatic inline unsigned int current_alloc_flags(gfp_t gfp_mask, 38118c2ecf20Sopenharmony_ci unsigned int alloc_flags) 38128c2ecf20Sopenharmony_ci{ 38138c2ecf20Sopenharmony_ci#ifdef CONFIG_CMA 38148c2ecf20Sopenharmony_ci unsigned int pflags = current->flags; 38158c2ecf20Sopenharmony_ci 38168c2ecf20Sopenharmony_ci if (!(pflags & PF_MEMALLOC_NOCMA) && 38178c2ecf20Sopenharmony_ci gfp_migratetype(gfp_mask) == get_cma_migratetype()) 38188c2ecf20Sopenharmony_ci alloc_flags |= ALLOC_CMA; 38198c2ecf20Sopenharmony_ci 38208c2ecf20Sopenharmony_ci#endif 38218c2ecf20Sopenharmony_ci return alloc_flags; 38228c2ecf20Sopenharmony_ci} 38238c2ecf20Sopenharmony_ci 38248c2ecf20Sopenharmony_ci/* 38258c2ecf20Sopenharmony_ci * get_page_from_freelist goes through the zonelist trying to allocate 38268c2ecf20Sopenharmony_ci * a page. 38278c2ecf20Sopenharmony_ci */ 38288c2ecf20Sopenharmony_cistatic struct page * 38298c2ecf20Sopenharmony_ciget_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, 38308c2ecf20Sopenharmony_ci const struct alloc_context *ac) 38318c2ecf20Sopenharmony_ci{ 38328c2ecf20Sopenharmony_ci struct zoneref *z; 38338c2ecf20Sopenharmony_ci struct zone *zone; 38348c2ecf20Sopenharmony_ci struct pglist_data *last_pgdat_dirty_limit = NULL; 38358c2ecf20Sopenharmony_ci bool no_fallback; 38368c2ecf20Sopenharmony_ci 38378c2ecf20Sopenharmony_ciretry: 38388c2ecf20Sopenharmony_ci /* 38398c2ecf20Sopenharmony_ci * Scan zonelist, looking for a zone with enough free. 38408c2ecf20Sopenharmony_ci * See also __cpuset_node_allowed() comment in kernel/cpuset.c. 38418c2ecf20Sopenharmony_ci */ 38428c2ecf20Sopenharmony_ci no_fallback = alloc_flags & ALLOC_NOFRAGMENT; 38438c2ecf20Sopenharmony_ci z = ac->preferred_zoneref; 38448c2ecf20Sopenharmony_ci for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx, 38458c2ecf20Sopenharmony_ci ac->nodemask) { 38468c2ecf20Sopenharmony_ci struct page *page; 38478c2ecf20Sopenharmony_ci unsigned long mark; 38488c2ecf20Sopenharmony_ci 38498c2ecf20Sopenharmony_ci if (cpusets_enabled() && 38508c2ecf20Sopenharmony_ci (alloc_flags & ALLOC_CPUSET) && 38518c2ecf20Sopenharmony_ci !__cpuset_zone_allowed(zone, gfp_mask)) 38528c2ecf20Sopenharmony_ci continue; 38538c2ecf20Sopenharmony_ci /* 38548c2ecf20Sopenharmony_ci * When allocating a page cache page for writing, we 38558c2ecf20Sopenharmony_ci * want to get it from a node that is within its dirty 38568c2ecf20Sopenharmony_ci * limit, such that no single node holds more than its 38578c2ecf20Sopenharmony_ci * proportional share of globally allowed dirty pages. 38588c2ecf20Sopenharmony_ci * The dirty limits take into account the node's 38598c2ecf20Sopenharmony_ci * lowmem reserves and high watermark so that kswapd 38608c2ecf20Sopenharmony_ci * should be able to balance it without having to 38618c2ecf20Sopenharmony_ci * write pages from its LRU list. 38628c2ecf20Sopenharmony_ci * 38638c2ecf20Sopenharmony_ci * XXX: For now, allow allocations to potentially 38648c2ecf20Sopenharmony_ci * exceed the per-node dirty limit in the slowpath 38658c2ecf20Sopenharmony_ci * (spread_dirty_pages unset) before going into reclaim, 38668c2ecf20Sopenharmony_ci * which is important when on a NUMA setup the allowed 38678c2ecf20Sopenharmony_ci * nodes are together not big enough to reach the 38688c2ecf20Sopenharmony_ci * global limit. The proper fix for these situations 38698c2ecf20Sopenharmony_ci * will require awareness of nodes in the 38708c2ecf20Sopenharmony_ci * dirty-throttling and the flusher threads. 38718c2ecf20Sopenharmony_ci */ 38728c2ecf20Sopenharmony_ci if (ac->spread_dirty_pages) { 38738c2ecf20Sopenharmony_ci if (last_pgdat_dirty_limit == zone->zone_pgdat) 38748c2ecf20Sopenharmony_ci continue; 38758c2ecf20Sopenharmony_ci 38768c2ecf20Sopenharmony_ci if (!node_dirty_ok(zone->zone_pgdat)) { 38778c2ecf20Sopenharmony_ci last_pgdat_dirty_limit = zone->zone_pgdat; 38788c2ecf20Sopenharmony_ci continue; 38798c2ecf20Sopenharmony_ci } 38808c2ecf20Sopenharmony_ci } 38818c2ecf20Sopenharmony_ci 38828c2ecf20Sopenharmony_ci if (no_fallback && nr_online_nodes > 1 && 38838c2ecf20Sopenharmony_ci zone != ac->preferred_zoneref->zone) { 38848c2ecf20Sopenharmony_ci int local_nid; 38858c2ecf20Sopenharmony_ci 38868c2ecf20Sopenharmony_ci /* 38878c2ecf20Sopenharmony_ci * If moving to a remote node, retry but allow 38888c2ecf20Sopenharmony_ci * fragmenting fallbacks. Locality is more important 38898c2ecf20Sopenharmony_ci * than fragmentation avoidance. 38908c2ecf20Sopenharmony_ci */ 38918c2ecf20Sopenharmony_ci local_nid = zone_to_nid(ac->preferred_zoneref->zone); 38928c2ecf20Sopenharmony_ci if (zone_to_nid(zone) != local_nid) { 38938c2ecf20Sopenharmony_ci alloc_flags &= ~ALLOC_NOFRAGMENT; 38948c2ecf20Sopenharmony_ci goto retry; 38958c2ecf20Sopenharmony_ci } 38968c2ecf20Sopenharmony_ci } 38978c2ecf20Sopenharmony_ci 38988c2ecf20Sopenharmony_ci mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); 38998c2ecf20Sopenharmony_ci if (!zone_watermark_fast(zone, order, mark, 39008c2ecf20Sopenharmony_ci ac->highest_zoneidx, alloc_flags, 39018c2ecf20Sopenharmony_ci gfp_mask)) { 39028c2ecf20Sopenharmony_ci int ret; 39038c2ecf20Sopenharmony_ci 39048c2ecf20Sopenharmony_ci#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 39058c2ecf20Sopenharmony_ci /* 39068c2ecf20Sopenharmony_ci * Watermark failed for this zone, but see if we can 39078c2ecf20Sopenharmony_ci * grow this zone if it contains deferred pages. 39088c2ecf20Sopenharmony_ci */ 39098c2ecf20Sopenharmony_ci if (static_branch_unlikely(&deferred_pages)) { 39108c2ecf20Sopenharmony_ci if (_deferred_grow_zone(zone, order)) 39118c2ecf20Sopenharmony_ci goto try_this_zone; 39128c2ecf20Sopenharmony_ci } 39138c2ecf20Sopenharmony_ci#endif 39148c2ecf20Sopenharmony_ci /* Checked here to keep the fast path fast */ 39158c2ecf20Sopenharmony_ci BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 39168c2ecf20Sopenharmony_ci if (alloc_flags & ALLOC_NO_WATERMARKS) 39178c2ecf20Sopenharmony_ci goto try_this_zone; 39188c2ecf20Sopenharmony_ci 39198c2ecf20Sopenharmony_ci if (node_reclaim_mode == 0 || 39208c2ecf20Sopenharmony_ci !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) 39218c2ecf20Sopenharmony_ci continue; 39228c2ecf20Sopenharmony_ci 39238c2ecf20Sopenharmony_ci ret = node_reclaim(zone->zone_pgdat, gfp_mask, order); 39248c2ecf20Sopenharmony_ci switch (ret) { 39258c2ecf20Sopenharmony_ci case NODE_RECLAIM_NOSCAN: 39268c2ecf20Sopenharmony_ci /* did not scan */ 39278c2ecf20Sopenharmony_ci continue; 39288c2ecf20Sopenharmony_ci case NODE_RECLAIM_FULL: 39298c2ecf20Sopenharmony_ci /* scanned but unreclaimable */ 39308c2ecf20Sopenharmony_ci continue; 39318c2ecf20Sopenharmony_ci default: 39328c2ecf20Sopenharmony_ci /* did we reclaim enough */ 39338c2ecf20Sopenharmony_ci if (zone_watermark_ok(zone, order, mark, 39348c2ecf20Sopenharmony_ci ac->highest_zoneidx, alloc_flags)) 39358c2ecf20Sopenharmony_ci goto try_this_zone; 39368c2ecf20Sopenharmony_ci 39378c2ecf20Sopenharmony_ci continue; 39388c2ecf20Sopenharmony_ci } 39398c2ecf20Sopenharmony_ci } 39408c2ecf20Sopenharmony_ci 39418c2ecf20Sopenharmony_citry_this_zone: 39428c2ecf20Sopenharmony_ci page = rmqueue(ac->preferred_zoneref->zone, zone, order, 39438c2ecf20Sopenharmony_ci gfp_mask, alloc_flags, ac->migratetype); 39448c2ecf20Sopenharmony_ci if (page) { 39458c2ecf20Sopenharmony_ci prep_new_page(page, order, gfp_mask, alloc_flags); 39468c2ecf20Sopenharmony_ci 39478c2ecf20Sopenharmony_ci /* 39488c2ecf20Sopenharmony_ci * If this is a high-order atomic allocation then check 39498c2ecf20Sopenharmony_ci * if the pageblock should be reserved for the future 39508c2ecf20Sopenharmony_ci */ 39518c2ecf20Sopenharmony_ci if (unlikely(order && (alloc_flags & ALLOC_HARDER))) 39528c2ecf20Sopenharmony_ci reserve_highatomic_pageblock(page, zone, order); 39538c2ecf20Sopenharmony_ci 39548c2ecf20Sopenharmony_ci return page; 39558c2ecf20Sopenharmony_ci } else { 39568c2ecf20Sopenharmony_ci#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 39578c2ecf20Sopenharmony_ci /* Try again if zone has deferred pages */ 39588c2ecf20Sopenharmony_ci if (static_branch_unlikely(&deferred_pages)) { 39598c2ecf20Sopenharmony_ci if (_deferred_grow_zone(zone, order)) 39608c2ecf20Sopenharmony_ci goto try_this_zone; 39618c2ecf20Sopenharmony_ci } 39628c2ecf20Sopenharmony_ci#endif 39638c2ecf20Sopenharmony_ci } 39648c2ecf20Sopenharmony_ci } 39658c2ecf20Sopenharmony_ci 39668c2ecf20Sopenharmony_ci /* 39678c2ecf20Sopenharmony_ci * It's possible on a UMA machine to get through all zones that are 39688c2ecf20Sopenharmony_ci * fragmented. If avoiding fragmentation, reset and try again. 39698c2ecf20Sopenharmony_ci */ 39708c2ecf20Sopenharmony_ci if (no_fallback) { 39718c2ecf20Sopenharmony_ci alloc_flags &= ~ALLOC_NOFRAGMENT; 39728c2ecf20Sopenharmony_ci goto retry; 39738c2ecf20Sopenharmony_ci } 39748c2ecf20Sopenharmony_ci 39758c2ecf20Sopenharmony_ci return NULL; 39768c2ecf20Sopenharmony_ci} 39778c2ecf20Sopenharmony_ci 39788c2ecf20Sopenharmony_cistatic void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) 39798c2ecf20Sopenharmony_ci{ 39808c2ecf20Sopenharmony_ci unsigned int filter = SHOW_MEM_FILTER_NODES; 39818c2ecf20Sopenharmony_ci 39828c2ecf20Sopenharmony_ci /* 39838c2ecf20Sopenharmony_ci * This documents exceptions given to allocations in certain 39848c2ecf20Sopenharmony_ci * contexts that are allowed to allocate outside current's set 39858c2ecf20Sopenharmony_ci * of allowed nodes. 39868c2ecf20Sopenharmony_ci */ 39878c2ecf20Sopenharmony_ci if (!(gfp_mask & __GFP_NOMEMALLOC)) 39888c2ecf20Sopenharmony_ci if (tsk_is_oom_victim(current) || 39898c2ecf20Sopenharmony_ci (current->flags & (PF_MEMALLOC | PF_EXITING))) 39908c2ecf20Sopenharmony_ci filter &= ~SHOW_MEM_FILTER_NODES; 39918c2ecf20Sopenharmony_ci if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) 39928c2ecf20Sopenharmony_ci filter &= ~SHOW_MEM_FILTER_NODES; 39938c2ecf20Sopenharmony_ci 39948c2ecf20Sopenharmony_ci show_mem(filter, nodemask); 39958c2ecf20Sopenharmony_ci} 39968c2ecf20Sopenharmony_ci 39978c2ecf20Sopenharmony_civoid warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) 39988c2ecf20Sopenharmony_ci{ 39998c2ecf20Sopenharmony_ci struct va_format vaf; 40008c2ecf20Sopenharmony_ci va_list args; 40018c2ecf20Sopenharmony_ci static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1); 40028c2ecf20Sopenharmony_ci 40038c2ecf20Sopenharmony_ci if ((gfp_mask & __GFP_NOWARN) || 40048c2ecf20Sopenharmony_ci !__ratelimit(&nopage_rs) || 40058c2ecf20Sopenharmony_ci ((gfp_mask & __GFP_DMA) && !has_managed_dma())) 40068c2ecf20Sopenharmony_ci return; 40078c2ecf20Sopenharmony_ci 40088c2ecf20Sopenharmony_ci va_start(args, fmt); 40098c2ecf20Sopenharmony_ci vaf.fmt = fmt; 40108c2ecf20Sopenharmony_ci vaf.va = &args; 40118c2ecf20Sopenharmony_ci pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl", 40128c2ecf20Sopenharmony_ci current->comm, &vaf, gfp_mask, &gfp_mask, 40138c2ecf20Sopenharmony_ci nodemask_pr_args(nodemask)); 40148c2ecf20Sopenharmony_ci va_end(args); 40158c2ecf20Sopenharmony_ci 40168c2ecf20Sopenharmony_ci cpuset_print_current_mems_allowed(); 40178c2ecf20Sopenharmony_ci pr_cont("\n"); 40188c2ecf20Sopenharmony_ci dump_stack(); 40198c2ecf20Sopenharmony_ci warn_alloc_show_mem(gfp_mask, nodemask); 40208c2ecf20Sopenharmony_ci} 40218c2ecf20Sopenharmony_ci 40228c2ecf20Sopenharmony_cistatic inline struct page * 40238c2ecf20Sopenharmony_ci__alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order, 40248c2ecf20Sopenharmony_ci unsigned int alloc_flags, 40258c2ecf20Sopenharmony_ci const struct alloc_context *ac) 40268c2ecf20Sopenharmony_ci{ 40278c2ecf20Sopenharmony_ci struct page *page; 40288c2ecf20Sopenharmony_ci 40298c2ecf20Sopenharmony_ci page = get_page_from_freelist(gfp_mask, order, 40308c2ecf20Sopenharmony_ci alloc_flags|ALLOC_CPUSET, ac); 40318c2ecf20Sopenharmony_ci /* 40328c2ecf20Sopenharmony_ci * fallback to ignore cpuset restriction if our nodes 40338c2ecf20Sopenharmony_ci * are depleted 40348c2ecf20Sopenharmony_ci */ 40358c2ecf20Sopenharmony_ci if (!page) 40368c2ecf20Sopenharmony_ci page = get_page_from_freelist(gfp_mask, order, 40378c2ecf20Sopenharmony_ci alloc_flags, ac); 40388c2ecf20Sopenharmony_ci 40398c2ecf20Sopenharmony_ci return page; 40408c2ecf20Sopenharmony_ci} 40418c2ecf20Sopenharmony_ci 40428c2ecf20Sopenharmony_cistatic inline struct page * 40438c2ecf20Sopenharmony_ci__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, 40448c2ecf20Sopenharmony_ci const struct alloc_context *ac, unsigned long *did_some_progress) 40458c2ecf20Sopenharmony_ci{ 40468c2ecf20Sopenharmony_ci struct oom_control oc = { 40478c2ecf20Sopenharmony_ci .zonelist = ac->zonelist, 40488c2ecf20Sopenharmony_ci .nodemask = ac->nodemask, 40498c2ecf20Sopenharmony_ci .memcg = NULL, 40508c2ecf20Sopenharmony_ci .gfp_mask = gfp_mask, 40518c2ecf20Sopenharmony_ci .order = order, 40528c2ecf20Sopenharmony_ci }; 40538c2ecf20Sopenharmony_ci struct page *page; 40548c2ecf20Sopenharmony_ci 40558c2ecf20Sopenharmony_ci *did_some_progress = 0; 40568c2ecf20Sopenharmony_ci 40578c2ecf20Sopenharmony_ci /* 40588c2ecf20Sopenharmony_ci * Acquire the oom lock. If that fails, somebody else is 40598c2ecf20Sopenharmony_ci * making progress for us. 40608c2ecf20Sopenharmony_ci */ 40618c2ecf20Sopenharmony_ci if (!mutex_trylock(&oom_lock)) { 40628c2ecf20Sopenharmony_ci *did_some_progress = 1; 40638c2ecf20Sopenharmony_ci schedule_timeout_uninterruptible(1); 40648c2ecf20Sopenharmony_ci return NULL; 40658c2ecf20Sopenharmony_ci } 40668c2ecf20Sopenharmony_ci 40678c2ecf20Sopenharmony_ci /* 40688c2ecf20Sopenharmony_ci * Go through the zonelist yet one more time, keep very high watermark 40698c2ecf20Sopenharmony_ci * here, this is only to catch a parallel oom killing, we must fail if 40708c2ecf20Sopenharmony_ci * we're still under heavy pressure. But make sure that this reclaim 40718c2ecf20Sopenharmony_ci * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY 40728c2ecf20Sopenharmony_ci * allocation which will never fail due to oom_lock already held. 40738c2ecf20Sopenharmony_ci */ 40748c2ecf20Sopenharmony_ci page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) & 40758c2ecf20Sopenharmony_ci ~__GFP_DIRECT_RECLAIM, order, 40768c2ecf20Sopenharmony_ci ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); 40778c2ecf20Sopenharmony_ci if (page) 40788c2ecf20Sopenharmony_ci goto out; 40798c2ecf20Sopenharmony_ci 40808c2ecf20Sopenharmony_ci /* Coredumps can quickly deplete all memory reserves */ 40818c2ecf20Sopenharmony_ci if (current->flags & PF_DUMPCORE) 40828c2ecf20Sopenharmony_ci goto out; 40838c2ecf20Sopenharmony_ci /* The OOM killer will not help higher order allocs */ 40848c2ecf20Sopenharmony_ci if (order > PAGE_ALLOC_COSTLY_ORDER) 40858c2ecf20Sopenharmony_ci goto out; 40868c2ecf20Sopenharmony_ci /* 40878c2ecf20Sopenharmony_ci * We have already exhausted all our reclaim opportunities without any 40888c2ecf20Sopenharmony_ci * success so it is time to admit defeat. We will skip the OOM killer 40898c2ecf20Sopenharmony_ci * because it is very likely that the caller has a more reasonable 40908c2ecf20Sopenharmony_ci * fallback than shooting a random task. 40918c2ecf20Sopenharmony_ci * 40928c2ecf20Sopenharmony_ci * The OOM killer may not free memory on a specific node. 40938c2ecf20Sopenharmony_ci */ 40948c2ecf20Sopenharmony_ci if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE)) 40958c2ecf20Sopenharmony_ci goto out; 40968c2ecf20Sopenharmony_ci /* The OOM killer does not needlessly kill tasks for lowmem */ 40978c2ecf20Sopenharmony_ci if (ac->highest_zoneidx < ZONE_NORMAL) 40988c2ecf20Sopenharmony_ci goto out; 40998c2ecf20Sopenharmony_ci if (pm_suspended_storage()) 41008c2ecf20Sopenharmony_ci goto out; 41018c2ecf20Sopenharmony_ci /* 41028c2ecf20Sopenharmony_ci * XXX: GFP_NOFS allocations should rather fail than rely on 41038c2ecf20Sopenharmony_ci * other request to make a forward progress. 41048c2ecf20Sopenharmony_ci * We are in an unfortunate situation where out_of_memory cannot 41058c2ecf20Sopenharmony_ci * do much for this context but let's try it to at least get 41068c2ecf20Sopenharmony_ci * access to memory reserved if the current task is killed (see 41078c2ecf20Sopenharmony_ci * out_of_memory). Once filesystems are ready to handle allocation 41088c2ecf20Sopenharmony_ci * failures more gracefully we should just bail out here. 41098c2ecf20Sopenharmony_ci */ 41108c2ecf20Sopenharmony_ci 41118c2ecf20Sopenharmony_ci /* Exhausted what can be done so it's blame time */ 41128c2ecf20Sopenharmony_ci if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { 41138c2ecf20Sopenharmony_ci *did_some_progress = 1; 41148c2ecf20Sopenharmony_ci 41158c2ecf20Sopenharmony_ci /* 41168c2ecf20Sopenharmony_ci * Help non-failing allocations by giving them access to memory 41178c2ecf20Sopenharmony_ci * reserves 41188c2ecf20Sopenharmony_ci */ 41198c2ecf20Sopenharmony_ci if (gfp_mask & __GFP_NOFAIL) 41208c2ecf20Sopenharmony_ci page = __alloc_pages_cpuset_fallback(gfp_mask, order, 41218c2ecf20Sopenharmony_ci ALLOC_NO_WATERMARKS, ac); 41228c2ecf20Sopenharmony_ci } 41238c2ecf20Sopenharmony_ciout: 41248c2ecf20Sopenharmony_ci mutex_unlock(&oom_lock); 41258c2ecf20Sopenharmony_ci return page; 41268c2ecf20Sopenharmony_ci} 41278c2ecf20Sopenharmony_ci 41288c2ecf20Sopenharmony_ci/* 41298c2ecf20Sopenharmony_ci * Maximum number of compaction retries wit a progress before OOM 41308c2ecf20Sopenharmony_ci * killer is consider as the only way to move forward. 41318c2ecf20Sopenharmony_ci */ 41328c2ecf20Sopenharmony_ci#define MAX_COMPACT_RETRIES 16 41338c2ecf20Sopenharmony_ci 41348c2ecf20Sopenharmony_ci#ifdef CONFIG_COMPACTION 41358c2ecf20Sopenharmony_ci/* Try memory compaction for high-order allocations before reclaim */ 41368c2ecf20Sopenharmony_cistatic struct page * 41378c2ecf20Sopenharmony_ci__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 41388c2ecf20Sopenharmony_ci unsigned int alloc_flags, const struct alloc_context *ac, 41398c2ecf20Sopenharmony_ci enum compact_priority prio, enum compact_result *compact_result) 41408c2ecf20Sopenharmony_ci{ 41418c2ecf20Sopenharmony_ci struct page *page = NULL; 41428c2ecf20Sopenharmony_ci unsigned long pflags; 41438c2ecf20Sopenharmony_ci unsigned int noreclaim_flag; 41448c2ecf20Sopenharmony_ci 41458c2ecf20Sopenharmony_ci if (!order) 41468c2ecf20Sopenharmony_ci return NULL; 41478c2ecf20Sopenharmony_ci 41488c2ecf20Sopenharmony_ci psi_memstall_enter(&pflags); 41498c2ecf20Sopenharmony_ci noreclaim_flag = memalloc_noreclaim_save(); 41508c2ecf20Sopenharmony_ci 41518c2ecf20Sopenharmony_ci *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, 41528c2ecf20Sopenharmony_ci prio, &page); 41538c2ecf20Sopenharmony_ci 41548c2ecf20Sopenharmony_ci memalloc_noreclaim_restore(noreclaim_flag); 41558c2ecf20Sopenharmony_ci psi_memstall_leave(&pflags); 41568c2ecf20Sopenharmony_ci 41578c2ecf20Sopenharmony_ci /* 41588c2ecf20Sopenharmony_ci * At least in one zone compaction wasn't deferred or skipped, so let's 41598c2ecf20Sopenharmony_ci * count a compaction stall 41608c2ecf20Sopenharmony_ci */ 41618c2ecf20Sopenharmony_ci count_vm_event(COMPACTSTALL); 41628c2ecf20Sopenharmony_ci 41638c2ecf20Sopenharmony_ci /* Prep a captured page if available */ 41648c2ecf20Sopenharmony_ci if (page) 41658c2ecf20Sopenharmony_ci prep_new_page(page, order, gfp_mask, alloc_flags); 41668c2ecf20Sopenharmony_ci 41678c2ecf20Sopenharmony_ci /* Try get a page from the freelist if available */ 41688c2ecf20Sopenharmony_ci if (!page) 41698c2ecf20Sopenharmony_ci page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 41708c2ecf20Sopenharmony_ci 41718c2ecf20Sopenharmony_ci if (page) { 41728c2ecf20Sopenharmony_ci struct zone *zone = page_zone(page); 41738c2ecf20Sopenharmony_ci 41748c2ecf20Sopenharmony_ci zone->compact_blockskip_flush = false; 41758c2ecf20Sopenharmony_ci compaction_defer_reset(zone, order, true); 41768c2ecf20Sopenharmony_ci count_vm_event(COMPACTSUCCESS); 41778c2ecf20Sopenharmony_ci return page; 41788c2ecf20Sopenharmony_ci } 41798c2ecf20Sopenharmony_ci 41808c2ecf20Sopenharmony_ci /* 41818c2ecf20Sopenharmony_ci * It's bad if compaction run occurs and fails. The most likely reason 41828c2ecf20Sopenharmony_ci * is that pages exist, but not enough to satisfy watermarks. 41838c2ecf20Sopenharmony_ci */ 41848c2ecf20Sopenharmony_ci count_vm_event(COMPACTFAIL); 41858c2ecf20Sopenharmony_ci 41868c2ecf20Sopenharmony_ci cond_resched(); 41878c2ecf20Sopenharmony_ci 41888c2ecf20Sopenharmony_ci return NULL; 41898c2ecf20Sopenharmony_ci} 41908c2ecf20Sopenharmony_ci 41918c2ecf20Sopenharmony_cistatic inline bool 41928c2ecf20Sopenharmony_cishould_compact_retry(struct alloc_context *ac, int order, int alloc_flags, 41938c2ecf20Sopenharmony_ci enum compact_result compact_result, 41948c2ecf20Sopenharmony_ci enum compact_priority *compact_priority, 41958c2ecf20Sopenharmony_ci int *compaction_retries) 41968c2ecf20Sopenharmony_ci{ 41978c2ecf20Sopenharmony_ci int max_retries = MAX_COMPACT_RETRIES; 41988c2ecf20Sopenharmony_ci int min_priority; 41998c2ecf20Sopenharmony_ci bool ret = false; 42008c2ecf20Sopenharmony_ci int retries = *compaction_retries; 42018c2ecf20Sopenharmony_ci enum compact_priority priority = *compact_priority; 42028c2ecf20Sopenharmony_ci 42038c2ecf20Sopenharmony_ci if (!order) 42048c2ecf20Sopenharmony_ci return false; 42058c2ecf20Sopenharmony_ci 42068c2ecf20Sopenharmony_ci if (compaction_made_progress(compact_result)) 42078c2ecf20Sopenharmony_ci (*compaction_retries)++; 42088c2ecf20Sopenharmony_ci 42098c2ecf20Sopenharmony_ci /* 42108c2ecf20Sopenharmony_ci * compaction considers all the zone as desperately out of memory 42118c2ecf20Sopenharmony_ci * so it doesn't really make much sense to retry except when the 42128c2ecf20Sopenharmony_ci * failure could be caused by insufficient priority 42138c2ecf20Sopenharmony_ci */ 42148c2ecf20Sopenharmony_ci if (compaction_failed(compact_result)) 42158c2ecf20Sopenharmony_ci goto check_priority; 42168c2ecf20Sopenharmony_ci 42178c2ecf20Sopenharmony_ci /* 42188c2ecf20Sopenharmony_ci * compaction was skipped because there are not enough order-0 pages 42198c2ecf20Sopenharmony_ci * to work with, so we retry only if it looks like reclaim can help. 42208c2ecf20Sopenharmony_ci */ 42218c2ecf20Sopenharmony_ci if (compaction_needs_reclaim(compact_result)) { 42228c2ecf20Sopenharmony_ci ret = compaction_zonelist_suitable(ac, order, alloc_flags); 42238c2ecf20Sopenharmony_ci goto out; 42248c2ecf20Sopenharmony_ci } 42258c2ecf20Sopenharmony_ci 42268c2ecf20Sopenharmony_ci /* 42278c2ecf20Sopenharmony_ci * make sure the compaction wasn't deferred or didn't bail out early 42288c2ecf20Sopenharmony_ci * due to locks contention before we declare that we should give up. 42298c2ecf20Sopenharmony_ci * But the next retry should use a higher priority if allowed, so 42308c2ecf20Sopenharmony_ci * we don't just keep bailing out endlessly. 42318c2ecf20Sopenharmony_ci */ 42328c2ecf20Sopenharmony_ci if (compaction_withdrawn(compact_result)) { 42338c2ecf20Sopenharmony_ci goto check_priority; 42348c2ecf20Sopenharmony_ci } 42358c2ecf20Sopenharmony_ci 42368c2ecf20Sopenharmony_ci /* 42378c2ecf20Sopenharmony_ci * !costly requests are much more important than __GFP_RETRY_MAYFAIL 42388c2ecf20Sopenharmony_ci * costly ones because they are de facto nofail and invoke OOM 42398c2ecf20Sopenharmony_ci * killer to move on while costly can fail and users are ready 42408c2ecf20Sopenharmony_ci * to cope with that. 1/4 retries is rather arbitrary but we 42418c2ecf20Sopenharmony_ci * would need much more detailed feedback from compaction to 42428c2ecf20Sopenharmony_ci * make a better decision. 42438c2ecf20Sopenharmony_ci */ 42448c2ecf20Sopenharmony_ci if (order > PAGE_ALLOC_COSTLY_ORDER) 42458c2ecf20Sopenharmony_ci max_retries /= 4; 42468c2ecf20Sopenharmony_ci if (*compaction_retries <= max_retries) { 42478c2ecf20Sopenharmony_ci ret = true; 42488c2ecf20Sopenharmony_ci goto out; 42498c2ecf20Sopenharmony_ci } 42508c2ecf20Sopenharmony_ci 42518c2ecf20Sopenharmony_ci /* 42528c2ecf20Sopenharmony_ci * Make sure there are attempts at the highest priority if we exhausted 42538c2ecf20Sopenharmony_ci * all retries or failed at the lower priorities. 42548c2ecf20Sopenharmony_ci */ 42558c2ecf20Sopenharmony_cicheck_priority: 42568c2ecf20Sopenharmony_ci min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ? 42578c2ecf20Sopenharmony_ci MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY; 42588c2ecf20Sopenharmony_ci 42598c2ecf20Sopenharmony_ci if (*compact_priority > min_priority) { 42608c2ecf20Sopenharmony_ci (*compact_priority)--; 42618c2ecf20Sopenharmony_ci *compaction_retries = 0; 42628c2ecf20Sopenharmony_ci ret = true; 42638c2ecf20Sopenharmony_ci } 42648c2ecf20Sopenharmony_ciout: 42658c2ecf20Sopenharmony_ci trace_compact_retry(order, priority, compact_result, retries, max_retries, ret); 42668c2ecf20Sopenharmony_ci return ret; 42678c2ecf20Sopenharmony_ci} 42688c2ecf20Sopenharmony_ci#else 42698c2ecf20Sopenharmony_cistatic inline struct page * 42708c2ecf20Sopenharmony_ci__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 42718c2ecf20Sopenharmony_ci unsigned int alloc_flags, const struct alloc_context *ac, 42728c2ecf20Sopenharmony_ci enum compact_priority prio, enum compact_result *compact_result) 42738c2ecf20Sopenharmony_ci{ 42748c2ecf20Sopenharmony_ci *compact_result = COMPACT_SKIPPED; 42758c2ecf20Sopenharmony_ci return NULL; 42768c2ecf20Sopenharmony_ci} 42778c2ecf20Sopenharmony_ci 42788c2ecf20Sopenharmony_cistatic inline bool 42798c2ecf20Sopenharmony_cishould_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, 42808c2ecf20Sopenharmony_ci enum compact_result compact_result, 42818c2ecf20Sopenharmony_ci enum compact_priority *compact_priority, 42828c2ecf20Sopenharmony_ci int *compaction_retries) 42838c2ecf20Sopenharmony_ci{ 42848c2ecf20Sopenharmony_ci struct zone *zone; 42858c2ecf20Sopenharmony_ci struct zoneref *z; 42868c2ecf20Sopenharmony_ci 42878c2ecf20Sopenharmony_ci if (!order || order > PAGE_ALLOC_COSTLY_ORDER) 42888c2ecf20Sopenharmony_ci return false; 42898c2ecf20Sopenharmony_ci 42908c2ecf20Sopenharmony_ci /* 42918c2ecf20Sopenharmony_ci * There are setups with compaction disabled which would prefer to loop 42928c2ecf20Sopenharmony_ci * inside the allocator rather than hit the oom killer prematurely. 42938c2ecf20Sopenharmony_ci * Let's give them a good hope and keep retrying while the order-0 42948c2ecf20Sopenharmony_ci * watermarks are OK. 42958c2ecf20Sopenharmony_ci */ 42968c2ecf20Sopenharmony_ci for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, 42978c2ecf20Sopenharmony_ci ac->highest_zoneidx, ac->nodemask) { 42988c2ecf20Sopenharmony_ci if (zone_watermark_ok(zone, 0, min_wmark_pages(zone), 42998c2ecf20Sopenharmony_ci ac->highest_zoneidx, alloc_flags)) 43008c2ecf20Sopenharmony_ci return true; 43018c2ecf20Sopenharmony_ci } 43028c2ecf20Sopenharmony_ci return false; 43038c2ecf20Sopenharmony_ci} 43048c2ecf20Sopenharmony_ci#endif /* CONFIG_COMPACTION */ 43058c2ecf20Sopenharmony_ci 43068c2ecf20Sopenharmony_ci#ifdef CONFIG_LOCKDEP 43078c2ecf20Sopenharmony_cistatic struct lockdep_map __fs_reclaim_map = 43088c2ecf20Sopenharmony_ci STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map); 43098c2ecf20Sopenharmony_ci 43108c2ecf20Sopenharmony_cistatic bool __need_fs_reclaim(gfp_t gfp_mask) 43118c2ecf20Sopenharmony_ci{ 43128c2ecf20Sopenharmony_ci gfp_mask = current_gfp_context(gfp_mask); 43138c2ecf20Sopenharmony_ci 43148c2ecf20Sopenharmony_ci /* no reclaim without waiting on it */ 43158c2ecf20Sopenharmony_ci if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) 43168c2ecf20Sopenharmony_ci return false; 43178c2ecf20Sopenharmony_ci 43188c2ecf20Sopenharmony_ci /* this guy won't enter reclaim */ 43198c2ecf20Sopenharmony_ci if (current->flags & PF_MEMALLOC) 43208c2ecf20Sopenharmony_ci return false; 43218c2ecf20Sopenharmony_ci 43228c2ecf20Sopenharmony_ci /* We're only interested __GFP_FS allocations for now */ 43238c2ecf20Sopenharmony_ci if (!(gfp_mask & __GFP_FS)) 43248c2ecf20Sopenharmony_ci return false; 43258c2ecf20Sopenharmony_ci 43268c2ecf20Sopenharmony_ci if (gfp_mask & __GFP_NOLOCKDEP) 43278c2ecf20Sopenharmony_ci return false; 43288c2ecf20Sopenharmony_ci 43298c2ecf20Sopenharmony_ci return true; 43308c2ecf20Sopenharmony_ci} 43318c2ecf20Sopenharmony_ci 43328c2ecf20Sopenharmony_civoid __fs_reclaim_acquire(void) 43338c2ecf20Sopenharmony_ci{ 43348c2ecf20Sopenharmony_ci lock_map_acquire(&__fs_reclaim_map); 43358c2ecf20Sopenharmony_ci} 43368c2ecf20Sopenharmony_ci 43378c2ecf20Sopenharmony_civoid __fs_reclaim_release(void) 43388c2ecf20Sopenharmony_ci{ 43398c2ecf20Sopenharmony_ci lock_map_release(&__fs_reclaim_map); 43408c2ecf20Sopenharmony_ci} 43418c2ecf20Sopenharmony_ci 43428c2ecf20Sopenharmony_civoid fs_reclaim_acquire(gfp_t gfp_mask) 43438c2ecf20Sopenharmony_ci{ 43448c2ecf20Sopenharmony_ci if (__need_fs_reclaim(gfp_mask)) 43458c2ecf20Sopenharmony_ci __fs_reclaim_acquire(); 43468c2ecf20Sopenharmony_ci} 43478c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(fs_reclaim_acquire); 43488c2ecf20Sopenharmony_ci 43498c2ecf20Sopenharmony_civoid fs_reclaim_release(gfp_t gfp_mask) 43508c2ecf20Sopenharmony_ci{ 43518c2ecf20Sopenharmony_ci if (__need_fs_reclaim(gfp_mask)) 43528c2ecf20Sopenharmony_ci __fs_reclaim_release(); 43538c2ecf20Sopenharmony_ci} 43548c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(fs_reclaim_release); 43558c2ecf20Sopenharmony_ci#endif 43568c2ecf20Sopenharmony_ci 43578c2ecf20Sopenharmony_ci/* 43588c2ecf20Sopenharmony_ci * Zonelists may change due to hotplug during allocation. Detect when zonelists 43598c2ecf20Sopenharmony_ci * have been rebuilt so allocation retries. Reader side does not lock and 43608c2ecf20Sopenharmony_ci * retries the allocation if zonelist changes. Writer side is protected by the 43618c2ecf20Sopenharmony_ci * embedded spin_lock. 43628c2ecf20Sopenharmony_ci */ 43638c2ecf20Sopenharmony_cistatic DEFINE_SEQLOCK(zonelist_update_seq); 43648c2ecf20Sopenharmony_ci 43658c2ecf20Sopenharmony_cistatic unsigned int zonelist_iter_begin(void) 43668c2ecf20Sopenharmony_ci{ 43678c2ecf20Sopenharmony_ci if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) 43688c2ecf20Sopenharmony_ci return read_seqbegin(&zonelist_update_seq); 43698c2ecf20Sopenharmony_ci 43708c2ecf20Sopenharmony_ci return 0; 43718c2ecf20Sopenharmony_ci} 43728c2ecf20Sopenharmony_ci 43738c2ecf20Sopenharmony_cistatic unsigned int check_retry_zonelist(unsigned int seq) 43748c2ecf20Sopenharmony_ci{ 43758c2ecf20Sopenharmony_ci if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) 43768c2ecf20Sopenharmony_ci return read_seqretry(&zonelist_update_seq, seq); 43778c2ecf20Sopenharmony_ci 43788c2ecf20Sopenharmony_ci return seq; 43798c2ecf20Sopenharmony_ci} 43808c2ecf20Sopenharmony_ci 43818c2ecf20Sopenharmony_ci/* Perform direct synchronous page reclaim */ 43828c2ecf20Sopenharmony_cistatic unsigned long 43838c2ecf20Sopenharmony_ci__perform_reclaim(gfp_t gfp_mask, unsigned int order, 43848c2ecf20Sopenharmony_ci const struct alloc_context *ac) 43858c2ecf20Sopenharmony_ci{ 43868c2ecf20Sopenharmony_ci unsigned int noreclaim_flag; 43878c2ecf20Sopenharmony_ci unsigned long pflags, progress; 43888c2ecf20Sopenharmony_ci 43898c2ecf20Sopenharmony_ci cond_resched(); 43908c2ecf20Sopenharmony_ci 43918c2ecf20Sopenharmony_ci /* We now go into synchronous reclaim */ 43928c2ecf20Sopenharmony_ci cpuset_memory_pressure_bump(); 43938c2ecf20Sopenharmony_ci psi_memstall_enter(&pflags); 43948c2ecf20Sopenharmony_ci fs_reclaim_acquire(gfp_mask); 43958c2ecf20Sopenharmony_ci noreclaim_flag = memalloc_noreclaim_save(); 43968c2ecf20Sopenharmony_ci 43978c2ecf20Sopenharmony_ci progress = try_to_free_pages(ac->zonelist, order, gfp_mask, 43988c2ecf20Sopenharmony_ci ac->nodemask); 43998c2ecf20Sopenharmony_ci 44008c2ecf20Sopenharmony_ci memalloc_noreclaim_restore(noreclaim_flag); 44018c2ecf20Sopenharmony_ci fs_reclaim_release(gfp_mask); 44028c2ecf20Sopenharmony_ci psi_memstall_leave(&pflags); 44038c2ecf20Sopenharmony_ci 44048c2ecf20Sopenharmony_ci cond_resched(); 44058c2ecf20Sopenharmony_ci 44068c2ecf20Sopenharmony_ci return progress; 44078c2ecf20Sopenharmony_ci} 44088c2ecf20Sopenharmony_ci 44098c2ecf20Sopenharmony_ci/* The really slow allocator path where we enter direct reclaim */ 44108c2ecf20Sopenharmony_cistatic inline struct page * 44118c2ecf20Sopenharmony_ci__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 44128c2ecf20Sopenharmony_ci unsigned int alloc_flags, const struct alloc_context *ac, 44138c2ecf20Sopenharmony_ci unsigned long *did_some_progress) 44148c2ecf20Sopenharmony_ci{ 44158c2ecf20Sopenharmony_ci struct page *page = NULL; 44168c2ecf20Sopenharmony_ci bool drained = false; 44178c2ecf20Sopenharmony_ci 44188c2ecf20Sopenharmony_ci *did_some_progress = __perform_reclaim(gfp_mask, order, ac); 44198c2ecf20Sopenharmony_ci if (unlikely(!(*did_some_progress))) 44208c2ecf20Sopenharmony_ci return NULL; 44218c2ecf20Sopenharmony_ci 44228c2ecf20Sopenharmony_ciretry: 44238c2ecf20Sopenharmony_ci page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 44248c2ecf20Sopenharmony_ci 44258c2ecf20Sopenharmony_ci /* 44268c2ecf20Sopenharmony_ci * If an allocation failed after direct reclaim, it could be because 44278c2ecf20Sopenharmony_ci * pages are pinned on the per-cpu lists or in high alloc reserves. 44288c2ecf20Sopenharmony_ci * Shrink them and try again 44298c2ecf20Sopenharmony_ci */ 44308c2ecf20Sopenharmony_ci if (!page && !drained) { 44318c2ecf20Sopenharmony_ci unreserve_highatomic_pageblock(ac, false); 44328c2ecf20Sopenharmony_ci#ifdef CONFIG_RECLAIM_ACCT 44338c2ecf20Sopenharmony_ci reclaimacct_substage_start(RA_DRAINALLPAGES); 44348c2ecf20Sopenharmony_ci#endif 44358c2ecf20Sopenharmony_ci drain_all_pages(NULL); 44368c2ecf20Sopenharmony_ci#ifdef CONFIG_RECLAIM_ACCT 44378c2ecf20Sopenharmony_ci reclaimacct_substage_end(RA_DRAINALLPAGES, 0, NULL); 44388c2ecf20Sopenharmony_ci#endif 44398c2ecf20Sopenharmony_ci drained = true; 44408c2ecf20Sopenharmony_ci goto retry; 44418c2ecf20Sopenharmony_ci } 44428c2ecf20Sopenharmony_ci 44438c2ecf20Sopenharmony_ci return page; 44448c2ecf20Sopenharmony_ci} 44458c2ecf20Sopenharmony_ci 44468c2ecf20Sopenharmony_cistatic void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, 44478c2ecf20Sopenharmony_ci const struct alloc_context *ac) 44488c2ecf20Sopenharmony_ci{ 44498c2ecf20Sopenharmony_ci struct zoneref *z; 44508c2ecf20Sopenharmony_ci struct zone *zone; 44518c2ecf20Sopenharmony_ci pg_data_t *last_pgdat = NULL; 44528c2ecf20Sopenharmony_ci enum zone_type highest_zoneidx = ac->highest_zoneidx; 44538c2ecf20Sopenharmony_ci 44548c2ecf20Sopenharmony_ci for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx, 44558c2ecf20Sopenharmony_ci ac->nodemask) { 44568c2ecf20Sopenharmony_ci if (last_pgdat != zone->zone_pgdat) 44578c2ecf20Sopenharmony_ci wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx); 44588c2ecf20Sopenharmony_ci last_pgdat = zone->zone_pgdat; 44598c2ecf20Sopenharmony_ci } 44608c2ecf20Sopenharmony_ci} 44618c2ecf20Sopenharmony_ci 44628c2ecf20Sopenharmony_cistatic inline unsigned int 44638c2ecf20Sopenharmony_cigfp_to_alloc_flags(gfp_t gfp_mask) 44648c2ecf20Sopenharmony_ci{ 44658c2ecf20Sopenharmony_ci unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; 44668c2ecf20Sopenharmony_ci 44678c2ecf20Sopenharmony_ci /* 44688c2ecf20Sopenharmony_ci * __GFP_HIGH is assumed to be the same as ALLOC_HIGH 44698c2ecf20Sopenharmony_ci * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD 44708c2ecf20Sopenharmony_ci * to save two branches. 44718c2ecf20Sopenharmony_ci */ 44728c2ecf20Sopenharmony_ci BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); 44738c2ecf20Sopenharmony_ci BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD); 44748c2ecf20Sopenharmony_ci 44758c2ecf20Sopenharmony_ci /* 44768c2ecf20Sopenharmony_ci * The caller may dip into page reserves a bit more if the caller 44778c2ecf20Sopenharmony_ci * cannot run direct reclaim, or if the caller has realtime scheduling 44788c2ecf20Sopenharmony_ci * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 44798c2ecf20Sopenharmony_ci * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). 44808c2ecf20Sopenharmony_ci */ 44818c2ecf20Sopenharmony_ci alloc_flags |= (__force int) 44828c2ecf20Sopenharmony_ci (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM)); 44838c2ecf20Sopenharmony_ci 44848c2ecf20Sopenharmony_ci if (gfp_mask & __GFP_ATOMIC) { 44858c2ecf20Sopenharmony_ci /* 44868c2ecf20Sopenharmony_ci * Not worth trying to allocate harder for __GFP_NOMEMALLOC even 44878c2ecf20Sopenharmony_ci * if it can't schedule. 44888c2ecf20Sopenharmony_ci */ 44898c2ecf20Sopenharmony_ci if (!(gfp_mask & __GFP_NOMEMALLOC)) 44908c2ecf20Sopenharmony_ci alloc_flags |= ALLOC_HARDER; 44918c2ecf20Sopenharmony_ci /* 44928c2ecf20Sopenharmony_ci * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the 44938c2ecf20Sopenharmony_ci * comment for __cpuset_node_allowed(). 44948c2ecf20Sopenharmony_ci */ 44958c2ecf20Sopenharmony_ci alloc_flags &= ~ALLOC_CPUSET; 44968c2ecf20Sopenharmony_ci } else if (unlikely(rt_task(current)) && !in_interrupt()) 44978c2ecf20Sopenharmony_ci alloc_flags |= ALLOC_HARDER; 44988c2ecf20Sopenharmony_ci 44998c2ecf20Sopenharmony_ci alloc_flags = current_alloc_flags(gfp_mask, alloc_flags); 45008c2ecf20Sopenharmony_ci 45018c2ecf20Sopenharmony_ci return alloc_flags; 45028c2ecf20Sopenharmony_ci} 45038c2ecf20Sopenharmony_ci 45048c2ecf20Sopenharmony_cistatic bool oom_reserves_allowed(struct task_struct *tsk) 45058c2ecf20Sopenharmony_ci{ 45068c2ecf20Sopenharmony_ci if (!tsk_is_oom_victim(tsk)) 45078c2ecf20Sopenharmony_ci return false; 45088c2ecf20Sopenharmony_ci 45098c2ecf20Sopenharmony_ci /* 45108c2ecf20Sopenharmony_ci * !MMU doesn't have oom reaper so give access to memory reserves 45118c2ecf20Sopenharmony_ci * only to the thread with TIF_MEMDIE set 45128c2ecf20Sopenharmony_ci */ 45138c2ecf20Sopenharmony_ci if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE)) 45148c2ecf20Sopenharmony_ci return false; 45158c2ecf20Sopenharmony_ci 45168c2ecf20Sopenharmony_ci return true; 45178c2ecf20Sopenharmony_ci} 45188c2ecf20Sopenharmony_ci 45198c2ecf20Sopenharmony_ci/* 45208c2ecf20Sopenharmony_ci * Distinguish requests which really need access to full memory 45218c2ecf20Sopenharmony_ci * reserves from oom victims which can live with a portion of it 45228c2ecf20Sopenharmony_ci */ 45238c2ecf20Sopenharmony_cistatic inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask) 45248c2ecf20Sopenharmony_ci{ 45258c2ecf20Sopenharmony_ci if (unlikely(gfp_mask & __GFP_NOMEMALLOC)) 45268c2ecf20Sopenharmony_ci return 0; 45278c2ecf20Sopenharmony_ci if (gfp_mask & __GFP_MEMALLOC) 45288c2ecf20Sopenharmony_ci return ALLOC_NO_WATERMARKS; 45298c2ecf20Sopenharmony_ci if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) 45308c2ecf20Sopenharmony_ci return ALLOC_NO_WATERMARKS; 45318c2ecf20Sopenharmony_ci if (!in_interrupt()) { 45328c2ecf20Sopenharmony_ci if (current->flags & PF_MEMALLOC) 45338c2ecf20Sopenharmony_ci return ALLOC_NO_WATERMARKS; 45348c2ecf20Sopenharmony_ci else if (oom_reserves_allowed(current)) 45358c2ecf20Sopenharmony_ci return ALLOC_OOM; 45368c2ecf20Sopenharmony_ci } 45378c2ecf20Sopenharmony_ci 45388c2ecf20Sopenharmony_ci return 0; 45398c2ecf20Sopenharmony_ci} 45408c2ecf20Sopenharmony_ci 45418c2ecf20Sopenharmony_cibool gfp_pfmemalloc_allowed(gfp_t gfp_mask) 45428c2ecf20Sopenharmony_ci{ 45438c2ecf20Sopenharmony_ci return !!__gfp_pfmemalloc_flags(gfp_mask); 45448c2ecf20Sopenharmony_ci} 45458c2ecf20Sopenharmony_ci 45468c2ecf20Sopenharmony_ci/* 45478c2ecf20Sopenharmony_ci * Checks whether it makes sense to retry the reclaim to make a forward progress 45488c2ecf20Sopenharmony_ci * for the given allocation request. 45498c2ecf20Sopenharmony_ci * 45508c2ecf20Sopenharmony_ci * We give up when we either have tried MAX_RECLAIM_RETRIES in a row 45518c2ecf20Sopenharmony_ci * without success, or when we couldn't even meet the watermark if we 45528c2ecf20Sopenharmony_ci * reclaimed all remaining pages on the LRU lists. 45538c2ecf20Sopenharmony_ci * 45548c2ecf20Sopenharmony_ci * Returns true if a retry is viable or false to enter the oom path. 45558c2ecf20Sopenharmony_ci */ 45568c2ecf20Sopenharmony_cistatic inline bool 45578c2ecf20Sopenharmony_cishould_reclaim_retry(gfp_t gfp_mask, unsigned order, 45588c2ecf20Sopenharmony_ci struct alloc_context *ac, int alloc_flags, 45598c2ecf20Sopenharmony_ci bool did_some_progress, int *no_progress_loops) 45608c2ecf20Sopenharmony_ci{ 45618c2ecf20Sopenharmony_ci struct zone *zone; 45628c2ecf20Sopenharmony_ci struct zoneref *z; 45638c2ecf20Sopenharmony_ci bool ret = false; 45648c2ecf20Sopenharmony_ci 45658c2ecf20Sopenharmony_ci /* 45668c2ecf20Sopenharmony_ci * Costly allocations might have made a progress but this doesn't mean 45678c2ecf20Sopenharmony_ci * their order will become available due to high fragmentation so 45688c2ecf20Sopenharmony_ci * always increment the no progress counter for them 45698c2ecf20Sopenharmony_ci */ 45708c2ecf20Sopenharmony_ci if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) 45718c2ecf20Sopenharmony_ci *no_progress_loops = 0; 45728c2ecf20Sopenharmony_ci else 45738c2ecf20Sopenharmony_ci (*no_progress_loops)++; 45748c2ecf20Sopenharmony_ci 45758c2ecf20Sopenharmony_ci /* 45768c2ecf20Sopenharmony_ci * Make sure we converge to OOM if we cannot make any progress 45778c2ecf20Sopenharmony_ci * several times in the row. 45788c2ecf20Sopenharmony_ci */ 45798c2ecf20Sopenharmony_ci if (*no_progress_loops > MAX_RECLAIM_RETRIES) { 45808c2ecf20Sopenharmony_ci /* Before OOM, exhaust highatomic_reserve */ 45818c2ecf20Sopenharmony_ci return unreserve_highatomic_pageblock(ac, true); 45828c2ecf20Sopenharmony_ci } 45838c2ecf20Sopenharmony_ci 45848c2ecf20Sopenharmony_ci /* 45858c2ecf20Sopenharmony_ci * Keep reclaiming pages while there is a chance this will lead 45868c2ecf20Sopenharmony_ci * somewhere. If none of the target zones can satisfy our allocation 45878c2ecf20Sopenharmony_ci * request even if all reclaimable pages are considered then we are 45888c2ecf20Sopenharmony_ci * screwed and have to go OOM. 45898c2ecf20Sopenharmony_ci */ 45908c2ecf20Sopenharmony_ci for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, 45918c2ecf20Sopenharmony_ci ac->highest_zoneidx, ac->nodemask) { 45928c2ecf20Sopenharmony_ci unsigned long available; 45938c2ecf20Sopenharmony_ci unsigned long reclaimable; 45948c2ecf20Sopenharmony_ci unsigned long min_wmark = min_wmark_pages(zone); 45958c2ecf20Sopenharmony_ci bool wmark; 45968c2ecf20Sopenharmony_ci 45978c2ecf20Sopenharmony_ci available = reclaimable = zone_reclaimable_pages(zone); 45988c2ecf20Sopenharmony_ci available += zone_page_state_snapshot(zone, NR_FREE_PAGES); 45998c2ecf20Sopenharmony_ci 46008c2ecf20Sopenharmony_ci /* 46018c2ecf20Sopenharmony_ci * Would the allocation succeed if we reclaimed all 46028c2ecf20Sopenharmony_ci * reclaimable pages? 46038c2ecf20Sopenharmony_ci */ 46048c2ecf20Sopenharmony_ci wmark = __zone_watermark_ok(zone, order, min_wmark, 46058c2ecf20Sopenharmony_ci ac->highest_zoneidx, alloc_flags, available); 46068c2ecf20Sopenharmony_ci trace_reclaim_retry_zone(z, order, reclaimable, 46078c2ecf20Sopenharmony_ci available, min_wmark, *no_progress_loops, wmark); 46088c2ecf20Sopenharmony_ci if (wmark) { 46098c2ecf20Sopenharmony_ci /* 46108c2ecf20Sopenharmony_ci * If we didn't make any progress and have a lot of 46118c2ecf20Sopenharmony_ci * dirty + writeback pages then we should wait for 46128c2ecf20Sopenharmony_ci * an IO to complete to slow down the reclaim and 46138c2ecf20Sopenharmony_ci * prevent from pre mature OOM 46148c2ecf20Sopenharmony_ci */ 46158c2ecf20Sopenharmony_ci if (!did_some_progress) { 46168c2ecf20Sopenharmony_ci unsigned long write_pending; 46178c2ecf20Sopenharmony_ci 46188c2ecf20Sopenharmony_ci write_pending = zone_page_state_snapshot(zone, 46198c2ecf20Sopenharmony_ci NR_ZONE_WRITE_PENDING); 46208c2ecf20Sopenharmony_ci 46218c2ecf20Sopenharmony_ci if (2 * write_pending > reclaimable) { 46228c2ecf20Sopenharmony_ci congestion_wait(BLK_RW_ASYNC, HZ/10); 46238c2ecf20Sopenharmony_ci return true; 46248c2ecf20Sopenharmony_ci } 46258c2ecf20Sopenharmony_ci } 46268c2ecf20Sopenharmony_ci 46278c2ecf20Sopenharmony_ci ret = true; 46288c2ecf20Sopenharmony_ci goto out; 46298c2ecf20Sopenharmony_ci } 46308c2ecf20Sopenharmony_ci } 46318c2ecf20Sopenharmony_ci 46328c2ecf20Sopenharmony_ciout: 46338c2ecf20Sopenharmony_ci /* 46348c2ecf20Sopenharmony_ci * Memory allocation/reclaim might be called from a WQ context and the 46358c2ecf20Sopenharmony_ci * current implementation of the WQ concurrency control doesn't 46368c2ecf20Sopenharmony_ci * recognize that a particular WQ is congested if the worker thread is 46378c2ecf20Sopenharmony_ci * looping without ever sleeping. Therefore we have to do a short sleep 46388c2ecf20Sopenharmony_ci * here rather than calling cond_resched(). 46398c2ecf20Sopenharmony_ci */ 46408c2ecf20Sopenharmony_ci if (current->flags & PF_WQ_WORKER) 46418c2ecf20Sopenharmony_ci schedule_timeout_uninterruptible(1); 46428c2ecf20Sopenharmony_ci else 46438c2ecf20Sopenharmony_ci cond_resched(); 46448c2ecf20Sopenharmony_ci return ret; 46458c2ecf20Sopenharmony_ci} 46468c2ecf20Sopenharmony_ci 46478c2ecf20Sopenharmony_cistatic inline bool 46488c2ecf20Sopenharmony_cicheck_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac) 46498c2ecf20Sopenharmony_ci{ 46508c2ecf20Sopenharmony_ci /* 46518c2ecf20Sopenharmony_ci * It's possible that cpuset's mems_allowed and the nodemask from 46528c2ecf20Sopenharmony_ci * mempolicy don't intersect. This should be normally dealt with by 46538c2ecf20Sopenharmony_ci * policy_nodemask(), but it's possible to race with cpuset update in 46548c2ecf20Sopenharmony_ci * such a way the check therein was true, and then it became false 46558c2ecf20Sopenharmony_ci * before we got our cpuset_mems_cookie here. 46568c2ecf20Sopenharmony_ci * This assumes that for all allocations, ac->nodemask can come only 46578c2ecf20Sopenharmony_ci * from MPOL_BIND mempolicy (whose documented semantics is to be ignored 46588c2ecf20Sopenharmony_ci * when it does not intersect with the cpuset restrictions) or the 46598c2ecf20Sopenharmony_ci * caller can deal with a violated nodemask. 46608c2ecf20Sopenharmony_ci */ 46618c2ecf20Sopenharmony_ci if (cpusets_enabled() && ac->nodemask && 46628c2ecf20Sopenharmony_ci !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) { 46638c2ecf20Sopenharmony_ci ac->nodemask = NULL; 46648c2ecf20Sopenharmony_ci return true; 46658c2ecf20Sopenharmony_ci } 46668c2ecf20Sopenharmony_ci 46678c2ecf20Sopenharmony_ci /* 46688c2ecf20Sopenharmony_ci * When updating a task's mems_allowed or mempolicy nodemask, it is 46698c2ecf20Sopenharmony_ci * possible to race with parallel threads in such a way that our 46708c2ecf20Sopenharmony_ci * allocation can fail while the mask is being updated. If we are about 46718c2ecf20Sopenharmony_ci * to fail, check if the cpuset changed during allocation and if so, 46728c2ecf20Sopenharmony_ci * retry. 46738c2ecf20Sopenharmony_ci */ 46748c2ecf20Sopenharmony_ci if (read_mems_allowed_retry(cpuset_mems_cookie)) 46758c2ecf20Sopenharmony_ci return true; 46768c2ecf20Sopenharmony_ci 46778c2ecf20Sopenharmony_ci return false; 46788c2ecf20Sopenharmony_ci} 46798c2ecf20Sopenharmony_ci 46808c2ecf20Sopenharmony_cistatic inline struct page * 46818c2ecf20Sopenharmony_ci__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 46828c2ecf20Sopenharmony_ci struct alloc_context *ac) 46838c2ecf20Sopenharmony_ci{ 46848c2ecf20Sopenharmony_ci bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; 46858c2ecf20Sopenharmony_ci const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; 46868c2ecf20Sopenharmony_ci struct page *page = NULL; 46878c2ecf20Sopenharmony_ci unsigned int alloc_flags; 46888c2ecf20Sopenharmony_ci unsigned long did_some_progress; 46898c2ecf20Sopenharmony_ci enum compact_priority compact_priority; 46908c2ecf20Sopenharmony_ci enum compact_result compact_result; 46918c2ecf20Sopenharmony_ci int compaction_retries; 46928c2ecf20Sopenharmony_ci int no_progress_loops; 46938c2ecf20Sopenharmony_ci unsigned int cpuset_mems_cookie; 46948c2ecf20Sopenharmony_ci unsigned int zonelist_iter_cookie; 46958c2ecf20Sopenharmony_ci int reserve_flags; 46968c2ecf20Sopenharmony_ci#ifdef CONFIG_RECLAIM_ACCT 46978c2ecf20Sopenharmony_ci struct reclaim_acct ra = {0}; 46988c2ecf20Sopenharmony_ci#endif 46998c2ecf20Sopenharmony_ci 47008c2ecf20Sopenharmony_ci /* 47018c2ecf20Sopenharmony_ci * We also sanity check to catch abuse of atomic reserves being used by 47028c2ecf20Sopenharmony_ci * callers that are not in atomic context. 47038c2ecf20Sopenharmony_ci */ 47048c2ecf20Sopenharmony_ci if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) == 47058c2ecf20Sopenharmony_ci (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) 47068c2ecf20Sopenharmony_ci gfp_mask &= ~__GFP_ATOMIC; 47078c2ecf20Sopenharmony_ci 47088c2ecf20Sopenharmony_cirestart: 47098c2ecf20Sopenharmony_ci compaction_retries = 0; 47108c2ecf20Sopenharmony_ci no_progress_loops = 0; 47118c2ecf20Sopenharmony_ci compact_priority = DEF_COMPACT_PRIORITY; 47128c2ecf20Sopenharmony_ci cpuset_mems_cookie = read_mems_allowed_begin(); 47138c2ecf20Sopenharmony_ci zonelist_iter_cookie = zonelist_iter_begin(); 47148c2ecf20Sopenharmony_ci 47158c2ecf20Sopenharmony_ci /* 47168c2ecf20Sopenharmony_ci * The fast path uses conservative alloc_flags to succeed only until 47178c2ecf20Sopenharmony_ci * kswapd needs to be woken up, and to avoid the cost of setting up 47188c2ecf20Sopenharmony_ci * alloc_flags precisely. So we do that now. 47198c2ecf20Sopenharmony_ci */ 47208c2ecf20Sopenharmony_ci alloc_flags = gfp_to_alloc_flags(gfp_mask); 47218c2ecf20Sopenharmony_ci 47228c2ecf20Sopenharmony_ci /* 47238c2ecf20Sopenharmony_ci * We need to recalculate the starting point for the zonelist iterator 47248c2ecf20Sopenharmony_ci * because we might have used different nodemask in the fast path, or 47258c2ecf20Sopenharmony_ci * there was a cpuset modification and we are retrying - otherwise we 47268c2ecf20Sopenharmony_ci * could end up iterating over non-eligible zones endlessly. 47278c2ecf20Sopenharmony_ci */ 47288c2ecf20Sopenharmony_ci ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 47298c2ecf20Sopenharmony_ci ac->highest_zoneidx, ac->nodemask); 47308c2ecf20Sopenharmony_ci if (!ac->preferred_zoneref->zone) 47318c2ecf20Sopenharmony_ci goto nopage; 47328c2ecf20Sopenharmony_ci 47338c2ecf20Sopenharmony_ci if (alloc_flags & ALLOC_KSWAPD) 47348c2ecf20Sopenharmony_ci wake_all_kswapds(order, gfp_mask, ac); 47358c2ecf20Sopenharmony_ci 47368c2ecf20Sopenharmony_ci /* 47378c2ecf20Sopenharmony_ci * The adjusted alloc_flags might result in immediate success, so try 47388c2ecf20Sopenharmony_ci * that first 47398c2ecf20Sopenharmony_ci */ 47408c2ecf20Sopenharmony_ci page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 47418c2ecf20Sopenharmony_ci if (page) 47428c2ecf20Sopenharmony_ci goto got_pg; 47438c2ecf20Sopenharmony_ci 47448c2ecf20Sopenharmony_ci /* 47458c2ecf20Sopenharmony_ci * For costly allocations, try direct compaction first, as it's likely 47468c2ecf20Sopenharmony_ci * that we have enough base pages and don't need to reclaim. For non- 47478c2ecf20Sopenharmony_ci * movable high-order allocations, do that as well, as compaction will 47488c2ecf20Sopenharmony_ci * try prevent permanent fragmentation by migrating from blocks of the 47498c2ecf20Sopenharmony_ci * same migratetype. 47508c2ecf20Sopenharmony_ci * Don't try this for allocations that are allowed to ignore 47518c2ecf20Sopenharmony_ci * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen. 47528c2ecf20Sopenharmony_ci */ 47538c2ecf20Sopenharmony_ci if (can_direct_reclaim && 47548c2ecf20Sopenharmony_ci (costly_order || 47558c2ecf20Sopenharmony_ci (order > 0 && ac->migratetype != MIGRATE_MOVABLE)) 47568c2ecf20Sopenharmony_ci && !gfp_pfmemalloc_allowed(gfp_mask)) { 47578c2ecf20Sopenharmony_ci page = __alloc_pages_direct_compact(gfp_mask, order, 47588c2ecf20Sopenharmony_ci alloc_flags, ac, 47598c2ecf20Sopenharmony_ci INIT_COMPACT_PRIORITY, 47608c2ecf20Sopenharmony_ci &compact_result); 47618c2ecf20Sopenharmony_ci if (page) 47628c2ecf20Sopenharmony_ci goto got_pg; 47638c2ecf20Sopenharmony_ci 47648c2ecf20Sopenharmony_ci /* 47658c2ecf20Sopenharmony_ci * Checks for costly allocations with __GFP_NORETRY, which 47668c2ecf20Sopenharmony_ci * includes some THP page fault allocations 47678c2ecf20Sopenharmony_ci */ 47688c2ecf20Sopenharmony_ci if (costly_order && (gfp_mask & __GFP_NORETRY)) { 47698c2ecf20Sopenharmony_ci /* 47708c2ecf20Sopenharmony_ci * If allocating entire pageblock(s) and compaction 47718c2ecf20Sopenharmony_ci * failed because all zones are below low watermarks 47728c2ecf20Sopenharmony_ci * or is prohibited because it recently failed at this 47738c2ecf20Sopenharmony_ci * order, fail immediately unless the allocator has 47748c2ecf20Sopenharmony_ci * requested compaction and reclaim retry. 47758c2ecf20Sopenharmony_ci * 47768c2ecf20Sopenharmony_ci * Reclaim is 47778c2ecf20Sopenharmony_ci * - potentially very expensive because zones are far 47788c2ecf20Sopenharmony_ci * below their low watermarks or this is part of very 47798c2ecf20Sopenharmony_ci * bursty high order allocations, 47808c2ecf20Sopenharmony_ci * - not guaranteed to help because isolate_freepages() 47818c2ecf20Sopenharmony_ci * may not iterate over freed pages as part of its 47828c2ecf20Sopenharmony_ci * linear scan, and 47838c2ecf20Sopenharmony_ci * - unlikely to make entire pageblocks free on its 47848c2ecf20Sopenharmony_ci * own. 47858c2ecf20Sopenharmony_ci */ 47868c2ecf20Sopenharmony_ci if (compact_result == COMPACT_SKIPPED || 47878c2ecf20Sopenharmony_ci compact_result == COMPACT_DEFERRED) 47888c2ecf20Sopenharmony_ci goto nopage; 47898c2ecf20Sopenharmony_ci 47908c2ecf20Sopenharmony_ci /* 47918c2ecf20Sopenharmony_ci * Looks like reclaim/compaction is worth trying, but 47928c2ecf20Sopenharmony_ci * sync compaction could be very expensive, so keep 47938c2ecf20Sopenharmony_ci * using async compaction. 47948c2ecf20Sopenharmony_ci */ 47958c2ecf20Sopenharmony_ci compact_priority = INIT_COMPACT_PRIORITY; 47968c2ecf20Sopenharmony_ci } 47978c2ecf20Sopenharmony_ci } 47988c2ecf20Sopenharmony_ci 47998c2ecf20Sopenharmony_ciretry: 48008c2ecf20Sopenharmony_ci /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ 48018c2ecf20Sopenharmony_ci if (alloc_flags & ALLOC_KSWAPD) 48028c2ecf20Sopenharmony_ci wake_all_kswapds(order, gfp_mask, ac); 48038c2ecf20Sopenharmony_ci 48048c2ecf20Sopenharmony_ci reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); 48058c2ecf20Sopenharmony_ci if (reserve_flags) 48068c2ecf20Sopenharmony_ci alloc_flags = current_alloc_flags(gfp_mask, reserve_flags); 48078c2ecf20Sopenharmony_ci 48088c2ecf20Sopenharmony_ci /* 48098c2ecf20Sopenharmony_ci * Reset the nodemask and zonelist iterators if memory policies can be 48108c2ecf20Sopenharmony_ci * ignored. These allocations are high priority and system rather than 48118c2ecf20Sopenharmony_ci * user oriented. 48128c2ecf20Sopenharmony_ci */ 48138c2ecf20Sopenharmony_ci if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { 48148c2ecf20Sopenharmony_ci ac->nodemask = NULL; 48158c2ecf20Sopenharmony_ci ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 48168c2ecf20Sopenharmony_ci ac->highest_zoneidx, ac->nodemask); 48178c2ecf20Sopenharmony_ci } 48188c2ecf20Sopenharmony_ci 48198c2ecf20Sopenharmony_ci /* Attempt with potentially adjusted zonelist and alloc_flags */ 48208c2ecf20Sopenharmony_ci page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 48218c2ecf20Sopenharmony_ci if (page) 48228c2ecf20Sopenharmony_ci goto got_pg; 48238c2ecf20Sopenharmony_ci 48248c2ecf20Sopenharmony_ci /* Caller is not willing to reclaim, we can't balance anything */ 48258c2ecf20Sopenharmony_ci if (!can_direct_reclaim) 48268c2ecf20Sopenharmony_ci goto nopage; 48278c2ecf20Sopenharmony_ci 48288c2ecf20Sopenharmony_ci /* Avoid recursion of direct reclaim */ 48298c2ecf20Sopenharmony_ci if (current->flags & PF_MEMALLOC) 48308c2ecf20Sopenharmony_ci goto nopage; 48318c2ecf20Sopenharmony_ci 48328c2ecf20Sopenharmony_ci /* Try direct reclaim and then allocating */ 48338c2ecf20Sopenharmony_ci#ifdef CONFIG_RECLAIM_ACCT 48348c2ecf20Sopenharmony_ci reclaimacct_start(DIRECT_RECLAIMS, &ra); 48358c2ecf20Sopenharmony_ci#endif 48368c2ecf20Sopenharmony_ci page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, 48378c2ecf20Sopenharmony_ci &did_some_progress); 48388c2ecf20Sopenharmony_ci#ifdef CONFIG_RECLAIM_ACCT 48398c2ecf20Sopenharmony_ci reclaimacct_end(DIRECT_RECLAIMS); 48408c2ecf20Sopenharmony_ci#endif 48418c2ecf20Sopenharmony_ci if (page) 48428c2ecf20Sopenharmony_ci goto got_pg; 48438c2ecf20Sopenharmony_ci 48448c2ecf20Sopenharmony_ci /* Try direct compaction and then allocating */ 48458c2ecf20Sopenharmony_ci page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, 48468c2ecf20Sopenharmony_ci compact_priority, &compact_result); 48478c2ecf20Sopenharmony_ci if (page) 48488c2ecf20Sopenharmony_ci goto got_pg; 48498c2ecf20Sopenharmony_ci 48508c2ecf20Sopenharmony_ci /* Do not loop if specifically requested */ 48518c2ecf20Sopenharmony_ci if (gfp_mask & __GFP_NORETRY) 48528c2ecf20Sopenharmony_ci goto nopage; 48538c2ecf20Sopenharmony_ci 48548c2ecf20Sopenharmony_ci /* 48558c2ecf20Sopenharmony_ci * Do not retry costly high order allocations unless they are 48568c2ecf20Sopenharmony_ci * __GFP_RETRY_MAYFAIL 48578c2ecf20Sopenharmony_ci */ 48588c2ecf20Sopenharmony_ci if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL)) 48598c2ecf20Sopenharmony_ci goto nopage; 48608c2ecf20Sopenharmony_ci 48618c2ecf20Sopenharmony_ci if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, 48628c2ecf20Sopenharmony_ci did_some_progress > 0, &no_progress_loops)) 48638c2ecf20Sopenharmony_ci goto retry; 48648c2ecf20Sopenharmony_ci 48658c2ecf20Sopenharmony_ci /* 48668c2ecf20Sopenharmony_ci * It doesn't make any sense to retry for the compaction if the order-0 48678c2ecf20Sopenharmony_ci * reclaim is not able to make any progress because the current 48688c2ecf20Sopenharmony_ci * implementation of the compaction depends on the sufficient amount 48698c2ecf20Sopenharmony_ci * of free memory (see __compaction_suitable) 48708c2ecf20Sopenharmony_ci */ 48718c2ecf20Sopenharmony_ci if (did_some_progress > 0 && 48728c2ecf20Sopenharmony_ci should_compact_retry(ac, order, alloc_flags, 48738c2ecf20Sopenharmony_ci compact_result, &compact_priority, 48748c2ecf20Sopenharmony_ci &compaction_retries)) 48758c2ecf20Sopenharmony_ci goto retry; 48768c2ecf20Sopenharmony_ci 48778c2ecf20Sopenharmony_ci 48788c2ecf20Sopenharmony_ci /* 48798c2ecf20Sopenharmony_ci * Deal with possible cpuset update races or zonelist updates to avoid 48808c2ecf20Sopenharmony_ci * a unnecessary OOM kill. 48818c2ecf20Sopenharmony_ci */ 48828c2ecf20Sopenharmony_ci if (check_retry_cpuset(cpuset_mems_cookie, ac) || 48838c2ecf20Sopenharmony_ci check_retry_zonelist(zonelist_iter_cookie)) 48848c2ecf20Sopenharmony_ci goto restart; 48858c2ecf20Sopenharmony_ci 48868c2ecf20Sopenharmony_ci /* Reclaim has failed us, start killing things */ 48878c2ecf20Sopenharmony_ci page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); 48888c2ecf20Sopenharmony_ci if (page) 48898c2ecf20Sopenharmony_ci goto got_pg; 48908c2ecf20Sopenharmony_ci 48918c2ecf20Sopenharmony_ci /* Avoid allocations with no watermarks from looping endlessly */ 48928c2ecf20Sopenharmony_ci if (tsk_is_oom_victim(current) && 48938c2ecf20Sopenharmony_ci (alloc_flags & ALLOC_OOM || 48948c2ecf20Sopenharmony_ci (gfp_mask & __GFP_NOMEMALLOC))) 48958c2ecf20Sopenharmony_ci goto nopage; 48968c2ecf20Sopenharmony_ci 48978c2ecf20Sopenharmony_ci /* Retry as long as the OOM killer is making progress */ 48988c2ecf20Sopenharmony_ci if (did_some_progress) { 48998c2ecf20Sopenharmony_ci no_progress_loops = 0; 49008c2ecf20Sopenharmony_ci goto retry; 49018c2ecf20Sopenharmony_ci } 49028c2ecf20Sopenharmony_ci 49038c2ecf20Sopenharmony_cinopage: 49048c2ecf20Sopenharmony_ci /* 49058c2ecf20Sopenharmony_ci * Deal with possible cpuset update races or zonelist updates to avoid 49068c2ecf20Sopenharmony_ci * a unnecessary OOM kill. 49078c2ecf20Sopenharmony_ci */ 49088c2ecf20Sopenharmony_ci if (check_retry_cpuset(cpuset_mems_cookie, ac) || 49098c2ecf20Sopenharmony_ci check_retry_zonelist(zonelist_iter_cookie)) 49108c2ecf20Sopenharmony_ci goto restart; 49118c2ecf20Sopenharmony_ci 49128c2ecf20Sopenharmony_ci /* 49138c2ecf20Sopenharmony_ci * Make sure that __GFP_NOFAIL request doesn't leak out and make sure 49148c2ecf20Sopenharmony_ci * we always retry 49158c2ecf20Sopenharmony_ci */ 49168c2ecf20Sopenharmony_ci if (gfp_mask & __GFP_NOFAIL) { 49178c2ecf20Sopenharmony_ci /* 49188c2ecf20Sopenharmony_ci * All existing users of the __GFP_NOFAIL are blockable, so warn 49198c2ecf20Sopenharmony_ci * of any new users that actually require GFP_NOWAIT 49208c2ecf20Sopenharmony_ci */ 49218c2ecf20Sopenharmony_ci if (WARN_ON_ONCE(!can_direct_reclaim)) 49228c2ecf20Sopenharmony_ci goto fail; 49238c2ecf20Sopenharmony_ci 49248c2ecf20Sopenharmony_ci /* 49258c2ecf20Sopenharmony_ci * PF_MEMALLOC request from this context is rather bizarre 49268c2ecf20Sopenharmony_ci * because we cannot reclaim anything and only can loop waiting 49278c2ecf20Sopenharmony_ci * for somebody to do a work for us 49288c2ecf20Sopenharmony_ci */ 49298c2ecf20Sopenharmony_ci WARN_ON_ONCE(current->flags & PF_MEMALLOC); 49308c2ecf20Sopenharmony_ci 49318c2ecf20Sopenharmony_ci /* 49328c2ecf20Sopenharmony_ci * non failing costly orders are a hard requirement which we 49338c2ecf20Sopenharmony_ci * are not prepared for much so let's warn about these users 49348c2ecf20Sopenharmony_ci * so that we can identify them and convert them to something 49358c2ecf20Sopenharmony_ci * else. 49368c2ecf20Sopenharmony_ci */ 49378c2ecf20Sopenharmony_ci WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER); 49388c2ecf20Sopenharmony_ci 49398c2ecf20Sopenharmony_ci /* 49408c2ecf20Sopenharmony_ci * Help non-failing allocations by giving them access to memory 49418c2ecf20Sopenharmony_ci * reserves but do not use ALLOC_NO_WATERMARKS because this 49428c2ecf20Sopenharmony_ci * could deplete whole memory reserves which would just make 49438c2ecf20Sopenharmony_ci * the situation worse 49448c2ecf20Sopenharmony_ci */ 49458c2ecf20Sopenharmony_ci page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac); 49468c2ecf20Sopenharmony_ci if (page) 49478c2ecf20Sopenharmony_ci goto got_pg; 49488c2ecf20Sopenharmony_ci 49498c2ecf20Sopenharmony_ci cond_resched(); 49508c2ecf20Sopenharmony_ci goto retry; 49518c2ecf20Sopenharmony_ci } 49528c2ecf20Sopenharmony_cifail: 49538c2ecf20Sopenharmony_ci warn_alloc(gfp_mask, ac->nodemask, 49548c2ecf20Sopenharmony_ci "page allocation failure: order:%u", order); 49558c2ecf20Sopenharmony_cigot_pg: 49568c2ecf20Sopenharmony_ci return page; 49578c2ecf20Sopenharmony_ci} 49588c2ecf20Sopenharmony_ci 49598c2ecf20Sopenharmony_cistatic inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, 49608c2ecf20Sopenharmony_ci int preferred_nid, nodemask_t *nodemask, 49618c2ecf20Sopenharmony_ci struct alloc_context *ac, gfp_t *alloc_mask, 49628c2ecf20Sopenharmony_ci unsigned int *alloc_flags) 49638c2ecf20Sopenharmony_ci{ 49648c2ecf20Sopenharmony_ci ac->highest_zoneidx = gfp_zone(gfp_mask); 49658c2ecf20Sopenharmony_ci ac->zonelist = node_zonelist(preferred_nid, gfp_mask); 49668c2ecf20Sopenharmony_ci ac->nodemask = nodemask; 49678c2ecf20Sopenharmony_ci ac->migratetype = gfp_migratetype(gfp_mask); 49688c2ecf20Sopenharmony_ci 49698c2ecf20Sopenharmony_ci if (cpusets_enabled()) { 49708c2ecf20Sopenharmony_ci *alloc_mask |= __GFP_HARDWALL; 49718c2ecf20Sopenharmony_ci /* 49728c2ecf20Sopenharmony_ci * When we are in the interrupt context, it is irrelevant 49738c2ecf20Sopenharmony_ci * to the current task context. It means that any node ok. 49748c2ecf20Sopenharmony_ci */ 49758c2ecf20Sopenharmony_ci if (!in_interrupt() && !ac->nodemask) 49768c2ecf20Sopenharmony_ci ac->nodemask = &cpuset_current_mems_allowed; 49778c2ecf20Sopenharmony_ci else 49788c2ecf20Sopenharmony_ci *alloc_flags |= ALLOC_CPUSET; 49798c2ecf20Sopenharmony_ci } 49808c2ecf20Sopenharmony_ci 49818c2ecf20Sopenharmony_ci fs_reclaim_acquire(gfp_mask); 49828c2ecf20Sopenharmony_ci fs_reclaim_release(gfp_mask); 49838c2ecf20Sopenharmony_ci 49848c2ecf20Sopenharmony_ci might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); 49858c2ecf20Sopenharmony_ci 49868c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_ZSWAPD 49878c2ecf20Sopenharmony_ci if (gfp_mask & __GFP_KSWAPD_RECLAIM) 49888c2ecf20Sopenharmony_ci wake_all_zswapd(); 49898c2ecf20Sopenharmony_ci#endif 49908c2ecf20Sopenharmony_ci 49918c2ecf20Sopenharmony_ci if (should_fail_alloc_page(gfp_mask, order)) 49928c2ecf20Sopenharmony_ci return false; 49938c2ecf20Sopenharmony_ci 49948c2ecf20Sopenharmony_ci *alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags); 49958c2ecf20Sopenharmony_ci 49968c2ecf20Sopenharmony_ci /* Dirty zone balancing only done in the fast path */ 49978c2ecf20Sopenharmony_ci ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); 49988c2ecf20Sopenharmony_ci 49998c2ecf20Sopenharmony_ci /* 50008c2ecf20Sopenharmony_ci * The preferred zone is used for statistics but crucially it is 50018c2ecf20Sopenharmony_ci * also used as the starting point for the zonelist iterator. It 50028c2ecf20Sopenharmony_ci * may get reset for allocations that ignore memory policies. 50038c2ecf20Sopenharmony_ci */ 50048c2ecf20Sopenharmony_ci ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 50058c2ecf20Sopenharmony_ci ac->highest_zoneidx, ac->nodemask); 50068c2ecf20Sopenharmony_ci 50078c2ecf20Sopenharmony_ci return true; 50088c2ecf20Sopenharmony_ci} 50098c2ecf20Sopenharmony_ci 50108c2ecf20Sopenharmony_ci/* 50118c2ecf20Sopenharmony_ci * This is the 'heart' of the zoned buddy allocator. 50128c2ecf20Sopenharmony_ci */ 50138c2ecf20Sopenharmony_cistruct page * 50148c2ecf20Sopenharmony_ci__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, 50158c2ecf20Sopenharmony_ci nodemask_t *nodemask) 50168c2ecf20Sopenharmony_ci{ 50178c2ecf20Sopenharmony_ci struct page *page; 50188c2ecf20Sopenharmony_ci unsigned int alloc_flags = ALLOC_WMARK_LOW; 50198c2ecf20Sopenharmony_ci gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ 50208c2ecf20Sopenharmony_ci struct alloc_context ac = { }; 50218c2ecf20Sopenharmony_ci 50228c2ecf20Sopenharmony_ci /* 50238c2ecf20Sopenharmony_ci * There are several places where we assume that the order value is sane 50248c2ecf20Sopenharmony_ci * so bail out early if the request is out of bound. 50258c2ecf20Sopenharmony_ci */ 50268c2ecf20Sopenharmony_ci if (unlikely(order >= MAX_ORDER)) { 50278c2ecf20Sopenharmony_ci WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); 50288c2ecf20Sopenharmony_ci return NULL; 50298c2ecf20Sopenharmony_ci } 50308c2ecf20Sopenharmony_ci 50318c2ecf20Sopenharmony_ci gfp_mask &= gfp_allowed_mask; 50328c2ecf20Sopenharmony_ci alloc_mask = gfp_mask; 50338c2ecf20Sopenharmony_ci if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) 50348c2ecf20Sopenharmony_ci return NULL; 50358c2ecf20Sopenharmony_ci 50368c2ecf20Sopenharmony_ci /* 50378c2ecf20Sopenharmony_ci * Forbid the first pass from falling back to types that fragment 50388c2ecf20Sopenharmony_ci * memory until all local zones are considered. 50398c2ecf20Sopenharmony_ci */ 50408c2ecf20Sopenharmony_ci alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask); 50418c2ecf20Sopenharmony_ci 50428c2ecf20Sopenharmony_ci /* First allocation attempt */ 50438c2ecf20Sopenharmony_ci page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); 50448c2ecf20Sopenharmony_ci if (likely(page)) 50458c2ecf20Sopenharmony_ci goto out; 50468c2ecf20Sopenharmony_ci 50478c2ecf20Sopenharmony_ci /* 50488c2ecf20Sopenharmony_ci * Apply scoped allocation constraints. This is mainly about GFP_NOFS 50498c2ecf20Sopenharmony_ci * resp. GFP_NOIO which has to be inherited for all allocation requests 50508c2ecf20Sopenharmony_ci * from a particular context which has been marked by 50518c2ecf20Sopenharmony_ci * memalloc_no{fs,io}_{save,restore}. 50528c2ecf20Sopenharmony_ci */ 50538c2ecf20Sopenharmony_ci alloc_mask = current_gfp_context(gfp_mask); 50548c2ecf20Sopenharmony_ci ac.spread_dirty_pages = false; 50558c2ecf20Sopenharmony_ci 50568c2ecf20Sopenharmony_ci /* 50578c2ecf20Sopenharmony_ci * Restore the original nodemask if it was potentially replaced with 50588c2ecf20Sopenharmony_ci * &cpuset_current_mems_allowed to optimize the fast-path attempt. 50598c2ecf20Sopenharmony_ci */ 50608c2ecf20Sopenharmony_ci ac.nodemask = nodemask; 50618c2ecf20Sopenharmony_ci 50628c2ecf20Sopenharmony_ci page = __alloc_pages_slowpath(alloc_mask, order, &ac); 50638c2ecf20Sopenharmony_ci 50648c2ecf20Sopenharmony_ciout: 50658c2ecf20Sopenharmony_ci if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && 50668c2ecf20Sopenharmony_ci unlikely(__memcg_kmem_charge_page(page, gfp_mask, order) != 0)) { 50678c2ecf20Sopenharmony_ci __free_pages(page, order); 50688c2ecf20Sopenharmony_ci page = NULL; 50698c2ecf20Sopenharmony_ci } 50708c2ecf20Sopenharmony_ci 50718c2ecf20Sopenharmony_ci trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); 50728c2ecf20Sopenharmony_ci 50738c2ecf20Sopenharmony_ci return page; 50748c2ecf20Sopenharmony_ci} 50758c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__alloc_pages_nodemask); 50768c2ecf20Sopenharmony_ci 50778c2ecf20Sopenharmony_ci/* 50788c2ecf20Sopenharmony_ci * Common helper functions. Never use with __GFP_HIGHMEM because the returned 50798c2ecf20Sopenharmony_ci * address cannot represent highmem pages. Use alloc_pages and then kmap if 50808c2ecf20Sopenharmony_ci * you need to access high mem. 50818c2ecf20Sopenharmony_ci */ 50828c2ecf20Sopenharmony_ciunsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 50838c2ecf20Sopenharmony_ci{ 50848c2ecf20Sopenharmony_ci struct page *page; 50858c2ecf20Sopenharmony_ci 50868c2ecf20Sopenharmony_ci page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order); 50878c2ecf20Sopenharmony_ci if (!page) 50888c2ecf20Sopenharmony_ci return 0; 50898c2ecf20Sopenharmony_ci return (unsigned long) page_address(page); 50908c2ecf20Sopenharmony_ci} 50918c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__get_free_pages); 50928c2ecf20Sopenharmony_ci 50938c2ecf20Sopenharmony_ciunsigned long get_zeroed_page(gfp_t gfp_mask) 50948c2ecf20Sopenharmony_ci{ 50958c2ecf20Sopenharmony_ci return __get_free_pages(gfp_mask | __GFP_ZERO, 0); 50968c2ecf20Sopenharmony_ci} 50978c2ecf20Sopenharmony_ciEXPORT_SYMBOL(get_zeroed_page); 50988c2ecf20Sopenharmony_ci 50998c2ecf20Sopenharmony_cistatic inline void free_the_page(struct page *page, unsigned int order) 51008c2ecf20Sopenharmony_ci{ 51018c2ecf20Sopenharmony_ci if (order == 0) /* Via pcp? */ 51028c2ecf20Sopenharmony_ci free_unref_page(page); 51038c2ecf20Sopenharmony_ci else 51048c2ecf20Sopenharmony_ci __free_pages_ok(page, order, FPI_NONE); 51058c2ecf20Sopenharmony_ci} 51068c2ecf20Sopenharmony_ci 51078c2ecf20Sopenharmony_civoid __free_pages(struct page *page, unsigned int order) 51088c2ecf20Sopenharmony_ci{ 51098c2ecf20Sopenharmony_ci /* get PageHead before we drop reference */ 51108c2ecf20Sopenharmony_ci int head = PageHead(page); 51118c2ecf20Sopenharmony_ci 51128c2ecf20Sopenharmony_ci if (put_page_testzero(page)) 51138c2ecf20Sopenharmony_ci free_the_page(page, order); 51148c2ecf20Sopenharmony_ci else if (!head) 51158c2ecf20Sopenharmony_ci while (order-- > 0) 51168c2ecf20Sopenharmony_ci free_the_page(page + (1 << order), order); 51178c2ecf20Sopenharmony_ci} 51188c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__free_pages); 51198c2ecf20Sopenharmony_ci 51208c2ecf20Sopenharmony_civoid free_pages(unsigned long addr, unsigned int order) 51218c2ecf20Sopenharmony_ci{ 51228c2ecf20Sopenharmony_ci if (addr != 0) { 51238c2ecf20Sopenharmony_ci VM_BUG_ON(!virt_addr_valid((void *)addr)); 51248c2ecf20Sopenharmony_ci __free_pages(virt_to_page((void *)addr), order); 51258c2ecf20Sopenharmony_ci } 51268c2ecf20Sopenharmony_ci} 51278c2ecf20Sopenharmony_ci 51288c2ecf20Sopenharmony_ciEXPORT_SYMBOL(free_pages); 51298c2ecf20Sopenharmony_ci 51308c2ecf20Sopenharmony_ci/* 51318c2ecf20Sopenharmony_ci * Page Fragment: 51328c2ecf20Sopenharmony_ci * An arbitrary-length arbitrary-offset area of memory which resides 51338c2ecf20Sopenharmony_ci * within a 0 or higher order page. Multiple fragments within that page 51348c2ecf20Sopenharmony_ci * are individually refcounted, in the page's reference counter. 51358c2ecf20Sopenharmony_ci * 51368c2ecf20Sopenharmony_ci * The page_frag functions below provide a simple allocation framework for 51378c2ecf20Sopenharmony_ci * page fragments. This is used by the network stack and network device 51388c2ecf20Sopenharmony_ci * drivers to provide a backing region of memory for use as either an 51398c2ecf20Sopenharmony_ci * sk_buff->head, or to be used in the "frags" portion of skb_shared_info. 51408c2ecf20Sopenharmony_ci */ 51418c2ecf20Sopenharmony_cistatic struct page *__page_frag_cache_refill(struct page_frag_cache *nc, 51428c2ecf20Sopenharmony_ci gfp_t gfp_mask) 51438c2ecf20Sopenharmony_ci{ 51448c2ecf20Sopenharmony_ci struct page *page = NULL; 51458c2ecf20Sopenharmony_ci gfp_t gfp = gfp_mask; 51468c2ecf20Sopenharmony_ci 51478c2ecf20Sopenharmony_ci#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) 51488c2ecf20Sopenharmony_ci gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY | 51498c2ecf20Sopenharmony_ci __GFP_NOMEMALLOC; 51508c2ecf20Sopenharmony_ci page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, 51518c2ecf20Sopenharmony_ci PAGE_FRAG_CACHE_MAX_ORDER); 51528c2ecf20Sopenharmony_ci nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE; 51538c2ecf20Sopenharmony_ci#endif 51548c2ecf20Sopenharmony_ci if (unlikely(!page)) 51558c2ecf20Sopenharmony_ci page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); 51568c2ecf20Sopenharmony_ci 51578c2ecf20Sopenharmony_ci nc->va = page ? page_address(page) : NULL; 51588c2ecf20Sopenharmony_ci 51598c2ecf20Sopenharmony_ci#ifdef CONFIG_PAGE_TRACING 51608c2ecf20Sopenharmony_ci if (likely(page)) { 51618c2ecf20Sopenharmony_ci int order = get_order(nc->size); 51628c2ecf20Sopenharmony_ci int i; 51638c2ecf20Sopenharmony_ci struct page *newpage = page; 51648c2ecf20Sopenharmony_ci unsigned int deta = 1U << (unsigned int)order; 51658c2ecf20Sopenharmony_ci 51668c2ecf20Sopenharmony_ci for (i = 0; i < (1 << order); i++) { 51678c2ecf20Sopenharmony_ci if (!newpage) 51688c2ecf20Sopenharmony_ci break; 51698c2ecf20Sopenharmony_ci SetPageSKB(newpage); 51708c2ecf20Sopenharmony_ci newpage++; 51718c2ecf20Sopenharmony_ci } 51728c2ecf20Sopenharmony_ci mod_zone_page_state(page_zone(page), NR_SKB_PAGES, (long)deta); 51738c2ecf20Sopenharmony_ci } 51748c2ecf20Sopenharmony_ci#endif 51758c2ecf20Sopenharmony_ci 51768c2ecf20Sopenharmony_ci return page; 51778c2ecf20Sopenharmony_ci} 51788c2ecf20Sopenharmony_ci 51798c2ecf20Sopenharmony_civoid __page_frag_cache_drain(struct page *page, unsigned int count) 51808c2ecf20Sopenharmony_ci{ 51818c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); 51828c2ecf20Sopenharmony_ci 51838c2ecf20Sopenharmony_ci if (page_ref_sub_and_test(page, count)) { 51848c2ecf20Sopenharmony_ci#ifdef CONFIG_PAGE_TRACING 51858c2ecf20Sopenharmony_ci if (likely(page)) { 51868c2ecf20Sopenharmony_ci unsigned int deta = 1U << compound_order(page); 51878c2ecf20Sopenharmony_ci 51888c2ecf20Sopenharmony_ci mod_zone_page_state(page_zone(page), NR_SKB_PAGES, -(long)deta); 51898c2ecf20Sopenharmony_ci } 51908c2ecf20Sopenharmony_ci#endif 51918c2ecf20Sopenharmony_ci free_the_page(page, compound_order(page)); 51928c2ecf20Sopenharmony_ci } 51938c2ecf20Sopenharmony_ci} 51948c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__page_frag_cache_drain); 51958c2ecf20Sopenharmony_ci 51968c2ecf20Sopenharmony_civoid *page_frag_alloc(struct page_frag_cache *nc, 51978c2ecf20Sopenharmony_ci unsigned int fragsz, gfp_t gfp_mask) 51988c2ecf20Sopenharmony_ci{ 51998c2ecf20Sopenharmony_ci unsigned int size = PAGE_SIZE; 52008c2ecf20Sopenharmony_ci struct page *page; 52018c2ecf20Sopenharmony_ci int offset; 52028c2ecf20Sopenharmony_ci 52038c2ecf20Sopenharmony_ci if (unlikely(!nc->va)) { 52048c2ecf20Sopenharmony_cirefill: 52058c2ecf20Sopenharmony_ci page = __page_frag_cache_refill(nc, gfp_mask); 52068c2ecf20Sopenharmony_ci if (!page) 52078c2ecf20Sopenharmony_ci return NULL; 52088c2ecf20Sopenharmony_ci 52098c2ecf20Sopenharmony_ci#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) 52108c2ecf20Sopenharmony_ci /* if size can vary use size else just use PAGE_SIZE */ 52118c2ecf20Sopenharmony_ci size = nc->size; 52128c2ecf20Sopenharmony_ci#endif 52138c2ecf20Sopenharmony_ci /* Even if we own the page, we do not use atomic_set(). 52148c2ecf20Sopenharmony_ci * This would break get_page_unless_zero() users. 52158c2ecf20Sopenharmony_ci */ 52168c2ecf20Sopenharmony_ci page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE); 52178c2ecf20Sopenharmony_ci 52188c2ecf20Sopenharmony_ci /* reset page count bias and offset to start of new frag */ 52198c2ecf20Sopenharmony_ci nc->pfmemalloc = page_is_pfmemalloc(page); 52208c2ecf20Sopenharmony_ci nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; 52218c2ecf20Sopenharmony_ci nc->offset = size; 52228c2ecf20Sopenharmony_ci } 52238c2ecf20Sopenharmony_ci 52248c2ecf20Sopenharmony_ci offset = nc->offset - fragsz; 52258c2ecf20Sopenharmony_ci if (unlikely(offset < 0)) { 52268c2ecf20Sopenharmony_ci page = virt_to_page(nc->va); 52278c2ecf20Sopenharmony_ci 52288c2ecf20Sopenharmony_ci if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) 52298c2ecf20Sopenharmony_ci goto refill; 52308c2ecf20Sopenharmony_ci 52318c2ecf20Sopenharmony_ci if (unlikely(nc->pfmemalloc)) { 52328c2ecf20Sopenharmony_ci free_the_page(page, compound_order(page)); 52338c2ecf20Sopenharmony_ci goto refill; 52348c2ecf20Sopenharmony_ci } 52358c2ecf20Sopenharmony_ci 52368c2ecf20Sopenharmony_ci#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) 52378c2ecf20Sopenharmony_ci /* if size can vary use size else just use PAGE_SIZE */ 52388c2ecf20Sopenharmony_ci size = nc->size; 52398c2ecf20Sopenharmony_ci#endif 52408c2ecf20Sopenharmony_ci /* OK, page count is 0, we can safely set it */ 52418c2ecf20Sopenharmony_ci set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1); 52428c2ecf20Sopenharmony_ci 52438c2ecf20Sopenharmony_ci /* reset page count bias and offset to start of new frag */ 52448c2ecf20Sopenharmony_ci nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; 52458c2ecf20Sopenharmony_ci offset = size - fragsz; 52468c2ecf20Sopenharmony_ci if (unlikely(offset < 0)) { 52478c2ecf20Sopenharmony_ci /* 52488c2ecf20Sopenharmony_ci * The caller is trying to allocate a fragment 52498c2ecf20Sopenharmony_ci * with fragsz > PAGE_SIZE but the cache isn't big 52508c2ecf20Sopenharmony_ci * enough to satisfy the request, this may 52518c2ecf20Sopenharmony_ci * happen in low memory conditions. 52528c2ecf20Sopenharmony_ci * We don't release the cache page because 52538c2ecf20Sopenharmony_ci * it could make memory pressure worse 52548c2ecf20Sopenharmony_ci * so we simply return NULL here. 52558c2ecf20Sopenharmony_ci */ 52568c2ecf20Sopenharmony_ci return NULL; 52578c2ecf20Sopenharmony_ci } 52588c2ecf20Sopenharmony_ci } 52598c2ecf20Sopenharmony_ci 52608c2ecf20Sopenharmony_ci nc->pagecnt_bias--; 52618c2ecf20Sopenharmony_ci nc->offset = offset; 52628c2ecf20Sopenharmony_ci 52638c2ecf20Sopenharmony_ci return nc->va + offset; 52648c2ecf20Sopenharmony_ci} 52658c2ecf20Sopenharmony_ciEXPORT_SYMBOL(page_frag_alloc); 52668c2ecf20Sopenharmony_ci 52678c2ecf20Sopenharmony_ci/* 52688c2ecf20Sopenharmony_ci * Frees a page fragment allocated out of either a compound or order 0 page. 52698c2ecf20Sopenharmony_ci */ 52708c2ecf20Sopenharmony_civoid page_frag_free(void *addr) 52718c2ecf20Sopenharmony_ci{ 52728c2ecf20Sopenharmony_ci struct page *page = virt_to_head_page(addr); 52738c2ecf20Sopenharmony_ci 52748c2ecf20Sopenharmony_ci if (unlikely(put_page_testzero(page))) { 52758c2ecf20Sopenharmony_ci#ifdef CONFIG_PAGE_TRACING 52768c2ecf20Sopenharmony_ci if (likely(page)) { 52778c2ecf20Sopenharmony_ci unsigned int deta = 1U << compound_order(page); 52788c2ecf20Sopenharmony_ci 52798c2ecf20Sopenharmony_ci mod_zone_page_state(page_zone(page), NR_SKB_PAGES, -(long)deta); 52808c2ecf20Sopenharmony_ci } 52818c2ecf20Sopenharmony_ci#endif 52828c2ecf20Sopenharmony_ci free_the_page(page, compound_order(page)); 52838c2ecf20Sopenharmony_ci } 52848c2ecf20Sopenharmony_ci} 52858c2ecf20Sopenharmony_ciEXPORT_SYMBOL(page_frag_free); 52868c2ecf20Sopenharmony_ci 52878c2ecf20Sopenharmony_cistatic void *make_alloc_exact(unsigned long addr, unsigned int order, 52888c2ecf20Sopenharmony_ci size_t size) 52898c2ecf20Sopenharmony_ci{ 52908c2ecf20Sopenharmony_ci if (addr) { 52918c2ecf20Sopenharmony_ci unsigned long alloc_end = addr + (PAGE_SIZE << order); 52928c2ecf20Sopenharmony_ci unsigned long used = addr + PAGE_ALIGN(size); 52938c2ecf20Sopenharmony_ci 52948c2ecf20Sopenharmony_ci split_page(virt_to_page((void *)addr), order); 52958c2ecf20Sopenharmony_ci while (used < alloc_end) { 52968c2ecf20Sopenharmony_ci free_page(used); 52978c2ecf20Sopenharmony_ci used += PAGE_SIZE; 52988c2ecf20Sopenharmony_ci } 52998c2ecf20Sopenharmony_ci } 53008c2ecf20Sopenharmony_ci return (void *)addr; 53018c2ecf20Sopenharmony_ci} 53028c2ecf20Sopenharmony_ci 53038c2ecf20Sopenharmony_ci/** 53048c2ecf20Sopenharmony_ci * alloc_pages_exact - allocate an exact number physically-contiguous pages. 53058c2ecf20Sopenharmony_ci * @size: the number of bytes to allocate 53068c2ecf20Sopenharmony_ci * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP 53078c2ecf20Sopenharmony_ci * 53088c2ecf20Sopenharmony_ci * This function is similar to alloc_pages(), except that it allocates the 53098c2ecf20Sopenharmony_ci * minimum number of pages to satisfy the request. alloc_pages() can only 53108c2ecf20Sopenharmony_ci * allocate memory in power-of-two pages. 53118c2ecf20Sopenharmony_ci * 53128c2ecf20Sopenharmony_ci * This function is also limited by MAX_ORDER. 53138c2ecf20Sopenharmony_ci * 53148c2ecf20Sopenharmony_ci * Memory allocated by this function must be released by free_pages_exact(). 53158c2ecf20Sopenharmony_ci * 53168c2ecf20Sopenharmony_ci * Return: pointer to the allocated area or %NULL in case of error. 53178c2ecf20Sopenharmony_ci */ 53188c2ecf20Sopenharmony_civoid *alloc_pages_exact(size_t size, gfp_t gfp_mask) 53198c2ecf20Sopenharmony_ci{ 53208c2ecf20Sopenharmony_ci unsigned int order = get_order(size); 53218c2ecf20Sopenharmony_ci unsigned long addr; 53228c2ecf20Sopenharmony_ci 53238c2ecf20Sopenharmony_ci if (WARN_ON_ONCE(gfp_mask & __GFP_COMP)) 53248c2ecf20Sopenharmony_ci gfp_mask &= ~__GFP_COMP; 53258c2ecf20Sopenharmony_ci 53268c2ecf20Sopenharmony_ci addr = __get_free_pages(gfp_mask, order); 53278c2ecf20Sopenharmony_ci return make_alloc_exact(addr, order, size); 53288c2ecf20Sopenharmony_ci} 53298c2ecf20Sopenharmony_ciEXPORT_SYMBOL(alloc_pages_exact); 53308c2ecf20Sopenharmony_ci 53318c2ecf20Sopenharmony_ci/** 53328c2ecf20Sopenharmony_ci * alloc_pages_exact_nid - allocate an exact number of physically-contiguous 53338c2ecf20Sopenharmony_ci * pages on a node. 53348c2ecf20Sopenharmony_ci * @nid: the preferred node ID where memory should be allocated 53358c2ecf20Sopenharmony_ci * @size: the number of bytes to allocate 53368c2ecf20Sopenharmony_ci * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP 53378c2ecf20Sopenharmony_ci * 53388c2ecf20Sopenharmony_ci * Like alloc_pages_exact(), but try to allocate on node nid first before falling 53398c2ecf20Sopenharmony_ci * back. 53408c2ecf20Sopenharmony_ci * 53418c2ecf20Sopenharmony_ci * Return: pointer to the allocated area or %NULL in case of error. 53428c2ecf20Sopenharmony_ci */ 53438c2ecf20Sopenharmony_civoid * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) 53448c2ecf20Sopenharmony_ci{ 53458c2ecf20Sopenharmony_ci unsigned int order = get_order(size); 53468c2ecf20Sopenharmony_ci struct page *p; 53478c2ecf20Sopenharmony_ci 53488c2ecf20Sopenharmony_ci if (WARN_ON_ONCE(gfp_mask & __GFP_COMP)) 53498c2ecf20Sopenharmony_ci gfp_mask &= ~__GFP_COMP; 53508c2ecf20Sopenharmony_ci 53518c2ecf20Sopenharmony_ci p = alloc_pages_node(nid, gfp_mask, order); 53528c2ecf20Sopenharmony_ci if (!p) 53538c2ecf20Sopenharmony_ci return NULL; 53548c2ecf20Sopenharmony_ci return make_alloc_exact((unsigned long)page_address(p), order, size); 53558c2ecf20Sopenharmony_ci} 53568c2ecf20Sopenharmony_ci 53578c2ecf20Sopenharmony_ci/** 53588c2ecf20Sopenharmony_ci * free_pages_exact - release memory allocated via alloc_pages_exact() 53598c2ecf20Sopenharmony_ci * @virt: the value returned by alloc_pages_exact. 53608c2ecf20Sopenharmony_ci * @size: size of allocation, same value as passed to alloc_pages_exact(). 53618c2ecf20Sopenharmony_ci * 53628c2ecf20Sopenharmony_ci * Release the memory allocated by a previous call to alloc_pages_exact. 53638c2ecf20Sopenharmony_ci */ 53648c2ecf20Sopenharmony_civoid free_pages_exact(void *virt, size_t size) 53658c2ecf20Sopenharmony_ci{ 53668c2ecf20Sopenharmony_ci unsigned long addr = (unsigned long)virt; 53678c2ecf20Sopenharmony_ci unsigned long end = addr + PAGE_ALIGN(size); 53688c2ecf20Sopenharmony_ci 53698c2ecf20Sopenharmony_ci while (addr < end) { 53708c2ecf20Sopenharmony_ci free_page(addr); 53718c2ecf20Sopenharmony_ci addr += PAGE_SIZE; 53728c2ecf20Sopenharmony_ci } 53738c2ecf20Sopenharmony_ci} 53748c2ecf20Sopenharmony_ciEXPORT_SYMBOL(free_pages_exact); 53758c2ecf20Sopenharmony_ci 53768c2ecf20Sopenharmony_ci/** 53778c2ecf20Sopenharmony_ci * nr_free_zone_pages - count number of pages beyond high watermark 53788c2ecf20Sopenharmony_ci * @offset: The zone index of the highest zone 53798c2ecf20Sopenharmony_ci * 53808c2ecf20Sopenharmony_ci * nr_free_zone_pages() counts the number of pages which are beyond the 53818c2ecf20Sopenharmony_ci * high watermark within all zones at or below a given zone index. For each 53828c2ecf20Sopenharmony_ci * zone, the number of pages is calculated as: 53838c2ecf20Sopenharmony_ci * 53848c2ecf20Sopenharmony_ci * nr_free_zone_pages = managed_pages - high_pages 53858c2ecf20Sopenharmony_ci * 53868c2ecf20Sopenharmony_ci * Return: number of pages beyond high watermark. 53878c2ecf20Sopenharmony_ci */ 53888c2ecf20Sopenharmony_cistatic unsigned long nr_free_zone_pages(int offset) 53898c2ecf20Sopenharmony_ci{ 53908c2ecf20Sopenharmony_ci struct zoneref *z; 53918c2ecf20Sopenharmony_ci struct zone *zone; 53928c2ecf20Sopenharmony_ci 53938c2ecf20Sopenharmony_ci /* Just pick one node, since fallback list is circular */ 53948c2ecf20Sopenharmony_ci unsigned long sum = 0; 53958c2ecf20Sopenharmony_ci 53968c2ecf20Sopenharmony_ci struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); 53978c2ecf20Sopenharmony_ci 53988c2ecf20Sopenharmony_ci for_each_zone_zonelist(zone, z, zonelist, offset) { 53998c2ecf20Sopenharmony_ci unsigned long size = zone_managed_pages(zone); 54008c2ecf20Sopenharmony_ci unsigned long high = high_wmark_pages(zone); 54018c2ecf20Sopenharmony_ci if (size > high) 54028c2ecf20Sopenharmony_ci sum += size - high; 54038c2ecf20Sopenharmony_ci } 54048c2ecf20Sopenharmony_ci 54058c2ecf20Sopenharmony_ci return sum; 54068c2ecf20Sopenharmony_ci} 54078c2ecf20Sopenharmony_ci 54088c2ecf20Sopenharmony_ci/** 54098c2ecf20Sopenharmony_ci * nr_free_buffer_pages - count number of pages beyond high watermark 54108c2ecf20Sopenharmony_ci * 54118c2ecf20Sopenharmony_ci * nr_free_buffer_pages() counts the number of pages which are beyond the high 54128c2ecf20Sopenharmony_ci * watermark within ZONE_DMA and ZONE_NORMAL. 54138c2ecf20Sopenharmony_ci * 54148c2ecf20Sopenharmony_ci * Return: number of pages beyond high watermark within ZONE_DMA and 54158c2ecf20Sopenharmony_ci * ZONE_NORMAL. 54168c2ecf20Sopenharmony_ci */ 54178c2ecf20Sopenharmony_ciunsigned long nr_free_buffer_pages(void) 54188c2ecf20Sopenharmony_ci{ 54198c2ecf20Sopenharmony_ci return nr_free_zone_pages(gfp_zone(GFP_USER)); 54208c2ecf20Sopenharmony_ci} 54218c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(nr_free_buffer_pages); 54228c2ecf20Sopenharmony_ci 54238c2ecf20Sopenharmony_cistatic inline void show_node(struct zone *zone) 54248c2ecf20Sopenharmony_ci{ 54258c2ecf20Sopenharmony_ci if (IS_ENABLED(CONFIG_NUMA)) 54268c2ecf20Sopenharmony_ci printk("Node %d ", zone_to_nid(zone)); 54278c2ecf20Sopenharmony_ci} 54288c2ecf20Sopenharmony_ci 54298c2ecf20Sopenharmony_cilong si_mem_available(void) 54308c2ecf20Sopenharmony_ci{ 54318c2ecf20Sopenharmony_ci long available; 54328c2ecf20Sopenharmony_ci unsigned long pagecache; 54338c2ecf20Sopenharmony_ci unsigned long wmark_low = 0; 54348c2ecf20Sopenharmony_ci unsigned long pages[NR_LRU_LISTS]; 54358c2ecf20Sopenharmony_ci unsigned long reclaimable; 54368c2ecf20Sopenharmony_ci struct zone *zone; 54378c2ecf20Sopenharmony_ci int lru; 54388c2ecf20Sopenharmony_ci 54398c2ecf20Sopenharmony_ci for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) 54408c2ecf20Sopenharmony_ci pages[lru] = global_node_page_state(NR_LRU_BASE + lru); 54418c2ecf20Sopenharmony_ci 54428c2ecf20Sopenharmony_ci for_each_zone(zone) 54438c2ecf20Sopenharmony_ci wmark_low += low_wmark_pages(zone); 54448c2ecf20Sopenharmony_ci 54458c2ecf20Sopenharmony_ci /* 54468c2ecf20Sopenharmony_ci * Estimate the amount of memory available for userspace allocations, 54478c2ecf20Sopenharmony_ci * without causing swapping. 54488c2ecf20Sopenharmony_ci */ 54498c2ecf20Sopenharmony_ci available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages; 54508c2ecf20Sopenharmony_ci 54518c2ecf20Sopenharmony_ci /* 54528c2ecf20Sopenharmony_ci * Not all the page cache can be freed, otherwise the system will 54538c2ecf20Sopenharmony_ci * start swapping. Assume at least half of the page cache, or the 54548c2ecf20Sopenharmony_ci * low watermark worth of cache, needs to stay. 54558c2ecf20Sopenharmony_ci */ 54568c2ecf20Sopenharmony_ci pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; 54578c2ecf20Sopenharmony_ci pagecache -= min(pagecache / 2, wmark_low); 54588c2ecf20Sopenharmony_ci available += pagecache; 54598c2ecf20Sopenharmony_ci 54608c2ecf20Sopenharmony_ci /* 54618c2ecf20Sopenharmony_ci * Part of the reclaimable slab and other kernel memory consists of 54628c2ecf20Sopenharmony_ci * items that are in use, and cannot be freed. Cap this estimate at the 54638c2ecf20Sopenharmony_ci * low watermark. 54648c2ecf20Sopenharmony_ci */ 54658c2ecf20Sopenharmony_ci reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) + 54668c2ecf20Sopenharmony_ci global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); 54678c2ecf20Sopenharmony_ci available += reclaimable - min(reclaimable / 2, wmark_low); 54688c2ecf20Sopenharmony_ci 54698c2ecf20Sopenharmony_ci if (available < 0) 54708c2ecf20Sopenharmony_ci available = 0; 54718c2ecf20Sopenharmony_ci return available; 54728c2ecf20Sopenharmony_ci} 54738c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(si_mem_available); 54748c2ecf20Sopenharmony_ci 54758c2ecf20Sopenharmony_civoid si_meminfo(struct sysinfo *val) 54768c2ecf20Sopenharmony_ci{ 54778c2ecf20Sopenharmony_ci val->totalram = totalram_pages(); 54788c2ecf20Sopenharmony_ci val->sharedram = global_node_page_state(NR_SHMEM); 54798c2ecf20Sopenharmony_ci val->freeram = global_zone_page_state(NR_FREE_PAGES); 54808c2ecf20Sopenharmony_ci val->bufferram = nr_blockdev_pages(); 54818c2ecf20Sopenharmony_ci val->totalhigh = totalhigh_pages(); 54828c2ecf20Sopenharmony_ci val->freehigh = nr_free_highpages(); 54838c2ecf20Sopenharmony_ci val->mem_unit = PAGE_SIZE; 54848c2ecf20Sopenharmony_ci} 54858c2ecf20Sopenharmony_ci 54868c2ecf20Sopenharmony_ciEXPORT_SYMBOL(si_meminfo); 54878c2ecf20Sopenharmony_ci 54888c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA 54898c2ecf20Sopenharmony_civoid si_meminfo_node(struct sysinfo *val, int nid) 54908c2ecf20Sopenharmony_ci{ 54918c2ecf20Sopenharmony_ci int zone_type; /* needs to be signed */ 54928c2ecf20Sopenharmony_ci unsigned long managed_pages = 0; 54938c2ecf20Sopenharmony_ci unsigned long managed_highpages = 0; 54948c2ecf20Sopenharmony_ci unsigned long free_highpages = 0; 54958c2ecf20Sopenharmony_ci pg_data_t *pgdat = NODE_DATA(nid); 54968c2ecf20Sopenharmony_ci 54978c2ecf20Sopenharmony_ci for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) 54988c2ecf20Sopenharmony_ci managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]); 54998c2ecf20Sopenharmony_ci val->totalram = managed_pages; 55008c2ecf20Sopenharmony_ci val->sharedram = node_page_state(pgdat, NR_SHMEM); 55018c2ecf20Sopenharmony_ci val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); 55028c2ecf20Sopenharmony_ci#ifdef CONFIG_HIGHMEM 55038c2ecf20Sopenharmony_ci for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 55048c2ecf20Sopenharmony_ci struct zone *zone = &pgdat->node_zones[zone_type]; 55058c2ecf20Sopenharmony_ci 55068c2ecf20Sopenharmony_ci if (is_highmem(zone)) { 55078c2ecf20Sopenharmony_ci managed_highpages += zone_managed_pages(zone); 55088c2ecf20Sopenharmony_ci free_highpages += zone_page_state(zone, NR_FREE_PAGES); 55098c2ecf20Sopenharmony_ci } 55108c2ecf20Sopenharmony_ci } 55118c2ecf20Sopenharmony_ci val->totalhigh = managed_highpages; 55128c2ecf20Sopenharmony_ci val->freehigh = free_highpages; 55138c2ecf20Sopenharmony_ci#else 55148c2ecf20Sopenharmony_ci val->totalhigh = managed_highpages; 55158c2ecf20Sopenharmony_ci val->freehigh = free_highpages; 55168c2ecf20Sopenharmony_ci#endif 55178c2ecf20Sopenharmony_ci val->mem_unit = PAGE_SIZE; 55188c2ecf20Sopenharmony_ci} 55198c2ecf20Sopenharmony_ci#endif 55208c2ecf20Sopenharmony_ci 55218c2ecf20Sopenharmony_ci/* 55228c2ecf20Sopenharmony_ci * Determine whether the node should be displayed or not, depending on whether 55238c2ecf20Sopenharmony_ci * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). 55248c2ecf20Sopenharmony_ci */ 55258c2ecf20Sopenharmony_cistatic bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask) 55268c2ecf20Sopenharmony_ci{ 55278c2ecf20Sopenharmony_ci if (!(flags & SHOW_MEM_FILTER_NODES)) 55288c2ecf20Sopenharmony_ci return false; 55298c2ecf20Sopenharmony_ci 55308c2ecf20Sopenharmony_ci /* 55318c2ecf20Sopenharmony_ci * no node mask - aka implicit memory numa policy. Do not bother with 55328c2ecf20Sopenharmony_ci * the synchronization - read_mems_allowed_begin - because we do not 55338c2ecf20Sopenharmony_ci * have to be precise here. 55348c2ecf20Sopenharmony_ci */ 55358c2ecf20Sopenharmony_ci if (!nodemask) 55368c2ecf20Sopenharmony_ci nodemask = &cpuset_current_mems_allowed; 55378c2ecf20Sopenharmony_ci 55388c2ecf20Sopenharmony_ci return !node_isset(nid, *nodemask); 55398c2ecf20Sopenharmony_ci} 55408c2ecf20Sopenharmony_ci 55418c2ecf20Sopenharmony_ci#define K(x) ((x) << (PAGE_SHIFT-10)) 55428c2ecf20Sopenharmony_ci 55438c2ecf20Sopenharmony_cistatic void show_migration_types(unsigned char type) 55448c2ecf20Sopenharmony_ci{ 55458c2ecf20Sopenharmony_ci static const char types[MIGRATE_TYPES] = { 55468c2ecf20Sopenharmony_ci [MIGRATE_UNMOVABLE] = 'U', 55478c2ecf20Sopenharmony_ci [MIGRATE_MOVABLE] = 'M', 55488c2ecf20Sopenharmony_ci [MIGRATE_RECLAIMABLE] = 'E', 55498c2ecf20Sopenharmony_ci [MIGRATE_HIGHATOMIC] = 'H', 55508c2ecf20Sopenharmony_ci#ifdef CONFIG_CMA 55518c2ecf20Sopenharmony_ci [MIGRATE_CMA] = 'C', 55528c2ecf20Sopenharmony_ci#endif 55538c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMORY_ISOLATION 55548c2ecf20Sopenharmony_ci [MIGRATE_ISOLATE] = 'I', 55558c2ecf20Sopenharmony_ci#endif 55568c2ecf20Sopenharmony_ci }; 55578c2ecf20Sopenharmony_ci char tmp[MIGRATE_TYPES + 1]; 55588c2ecf20Sopenharmony_ci char *p = tmp; 55598c2ecf20Sopenharmony_ci int i; 55608c2ecf20Sopenharmony_ci 55618c2ecf20Sopenharmony_ci for (i = 0; i < MIGRATE_TYPES; i++) { 55628c2ecf20Sopenharmony_ci if (type & (1 << i)) 55638c2ecf20Sopenharmony_ci *p++ = types[i]; 55648c2ecf20Sopenharmony_ci } 55658c2ecf20Sopenharmony_ci 55668c2ecf20Sopenharmony_ci *p = '\0'; 55678c2ecf20Sopenharmony_ci printk(KERN_CONT "(%s) ", tmp); 55688c2ecf20Sopenharmony_ci} 55698c2ecf20Sopenharmony_ci 55708c2ecf20Sopenharmony_ci/* 55718c2ecf20Sopenharmony_ci * Show free area list (used inside shift_scroll-lock stuff) 55728c2ecf20Sopenharmony_ci * We also calculate the percentage fragmentation. We do this by counting the 55738c2ecf20Sopenharmony_ci * memory on each free list with the exception of the first item on the list. 55748c2ecf20Sopenharmony_ci * 55758c2ecf20Sopenharmony_ci * Bits in @filter: 55768c2ecf20Sopenharmony_ci * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's 55778c2ecf20Sopenharmony_ci * cpuset. 55788c2ecf20Sopenharmony_ci */ 55798c2ecf20Sopenharmony_civoid show_free_areas(unsigned int filter, nodemask_t *nodemask) 55808c2ecf20Sopenharmony_ci{ 55818c2ecf20Sopenharmony_ci unsigned long free_pcp = 0; 55828c2ecf20Sopenharmony_ci int cpu; 55838c2ecf20Sopenharmony_ci struct zone *zone; 55848c2ecf20Sopenharmony_ci pg_data_t *pgdat; 55858c2ecf20Sopenharmony_ci 55868c2ecf20Sopenharmony_ci for_each_populated_zone(zone) { 55878c2ecf20Sopenharmony_ci if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) 55888c2ecf20Sopenharmony_ci continue; 55898c2ecf20Sopenharmony_ci 55908c2ecf20Sopenharmony_ci for_each_online_cpu(cpu) 55918c2ecf20Sopenharmony_ci free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; 55928c2ecf20Sopenharmony_ci } 55938c2ecf20Sopenharmony_ci 55948c2ecf20Sopenharmony_ci printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" 55958c2ecf20Sopenharmony_ci " active_file:%lu inactive_file:%lu isolated_file:%lu\n" 55968c2ecf20Sopenharmony_ci " unevictable:%lu dirty:%lu writeback:%lu\n" 55978c2ecf20Sopenharmony_ci " slab_reclaimable:%lu slab_unreclaimable:%lu\n" 55988c2ecf20Sopenharmony_ci " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" 55998c2ecf20Sopenharmony_ci " free:%lu free_pcp:%lu free_cma:%lu\n", 56008c2ecf20Sopenharmony_ci global_node_page_state(NR_ACTIVE_ANON), 56018c2ecf20Sopenharmony_ci global_node_page_state(NR_INACTIVE_ANON), 56028c2ecf20Sopenharmony_ci global_node_page_state(NR_ISOLATED_ANON), 56038c2ecf20Sopenharmony_ci global_node_page_state(NR_ACTIVE_FILE), 56048c2ecf20Sopenharmony_ci global_node_page_state(NR_INACTIVE_FILE), 56058c2ecf20Sopenharmony_ci global_node_page_state(NR_ISOLATED_FILE), 56068c2ecf20Sopenharmony_ci global_node_page_state(NR_UNEVICTABLE), 56078c2ecf20Sopenharmony_ci global_node_page_state(NR_FILE_DIRTY), 56088c2ecf20Sopenharmony_ci global_node_page_state(NR_WRITEBACK), 56098c2ecf20Sopenharmony_ci global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B), 56108c2ecf20Sopenharmony_ci global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B), 56118c2ecf20Sopenharmony_ci global_node_page_state(NR_FILE_MAPPED), 56128c2ecf20Sopenharmony_ci global_node_page_state(NR_SHMEM), 56138c2ecf20Sopenharmony_ci global_zone_page_state(NR_PAGETABLE), 56148c2ecf20Sopenharmony_ci global_zone_page_state(NR_BOUNCE), 56158c2ecf20Sopenharmony_ci global_zone_page_state(NR_FREE_PAGES), 56168c2ecf20Sopenharmony_ci free_pcp, 56178c2ecf20Sopenharmony_ci global_zone_page_state(NR_FREE_CMA_PAGES)); 56188c2ecf20Sopenharmony_ci 56198c2ecf20Sopenharmony_ci for_each_online_pgdat(pgdat) { 56208c2ecf20Sopenharmony_ci if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) 56218c2ecf20Sopenharmony_ci continue; 56228c2ecf20Sopenharmony_ci 56238c2ecf20Sopenharmony_ci printk("Node %d" 56248c2ecf20Sopenharmony_ci " active_anon:%lukB" 56258c2ecf20Sopenharmony_ci " inactive_anon:%lukB" 56268c2ecf20Sopenharmony_ci " active_file:%lukB" 56278c2ecf20Sopenharmony_ci " inactive_file:%lukB" 56288c2ecf20Sopenharmony_ci " unevictable:%lukB" 56298c2ecf20Sopenharmony_ci " isolated(anon):%lukB" 56308c2ecf20Sopenharmony_ci " isolated(file):%lukB" 56318c2ecf20Sopenharmony_ci " mapped:%lukB" 56328c2ecf20Sopenharmony_ci " dirty:%lukB" 56338c2ecf20Sopenharmony_ci " writeback:%lukB" 56348c2ecf20Sopenharmony_ci " shmem:%lukB" 56358c2ecf20Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 56368c2ecf20Sopenharmony_ci " shmem_thp: %lukB" 56378c2ecf20Sopenharmony_ci " shmem_pmdmapped: %lukB" 56388c2ecf20Sopenharmony_ci " anon_thp: %lukB" 56398c2ecf20Sopenharmony_ci#endif 56408c2ecf20Sopenharmony_ci " writeback_tmp:%lukB" 56418c2ecf20Sopenharmony_ci " kernel_stack:%lukB" 56428c2ecf20Sopenharmony_ci#ifdef CONFIG_SHADOW_CALL_STACK 56438c2ecf20Sopenharmony_ci " shadow_call_stack:%lukB" 56448c2ecf20Sopenharmony_ci#endif 56458c2ecf20Sopenharmony_ci " all_unreclaimable? %s" 56468c2ecf20Sopenharmony_ci "\n", 56478c2ecf20Sopenharmony_ci pgdat->node_id, 56488c2ecf20Sopenharmony_ci K(node_page_state(pgdat, NR_ACTIVE_ANON)), 56498c2ecf20Sopenharmony_ci K(node_page_state(pgdat, NR_INACTIVE_ANON)), 56508c2ecf20Sopenharmony_ci K(node_page_state(pgdat, NR_ACTIVE_FILE)), 56518c2ecf20Sopenharmony_ci K(node_page_state(pgdat, NR_INACTIVE_FILE)), 56528c2ecf20Sopenharmony_ci K(node_page_state(pgdat, NR_UNEVICTABLE)), 56538c2ecf20Sopenharmony_ci K(node_page_state(pgdat, NR_ISOLATED_ANON)), 56548c2ecf20Sopenharmony_ci K(node_page_state(pgdat, NR_ISOLATED_FILE)), 56558c2ecf20Sopenharmony_ci K(node_page_state(pgdat, NR_FILE_MAPPED)), 56568c2ecf20Sopenharmony_ci K(node_page_state(pgdat, NR_FILE_DIRTY)), 56578c2ecf20Sopenharmony_ci K(node_page_state(pgdat, NR_WRITEBACK)), 56588c2ecf20Sopenharmony_ci K(node_page_state(pgdat, NR_SHMEM)), 56598c2ecf20Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 56608c2ecf20Sopenharmony_ci K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR), 56618c2ecf20Sopenharmony_ci K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) 56628c2ecf20Sopenharmony_ci * HPAGE_PMD_NR), 56638c2ecf20Sopenharmony_ci K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), 56648c2ecf20Sopenharmony_ci#endif 56658c2ecf20Sopenharmony_ci K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), 56668c2ecf20Sopenharmony_ci node_page_state(pgdat, NR_KERNEL_STACK_KB), 56678c2ecf20Sopenharmony_ci#ifdef CONFIG_SHADOW_CALL_STACK 56688c2ecf20Sopenharmony_ci node_page_state(pgdat, NR_KERNEL_SCS_KB), 56698c2ecf20Sopenharmony_ci#endif 56708c2ecf20Sopenharmony_ci pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? 56718c2ecf20Sopenharmony_ci "yes" : "no"); 56728c2ecf20Sopenharmony_ci } 56738c2ecf20Sopenharmony_ci 56748c2ecf20Sopenharmony_ci for_each_populated_zone(zone) { 56758c2ecf20Sopenharmony_ci int i; 56768c2ecf20Sopenharmony_ci 56778c2ecf20Sopenharmony_ci if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) 56788c2ecf20Sopenharmony_ci continue; 56798c2ecf20Sopenharmony_ci 56808c2ecf20Sopenharmony_ci free_pcp = 0; 56818c2ecf20Sopenharmony_ci for_each_online_cpu(cpu) 56828c2ecf20Sopenharmony_ci free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; 56838c2ecf20Sopenharmony_ci 56848c2ecf20Sopenharmony_ci show_node(zone); 56858c2ecf20Sopenharmony_ci printk(KERN_CONT 56868c2ecf20Sopenharmony_ci "%s" 56878c2ecf20Sopenharmony_ci " free:%lukB" 56888c2ecf20Sopenharmony_ci " min:%lukB" 56898c2ecf20Sopenharmony_ci " low:%lukB" 56908c2ecf20Sopenharmony_ci " high:%lukB" 56918c2ecf20Sopenharmony_ci " reserved_highatomic:%luKB" 56928c2ecf20Sopenharmony_ci " active_anon:%lukB" 56938c2ecf20Sopenharmony_ci " inactive_anon:%lukB" 56948c2ecf20Sopenharmony_ci " active_file:%lukB" 56958c2ecf20Sopenharmony_ci " inactive_file:%lukB" 56968c2ecf20Sopenharmony_ci " unevictable:%lukB" 56978c2ecf20Sopenharmony_ci " writepending:%lukB" 56988c2ecf20Sopenharmony_ci " present:%lukB" 56998c2ecf20Sopenharmony_ci " managed:%lukB" 57008c2ecf20Sopenharmony_ci " mlocked:%lukB" 57018c2ecf20Sopenharmony_ci " pagetables:%lukB" 57028c2ecf20Sopenharmony_ci " bounce:%lukB" 57038c2ecf20Sopenharmony_ci " free_pcp:%lukB" 57048c2ecf20Sopenharmony_ci " local_pcp:%ukB" 57058c2ecf20Sopenharmony_ci " free_cma:%lukB" 57068c2ecf20Sopenharmony_ci "\n", 57078c2ecf20Sopenharmony_ci zone->name, 57088c2ecf20Sopenharmony_ci K(zone_page_state(zone, NR_FREE_PAGES)), 57098c2ecf20Sopenharmony_ci K(min_wmark_pages(zone)), 57108c2ecf20Sopenharmony_ci K(low_wmark_pages(zone)), 57118c2ecf20Sopenharmony_ci K(high_wmark_pages(zone)), 57128c2ecf20Sopenharmony_ci K(zone->nr_reserved_highatomic), 57138c2ecf20Sopenharmony_ci K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), 57148c2ecf20Sopenharmony_ci K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), 57158c2ecf20Sopenharmony_ci K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), 57168c2ecf20Sopenharmony_ci K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)), 57178c2ecf20Sopenharmony_ci K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), 57188c2ecf20Sopenharmony_ci K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), 57198c2ecf20Sopenharmony_ci K(zone->present_pages), 57208c2ecf20Sopenharmony_ci K(zone_managed_pages(zone)), 57218c2ecf20Sopenharmony_ci K(zone_page_state(zone, NR_MLOCK)), 57228c2ecf20Sopenharmony_ci K(zone_page_state(zone, NR_PAGETABLE)), 57238c2ecf20Sopenharmony_ci K(zone_page_state(zone, NR_BOUNCE)), 57248c2ecf20Sopenharmony_ci K(free_pcp), 57258c2ecf20Sopenharmony_ci K(this_cpu_read(zone->pageset->pcp.count)), 57268c2ecf20Sopenharmony_ci K(zone_page_state(zone, NR_FREE_CMA_PAGES))); 57278c2ecf20Sopenharmony_ci printk("lowmem_reserve[]:"); 57288c2ecf20Sopenharmony_ci for (i = 0; i < MAX_NR_ZONES; i++) 57298c2ecf20Sopenharmony_ci printk(KERN_CONT " %ld", zone->lowmem_reserve[i]); 57308c2ecf20Sopenharmony_ci printk(KERN_CONT "\n"); 57318c2ecf20Sopenharmony_ci } 57328c2ecf20Sopenharmony_ci 57338c2ecf20Sopenharmony_ci for_each_populated_zone(zone) { 57348c2ecf20Sopenharmony_ci unsigned int order; 57358c2ecf20Sopenharmony_ci unsigned long nr[MAX_ORDER], flags, total = 0; 57368c2ecf20Sopenharmony_ci unsigned char types[MAX_ORDER]; 57378c2ecf20Sopenharmony_ci 57388c2ecf20Sopenharmony_ci if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) 57398c2ecf20Sopenharmony_ci continue; 57408c2ecf20Sopenharmony_ci show_node(zone); 57418c2ecf20Sopenharmony_ci printk(KERN_CONT "%s: ", zone->name); 57428c2ecf20Sopenharmony_ci 57438c2ecf20Sopenharmony_ci spin_lock_irqsave(&zone->lock, flags); 57448c2ecf20Sopenharmony_ci for (order = 0; order < MAX_ORDER; order++) { 57458c2ecf20Sopenharmony_ci struct free_area *area = &zone->free_area[order]; 57468c2ecf20Sopenharmony_ci int type; 57478c2ecf20Sopenharmony_ci 57488c2ecf20Sopenharmony_ci nr[order] = area->nr_free; 57498c2ecf20Sopenharmony_ci total += nr[order] << order; 57508c2ecf20Sopenharmony_ci 57518c2ecf20Sopenharmony_ci types[order] = 0; 57528c2ecf20Sopenharmony_ci for (type = 0; type < MIGRATE_TYPES; type++) { 57538c2ecf20Sopenharmony_ci if (!free_area_empty(area, type)) 57548c2ecf20Sopenharmony_ci types[order] |= 1 << type; 57558c2ecf20Sopenharmony_ci } 57568c2ecf20Sopenharmony_ci } 57578c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&zone->lock, flags); 57588c2ecf20Sopenharmony_ci for (order = 0; order < MAX_ORDER; order++) { 57598c2ecf20Sopenharmony_ci printk(KERN_CONT "%lu*%lukB ", 57608c2ecf20Sopenharmony_ci nr[order], K(1UL) << order); 57618c2ecf20Sopenharmony_ci if (nr[order]) 57628c2ecf20Sopenharmony_ci show_migration_types(types[order]); 57638c2ecf20Sopenharmony_ci } 57648c2ecf20Sopenharmony_ci printk(KERN_CONT "= %lukB\n", K(total)); 57658c2ecf20Sopenharmony_ci } 57668c2ecf20Sopenharmony_ci 57678c2ecf20Sopenharmony_ci hugetlb_show_meminfo(); 57688c2ecf20Sopenharmony_ci 57698c2ecf20Sopenharmony_ci printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES)); 57708c2ecf20Sopenharmony_ci 57718c2ecf20Sopenharmony_ci show_swap_cache_info(); 57728c2ecf20Sopenharmony_ci} 57738c2ecf20Sopenharmony_ci 57748c2ecf20Sopenharmony_cistatic void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) 57758c2ecf20Sopenharmony_ci{ 57768c2ecf20Sopenharmony_ci zoneref->zone = zone; 57778c2ecf20Sopenharmony_ci zoneref->zone_idx = zone_idx(zone); 57788c2ecf20Sopenharmony_ci} 57798c2ecf20Sopenharmony_ci 57808c2ecf20Sopenharmony_ci/* 57818c2ecf20Sopenharmony_ci * Builds allocation fallback zone lists. 57828c2ecf20Sopenharmony_ci * 57838c2ecf20Sopenharmony_ci * Add all populated zones of a node to the zonelist. 57848c2ecf20Sopenharmony_ci */ 57858c2ecf20Sopenharmony_cistatic int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs) 57868c2ecf20Sopenharmony_ci{ 57878c2ecf20Sopenharmony_ci struct zone *zone; 57888c2ecf20Sopenharmony_ci enum zone_type zone_type = MAX_NR_ZONES; 57898c2ecf20Sopenharmony_ci int nr_zones = 0; 57908c2ecf20Sopenharmony_ci 57918c2ecf20Sopenharmony_ci do { 57928c2ecf20Sopenharmony_ci zone_type--; 57938c2ecf20Sopenharmony_ci zone = pgdat->node_zones + zone_type; 57948c2ecf20Sopenharmony_ci if (populated_zone(zone)) { 57958c2ecf20Sopenharmony_ci zoneref_set_zone(zone, &zonerefs[nr_zones++]); 57968c2ecf20Sopenharmony_ci check_highest_zone(zone_type); 57978c2ecf20Sopenharmony_ci } 57988c2ecf20Sopenharmony_ci } while (zone_type); 57998c2ecf20Sopenharmony_ci 58008c2ecf20Sopenharmony_ci return nr_zones; 58018c2ecf20Sopenharmony_ci} 58028c2ecf20Sopenharmony_ci 58038c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA 58048c2ecf20Sopenharmony_ci 58058c2ecf20Sopenharmony_cistatic int __parse_numa_zonelist_order(char *s) 58068c2ecf20Sopenharmony_ci{ 58078c2ecf20Sopenharmony_ci /* 58088c2ecf20Sopenharmony_ci * We used to support different zonlists modes but they turned 58098c2ecf20Sopenharmony_ci * out to be just not useful. Let's keep the warning in place 58108c2ecf20Sopenharmony_ci * if somebody still use the cmd line parameter so that we do 58118c2ecf20Sopenharmony_ci * not fail it silently 58128c2ecf20Sopenharmony_ci */ 58138c2ecf20Sopenharmony_ci if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) { 58148c2ecf20Sopenharmony_ci pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s); 58158c2ecf20Sopenharmony_ci return -EINVAL; 58168c2ecf20Sopenharmony_ci } 58178c2ecf20Sopenharmony_ci return 0; 58188c2ecf20Sopenharmony_ci} 58198c2ecf20Sopenharmony_ci 58208c2ecf20Sopenharmony_cichar numa_zonelist_order[] = "Node"; 58218c2ecf20Sopenharmony_ci 58228c2ecf20Sopenharmony_ci/* 58238c2ecf20Sopenharmony_ci * sysctl handler for numa_zonelist_order 58248c2ecf20Sopenharmony_ci */ 58258c2ecf20Sopenharmony_ciint numa_zonelist_order_handler(struct ctl_table *table, int write, 58268c2ecf20Sopenharmony_ci void *buffer, size_t *length, loff_t *ppos) 58278c2ecf20Sopenharmony_ci{ 58288c2ecf20Sopenharmony_ci if (write) 58298c2ecf20Sopenharmony_ci return __parse_numa_zonelist_order(buffer); 58308c2ecf20Sopenharmony_ci return proc_dostring(table, write, buffer, length, ppos); 58318c2ecf20Sopenharmony_ci} 58328c2ecf20Sopenharmony_ci 58338c2ecf20Sopenharmony_ci 58348c2ecf20Sopenharmony_ci#define MAX_NODE_LOAD (nr_online_nodes) 58358c2ecf20Sopenharmony_cistatic int node_load[MAX_NUMNODES]; 58368c2ecf20Sopenharmony_ci 58378c2ecf20Sopenharmony_ci/** 58388c2ecf20Sopenharmony_ci * find_next_best_node - find the next node that should appear in a given node's fallback list 58398c2ecf20Sopenharmony_ci * @node: node whose fallback list we're appending 58408c2ecf20Sopenharmony_ci * @used_node_mask: nodemask_t of already used nodes 58418c2ecf20Sopenharmony_ci * 58428c2ecf20Sopenharmony_ci * We use a number of factors to determine which is the next node that should 58438c2ecf20Sopenharmony_ci * appear on a given node's fallback list. The node should not have appeared 58448c2ecf20Sopenharmony_ci * already in @node's fallback list, and it should be the next closest node 58458c2ecf20Sopenharmony_ci * according to the distance array (which contains arbitrary distance values 58468c2ecf20Sopenharmony_ci * from each node to each node in the system), and should also prefer nodes 58478c2ecf20Sopenharmony_ci * with no CPUs, since presumably they'll have very little allocation pressure 58488c2ecf20Sopenharmony_ci * on them otherwise. 58498c2ecf20Sopenharmony_ci * 58508c2ecf20Sopenharmony_ci * Return: node id of the found node or %NUMA_NO_NODE if no node is found. 58518c2ecf20Sopenharmony_ci */ 58528c2ecf20Sopenharmony_cistatic int find_next_best_node(int node, nodemask_t *used_node_mask) 58538c2ecf20Sopenharmony_ci{ 58548c2ecf20Sopenharmony_ci int n, val; 58558c2ecf20Sopenharmony_ci int min_val = INT_MAX; 58568c2ecf20Sopenharmony_ci int best_node = NUMA_NO_NODE; 58578c2ecf20Sopenharmony_ci 58588c2ecf20Sopenharmony_ci /* Use the local node if we haven't already */ 58598c2ecf20Sopenharmony_ci if (!node_isset(node, *used_node_mask)) { 58608c2ecf20Sopenharmony_ci node_set(node, *used_node_mask); 58618c2ecf20Sopenharmony_ci return node; 58628c2ecf20Sopenharmony_ci } 58638c2ecf20Sopenharmony_ci 58648c2ecf20Sopenharmony_ci for_each_node_state(n, N_MEMORY) { 58658c2ecf20Sopenharmony_ci 58668c2ecf20Sopenharmony_ci /* Don't want a node to appear more than once */ 58678c2ecf20Sopenharmony_ci if (node_isset(n, *used_node_mask)) 58688c2ecf20Sopenharmony_ci continue; 58698c2ecf20Sopenharmony_ci 58708c2ecf20Sopenharmony_ci /* Use the distance array to find the distance */ 58718c2ecf20Sopenharmony_ci val = node_distance(node, n); 58728c2ecf20Sopenharmony_ci 58738c2ecf20Sopenharmony_ci /* Penalize nodes under us ("prefer the next node") */ 58748c2ecf20Sopenharmony_ci val += (n < node); 58758c2ecf20Sopenharmony_ci 58768c2ecf20Sopenharmony_ci /* Give preference to headless and unused nodes */ 58778c2ecf20Sopenharmony_ci if (!cpumask_empty(cpumask_of_node(n))) 58788c2ecf20Sopenharmony_ci val += PENALTY_FOR_NODE_WITH_CPUS; 58798c2ecf20Sopenharmony_ci 58808c2ecf20Sopenharmony_ci /* Slight preference for less loaded node */ 58818c2ecf20Sopenharmony_ci val *= (MAX_NODE_LOAD*MAX_NUMNODES); 58828c2ecf20Sopenharmony_ci val += node_load[n]; 58838c2ecf20Sopenharmony_ci 58848c2ecf20Sopenharmony_ci if (val < min_val) { 58858c2ecf20Sopenharmony_ci min_val = val; 58868c2ecf20Sopenharmony_ci best_node = n; 58878c2ecf20Sopenharmony_ci } 58888c2ecf20Sopenharmony_ci } 58898c2ecf20Sopenharmony_ci 58908c2ecf20Sopenharmony_ci if (best_node >= 0) 58918c2ecf20Sopenharmony_ci node_set(best_node, *used_node_mask); 58928c2ecf20Sopenharmony_ci 58938c2ecf20Sopenharmony_ci return best_node; 58948c2ecf20Sopenharmony_ci} 58958c2ecf20Sopenharmony_ci 58968c2ecf20Sopenharmony_ci 58978c2ecf20Sopenharmony_ci/* 58988c2ecf20Sopenharmony_ci * Build zonelists ordered by node and zones within node. 58998c2ecf20Sopenharmony_ci * This results in maximum locality--normal zone overflows into local 59008c2ecf20Sopenharmony_ci * DMA zone, if any--but risks exhausting DMA zone. 59018c2ecf20Sopenharmony_ci */ 59028c2ecf20Sopenharmony_cistatic void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order, 59038c2ecf20Sopenharmony_ci unsigned nr_nodes) 59048c2ecf20Sopenharmony_ci{ 59058c2ecf20Sopenharmony_ci struct zoneref *zonerefs; 59068c2ecf20Sopenharmony_ci int i; 59078c2ecf20Sopenharmony_ci 59088c2ecf20Sopenharmony_ci zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; 59098c2ecf20Sopenharmony_ci 59108c2ecf20Sopenharmony_ci for (i = 0; i < nr_nodes; i++) { 59118c2ecf20Sopenharmony_ci int nr_zones; 59128c2ecf20Sopenharmony_ci 59138c2ecf20Sopenharmony_ci pg_data_t *node = NODE_DATA(node_order[i]); 59148c2ecf20Sopenharmony_ci 59158c2ecf20Sopenharmony_ci nr_zones = build_zonerefs_node(node, zonerefs); 59168c2ecf20Sopenharmony_ci zonerefs += nr_zones; 59178c2ecf20Sopenharmony_ci } 59188c2ecf20Sopenharmony_ci zonerefs->zone = NULL; 59198c2ecf20Sopenharmony_ci zonerefs->zone_idx = 0; 59208c2ecf20Sopenharmony_ci} 59218c2ecf20Sopenharmony_ci 59228c2ecf20Sopenharmony_ci/* 59238c2ecf20Sopenharmony_ci * Build gfp_thisnode zonelists 59248c2ecf20Sopenharmony_ci */ 59258c2ecf20Sopenharmony_cistatic void build_thisnode_zonelists(pg_data_t *pgdat) 59268c2ecf20Sopenharmony_ci{ 59278c2ecf20Sopenharmony_ci struct zoneref *zonerefs; 59288c2ecf20Sopenharmony_ci int nr_zones; 59298c2ecf20Sopenharmony_ci 59308c2ecf20Sopenharmony_ci zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs; 59318c2ecf20Sopenharmony_ci nr_zones = build_zonerefs_node(pgdat, zonerefs); 59328c2ecf20Sopenharmony_ci zonerefs += nr_zones; 59338c2ecf20Sopenharmony_ci zonerefs->zone = NULL; 59348c2ecf20Sopenharmony_ci zonerefs->zone_idx = 0; 59358c2ecf20Sopenharmony_ci} 59368c2ecf20Sopenharmony_ci 59378c2ecf20Sopenharmony_ci/* 59388c2ecf20Sopenharmony_ci * Build zonelists ordered by zone and nodes within zones. 59398c2ecf20Sopenharmony_ci * This results in conserving DMA zone[s] until all Normal memory is 59408c2ecf20Sopenharmony_ci * exhausted, but results in overflowing to remote node while memory 59418c2ecf20Sopenharmony_ci * may still exist in local DMA zone. 59428c2ecf20Sopenharmony_ci */ 59438c2ecf20Sopenharmony_ci 59448c2ecf20Sopenharmony_cistatic void build_zonelists(pg_data_t *pgdat) 59458c2ecf20Sopenharmony_ci{ 59468c2ecf20Sopenharmony_ci static int node_order[MAX_NUMNODES]; 59478c2ecf20Sopenharmony_ci int node, load, nr_nodes = 0; 59488c2ecf20Sopenharmony_ci nodemask_t used_mask = NODE_MASK_NONE; 59498c2ecf20Sopenharmony_ci int local_node, prev_node; 59508c2ecf20Sopenharmony_ci 59518c2ecf20Sopenharmony_ci /* NUMA-aware ordering of nodes */ 59528c2ecf20Sopenharmony_ci local_node = pgdat->node_id; 59538c2ecf20Sopenharmony_ci load = nr_online_nodes; 59548c2ecf20Sopenharmony_ci prev_node = local_node; 59558c2ecf20Sopenharmony_ci 59568c2ecf20Sopenharmony_ci memset(node_order, 0, sizeof(node_order)); 59578c2ecf20Sopenharmony_ci while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 59588c2ecf20Sopenharmony_ci /* 59598c2ecf20Sopenharmony_ci * We don't want to pressure a particular node. 59608c2ecf20Sopenharmony_ci * So adding penalty to the first node in same 59618c2ecf20Sopenharmony_ci * distance group to make it round-robin. 59628c2ecf20Sopenharmony_ci */ 59638c2ecf20Sopenharmony_ci if (node_distance(local_node, node) != 59648c2ecf20Sopenharmony_ci node_distance(local_node, prev_node)) 59658c2ecf20Sopenharmony_ci node_load[node] = load; 59668c2ecf20Sopenharmony_ci 59678c2ecf20Sopenharmony_ci node_order[nr_nodes++] = node; 59688c2ecf20Sopenharmony_ci prev_node = node; 59698c2ecf20Sopenharmony_ci load--; 59708c2ecf20Sopenharmony_ci } 59718c2ecf20Sopenharmony_ci 59728c2ecf20Sopenharmony_ci build_zonelists_in_node_order(pgdat, node_order, nr_nodes); 59738c2ecf20Sopenharmony_ci build_thisnode_zonelists(pgdat); 59748c2ecf20Sopenharmony_ci} 59758c2ecf20Sopenharmony_ci 59768c2ecf20Sopenharmony_ci#ifdef CONFIG_HAVE_MEMORYLESS_NODES 59778c2ecf20Sopenharmony_ci/* 59788c2ecf20Sopenharmony_ci * Return node id of node used for "local" allocations. 59798c2ecf20Sopenharmony_ci * I.e., first node id of first zone in arg node's generic zonelist. 59808c2ecf20Sopenharmony_ci * Used for initializing percpu 'numa_mem', which is used primarily 59818c2ecf20Sopenharmony_ci * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. 59828c2ecf20Sopenharmony_ci */ 59838c2ecf20Sopenharmony_ciint local_memory_node(int node) 59848c2ecf20Sopenharmony_ci{ 59858c2ecf20Sopenharmony_ci struct zoneref *z; 59868c2ecf20Sopenharmony_ci 59878c2ecf20Sopenharmony_ci z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL), 59888c2ecf20Sopenharmony_ci gfp_zone(GFP_KERNEL), 59898c2ecf20Sopenharmony_ci NULL); 59908c2ecf20Sopenharmony_ci return zone_to_nid(z->zone); 59918c2ecf20Sopenharmony_ci} 59928c2ecf20Sopenharmony_ci#endif 59938c2ecf20Sopenharmony_ci 59948c2ecf20Sopenharmony_cistatic void setup_min_unmapped_ratio(void); 59958c2ecf20Sopenharmony_cistatic void setup_min_slab_ratio(void); 59968c2ecf20Sopenharmony_ci#else /* CONFIG_NUMA */ 59978c2ecf20Sopenharmony_ci 59988c2ecf20Sopenharmony_cistatic void build_zonelists(pg_data_t *pgdat) 59998c2ecf20Sopenharmony_ci{ 60008c2ecf20Sopenharmony_ci int node, local_node; 60018c2ecf20Sopenharmony_ci struct zoneref *zonerefs; 60028c2ecf20Sopenharmony_ci int nr_zones; 60038c2ecf20Sopenharmony_ci 60048c2ecf20Sopenharmony_ci local_node = pgdat->node_id; 60058c2ecf20Sopenharmony_ci 60068c2ecf20Sopenharmony_ci zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; 60078c2ecf20Sopenharmony_ci nr_zones = build_zonerefs_node(pgdat, zonerefs); 60088c2ecf20Sopenharmony_ci zonerefs += nr_zones; 60098c2ecf20Sopenharmony_ci 60108c2ecf20Sopenharmony_ci /* 60118c2ecf20Sopenharmony_ci * Now we build the zonelist so that it contains the zones 60128c2ecf20Sopenharmony_ci * of all the other nodes. 60138c2ecf20Sopenharmony_ci * We don't want to pressure a particular node, so when 60148c2ecf20Sopenharmony_ci * building the zones for node N, we make sure that the 60158c2ecf20Sopenharmony_ci * zones coming right after the local ones are those from 60168c2ecf20Sopenharmony_ci * node N+1 (modulo N) 60178c2ecf20Sopenharmony_ci */ 60188c2ecf20Sopenharmony_ci for (node = local_node + 1; node < MAX_NUMNODES; node++) { 60198c2ecf20Sopenharmony_ci if (!node_online(node)) 60208c2ecf20Sopenharmony_ci continue; 60218c2ecf20Sopenharmony_ci nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs); 60228c2ecf20Sopenharmony_ci zonerefs += nr_zones; 60238c2ecf20Sopenharmony_ci } 60248c2ecf20Sopenharmony_ci for (node = 0; node < local_node; node++) { 60258c2ecf20Sopenharmony_ci if (!node_online(node)) 60268c2ecf20Sopenharmony_ci continue; 60278c2ecf20Sopenharmony_ci nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs); 60288c2ecf20Sopenharmony_ci zonerefs += nr_zones; 60298c2ecf20Sopenharmony_ci } 60308c2ecf20Sopenharmony_ci 60318c2ecf20Sopenharmony_ci zonerefs->zone = NULL; 60328c2ecf20Sopenharmony_ci zonerefs->zone_idx = 0; 60338c2ecf20Sopenharmony_ci} 60348c2ecf20Sopenharmony_ci 60358c2ecf20Sopenharmony_ci#endif /* CONFIG_NUMA */ 60368c2ecf20Sopenharmony_ci 60378c2ecf20Sopenharmony_ci/* 60388c2ecf20Sopenharmony_ci * Boot pageset table. One per cpu which is going to be used for all 60398c2ecf20Sopenharmony_ci * zones and all nodes. The parameters will be set in such a way 60408c2ecf20Sopenharmony_ci * that an item put on a list will immediately be handed over to 60418c2ecf20Sopenharmony_ci * the buddy list. This is safe since pageset manipulation is done 60428c2ecf20Sopenharmony_ci * with interrupts disabled. 60438c2ecf20Sopenharmony_ci * 60448c2ecf20Sopenharmony_ci * The boot_pagesets must be kept even after bootup is complete for 60458c2ecf20Sopenharmony_ci * unused processors and/or zones. They do play a role for bootstrapping 60468c2ecf20Sopenharmony_ci * hotplugged processors. 60478c2ecf20Sopenharmony_ci * 60488c2ecf20Sopenharmony_ci * zoneinfo_show() and maybe other functions do 60498c2ecf20Sopenharmony_ci * not check if the processor is online before following the pageset pointer. 60508c2ecf20Sopenharmony_ci * Other parts of the kernel may not check if the zone is available. 60518c2ecf20Sopenharmony_ci */ 60528c2ecf20Sopenharmony_cistatic void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); 60538c2ecf20Sopenharmony_cistatic DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); 60548c2ecf20Sopenharmony_cistatic DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); 60558c2ecf20Sopenharmony_ci 60568c2ecf20Sopenharmony_cistatic void __build_all_zonelists(void *data) 60578c2ecf20Sopenharmony_ci{ 60588c2ecf20Sopenharmony_ci int nid; 60598c2ecf20Sopenharmony_ci int __maybe_unused cpu; 60608c2ecf20Sopenharmony_ci pg_data_t *self = data; 60618c2ecf20Sopenharmony_ci unsigned long flags; 60628c2ecf20Sopenharmony_ci 60638c2ecf20Sopenharmony_ci /* 60648c2ecf20Sopenharmony_ci * Explicitly disable this CPU's interrupts before taking seqlock 60658c2ecf20Sopenharmony_ci * to prevent any IRQ handler from calling into the page allocator 60668c2ecf20Sopenharmony_ci * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock. 60678c2ecf20Sopenharmony_ci */ 60688c2ecf20Sopenharmony_ci local_irq_save(flags); 60698c2ecf20Sopenharmony_ci /* 60708c2ecf20Sopenharmony_ci * Explicitly disable this CPU's synchronous printk() before taking 60718c2ecf20Sopenharmony_ci * seqlock to prevent any printk() from trying to hold port->lock, for 60728c2ecf20Sopenharmony_ci * tty_insert_flip_string_and_push_buffer() on other CPU might be 60738c2ecf20Sopenharmony_ci * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held. 60748c2ecf20Sopenharmony_ci */ 60758c2ecf20Sopenharmony_ci printk_deferred_enter(); 60768c2ecf20Sopenharmony_ci write_seqlock(&zonelist_update_seq); 60778c2ecf20Sopenharmony_ci 60788c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA 60798c2ecf20Sopenharmony_ci memset(node_load, 0, sizeof(node_load)); 60808c2ecf20Sopenharmony_ci#endif 60818c2ecf20Sopenharmony_ci 60828c2ecf20Sopenharmony_ci /* 60838c2ecf20Sopenharmony_ci * This node is hotadded and no memory is yet present. So just 60848c2ecf20Sopenharmony_ci * building zonelists is fine - no need to touch other nodes. 60858c2ecf20Sopenharmony_ci */ 60868c2ecf20Sopenharmony_ci if (self && !node_online(self->node_id)) { 60878c2ecf20Sopenharmony_ci build_zonelists(self); 60888c2ecf20Sopenharmony_ci } else { 60898c2ecf20Sopenharmony_ci for_each_online_node(nid) { 60908c2ecf20Sopenharmony_ci pg_data_t *pgdat = NODE_DATA(nid); 60918c2ecf20Sopenharmony_ci 60928c2ecf20Sopenharmony_ci build_zonelists(pgdat); 60938c2ecf20Sopenharmony_ci } 60948c2ecf20Sopenharmony_ci 60958c2ecf20Sopenharmony_ci#ifdef CONFIG_HAVE_MEMORYLESS_NODES 60968c2ecf20Sopenharmony_ci /* 60978c2ecf20Sopenharmony_ci * We now know the "local memory node" for each node-- 60988c2ecf20Sopenharmony_ci * i.e., the node of the first zone in the generic zonelist. 60998c2ecf20Sopenharmony_ci * Set up numa_mem percpu variable for on-line cpus. During 61008c2ecf20Sopenharmony_ci * boot, only the boot cpu should be on-line; we'll init the 61018c2ecf20Sopenharmony_ci * secondary cpus' numa_mem as they come on-line. During 61028c2ecf20Sopenharmony_ci * node/memory hotplug, we'll fixup all on-line cpus. 61038c2ecf20Sopenharmony_ci */ 61048c2ecf20Sopenharmony_ci for_each_online_cpu(cpu) 61058c2ecf20Sopenharmony_ci set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); 61068c2ecf20Sopenharmony_ci#endif 61078c2ecf20Sopenharmony_ci } 61088c2ecf20Sopenharmony_ci 61098c2ecf20Sopenharmony_ci write_sequnlock(&zonelist_update_seq); 61108c2ecf20Sopenharmony_ci printk_deferred_exit(); 61118c2ecf20Sopenharmony_ci local_irq_restore(flags); 61128c2ecf20Sopenharmony_ci} 61138c2ecf20Sopenharmony_ci 61148c2ecf20Sopenharmony_cistatic noinline void __init 61158c2ecf20Sopenharmony_cibuild_all_zonelists_init(void) 61168c2ecf20Sopenharmony_ci{ 61178c2ecf20Sopenharmony_ci int cpu; 61188c2ecf20Sopenharmony_ci 61198c2ecf20Sopenharmony_ci __build_all_zonelists(NULL); 61208c2ecf20Sopenharmony_ci 61218c2ecf20Sopenharmony_ci /* 61228c2ecf20Sopenharmony_ci * Initialize the boot_pagesets that are going to be used 61238c2ecf20Sopenharmony_ci * for bootstrapping processors. The real pagesets for 61248c2ecf20Sopenharmony_ci * each zone will be allocated later when the per cpu 61258c2ecf20Sopenharmony_ci * allocator is available. 61268c2ecf20Sopenharmony_ci * 61278c2ecf20Sopenharmony_ci * boot_pagesets are used also for bootstrapping offline 61288c2ecf20Sopenharmony_ci * cpus if the system is already booted because the pagesets 61298c2ecf20Sopenharmony_ci * are needed to initialize allocators on a specific cpu too. 61308c2ecf20Sopenharmony_ci * F.e. the percpu allocator needs the page allocator which 61318c2ecf20Sopenharmony_ci * needs the percpu allocator in order to allocate its pagesets 61328c2ecf20Sopenharmony_ci * (a chicken-egg dilemma). 61338c2ecf20Sopenharmony_ci */ 61348c2ecf20Sopenharmony_ci for_each_possible_cpu(cpu) 61358c2ecf20Sopenharmony_ci setup_pageset(&per_cpu(boot_pageset, cpu), 0); 61368c2ecf20Sopenharmony_ci 61378c2ecf20Sopenharmony_ci mminit_verify_zonelist(); 61388c2ecf20Sopenharmony_ci cpuset_init_current_mems_allowed(); 61398c2ecf20Sopenharmony_ci} 61408c2ecf20Sopenharmony_ci 61418c2ecf20Sopenharmony_ci/* 61428c2ecf20Sopenharmony_ci * unless system_state == SYSTEM_BOOTING. 61438c2ecf20Sopenharmony_ci * 61448c2ecf20Sopenharmony_ci * __ref due to call of __init annotated helper build_all_zonelists_init 61458c2ecf20Sopenharmony_ci * [protected by SYSTEM_BOOTING]. 61468c2ecf20Sopenharmony_ci */ 61478c2ecf20Sopenharmony_civoid __ref build_all_zonelists(pg_data_t *pgdat) 61488c2ecf20Sopenharmony_ci{ 61498c2ecf20Sopenharmony_ci unsigned long vm_total_pages; 61508c2ecf20Sopenharmony_ci 61518c2ecf20Sopenharmony_ci if (system_state == SYSTEM_BOOTING) { 61528c2ecf20Sopenharmony_ci build_all_zonelists_init(); 61538c2ecf20Sopenharmony_ci } else { 61548c2ecf20Sopenharmony_ci __build_all_zonelists(pgdat); 61558c2ecf20Sopenharmony_ci /* cpuset refresh routine should be here */ 61568c2ecf20Sopenharmony_ci } 61578c2ecf20Sopenharmony_ci /* Get the number of free pages beyond high watermark in all zones. */ 61588c2ecf20Sopenharmony_ci vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); 61598c2ecf20Sopenharmony_ci /* 61608c2ecf20Sopenharmony_ci * Disable grouping by mobility if the number of pages in the 61618c2ecf20Sopenharmony_ci * system is too low to allow the mechanism to work. It would be 61628c2ecf20Sopenharmony_ci * more accurate, but expensive to check per-zone. This check is 61638c2ecf20Sopenharmony_ci * made on memory-hotadd so a system can start with mobility 61648c2ecf20Sopenharmony_ci * disabled and enable it later 61658c2ecf20Sopenharmony_ci */ 61668c2ecf20Sopenharmony_ci if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) 61678c2ecf20Sopenharmony_ci page_group_by_mobility_disabled = 1; 61688c2ecf20Sopenharmony_ci else 61698c2ecf20Sopenharmony_ci page_group_by_mobility_disabled = 0; 61708c2ecf20Sopenharmony_ci 61718c2ecf20Sopenharmony_ci pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n", 61728c2ecf20Sopenharmony_ci nr_online_nodes, 61738c2ecf20Sopenharmony_ci page_group_by_mobility_disabled ? "off" : "on", 61748c2ecf20Sopenharmony_ci vm_total_pages); 61758c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA 61768c2ecf20Sopenharmony_ci pr_info("Policy zone: %s\n", zone_names[policy_zone]); 61778c2ecf20Sopenharmony_ci#endif 61788c2ecf20Sopenharmony_ci} 61798c2ecf20Sopenharmony_ci 61808c2ecf20Sopenharmony_ci/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */ 61818c2ecf20Sopenharmony_cistatic bool __meminit 61828c2ecf20Sopenharmony_cioverlap_memmap_init(unsigned long zone, unsigned long *pfn) 61838c2ecf20Sopenharmony_ci{ 61848c2ecf20Sopenharmony_ci static struct memblock_region *r; 61858c2ecf20Sopenharmony_ci 61868c2ecf20Sopenharmony_ci if (mirrored_kernelcore && zone == ZONE_MOVABLE) { 61878c2ecf20Sopenharmony_ci if (!r || *pfn >= memblock_region_memory_end_pfn(r)) { 61888c2ecf20Sopenharmony_ci for_each_mem_region(r) { 61898c2ecf20Sopenharmony_ci if (*pfn < memblock_region_memory_end_pfn(r)) 61908c2ecf20Sopenharmony_ci break; 61918c2ecf20Sopenharmony_ci } 61928c2ecf20Sopenharmony_ci } 61938c2ecf20Sopenharmony_ci if (*pfn >= memblock_region_memory_base_pfn(r) && 61948c2ecf20Sopenharmony_ci memblock_is_mirror(r)) { 61958c2ecf20Sopenharmony_ci *pfn = memblock_region_memory_end_pfn(r); 61968c2ecf20Sopenharmony_ci return true; 61978c2ecf20Sopenharmony_ci } 61988c2ecf20Sopenharmony_ci } 61998c2ecf20Sopenharmony_ci return false; 62008c2ecf20Sopenharmony_ci} 62018c2ecf20Sopenharmony_ci 62028c2ecf20Sopenharmony_ci/* 62038c2ecf20Sopenharmony_ci * Initially all pages are reserved - free ones are freed 62048c2ecf20Sopenharmony_ci * up by memblock_free_all() once the early boot process is 62058c2ecf20Sopenharmony_ci * done. Non-atomic initialization, single-pass. 62068c2ecf20Sopenharmony_ci * 62078c2ecf20Sopenharmony_ci * All aligned pageblocks are initialized to the specified migratetype 62088c2ecf20Sopenharmony_ci * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related 62098c2ecf20Sopenharmony_ci * zone stats (e.g., nr_isolate_pageblock) are touched. 62108c2ecf20Sopenharmony_ci */ 62118c2ecf20Sopenharmony_civoid __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, 62128c2ecf20Sopenharmony_ci unsigned long start_pfn, unsigned long zone_end_pfn, 62138c2ecf20Sopenharmony_ci enum meminit_context context, 62148c2ecf20Sopenharmony_ci struct vmem_altmap *altmap, int migratetype) 62158c2ecf20Sopenharmony_ci{ 62168c2ecf20Sopenharmony_ci unsigned long pfn, end_pfn = start_pfn + size; 62178c2ecf20Sopenharmony_ci struct page *page; 62188c2ecf20Sopenharmony_ci 62198c2ecf20Sopenharmony_ci if (highest_memmap_pfn < end_pfn - 1) 62208c2ecf20Sopenharmony_ci highest_memmap_pfn = end_pfn - 1; 62218c2ecf20Sopenharmony_ci 62228c2ecf20Sopenharmony_ci#ifdef CONFIG_ZONE_DEVICE 62238c2ecf20Sopenharmony_ci /* 62248c2ecf20Sopenharmony_ci * Honor reservation requested by the driver for this ZONE_DEVICE 62258c2ecf20Sopenharmony_ci * memory. We limit the total number of pages to initialize to just 62268c2ecf20Sopenharmony_ci * those that might contain the memory mapping. We will defer the 62278c2ecf20Sopenharmony_ci * ZONE_DEVICE page initialization until after we have released 62288c2ecf20Sopenharmony_ci * the hotplug lock. 62298c2ecf20Sopenharmony_ci */ 62308c2ecf20Sopenharmony_ci if (zone == ZONE_DEVICE) { 62318c2ecf20Sopenharmony_ci if (!altmap) 62328c2ecf20Sopenharmony_ci return; 62338c2ecf20Sopenharmony_ci 62348c2ecf20Sopenharmony_ci if (start_pfn == altmap->base_pfn) 62358c2ecf20Sopenharmony_ci start_pfn += altmap->reserve; 62368c2ecf20Sopenharmony_ci end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); 62378c2ecf20Sopenharmony_ci } 62388c2ecf20Sopenharmony_ci#endif 62398c2ecf20Sopenharmony_ci 62408c2ecf20Sopenharmony_ci for (pfn = start_pfn; pfn < end_pfn; ) { 62418c2ecf20Sopenharmony_ci /* 62428c2ecf20Sopenharmony_ci * There can be holes in boot-time mem_map[]s handed to this 62438c2ecf20Sopenharmony_ci * function. They do not exist on hotplugged memory. 62448c2ecf20Sopenharmony_ci */ 62458c2ecf20Sopenharmony_ci if (context == MEMINIT_EARLY) { 62468c2ecf20Sopenharmony_ci if (overlap_memmap_init(zone, &pfn)) 62478c2ecf20Sopenharmony_ci continue; 62488c2ecf20Sopenharmony_ci if (defer_init(nid, pfn, zone_end_pfn)) 62498c2ecf20Sopenharmony_ci break; 62508c2ecf20Sopenharmony_ci } 62518c2ecf20Sopenharmony_ci 62528c2ecf20Sopenharmony_ci page = pfn_to_page(pfn); 62538c2ecf20Sopenharmony_ci __init_single_page(page, pfn, zone, nid); 62548c2ecf20Sopenharmony_ci if (context == MEMINIT_HOTPLUG) 62558c2ecf20Sopenharmony_ci __SetPageReserved(page); 62568c2ecf20Sopenharmony_ci 62578c2ecf20Sopenharmony_ci /* 62588c2ecf20Sopenharmony_ci * Usually, we want to mark the pageblock MIGRATE_MOVABLE, 62598c2ecf20Sopenharmony_ci * such that unmovable allocations won't be scattered all 62608c2ecf20Sopenharmony_ci * over the place during system boot. 62618c2ecf20Sopenharmony_ci */ 62628c2ecf20Sopenharmony_ci if (IS_ALIGNED(pfn, pageblock_nr_pages)) { 62638c2ecf20Sopenharmony_ci set_pageblock_migratetype(page, migratetype); 62648c2ecf20Sopenharmony_ci cond_resched(); 62658c2ecf20Sopenharmony_ci } 62668c2ecf20Sopenharmony_ci pfn++; 62678c2ecf20Sopenharmony_ci } 62688c2ecf20Sopenharmony_ci} 62698c2ecf20Sopenharmony_ci 62708c2ecf20Sopenharmony_ci#ifdef CONFIG_ZONE_DEVICE 62718c2ecf20Sopenharmony_civoid __ref memmap_init_zone_device(struct zone *zone, 62728c2ecf20Sopenharmony_ci unsigned long start_pfn, 62738c2ecf20Sopenharmony_ci unsigned long nr_pages, 62748c2ecf20Sopenharmony_ci struct dev_pagemap *pgmap) 62758c2ecf20Sopenharmony_ci{ 62768c2ecf20Sopenharmony_ci unsigned long pfn, end_pfn = start_pfn + nr_pages; 62778c2ecf20Sopenharmony_ci struct pglist_data *pgdat = zone->zone_pgdat; 62788c2ecf20Sopenharmony_ci struct vmem_altmap *altmap = pgmap_altmap(pgmap); 62798c2ecf20Sopenharmony_ci unsigned long zone_idx = zone_idx(zone); 62808c2ecf20Sopenharmony_ci unsigned long start = jiffies; 62818c2ecf20Sopenharmony_ci int nid = pgdat->node_id; 62828c2ecf20Sopenharmony_ci 62838c2ecf20Sopenharmony_ci if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE)) 62848c2ecf20Sopenharmony_ci return; 62858c2ecf20Sopenharmony_ci 62868c2ecf20Sopenharmony_ci /* 62878c2ecf20Sopenharmony_ci * The call to memmap_init should have already taken care 62888c2ecf20Sopenharmony_ci * of the pages reserved for the memmap, so we can just jump to 62898c2ecf20Sopenharmony_ci * the end of that region and start processing the device pages. 62908c2ecf20Sopenharmony_ci */ 62918c2ecf20Sopenharmony_ci if (altmap) { 62928c2ecf20Sopenharmony_ci start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); 62938c2ecf20Sopenharmony_ci nr_pages = end_pfn - start_pfn; 62948c2ecf20Sopenharmony_ci } 62958c2ecf20Sopenharmony_ci 62968c2ecf20Sopenharmony_ci for (pfn = start_pfn; pfn < end_pfn; pfn++) { 62978c2ecf20Sopenharmony_ci struct page *page = pfn_to_page(pfn); 62988c2ecf20Sopenharmony_ci 62998c2ecf20Sopenharmony_ci __init_single_page(page, pfn, zone_idx, nid); 63008c2ecf20Sopenharmony_ci 63018c2ecf20Sopenharmony_ci /* 63028c2ecf20Sopenharmony_ci * Mark page reserved as it will need to wait for onlining 63038c2ecf20Sopenharmony_ci * phase for it to be fully associated with a zone. 63048c2ecf20Sopenharmony_ci * 63058c2ecf20Sopenharmony_ci * We can use the non-atomic __set_bit operation for setting 63068c2ecf20Sopenharmony_ci * the flag as we are still initializing the pages. 63078c2ecf20Sopenharmony_ci */ 63088c2ecf20Sopenharmony_ci __SetPageReserved(page); 63098c2ecf20Sopenharmony_ci 63108c2ecf20Sopenharmony_ci /* 63118c2ecf20Sopenharmony_ci * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer 63128c2ecf20Sopenharmony_ci * and zone_device_data. It is a bug if a ZONE_DEVICE page is 63138c2ecf20Sopenharmony_ci * ever freed or placed on a driver-private list. 63148c2ecf20Sopenharmony_ci */ 63158c2ecf20Sopenharmony_ci page->pgmap = pgmap; 63168c2ecf20Sopenharmony_ci page->zone_device_data = NULL; 63178c2ecf20Sopenharmony_ci 63188c2ecf20Sopenharmony_ci /* 63198c2ecf20Sopenharmony_ci * Mark the block movable so that blocks are reserved for 63208c2ecf20Sopenharmony_ci * movable at startup. This will force kernel allocations 63218c2ecf20Sopenharmony_ci * to reserve their blocks rather than leaking throughout 63228c2ecf20Sopenharmony_ci * the address space during boot when many long-lived 63238c2ecf20Sopenharmony_ci * kernel allocations are made. 63248c2ecf20Sopenharmony_ci * 63258c2ecf20Sopenharmony_ci * Please note that MEMINIT_HOTPLUG path doesn't clear memmap 63268c2ecf20Sopenharmony_ci * because this is done early in section_activate() 63278c2ecf20Sopenharmony_ci */ 63288c2ecf20Sopenharmony_ci if (IS_ALIGNED(pfn, pageblock_nr_pages)) { 63298c2ecf20Sopenharmony_ci set_pageblock_migratetype(page, MIGRATE_MOVABLE); 63308c2ecf20Sopenharmony_ci cond_resched(); 63318c2ecf20Sopenharmony_ci } 63328c2ecf20Sopenharmony_ci } 63338c2ecf20Sopenharmony_ci 63348c2ecf20Sopenharmony_ci pr_info("%s initialised %lu pages in %ums\n", __func__, 63358c2ecf20Sopenharmony_ci nr_pages, jiffies_to_msecs(jiffies - start)); 63368c2ecf20Sopenharmony_ci} 63378c2ecf20Sopenharmony_ci 63388c2ecf20Sopenharmony_ci#endif 63398c2ecf20Sopenharmony_cistatic void __meminit zone_init_free_lists(struct zone *zone) 63408c2ecf20Sopenharmony_ci{ 63418c2ecf20Sopenharmony_ci unsigned int order, t; 63428c2ecf20Sopenharmony_ci for_each_migratetype_order(order, t) { 63438c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); 63448c2ecf20Sopenharmony_ci zone->free_area[order].nr_free = 0; 63458c2ecf20Sopenharmony_ci } 63468c2ecf20Sopenharmony_ci} 63478c2ecf20Sopenharmony_ci 63488c2ecf20Sopenharmony_ci#if !defined(CONFIG_FLAT_NODE_MEM_MAP) 63498c2ecf20Sopenharmony_ci/* 63508c2ecf20Sopenharmony_ci * Only struct pages that correspond to ranges defined by memblock.memory 63518c2ecf20Sopenharmony_ci * are zeroed and initialized by going through __init_single_page() during 63528c2ecf20Sopenharmony_ci * memmap_init_zone_range(). 63538c2ecf20Sopenharmony_ci * 63548c2ecf20Sopenharmony_ci * But, there could be struct pages that correspond to holes in 63558c2ecf20Sopenharmony_ci * memblock.memory. This can happen because of the following reasons: 63568c2ecf20Sopenharmony_ci * - physical memory bank size is not necessarily the exact multiple of the 63578c2ecf20Sopenharmony_ci * arbitrary section size 63588c2ecf20Sopenharmony_ci * - early reserved memory may not be listed in memblock.memory 63598c2ecf20Sopenharmony_ci * - memory layouts defined with memmap= kernel parameter may not align 63608c2ecf20Sopenharmony_ci * nicely with memmap sections 63618c2ecf20Sopenharmony_ci * 63628c2ecf20Sopenharmony_ci * Explicitly initialize those struct pages so that: 63638c2ecf20Sopenharmony_ci * - PG_Reserved is set 63648c2ecf20Sopenharmony_ci * - zone and node links point to zone and node that span the page if the 63658c2ecf20Sopenharmony_ci * hole is in the middle of a zone 63668c2ecf20Sopenharmony_ci * - zone and node links point to adjacent zone/node if the hole falls on 63678c2ecf20Sopenharmony_ci * the zone boundary; the pages in such holes will be prepended to the 63688c2ecf20Sopenharmony_ci * zone/node above the hole except for the trailing pages in the last 63698c2ecf20Sopenharmony_ci * section that will be appended to the zone/node below. 63708c2ecf20Sopenharmony_ci */ 63718c2ecf20Sopenharmony_cistatic void __init init_unavailable_range(unsigned long spfn, 63728c2ecf20Sopenharmony_ci unsigned long epfn, 63738c2ecf20Sopenharmony_ci int zone, int node) 63748c2ecf20Sopenharmony_ci{ 63758c2ecf20Sopenharmony_ci unsigned long pfn; 63768c2ecf20Sopenharmony_ci u64 pgcnt = 0; 63778c2ecf20Sopenharmony_ci 63788c2ecf20Sopenharmony_ci for (pfn = spfn; pfn < epfn; pfn++) { 63798c2ecf20Sopenharmony_ci if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { 63808c2ecf20Sopenharmony_ci pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) 63818c2ecf20Sopenharmony_ci + pageblock_nr_pages - 1; 63828c2ecf20Sopenharmony_ci continue; 63838c2ecf20Sopenharmony_ci } 63848c2ecf20Sopenharmony_ci __init_single_page(pfn_to_page(pfn), pfn, zone, node); 63858c2ecf20Sopenharmony_ci __SetPageReserved(pfn_to_page(pfn)); 63868c2ecf20Sopenharmony_ci pgcnt++; 63878c2ecf20Sopenharmony_ci } 63888c2ecf20Sopenharmony_ci 63898c2ecf20Sopenharmony_ci if (pgcnt) 63908c2ecf20Sopenharmony_ci pr_info("On node %d, zone %s: %lld pages in unavailable ranges", 63918c2ecf20Sopenharmony_ci node, zone_names[zone], pgcnt); 63928c2ecf20Sopenharmony_ci} 63938c2ecf20Sopenharmony_ci#else 63948c2ecf20Sopenharmony_cistatic inline void init_unavailable_range(unsigned long spfn, 63958c2ecf20Sopenharmony_ci unsigned long epfn, 63968c2ecf20Sopenharmony_ci int zone, int node) 63978c2ecf20Sopenharmony_ci{ 63988c2ecf20Sopenharmony_ci} 63998c2ecf20Sopenharmony_ci#endif 64008c2ecf20Sopenharmony_ci 64018c2ecf20Sopenharmony_cistatic void __init memmap_init_zone_range(struct zone *zone, 64028c2ecf20Sopenharmony_ci unsigned long start_pfn, 64038c2ecf20Sopenharmony_ci unsigned long end_pfn, 64048c2ecf20Sopenharmony_ci unsigned long *hole_pfn) 64058c2ecf20Sopenharmony_ci{ 64068c2ecf20Sopenharmony_ci unsigned long zone_start_pfn = zone->zone_start_pfn; 64078c2ecf20Sopenharmony_ci unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages; 64088c2ecf20Sopenharmony_ci int nid = zone_to_nid(zone), zone_id = zone_idx(zone); 64098c2ecf20Sopenharmony_ci 64108c2ecf20Sopenharmony_ci start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn); 64118c2ecf20Sopenharmony_ci end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn); 64128c2ecf20Sopenharmony_ci 64138c2ecf20Sopenharmony_ci if (start_pfn >= end_pfn) 64148c2ecf20Sopenharmony_ci return; 64158c2ecf20Sopenharmony_ci 64168c2ecf20Sopenharmony_ci memmap_init_zone(end_pfn - start_pfn, nid, zone_id, start_pfn, 64178c2ecf20Sopenharmony_ci zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); 64188c2ecf20Sopenharmony_ci 64198c2ecf20Sopenharmony_ci if (*hole_pfn < start_pfn) 64208c2ecf20Sopenharmony_ci init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid); 64218c2ecf20Sopenharmony_ci 64228c2ecf20Sopenharmony_ci *hole_pfn = end_pfn; 64238c2ecf20Sopenharmony_ci} 64248c2ecf20Sopenharmony_ci 64258c2ecf20Sopenharmony_civoid __init __weak memmap_init(void) 64268c2ecf20Sopenharmony_ci{ 64278c2ecf20Sopenharmony_ci unsigned long start_pfn, end_pfn; 64288c2ecf20Sopenharmony_ci unsigned long hole_pfn = 0; 64298c2ecf20Sopenharmony_ci int i, j, zone_id, nid; 64308c2ecf20Sopenharmony_ci 64318c2ecf20Sopenharmony_ci for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 64328c2ecf20Sopenharmony_ci struct pglist_data *node = NODE_DATA(nid); 64338c2ecf20Sopenharmony_ci 64348c2ecf20Sopenharmony_ci for (j = 0; j < MAX_NR_ZONES; j++) { 64358c2ecf20Sopenharmony_ci struct zone *zone = node->node_zones + j; 64368c2ecf20Sopenharmony_ci 64378c2ecf20Sopenharmony_ci if (!populated_zone(zone)) 64388c2ecf20Sopenharmony_ci continue; 64398c2ecf20Sopenharmony_ci 64408c2ecf20Sopenharmony_ci memmap_init_zone_range(zone, start_pfn, end_pfn, 64418c2ecf20Sopenharmony_ci &hole_pfn); 64428c2ecf20Sopenharmony_ci zone_id = j; 64438c2ecf20Sopenharmony_ci } 64448c2ecf20Sopenharmony_ci } 64458c2ecf20Sopenharmony_ci 64468c2ecf20Sopenharmony_ci#ifdef CONFIG_SPARSEMEM 64478c2ecf20Sopenharmony_ci /* 64488c2ecf20Sopenharmony_ci * Initialize the memory map for hole in the range [memory_end, 64498c2ecf20Sopenharmony_ci * section_end]. 64508c2ecf20Sopenharmony_ci * Append the pages in this hole to the highest zone in the last 64518c2ecf20Sopenharmony_ci * node. 64528c2ecf20Sopenharmony_ci * The call to init_unavailable_range() is outside the ifdef to 64538c2ecf20Sopenharmony_ci * silence the compiler warining about zone_id set but not used; 64548c2ecf20Sopenharmony_ci * for FLATMEM it is a nop anyway 64558c2ecf20Sopenharmony_ci */ 64568c2ecf20Sopenharmony_ci end_pfn = round_up(end_pfn, PAGES_PER_SECTION); 64578c2ecf20Sopenharmony_ci if (hole_pfn < end_pfn) 64588c2ecf20Sopenharmony_ci#endif 64598c2ecf20Sopenharmony_ci init_unavailable_range(hole_pfn, end_pfn, zone_id, nid); 64608c2ecf20Sopenharmony_ci} 64618c2ecf20Sopenharmony_ci 64628c2ecf20Sopenharmony_ci/* A stub for backwards compatibility with custom implementatin on IA-64 */ 64638c2ecf20Sopenharmony_civoid __meminit __weak arch_memmap_init(unsigned long size, int nid, 64648c2ecf20Sopenharmony_ci unsigned long zone, 64658c2ecf20Sopenharmony_ci unsigned long range_start_pfn) 64668c2ecf20Sopenharmony_ci{ 64678c2ecf20Sopenharmony_ci} 64688c2ecf20Sopenharmony_ci 64698c2ecf20Sopenharmony_cistatic int zone_batchsize(struct zone *zone) 64708c2ecf20Sopenharmony_ci{ 64718c2ecf20Sopenharmony_ci#ifdef CONFIG_MMU 64728c2ecf20Sopenharmony_ci int batch; 64738c2ecf20Sopenharmony_ci 64748c2ecf20Sopenharmony_ci /* 64758c2ecf20Sopenharmony_ci * The per-cpu-pages pools are set to around 1000th of the 64768c2ecf20Sopenharmony_ci * size of the zone. 64778c2ecf20Sopenharmony_ci */ 64788c2ecf20Sopenharmony_ci batch = zone_managed_pages(zone) / 1024; 64798c2ecf20Sopenharmony_ci /* But no more than a meg. */ 64808c2ecf20Sopenharmony_ci if (batch * PAGE_SIZE > 1024 * 1024) 64818c2ecf20Sopenharmony_ci batch = (1024 * 1024) / PAGE_SIZE; 64828c2ecf20Sopenharmony_ci batch /= 4; /* We effectively *= 4 below */ 64838c2ecf20Sopenharmony_ci if (batch < 1) 64848c2ecf20Sopenharmony_ci batch = 1; 64858c2ecf20Sopenharmony_ci 64868c2ecf20Sopenharmony_ci /* 64878c2ecf20Sopenharmony_ci * Clamp the batch to a 2^n - 1 value. Having a power 64888c2ecf20Sopenharmony_ci * of 2 value was found to be more likely to have 64898c2ecf20Sopenharmony_ci * suboptimal cache aliasing properties in some cases. 64908c2ecf20Sopenharmony_ci * 64918c2ecf20Sopenharmony_ci * For example if 2 tasks are alternately allocating 64928c2ecf20Sopenharmony_ci * batches of pages, one task can end up with a lot 64938c2ecf20Sopenharmony_ci * of pages of one half of the possible page colors 64948c2ecf20Sopenharmony_ci * and the other with pages of the other colors. 64958c2ecf20Sopenharmony_ci */ 64968c2ecf20Sopenharmony_ci batch = rounddown_pow_of_two(batch + batch/2) - 1; 64978c2ecf20Sopenharmony_ci 64988c2ecf20Sopenharmony_ci return batch; 64998c2ecf20Sopenharmony_ci 65008c2ecf20Sopenharmony_ci#else 65018c2ecf20Sopenharmony_ci /* The deferral and batching of frees should be suppressed under NOMMU 65028c2ecf20Sopenharmony_ci * conditions. 65038c2ecf20Sopenharmony_ci * 65048c2ecf20Sopenharmony_ci * The problem is that NOMMU needs to be able to allocate large chunks 65058c2ecf20Sopenharmony_ci * of contiguous memory as there's no hardware page translation to 65068c2ecf20Sopenharmony_ci * assemble apparent contiguous memory from discontiguous pages. 65078c2ecf20Sopenharmony_ci * 65088c2ecf20Sopenharmony_ci * Queueing large contiguous runs of pages for batching, however, 65098c2ecf20Sopenharmony_ci * causes the pages to actually be freed in smaller chunks. As there 65108c2ecf20Sopenharmony_ci * can be a significant delay between the individual batches being 65118c2ecf20Sopenharmony_ci * recycled, this leads to the once large chunks of space being 65128c2ecf20Sopenharmony_ci * fragmented and becoming unavailable for high-order allocations. 65138c2ecf20Sopenharmony_ci */ 65148c2ecf20Sopenharmony_ci return 0; 65158c2ecf20Sopenharmony_ci#endif 65168c2ecf20Sopenharmony_ci} 65178c2ecf20Sopenharmony_ci 65188c2ecf20Sopenharmony_ci/* 65198c2ecf20Sopenharmony_ci * pcp->high and pcp->batch values are related and dependent on one another: 65208c2ecf20Sopenharmony_ci * ->batch must never be higher then ->high. 65218c2ecf20Sopenharmony_ci * The following function updates them in a safe manner without read side 65228c2ecf20Sopenharmony_ci * locking. 65238c2ecf20Sopenharmony_ci * 65248c2ecf20Sopenharmony_ci * Any new users of pcp->batch and pcp->high should ensure they can cope with 65258c2ecf20Sopenharmony_ci * those fields changing asynchronously (acording to the above rule). 65268c2ecf20Sopenharmony_ci * 65278c2ecf20Sopenharmony_ci * mutex_is_locked(&pcp_batch_high_lock) required when calling this function 65288c2ecf20Sopenharmony_ci * outside of boot time (or some other assurance that no concurrent updaters 65298c2ecf20Sopenharmony_ci * exist). 65308c2ecf20Sopenharmony_ci */ 65318c2ecf20Sopenharmony_cistatic void pageset_update(struct per_cpu_pages *pcp, unsigned long high, 65328c2ecf20Sopenharmony_ci unsigned long batch) 65338c2ecf20Sopenharmony_ci{ 65348c2ecf20Sopenharmony_ci /* start with a fail safe value for batch */ 65358c2ecf20Sopenharmony_ci pcp->batch = 1; 65368c2ecf20Sopenharmony_ci smp_wmb(); 65378c2ecf20Sopenharmony_ci 65388c2ecf20Sopenharmony_ci /* Update high, then batch, in order */ 65398c2ecf20Sopenharmony_ci pcp->high = high; 65408c2ecf20Sopenharmony_ci smp_wmb(); 65418c2ecf20Sopenharmony_ci 65428c2ecf20Sopenharmony_ci pcp->batch = batch; 65438c2ecf20Sopenharmony_ci} 65448c2ecf20Sopenharmony_ci 65458c2ecf20Sopenharmony_ci/* a companion to pageset_set_high() */ 65468c2ecf20Sopenharmony_cistatic void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) 65478c2ecf20Sopenharmony_ci{ 65488c2ecf20Sopenharmony_ci pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); 65498c2ecf20Sopenharmony_ci} 65508c2ecf20Sopenharmony_ci 65518c2ecf20Sopenharmony_cistatic void pageset_init(struct per_cpu_pageset *p) 65528c2ecf20Sopenharmony_ci{ 65538c2ecf20Sopenharmony_ci struct per_cpu_pages *pcp; 65548c2ecf20Sopenharmony_ci int migratetype; 65558c2ecf20Sopenharmony_ci 65568c2ecf20Sopenharmony_ci memset(p, 0, sizeof(*p)); 65578c2ecf20Sopenharmony_ci 65588c2ecf20Sopenharmony_ci pcp = &p->pcp; 65598c2ecf20Sopenharmony_ci for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) 65608c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&pcp->lists[migratetype]); 65618c2ecf20Sopenharmony_ci} 65628c2ecf20Sopenharmony_ci 65638c2ecf20Sopenharmony_cistatic void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 65648c2ecf20Sopenharmony_ci{ 65658c2ecf20Sopenharmony_ci pageset_init(p); 65668c2ecf20Sopenharmony_ci pageset_set_batch(p, batch); 65678c2ecf20Sopenharmony_ci} 65688c2ecf20Sopenharmony_ci 65698c2ecf20Sopenharmony_ci/* 65708c2ecf20Sopenharmony_ci * pageset_set_high() sets the high water mark for hot per_cpu_pagelist 65718c2ecf20Sopenharmony_ci * to the value high for the pageset p. 65728c2ecf20Sopenharmony_ci */ 65738c2ecf20Sopenharmony_cistatic void pageset_set_high(struct per_cpu_pageset *p, 65748c2ecf20Sopenharmony_ci unsigned long high) 65758c2ecf20Sopenharmony_ci{ 65768c2ecf20Sopenharmony_ci unsigned long batch = max(1UL, high / 4); 65778c2ecf20Sopenharmony_ci if ((high / 4) > (PAGE_SHIFT * 8)) 65788c2ecf20Sopenharmony_ci batch = PAGE_SHIFT * 8; 65798c2ecf20Sopenharmony_ci 65808c2ecf20Sopenharmony_ci pageset_update(&p->pcp, high, batch); 65818c2ecf20Sopenharmony_ci} 65828c2ecf20Sopenharmony_ci 65838c2ecf20Sopenharmony_cistatic void pageset_set_high_and_batch(struct zone *zone, 65848c2ecf20Sopenharmony_ci struct per_cpu_pageset *pcp) 65858c2ecf20Sopenharmony_ci{ 65868c2ecf20Sopenharmony_ci if (percpu_pagelist_fraction) 65878c2ecf20Sopenharmony_ci pageset_set_high(pcp, 65888c2ecf20Sopenharmony_ci (zone_managed_pages(zone) / 65898c2ecf20Sopenharmony_ci percpu_pagelist_fraction)); 65908c2ecf20Sopenharmony_ci else 65918c2ecf20Sopenharmony_ci pageset_set_batch(pcp, zone_batchsize(zone)); 65928c2ecf20Sopenharmony_ci} 65938c2ecf20Sopenharmony_ci 65948c2ecf20Sopenharmony_cistatic void __meminit zone_pageset_init(struct zone *zone, int cpu) 65958c2ecf20Sopenharmony_ci{ 65968c2ecf20Sopenharmony_ci struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); 65978c2ecf20Sopenharmony_ci 65988c2ecf20Sopenharmony_ci pageset_init(pcp); 65998c2ecf20Sopenharmony_ci pageset_set_high_and_batch(zone, pcp); 66008c2ecf20Sopenharmony_ci} 66018c2ecf20Sopenharmony_ci 66028c2ecf20Sopenharmony_civoid __meminit setup_zone_pageset(struct zone *zone) 66038c2ecf20Sopenharmony_ci{ 66048c2ecf20Sopenharmony_ci int cpu; 66058c2ecf20Sopenharmony_ci zone->pageset = alloc_percpu(struct per_cpu_pageset); 66068c2ecf20Sopenharmony_ci for_each_possible_cpu(cpu) 66078c2ecf20Sopenharmony_ci zone_pageset_init(zone, cpu); 66088c2ecf20Sopenharmony_ci} 66098c2ecf20Sopenharmony_ci 66108c2ecf20Sopenharmony_ci/* 66118c2ecf20Sopenharmony_ci * Allocate per cpu pagesets and initialize them. 66128c2ecf20Sopenharmony_ci * Before this call only boot pagesets were available. 66138c2ecf20Sopenharmony_ci */ 66148c2ecf20Sopenharmony_civoid __init setup_per_cpu_pageset(void) 66158c2ecf20Sopenharmony_ci{ 66168c2ecf20Sopenharmony_ci struct pglist_data *pgdat; 66178c2ecf20Sopenharmony_ci struct zone *zone; 66188c2ecf20Sopenharmony_ci int __maybe_unused cpu; 66198c2ecf20Sopenharmony_ci 66208c2ecf20Sopenharmony_ci for_each_populated_zone(zone) 66218c2ecf20Sopenharmony_ci setup_zone_pageset(zone); 66228c2ecf20Sopenharmony_ci 66238c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA 66248c2ecf20Sopenharmony_ci /* 66258c2ecf20Sopenharmony_ci * Unpopulated zones continue using the boot pagesets. 66268c2ecf20Sopenharmony_ci * The numa stats for these pagesets need to be reset. 66278c2ecf20Sopenharmony_ci * Otherwise, they will end up skewing the stats of 66288c2ecf20Sopenharmony_ci * the nodes these zones are associated with. 66298c2ecf20Sopenharmony_ci */ 66308c2ecf20Sopenharmony_ci for_each_possible_cpu(cpu) { 66318c2ecf20Sopenharmony_ci struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu); 66328c2ecf20Sopenharmony_ci memset(pcp->vm_numa_stat_diff, 0, 66338c2ecf20Sopenharmony_ci sizeof(pcp->vm_numa_stat_diff)); 66348c2ecf20Sopenharmony_ci } 66358c2ecf20Sopenharmony_ci#endif 66368c2ecf20Sopenharmony_ci 66378c2ecf20Sopenharmony_ci for_each_online_pgdat(pgdat) 66388c2ecf20Sopenharmony_ci pgdat->per_cpu_nodestats = 66398c2ecf20Sopenharmony_ci alloc_percpu(struct per_cpu_nodestat); 66408c2ecf20Sopenharmony_ci} 66418c2ecf20Sopenharmony_ci 66428c2ecf20Sopenharmony_cistatic __meminit void zone_pcp_init(struct zone *zone) 66438c2ecf20Sopenharmony_ci{ 66448c2ecf20Sopenharmony_ci /* 66458c2ecf20Sopenharmony_ci * per cpu subsystem is not up at this point. The following code 66468c2ecf20Sopenharmony_ci * relies on the ability of the linker to provide the 66478c2ecf20Sopenharmony_ci * offset of a (static) per cpu variable into the per cpu area. 66488c2ecf20Sopenharmony_ci */ 66498c2ecf20Sopenharmony_ci zone->pageset = &boot_pageset; 66508c2ecf20Sopenharmony_ci 66518c2ecf20Sopenharmony_ci if (populated_zone(zone)) 66528c2ecf20Sopenharmony_ci printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", 66538c2ecf20Sopenharmony_ci zone->name, zone->present_pages, 66548c2ecf20Sopenharmony_ci zone_batchsize(zone)); 66558c2ecf20Sopenharmony_ci} 66568c2ecf20Sopenharmony_ci 66578c2ecf20Sopenharmony_civoid __meminit init_currently_empty_zone(struct zone *zone, 66588c2ecf20Sopenharmony_ci unsigned long zone_start_pfn, 66598c2ecf20Sopenharmony_ci unsigned long size) 66608c2ecf20Sopenharmony_ci{ 66618c2ecf20Sopenharmony_ci struct pglist_data *pgdat = zone->zone_pgdat; 66628c2ecf20Sopenharmony_ci int zone_idx = zone_idx(zone) + 1; 66638c2ecf20Sopenharmony_ci 66648c2ecf20Sopenharmony_ci if (zone_idx > pgdat->nr_zones) 66658c2ecf20Sopenharmony_ci pgdat->nr_zones = zone_idx; 66668c2ecf20Sopenharmony_ci 66678c2ecf20Sopenharmony_ci zone->zone_start_pfn = zone_start_pfn; 66688c2ecf20Sopenharmony_ci 66698c2ecf20Sopenharmony_ci mminit_dprintk(MMINIT_TRACE, "memmap_init", 66708c2ecf20Sopenharmony_ci "Initialising map node %d zone %lu pfns %lu -> %lu\n", 66718c2ecf20Sopenharmony_ci pgdat->node_id, 66728c2ecf20Sopenharmony_ci (unsigned long)zone_idx(zone), 66738c2ecf20Sopenharmony_ci zone_start_pfn, (zone_start_pfn + size)); 66748c2ecf20Sopenharmony_ci 66758c2ecf20Sopenharmony_ci zone_init_free_lists(zone); 66768c2ecf20Sopenharmony_ci zone->initialized = 1; 66778c2ecf20Sopenharmony_ci} 66788c2ecf20Sopenharmony_ci 66798c2ecf20Sopenharmony_ci/** 66808c2ecf20Sopenharmony_ci * get_pfn_range_for_nid - Return the start and end page frames for a node 66818c2ecf20Sopenharmony_ci * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. 66828c2ecf20Sopenharmony_ci * @start_pfn: Passed by reference. On return, it will have the node start_pfn. 66838c2ecf20Sopenharmony_ci * @end_pfn: Passed by reference. On return, it will have the node end_pfn. 66848c2ecf20Sopenharmony_ci * 66858c2ecf20Sopenharmony_ci * It returns the start and end page frame of a node based on information 66868c2ecf20Sopenharmony_ci * provided by memblock_set_node(). If called for a node 66878c2ecf20Sopenharmony_ci * with no available memory, a warning is printed and the start and end 66888c2ecf20Sopenharmony_ci * PFNs will be 0. 66898c2ecf20Sopenharmony_ci */ 66908c2ecf20Sopenharmony_civoid __init get_pfn_range_for_nid(unsigned int nid, 66918c2ecf20Sopenharmony_ci unsigned long *start_pfn, unsigned long *end_pfn) 66928c2ecf20Sopenharmony_ci{ 66938c2ecf20Sopenharmony_ci unsigned long this_start_pfn, this_end_pfn; 66948c2ecf20Sopenharmony_ci int i; 66958c2ecf20Sopenharmony_ci 66968c2ecf20Sopenharmony_ci *start_pfn = -1UL; 66978c2ecf20Sopenharmony_ci *end_pfn = 0; 66988c2ecf20Sopenharmony_ci 66998c2ecf20Sopenharmony_ci for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) { 67008c2ecf20Sopenharmony_ci *start_pfn = min(*start_pfn, this_start_pfn); 67018c2ecf20Sopenharmony_ci *end_pfn = max(*end_pfn, this_end_pfn); 67028c2ecf20Sopenharmony_ci } 67038c2ecf20Sopenharmony_ci 67048c2ecf20Sopenharmony_ci if (*start_pfn == -1UL) 67058c2ecf20Sopenharmony_ci *start_pfn = 0; 67068c2ecf20Sopenharmony_ci} 67078c2ecf20Sopenharmony_ci 67088c2ecf20Sopenharmony_ci/* 67098c2ecf20Sopenharmony_ci * This finds a zone that can be used for ZONE_MOVABLE pages. The 67108c2ecf20Sopenharmony_ci * assumption is made that zones within a node are ordered in monotonic 67118c2ecf20Sopenharmony_ci * increasing memory addresses so that the "highest" populated zone is used 67128c2ecf20Sopenharmony_ci */ 67138c2ecf20Sopenharmony_cistatic void __init find_usable_zone_for_movable(void) 67148c2ecf20Sopenharmony_ci{ 67158c2ecf20Sopenharmony_ci int zone_index; 67168c2ecf20Sopenharmony_ci for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { 67178c2ecf20Sopenharmony_ci if (zone_index == ZONE_MOVABLE) 67188c2ecf20Sopenharmony_ci continue; 67198c2ecf20Sopenharmony_ci 67208c2ecf20Sopenharmony_ci if (arch_zone_highest_possible_pfn[zone_index] > 67218c2ecf20Sopenharmony_ci arch_zone_lowest_possible_pfn[zone_index]) 67228c2ecf20Sopenharmony_ci break; 67238c2ecf20Sopenharmony_ci } 67248c2ecf20Sopenharmony_ci 67258c2ecf20Sopenharmony_ci VM_BUG_ON(zone_index == -1); 67268c2ecf20Sopenharmony_ci movable_zone = zone_index; 67278c2ecf20Sopenharmony_ci} 67288c2ecf20Sopenharmony_ci 67298c2ecf20Sopenharmony_ci/* 67308c2ecf20Sopenharmony_ci * The zone ranges provided by the architecture do not include ZONE_MOVABLE 67318c2ecf20Sopenharmony_ci * because it is sized independent of architecture. Unlike the other zones, 67328c2ecf20Sopenharmony_ci * the starting point for ZONE_MOVABLE is not fixed. It may be different 67338c2ecf20Sopenharmony_ci * in each node depending on the size of each node and how evenly kernelcore 67348c2ecf20Sopenharmony_ci * is distributed. This helper function adjusts the zone ranges 67358c2ecf20Sopenharmony_ci * provided by the architecture for a given node by using the end of the 67368c2ecf20Sopenharmony_ci * highest usable zone for ZONE_MOVABLE. This preserves the assumption that 67378c2ecf20Sopenharmony_ci * zones within a node are in order of monotonic increases memory addresses 67388c2ecf20Sopenharmony_ci */ 67398c2ecf20Sopenharmony_cistatic void __init adjust_zone_range_for_zone_movable(int nid, 67408c2ecf20Sopenharmony_ci unsigned long zone_type, 67418c2ecf20Sopenharmony_ci unsigned long node_start_pfn, 67428c2ecf20Sopenharmony_ci unsigned long node_end_pfn, 67438c2ecf20Sopenharmony_ci unsigned long *zone_start_pfn, 67448c2ecf20Sopenharmony_ci unsigned long *zone_end_pfn) 67458c2ecf20Sopenharmony_ci{ 67468c2ecf20Sopenharmony_ci /* Only adjust if ZONE_MOVABLE is on this node */ 67478c2ecf20Sopenharmony_ci if (zone_movable_pfn[nid]) { 67488c2ecf20Sopenharmony_ci /* Size ZONE_MOVABLE */ 67498c2ecf20Sopenharmony_ci if (zone_type == ZONE_MOVABLE) { 67508c2ecf20Sopenharmony_ci *zone_start_pfn = zone_movable_pfn[nid]; 67518c2ecf20Sopenharmony_ci *zone_end_pfn = min(node_end_pfn, 67528c2ecf20Sopenharmony_ci arch_zone_highest_possible_pfn[movable_zone]); 67538c2ecf20Sopenharmony_ci 67548c2ecf20Sopenharmony_ci /* Adjust for ZONE_MOVABLE starting within this range */ 67558c2ecf20Sopenharmony_ci } else if (!mirrored_kernelcore && 67568c2ecf20Sopenharmony_ci *zone_start_pfn < zone_movable_pfn[nid] && 67578c2ecf20Sopenharmony_ci *zone_end_pfn > zone_movable_pfn[nid]) { 67588c2ecf20Sopenharmony_ci *zone_end_pfn = zone_movable_pfn[nid]; 67598c2ecf20Sopenharmony_ci 67608c2ecf20Sopenharmony_ci /* Check if this whole range is within ZONE_MOVABLE */ 67618c2ecf20Sopenharmony_ci } else if (*zone_start_pfn >= zone_movable_pfn[nid]) 67628c2ecf20Sopenharmony_ci *zone_start_pfn = *zone_end_pfn; 67638c2ecf20Sopenharmony_ci } 67648c2ecf20Sopenharmony_ci} 67658c2ecf20Sopenharmony_ci 67668c2ecf20Sopenharmony_ci/* 67678c2ecf20Sopenharmony_ci * Return the number of pages a zone spans in a node, including holes 67688c2ecf20Sopenharmony_ci * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() 67698c2ecf20Sopenharmony_ci */ 67708c2ecf20Sopenharmony_cistatic unsigned long __init zone_spanned_pages_in_node(int nid, 67718c2ecf20Sopenharmony_ci unsigned long zone_type, 67728c2ecf20Sopenharmony_ci unsigned long node_start_pfn, 67738c2ecf20Sopenharmony_ci unsigned long node_end_pfn, 67748c2ecf20Sopenharmony_ci unsigned long *zone_start_pfn, 67758c2ecf20Sopenharmony_ci unsigned long *zone_end_pfn) 67768c2ecf20Sopenharmony_ci{ 67778c2ecf20Sopenharmony_ci unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; 67788c2ecf20Sopenharmony_ci unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; 67798c2ecf20Sopenharmony_ci /* When hotadd a new node from cpu_up(), the node should be empty */ 67808c2ecf20Sopenharmony_ci if (!node_start_pfn && !node_end_pfn) 67818c2ecf20Sopenharmony_ci return 0; 67828c2ecf20Sopenharmony_ci 67838c2ecf20Sopenharmony_ci /* Get the start and end of the zone */ 67848c2ecf20Sopenharmony_ci *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); 67858c2ecf20Sopenharmony_ci *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); 67868c2ecf20Sopenharmony_ci adjust_zone_range_for_zone_movable(nid, zone_type, 67878c2ecf20Sopenharmony_ci node_start_pfn, node_end_pfn, 67888c2ecf20Sopenharmony_ci zone_start_pfn, zone_end_pfn); 67898c2ecf20Sopenharmony_ci 67908c2ecf20Sopenharmony_ci /* Check that this node has pages within the zone's required range */ 67918c2ecf20Sopenharmony_ci if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn) 67928c2ecf20Sopenharmony_ci return 0; 67938c2ecf20Sopenharmony_ci 67948c2ecf20Sopenharmony_ci /* Move the zone boundaries inside the node if necessary */ 67958c2ecf20Sopenharmony_ci *zone_end_pfn = min(*zone_end_pfn, node_end_pfn); 67968c2ecf20Sopenharmony_ci *zone_start_pfn = max(*zone_start_pfn, node_start_pfn); 67978c2ecf20Sopenharmony_ci 67988c2ecf20Sopenharmony_ci /* Return the spanned pages */ 67998c2ecf20Sopenharmony_ci return *zone_end_pfn - *zone_start_pfn; 68008c2ecf20Sopenharmony_ci} 68018c2ecf20Sopenharmony_ci 68028c2ecf20Sopenharmony_ci/* 68038c2ecf20Sopenharmony_ci * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 68048c2ecf20Sopenharmony_ci * then all holes in the requested range will be accounted for. 68058c2ecf20Sopenharmony_ci */ 68068c2ecf20Sopenharmony_ciunsigned long __init __absent_pages_in_range(int nid, 68078c2ecf20Sopenharmony_ci unsigned long range_start_pfn, 68088c2ecf20Sopenharmony_ci unsigned long range_end_pfn) 68098c2ecf20Sopenharmony_ci{ 68108c2ecf20Sopenharmony_ci unsigned long nr_absent = range_end_pfn - range_start_pfn; 68118c2ecf20Sopenharmony_ci unsigned long start_pfn, end_pfn; 68128c2ecf20Sopenharmony_ci int i; 68138c2ecf20Sopenharmony_ci 68148c2ecf20Sopenharmony_ci for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 68158c2ecf20Sopenharmony_ci start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); 68168c2ecf20Sopenharmony_ci end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); 68178c2ecf20Sopenharmony_ci nr_absent -= end_pfn - start_pfn; 68188c2ecf20Sopenharmony_ci } 68198c2ecf20Sopenharmony_ci return nr_absent; 68208c2ecf20Sopenharmony_ci} 68218c2ecf20Sopenharmony_ci 68228c2ecf20Sopenharmony_ci/** 68238c2ecf20Sopenharmony_ci * absent_pages_in_range - Return number of page frames in holes within a range 68248c2ecf20Sopenharmony_ci * @start_pfn: The start PFN to start searching for holes 68258c2ecf20Sopenharmony_ci * @end_pfn: The end PFN to stop searching for holes 68268c2ecf20Sopenharmony_ci * 68278c2ecf20Sopenharmony_ci * Return: the number of pages frames in memory holes within a range. 68288c2ecf20Sopenharmony_ci */ 68298c2ecf20Sopenharmony_ciunsigned long __init absent_pages_in_range(unsigned long start_pfn, 68308c2ecf20Sopenharmony_ci unsigned long end_pfn) 68318c2ecf20Sopenharmony_ci{ 68328c2ecf20Sopenharmony_ci return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); 68338c2ecf20Sopenharmony_ci} 68348c2ecf20Sopenharmony_ci 68358c2ecf20Sopenharmony_ci/* Return the number of page frames in holes in a zone on a node */ 68368c2ecf20Sopenharmony_cistatic unsigned long __init zone_absent_pages_in_node(int nid, 68378c2ecf20Sopenharmony_ci unsigned long zone_type, 68388c2ecf20Sopenharmony_ci unsigned long node_start_pfn, 68398c2ecf20Sopenharmony_ci unsigned long node_end_pfn) 68408c2ecf20Sopenharmony_ci{ 68418c2ecf20Sopenharmony_ci unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; 68428c2ecf20Sopenharmony_ci unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; 68438c2ecf20Sopenharmony_ci unsigned long zone_start_pfn, zone_end_pfn; 68448c2ecf20Sopenharmony_ci unsigned long nr_absent; 68458c2ecf20Sopenharmony_ci 68468c2ecf20Sopenharmony_ci /* When hotadd a new node from cpu_up(), the node should be empty */ 68478c2ecf20Sopenharmony_ci if (!node_start_pfn && !node_end_pfn) 68488c2ecf20Sopenharmony_ci return 0; 68498c2ecf20Sopenharmony_ci 68508c2ecf20Sopenharmony_ci zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); 68518c2ecf20Sopenharmony_ci zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); 68528c2ecf20Sopenharmony_ci 68538c2ecf20Sopenharmony_ci adjust_zone_range_for_zone_movable(nid, zone_type, 68548c2ecf20Sopenharmony_ci node_start_pfn, node_end_pfn, 68558c2ecf20Sopenharmony_ci &zone_start_pfn, &zone_end_pfn); 68568c2ecf20Sopenharmony_ci nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 68578c2ecf20Sopenharmony_ci 68588c2ecf20Sopenharmony_ci /* 68598c2ecf20Sopenharmony_ci * ZONE_MOVABLE handling. 68608c2ecf20Sopenharmony_ci * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages 68618c2ecf20Sopenharmony_ci * and vice versa. 68628c2ecf20Sopenharmony_ci */ 68638c2ecf20Sopenharmony_ci if (mirrored_kernelcore && zone_movable_pfn[nid]) { 68648c2ecf20Sopenharmony_ci unsigned long start_pfn, end_pfn; 68658c2ecf20Sopenharmony_ci struct memblock_region *r; 68668c2ecf20Sopenharmony_ci 68678c2ecf20Sopenharmony_ci for_each_mem_region(r) { 68688c2ecf20Sopenharmony_ci start_pfn = clamp(memblock_region_memory_base_pfn(r), 68698c2ecf20Sopenharmony_ci zone_start_pfn, zone_end_pfn); 68708c2ecf20Sopenharmony_ci end_pfn = clamp(memblock_region_memory_end_pfn(r), 68718c2ecf20Sopenharmony_ci zone_start_pfn, zone_end_pfn); 68728c2ecf20Sopenharmony_ci 68738c2ecf20Sopenharmony_ci if (zone_type == ZONE_MOVABLE && 68748c2ecf20Sopenharmony_ci memblock_is_mirror(r)) 68758c2ecf20Sopenharmony_ci nr_absent += end_pfn - start_pfn; 68768c2ecf20Sopenharmony_ci 68778c2ecf20Sopenharmony_ci if (zone_type == ZONE_NORMAL && 68788c2ecf20Sopenharmony_ci !memblock_is_mirror(r)) 68798c2ecf20Sopenharmony_ci nr_absent += end_pfn - start_pfn; 68808c2ecf20Sopenharmony_ci } 68818c2ecf20Sopenharmony_ci } 68828c2ecf20Sopenharmony_ci 68838c2ecf20Sopenharmony_ci return nr_absent; 68848c2ecf20Sopenharmony_ci} 68858c2ecf20Sopenharmony_ci 68868c2ecf20Sopenharmony_cistatic void __init calculate_node_totalpages(struct pglist_data *pgdat, 68878c2ecf20Sopenharmony_ci unsigned long node_start_pfn, 68888c2ecf20Sopenharmony_ci unsigned long node_end_pfn) 68898c2ecf20Sopenharmony_ci{ 68908c2ecf20Sopenharmony_ci unsigned long realtotalpages = 0, totalpages = 0; 68918c2ecf20Sopenharmony_ci enum zone_type i; 68928c2ecf20Sopenharmony_ci 68938c2ecf20Sopenharmony_ci for (i = 0; i < MAX_NR_ZONES; i++) { 68948c2ecf20Sopenharmony_ci struct zone *zone = pgdat->node_zones + i; 68958c2ecf20Sopenharmony_ci unsigned long zone_start_pfn, zone_end_pfn; 68968c2ecf20Sopenharmony_ci unsigned long spanned, absent; 68978c2ecf20Sopenharmony_ci unsigned long size, real_size; 68988c2ecf20Sopenharmony_ci 68998c2ecf20Sopenharmony_ci spanned = zone_spanned_pages_in_node(pgdat->node_id, i, 69008c2ecf20Sopenharmony_ci node_start_pfn, 69018c2ecf20Sopenharmony_ci node_end_pfn, 69028c2ecf20Sopenharmony_ci &zone_start_pfn, 69038c2ecf20Sopenharmony_ci &zone_end_pfn); 69048c2ecf20Sopenharmony_ci absent = zone_absent_pages_in_node(pgdat->node_id, i, 69058c2ecf20Sopenharmony_ci node_start_pfn, 69068c2ecf20Sopenharmony_ci node_end_pfn); 69078c2ecf20Sopenharmony_ci 69088c2ecf20Sopenharmony_ci size = spanned; 69098c2ecf20Sopenharmony_ci real_size = size - absent; 69108c2ecf20Sopenharmony_ci 69118c2ecf20Sopenharmony_ci if (size) 69128c2ecf20Sopenharmony_ci zone->zone_start_pfn = zone_start_pfn; 69138c2ecf20Sopenharmony_ci else 69148c2ecf20Sopenharmony_ci zone->zone_start_pfn = 0; 69158c2ecf20Sopenharmony_ci zone->spanned_pages = size; 69168c2ecf20Sopenharmony_ci zone->present_pages = real_size; 69178c2ecf20Sopenharmony_ci 69188c2ecf20Sopenharmony_ci totalpages += size; 69198c2ecf20Sopenharmony_ci realtotalpages += real_size; 69208c2ecf20Sopenharmony_ci } 69218c2ecf20Sopenharmony_ci 69228c2ecf20Sopenharmony_ci pgdat->node_spanned_pages = totalpages; 69238c2ecf20Sopenharmony_ci pgdat->node_present_pages = realtotalpages; 69248c2ecf20Sopenharmony_ci printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, 69258c2ecf20Sopenharmony_ci realtotalpages); 69268c2ecf20Sopenharmony_ci} 69278c2ecf20Sopenharmony_ci 69288c2ecf20Sopenharmony_ci#ifndef CONFIG_SPARSEMEM 69298c2ecf20Sopenharmony_ci/* 69308c2ecf20Sopenharmony_ci * Calculate the size of the zone->blockflags rounded to an unsigned long 69318c2ecf20Sopenharmony_ci * Start by making sure zonesize is a multiple of pageblock_order by rounding 69328c2ecf20Sopenharmony_ci * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally 69338c2ecf20Sopenharmony_ci * round what is now in bits to nearest long in bits, then return it in 69348c2ecf20Sopenharmony_ci * bytes. 69358c2ecf20Sopenharmony_ci */ 69368c2ecf20Sopenharmony_cistatic unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize) 69378c2ecf20Sopenharmony_ci{ 69388c2ecf20Sopenharmony_ci unsigned long usemapsize; 69398c2ecf20Sopenharmony_ci 69408c2ecf20Sopenharmony_ci zonesize += zone_start_pfn & (pageblock_nr_pages-1); 69418c2ecf20Sopenharmony_ci usemapsize = roundup(zonesize, pageblock_nr_pages); 69428c2ecf20Sopenharmony_ci usemapsize = usemapsize >> pageblock_order; 69438c2ecf20Sopenharmony_ci usemapsize *= NR_PAGEBLOCK_BITS; 69448c2ecf20Sopenharmony_ci usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); 69458c2ecf20Sopenharmony_ci 69468c2ecf20Sopenharmony_ci return usemapsize / 8; 69478c2ecf20Sopenharmony_ci} 69488c2ecf20Sopenharmony_ci 69498c2ecf20Sopenharmony_cistatic void __ref setup_usemap(struct pglist_data *pgdat, 69508c2ecf20Sopenharmony_ci struct zone *zone, 69518c2ecf20Sopenharmony_ci unsigned long zone_start_pfn, 69528c2ecf20Sopenharmony_ci unsigned long zonesize) 69538c2ecf20Sopenharmony_ci{ 69548c2ecf20Sopenharmony_ci unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); 69558c2ecf20Sopenharmony_ci zone->pageblock_flags = NULL; 69568c2ecf20Sopenharmony_ci if (usemapsize) { 69578c2ecf20Sopenharmony_ci zone->pageblock_flags = 69588c2ecf20Sopenharmony_ci memblock_alloc_node(usemapsize, SMP_CACHE_BYTES, 69598c2ecf20Sopenharmony_ci pgdat->node_id); 69608c2ecf20Sopenharmony_ci if (!zone->pageblock_flags) 69618c2ecf20Sopenharmony_ci panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n", 69628c2ecf20Sopenharmony_ci usemapsize, zone->name, pgdat->node_id); 69638c2ecf20Sopenharmony_ci } 69648c2ecf20Sopenharmony_ci} 69658c2ecf20Sopenharmony_ci#else 69668c2ecf20Sopenharmony_cistatic inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, 69678c2ecf20Sopenharmony_ci unsigned long zone_start_pfn, unsigned long zonesize) {} 69688c2ecf20Sopenharmony_ci#endif /* CONFIG_SPARSEMEM */ 69698c2ecf20Sopenharmony_ci 69708c2ecf20Sopenharmony_ci#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 69718c2ecf20Sopenharmony_ci 69728c2ecf20Sopenharmony_ci/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 69738c2ecf20Sopenharmony_civoid __init set_pageblock_order(void) 69748c2ecf20Sopenharmony_ci{ 69758c2ecf20Sopenharmony_ci unsigned int order; 69768c2ecf20Sopenharmony_ci 69778c2ecf20Sopenharmony_ci /* Check that pageblock_nr_pages has not already been setup */ 69788c2ecf20Sopenharmony_ci if (pageblock_order) 69798c2ecf20Sopenharmony_ci return; 69808c2ecf20Sopenharmony_ci 69818c2ecf20Sopenharmony_ci if (HPAGE_SHIFT > PAGE_SHIFT) 69828c2ecf20Sopenharmony_ci order = HUGETLB_PAGE_ORDER; 69838c2ecf20Sopenharmony_ci else 69848c2ecf20Sopenharmony_ci order = MAX_ORDER - 1; 69858c2ecf20Sopenharmony_ci 69868c2ecf20Sopenharmony_ci /* 69878c2ecf20Sopenharmony_ci * Assume the largest contiguous order of interest is a huge page. 69888c2ecf20Sopenharmony_ci * This value may be variable depending on boot parameters on IA64 and 69898c2ecf20Sopenharmony_ci * powerpc. 69908c2ecf20Sopenharmony_ci */ 69918c2ecf20Sopenharmony_ci pageblock_order = order; 69928c2ecf20Sopenharmony_ci} 69938c2ecf20Sopenharmony_ci#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 69948c2ecf20Sopenharmony_ci 69958c2ecf20Sopenharmony_ci/* 69968c2ecf20Sopenharmony_ci * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() 69978c2ecf20Sopenharmony_ci * is unused as pageblock_order is set at compile-time. See 69988c2ecf20Sopenharmony_ci * include/linux/pageblock-flags.h for the values of pageblock_order based on 69998c2ecf20Sopenharmony_ci * the kernel config 70008c2ecf20Sopenharmony_ci */ 70018c2ecf20Sopenharmony_civoid __init set_pageblock_order(void) 70028c2ecf20Sopenharmony_ci{ 70038c2ecf20Sopenharmony_ci} 70048c2ecf20Sopenharmony_ci 70058c2ecf20Sopenharmony_ci#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 70068c2ecf20Sopenharmony_ci 70078c2ecf20Sopenharmony_cistatic unsigned long __init calc_memmap_size(unsigned long spanned_pages, 70088c2ecf20Sopenharmony_ci unsigned long present_pages) 70098c2ecf20Sopenharmony_ci{ 70108c2ecf20Sopenharmony_ci unsigned long pages = spanned_pages; 70118c2ecf20Sopenharmony_ci 70128c2ecf20Sopenharmony_ci /* 70138c2ecf20Sopenharmony_ci * Provide a more accurate estimation if there are holes within 70148c2ecf20Sopenharmony_ci * the zone and SPARSEMEM is in use. If there are holes within the 70158c2ecf20Sopenharmony_ci * zone, each populated memory region may cost us one or two extra 70168c2ecf20Sopenharmony_ci * memmap pages due to alignment because memmap pages for each 70178c2ecf20Sopenharmony_ci * populated regions may not be naturally aligned on page boundary. 70188c2ecf20Sopenharmony_ci * So the (present_pages >> 4) heuristic is a tradeoff for that. 70198c2ecf20Sopenharmony_ci */ 70208c2ecf20Sopenharmony_ci if (spanned_pages > present_pages + (present_pages >> 4) && 70218c2ecf20Sopenharmony_ci IS_ENABLED(CONFIG_SPARSEMEM)) 70228c2ecf20Sopenharmony_ci pages = present_pages; 70238c2ecf20Sopenharmony_ci 70248c2ecf20Sopenharmony_ci return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; 70258c2ecf20Sopenharmony_ci} 70268c2ecf20Sopenharmony_ci 70278c2ecf20Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 70288c2ecf20Sopenharmony_cistatic void pgdat_init_split_queue(struct pglist_data *pgdat) 70298c2ecf20Sopenharmony_ci{ 70308c2ecf20Sopenharmony_ci struct deferred_split *ds_queue = &pgdat->deferred_split_queue; 70318c2ecf20Sopenharmony_ci 70328c2ecf20Sopenharmony_ci spin_lock_init(&ds_queue->split_queue_lock); 70338c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&ds_queue->split_queue); 70348c2ecf20Sopenharmony_ci ds_queue->split_queue_len = 0; 70358c2ecf20Sopenharmony_ci} 70368c2ecf20Sopenharmony_ci#else 70378c2ecf20Sopenharmony_cistatic void pgdat_init_split_queue(struct pglist_data *pgdat) {} 70388c2ecf20Sopenharmony_ci#endif 70398c2ecf20Sopenharmony_ci 70408c2ecf20Sopenharmony_ci#ifdef CONFIG_COMPACTION 70418c2ecf20Sopenharmony_cistatic void pgdat_init_kcompactd(struct pglist_data *pgdat) 70428c2ecf20Sopenharmony_ci{ 70438c2ecf20Sopenharmony_ci init_waitqueue_head(&pgdat->kcompactd_wait); 70448c2ecf20Sopenharmony_ci} 70458c2ecf20Sopenharmony_ci#else 70468c2ecf20Sopenharmony_cistatic void pgdat_init_kcompactd(struct pglist_data *pgdat) {} 70478c2ecf20Sopenharmony_ci#endif 70488c2ecf20Sopenharmony_ci 70498c2ecf20Sopenharmony_cistatic void __meminit pgdat_init_internals(struct pglist_data *pgdat) 70508c2ecf20Sopenharmony_ci{ 70518c2ecf20Sopenharmony_ci pgdat_resize_init(pgdat); 70528c2ecf20Sopenharmony_ci 70538c2ecf20Sopenharmony_ci pgdat_init_split_queue(pgdat); 70548c2ecf20Sopenharmony_ci pgdat_init_kcompactd(pgdat); 70558c2ecf20Sopenharmony_ci 70568c2ecf20Sopenharmony_ci init_waitqueue_head(&pgdat->kswapd_wait); 70578c2ecf20Sopenharmony_ci init_waitqueue_head(&pgdat->pfmemalloc_wait); 70588c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_ZSWAPD 70598c2ecf20Sopenharmony_ci init_waitqueue_head(&pgdat->zswapd_wait); 70608c2ecf20Sopenharmony_ci#endif 70618c2ecf20Sopenharmony_ci 70628c2ecf20Sopenharmony_ci pgdat_page_ext_init(pgdat); 70638c2ecf20Sopenharmony_ci spin_lock_init(&pgdat->lru_lock); 70648c2ecf20Sopenharmony_ci lruvec_init(&pgdat->__lruvec); 70658c2ecf20Sopenharmony_ci#if defined(CONFIG_HYPERHOLD_FILE_LRU) && defined(CONFIG_MEMCG) 70668c2ecf20Sopenharmony_ci pgdat->__lruvec.pgdat = pgdat; 70678c2ecf20Sopenharmony_ci#endif 70688c2ecf20Sopenharmony_ci} 70698c2ecf20Sopenharmony_ci 70708c2ecf20Sopenharmony_cistatic void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, 70718c2ecf20Sopenharmony_ci unsigned long remaining_pages) 70728c2ecf20Sopenharmony_ci{ 70738c2ecf20Sopenharmony_ci atomic_long_set(&zone->managed_pages, remaining_pages); 70748c2ecf20Sopenharmony_ci zone_set_nid(zone, nid); 70758c2ecf20Sopenharmony_ci zone->name = zone_names[idx]; 70768c2ecf20Sopenharmony_ci zone->zone_pgdat = NODE_DATA(nid); 70778c2ecf20Sopenharmony_ci spin_lock_init(&zone->lock); 70788c2ecf20Sopenharmony_ci zone_seqlock_init(zone); 70798c2ecf20Sopenharmony_ci zone_pcp_init(zone); 70808c2ecf20Sopenharmony_ci} 70818c2ecf20Sopenharmony_ci 70828c2ecf20Sopenharmony_ci/* 70838c2ecf20Sopenharmony_ci * Set up the zone data structures 70848c2ecf20Sopenharmony_ci * - init pgdat internals 70858c2ecf20Sopenharmony_ci * - init all zones belonging to this node 70868c2ecf20Sopenharmony_ci * 70878c2ecf20Sopenharmony_ci * NOTE: this function is only called during memory hotplug 70888c2ecf20Sopenharmony_ci */ 70898c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMORY_HOTPLUG 70908c2ecf20Sopenharmony_civoid __ref free_area_init_core_hotplug(int nid) 70918c2ecf20Sopenharmony_ci{ 70928c2ecf20Sopenharmony_ci enum zone_type z; 70938c2ecf20Sopenharmony_ci pg_data_t *pgdat = NODE_DATA(nid); 70948c2ecf20Sopenharmony_ci 70958c2ecf20Sopenharmony_ci pgdat_init_internals(pgdat); 70968c2ecf20Sopenharmony_ci for (z = 0; z < MAX_NR_ZONES; z++) 70978c2ecf20Sopenharmony_ci zone_init_internals(&pgdat->node_zones[z], z, nid, 0); 70988c2ecf20Sopenharmony_ci} 70998c2ecf20Sopenharmony_ci#endif 71008c2ecf20Sopenharmony_ci 71018c2ecf20Sopenharmony_ci/* 71028c2ecf20Sopenharmony_ci * Set up the zone data structures: 71038c2ecf20Sopenharmony_ci * - mark all pages reserved 71048c2ecf20Sopenharmony_ci * - mark all memory queues empty 71058c2ecf20Sopenharmony_ci * - clear the memory bitmaps 71068c2ecf20Sopenharmony_ci * 71078c2ecf20Sopenharmony_ci * NOTE: pgdat should get zeroed by caller. 71088c2ecf20Sopenharmony_ci * NOTE: this function is only called during early init. 71098c2ecf20Sopenharmony_ci */ 71108c2ecf20Sopenharmony_cistatic void __init free_area_init_core(struct pglist_data *pgdat) 71118c2ecf20Sopenharmony_ci{ 71128c2ecf20Sopenharmony_ci enum zone_type j; 71138c2ecf20Sopenharmony_ci int nid = pgdat->node_id; 71148c2ecf20Sopenharmony_ci 71158c2ecf20Sopenharmony_ci pgdat_init_internals(pgdat); 71168c2ecf20Sopenharmony_ci pgdat->per_cpu_nodestats = &boot_nodestats; 71178c2ecf20Sopenharmony_ci 71188c2ecf20Sopenharmony_ci for (j = 0; j < MAX_NR_ZONES; j++) { 71198c2ecf20Sopenharmony_ci struct zone *zone = pgdat->node_zones + j; 71208c2ecf20Sopenharmony_ci unsigned long size, freesize, memmap_pages; 71218c2ecf20Sopenharmony_ci unsigned long zone_start_pfn = zone->zone_start_pfn; 71228c2ecf20Sopenharmony_ci 71238c2ecf20Sopenharmony_ci size = zone->spanned_pages; 71248c2ecf20Sopenharmony_ci freesize = zone->present_pages; 71258c2ecf20Sopenharmony_ci 71268c2ecf20Sopenharmony_ci /* 71278c2ecf20Sopenharmony_ci * Adjust freesize so that it accounts for how much memory 71288c2ecf20Sopenharmony_ci * is used by this zone for memmap. This affects the watermark 71298c2ecf20Sopenharmony_ci * and per-cpu initialisations 71308c2ecf20Sopenharmony_ci */ 71318c2ecf20Sopenharmony_ci memmap_pages = calc_memmap_size(size, freesize); 71328c2ecf20Sopenharmony_ci if (!is_highmem_idx(j)) { 71338c2ecf20Sopenharmony_ci if (freesize >= memmap_pages) { 71348c2ecf20Sopenharmony_ci freesize -= memmap_pages; 71358c2ecf20Sopenharmony_ci if (memmap_pages) 71368c2ecf20Sopenharmony_ci printk(KERN_DEBUG 71378c2ecf20Sopenharmony_ci " %s zone: %lu pages used for memmap\n", 71388c2ecf20Sopenharmony_ci zone_names[j], memmap_pages); 71398c2ecf20Sopenharmony_ci } else 71408c2ecf20Sopenharmony_ci pr_warn(" %s zone: %lu pages exceeds freesize %lu\n", 71418c2ecf20Sopenharmony_ci zone_names[j], memmap_pages, freesize); 71428c2ecf20Sopenharmony_ci } 71438c2ecf20Sopenharmony_ci 71448c2ecf20Sopenharmony_ci /* Account for reserved pages */ 71458c2ecf20Sopenharmony_ci if (j == 0 && freesize > dma_reserve) { 71468c2ecf20Sopenharmony_ci freesize -= dma_reserve; 71478c2ecf20Sopenharmony_ci printk(KERN_DEBUG " %s zone: %lu pages reserved\n", 71488c2ecf20Sopenharmony_ci zone_names[0], dma_reserve); 71498c2ecf20Sopenharmony_ci } 71508c2ecf20Sopenharmony_ci 71518c2ecf20Sopenharmony_ci if (!is_highmem_idx(j)) 71528c2ecf20Sopenharmony_ci nr_kernel_pages += freesize; 71538c2ecf20Sopenharmony_ci /* Charge for highmem memmap if there are enough kernel pages */ 71548c2ecf20Sopenharmony_ci else if (nr_kernel_pages > memmap_pages * 2) 71558c2ecf20Sopenharmony_ci nr_kernel_pages -= memmap_pages; 71568c2ecf20Sopenharmony_ci nr_all_pages += freesize; 71578c2ecf20Sopenharmony_ci 71588c2ecf20Sopenharmony_ci /* 71598c2ecf20Sopenharmony_ci * Set an approximate value for lowmem here, it will be adjusted 71608c2ecf20Sopenharmony_ci * when the bootmem allocator frees pages into the buddy system. 71618c2ecf20Sopenharmony_ci * And all highmem pages will be managed by the buddy system. 71628c2ecf20Sopenharmony_ci */ 71638c2ecf20Sopenharmony_ci zone_init_internals(zone, j, nid, freesize); 71648c2ecf20Sopenharmony_ci 71658c2ecf20Sopenharmony_ci if (!size) 71668c2ecf20Sopenharmony_ci continue; 71678c2ecf20Sopenharmony_ci 71688c2ecf20Sopenharmony_ci set_pageblock_order(); 71698c2ecf20Sopenharmony_ci setup_usemap(pgdat, zone, zone_start_pfn, size); 71708c2ecf20Sopenharmony_ci init_currently_empty_zone(zone, zone_start_pfn, size); 71718c2ecf20Sopenharmony_ci arch_memmap_init(size, nid, j, zone_start_pfn); 71728c2ecf20Sopenharmony_ci } 71738c2ecf20Sopenharmony_ci} 71748c2ecf20Sopenharmony_ci 71758c2ecf20Sopenharmony_ci#ifdef CONFIG_FLAT_NODE_MEM_MAP 71768c2ecf20Sopenharmony_cistatic void __ref alloc_node_mem_map(struct pglist_data *pgdat) 71778c2ecf20Sopenharmony_ci{ 71788c2ecf20Sopenharmony_ci unsigned long __maybe_unused start = 0; 71798c2ecf20Sopenharmony_ci unsigned long __maybe_unused offset = 0; 71808c2ecf20Sopenharmony_ci 71818c2ecf20Sopenharmony_ci /* Skip empty nodes */ 71828c2ecf20Sopenharmony_ci if (!pgdat->node_spanned_pages) 71838c2ecf20Sopenharmony_ci return; 71848c2ecf20Sopenharmony_ci 71858c2ecf20Sopenharmony_ci start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); 71868c2ecf20Sopenharmony_ci offset = pgdat->node_start_pfn - start; 71878c2ecf20Sopenharmony_ci /* ia64 gets its own node_mem_map, before this, without bootmem */ 71888c2ecf20Sopenharmony_ci if (!pgdat->node_mem_map) { 71898c2ecf20Sopenharmony_ci unsigned long size, end; 71908c2ecf20Sopenharmony_ci struct page *map; 71918c2ecf20Sopenharmony_ci 71928c2ecf20Sopenharmony_ci /* 71938c2ecf20Sopenharmony_ci * The zone's endpoints aren't required to be MAX_ORDER 71948c2ecf20Sopenharmony_ci * aligned but the node_mem_map endpoints must be in order 71958c2ecf20Sopenharmony_ci * for the buddy allocator to function correctly. 71968c2ecf20Sopenharmony_ci */ 71978c2ecf20Sopenharmony_ci end = pgdat_end_pfn(pgdat); 71988c2ecf20Sopenharmony_ci end = ALIGN(end, MAX_ORDER_NR_PAGES); 71998c2ecf20Sopenharmony_ci size = (end - start) * sizeof(struct page); 72008c2ecf20Sopenharmony_ci map = memblock_alloc_node(size, SMP_CACHE_BYTES, 72018c2ecf20Sopenharmony_ci pgdat->node_id); 72028c2ecf20Sopenharmony_ci if (!map) 72038c2ecf20Sopenharmony_ci panic("Failed to allocate %ld bytes for node %d memory map\n", 72048c2ecf20Sopenharmony_ci size, pgdat->node_id); 72058c2ecf20Sopenharmony_ci pgdat->node_mem_map = map + offset; 72068c2ecf20Sopenharmony_ci } 72078c2ecf20Sopenharmony_ci pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", 72088c2ecf20Sopenharmony_ci __func__, pgdat->node_id, (unsigned long)pgdat, 72098c2ecf20Sopenharmony_ci (unsigned long)pgdat->node_mem_map); 72108c2ecf20Sopenharmony_ci#ifndef CONFIG_NEED_MULTIPLE_NODES 72118c2ecf20Sopenharmony_ci /* 72128c2ecf20Sopenharmony_ci * With no DISCONTIG, the global mem_map is just set as node 0's 72138c2ecf20Sopenharmony_ci */ 72148c2ecf20Sopenharmony_ci if (pgdat == NODE_DATA(0)) { 72158c2ecf20Sopenharmony_ci mem_map = NODE_DATA(0)->node_mem_map; 72168c2ecf20Sopenharmony_ci if (page_to_pfn(mem_map) != pgdat->node_start_pfn) 72178c2ecf20Sopenharmony_ci mem_map -= offset; 72188c2ecf20Sopenharmony_ci } 72198c2ecf20Sopenharmony_ci#endif 72208c2ecf20Sopenharmony_ci} 72218c2ecf20Sopenharmony_ci#else 72228c2ecf20Sopenharmony_cistatic void __ref alloc_node_mem_map(struct pglist_data *pgdat) { } 72238c2ecf20Sopenharmony_ci#endif /* CONFIG_FLAT_NODE_MEM_MAP */ 72248c2ecf20Sopenharmony_ci 72258c2ecf20Sopenharmony_ci#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 72268c2ecf20Sopenharmony_cistatic inline void pgdat_set_deferred_range(pg_data_t *pgdat) 72278c2ecf20Sopenharmony_ci{ 72288c2ecf20Sopenharmony_ci pgdat->first_deferred_pfn = ULONG_MAX; 72298c2ecf20Sopenharmony_ci} 72308c2ecf20Sopenharmony_ci#else 72318c2ecf20Sopenharmony_cistatic inline void pgdat_set_deferred_range(pg_data_t *pgdat) {} 72328c2ecf20Sopenharmony_ci#endif 72338c2ecf20Sopenharmony_ci 72348c2ecf20Sopenharmony_cistatic void __init free_area_init_node(int nid) 72358c2ecf20Sopenharmony_ci{ 72368c2ecf20Sopenharmony_ci pg_data_t *pgdat = NODE_DATA(nid); 72378c2ecf20Sopenharmony_ci unsigned long start_pfn = 0; 72388c2ecf20Sopenharmony_ci unsigned long end_pfn = 0; 72398c2ecf20Sopenharmony_ci 72408c2ecf20Sopenharmony_ci /* pg_data_t should be reset to zero when it's allocated */ 72418c2ecf20Sopenharmony_ci WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx); 72428c2ecf20Sopenharmony_ci 72438c2ecf20Sopenharmony_ci get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 72448c2ecf20Sopenharmony_ci 72458c2ecf20Sopenharmony_ci pgdat->node_id = nid; 72468c2ecf20Sopenharmony_ci pgdat->node_start_pfn = start_pfn; 72478c2ecf20Sopenharmony_ci pgdat->per_cpu_nodestats = NULL; 72488c2ecf20Sopenharmony_ci 72498c2ecf20Sopenharmony_ci pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, 72508c2ecf20Sopenharmony_ci (u64)start_pfn << PAGE_SHIFT, 72518c2ecf20Sopenharmony_ci end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); 72528c2ecf20Sopenharmony_ci calculate_node_totalpages(pgdat, start_pfn, end_pfn); 72538c2ecf20Sopenharmony_ci 72548c2ecf20Sopenharmony_ci alloc_node_mem_map(pgdat); 72558c2ecf20Sopenharmony_ci pgdat_set_deferred_range(pgdat); 72568c2ecf20Sopenharmony_ci 72578c2ecf20Sopenharmony_ci free_area_init_core(pgdat); 72588c2ecf20Sopenharmony_ci} 72598c2ecf20Sopenharmony_ci 72608c2ecf20Sopenharmony_civoid __init free_area_init_memoryless_node(int nid) 72618c2ecf20Sopenharmony_ci{ 72628c2ecf20Sopenharmony_ci free_area_init_node(nid); 72638c2ecf20Sopenharmony_ci} 72648c2ecf20Sopenharmony_ci 72658c2ecf20Sopenharmony_ci#if MAX_NUMNODES > 1 72668c2ecf20Sopenharmony_ci/* 72678c2ecf20Sopenharmony_ci * Figure out the number of possible node ids. 72688c2ecf20Sopenharmony_ci */ 72698c2ecf20Sopenharmony_civoid __init setup_nr_node_ids(void) 72708c2ecf20Sopenharmony_ci{ 72718c2ecf20Sopenharmony_ci unsigned int highest; 72728c2ecf20Sopenharmony_ci 72738c2ecf20Sopenharmony_ci highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES); 72748c2ecf20Sopenharmony_ci nr_node_ids = highest + 1; 72758c2ecf20Sopenharmony_ci} 72768c2ecf20Sopenharmony_ci#endif 72778c2ecf20Sopenharmony_ci 72788c2ecf20Sopenharmony_ci/** 72798c2ecf20Sopenharmony_ci * node_map_pfn_alignment - determine the maximum internode alignment 72808c2ecf20Sopenharmony_ci * 72818c2ecf20Sopenharmony_ci * This function should be called after node map is populated and sorted. 72828c2ecf20Sopenharmony_ci * It calculates the maximum power of two alignment which can distinguish 72838c2ecf20Sopenharmony_ci * all the nodes. 72848c2ecf20Sopenharmony_ci * 72858c2ecf20Sopenharmony_ci * For example, if all nodes are 1GiB and aligned to 1GiB, the return value 72868c2ecf20Sopenharmony_ci * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the 72878c2ecf20Sopenharmony_ci * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is 72888c2ecf20Sopenharmony_ci * shifted, 1GiB is enough and this function will indicate so. 72898c2ecf20Sopenharmony_ci * 72908c2ecf20Sopenharmony_ci * This is used to test whether pfn -> nid mapping of the chosen memory 72918c2ecf20Sopenharmony_ci * model has fine enough granularity to avoid incorrect mapping for the 72928c2ecf20Sopenharmony_ci * populated node map. 72938c2ecf20Sopenharmony_ci * 72948c2ecf20Sopenharmony_ci * Return: the determined alignment in pfn's. 0 if there is no alignment 72958c2ecf20Sopenharmony_ci * requirement (single node). 72968c2ecf20Sopenharmony_ci */ 72978c2ecf20Sopenharmony_ciunsigned long __init node_map_pfn_alignment(void) 72988c2ecf20Sopenharmony_ci{ 72998c2ecf20Sopenharmony_ci unsigned long accl_mask = 0, last_end = 0; 73008c2ecf20Sopenharmony_ci unsigned long start, end, mask; 73018c2ecf20Sopenharmony_ci int last_nid = NUMA_NO_NODE; 73028c2ecf20Sopenharmony_ci int i, nid; 73038c2ecf20Sopenharmony_ci 73048c2ecf20Sopenharmony_ci for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { 73058c2ecf20Sopenharmony_ci if (!start || last_nid < 0 || last_nid == nid) { 73068c2ecf20Sopenharmony_ci last_nid = nid; 73078c2ecf20Sopenharmony_ci last_end = end; 73088c2ecf20Sopenharmony_ci continue; 73098c2ecf20Sopenharmony_ci } 73108c2ecf20Sopenharmony_ci 73118c2ecf20Sopenharmony_ci /* 73128c2ecf20Sopenharmony_ci * Start with a mask granular enough to pin-point to the 73138c2ecf20Sopenharmony_ci * start pfn and tick off bits one-by-one until it becomes 73148c2ecf20Sopenharmony_ci * too coarse to separate the current node from the last. 73158c2ecf20Sopenharmony_ci */ 73168c2ecf20Sopenharmony_ci mask = ~((1 << __ffs(start)) - 1); 73178c2ecf20Sopenharmony_ci while (mask && last_end <= (start & (mask << 1))) 73188c2ecf20Sopenharmony_ci mask <<= 1; 73198c2ecf20Sopenharmony_ci 73208c2ecf20Sopenharmony_ci /* accumulate all internode masks */ 73218c2ecf20Sopenharmony_ci accl_mask |= mask; 73228c2ecf20Sopenharmony_ci } 73238c2ecf20Sopenharmony_ci 73248c2ecf20Sopenharmony_ci /* convert mask to number of pages */ 73258c2ecf20Sopenharmony_ci return ~accl_mask + 1; 73268c2ecf20Sopenharmony_ci} 73278c2ecf20Sopenharmony_ci 73288c2ecf20Sopenharmony_ci/** 73298c2ecf20Sopenharmony_ci * find_min_pfn_with_active_regions - Find the minimum PFN registered 73308c2ecf20Sopenharmony_ci * 73318c2ecf20Sopenharmony_ci * Return: the minimum PFN based on information provided via 73328c2ecf20Sopenharmony_ci * memblock_set_node(). 73338c2ecf20Sopenharmony_ci */ 73348c2ecf20Sopenharmony_ciunsigned long __init find_min_pfn_with_active_regions(void) 73358c2ecf20Sopenharmony_ci{ 73368c2ecf20Sopenharmony_ci return PHYS_PFN(memblock_start_of_DRAM()); 73378c2ecf20Sopenharmony_ci} 73388c2ecf20Sopenharmony_ci 73398c2ecf20Sopenharmony_ci/* 73408c2ecf20Sopenharmony_ci * early_calculate_totalpages() 73418c2ecf20Sopenharmony_ci * Sum pages in active regions for movable zone. 73428c2ecf20Sopenharmony_ci * Populate N_MEMORY for calculating usable_nodes. 73438c2ecf20Sopenharmony_ci */ 73448c2ecf20Sopenharmony_cistatic unsigned long __init early_calculate_totalpages(void) 73458c2ecf20Sopenharmony_ci{ 73468c2ecf20Sopenharmony_ci unsigned long totalpages = 0; 73478c2ecf20Sopenharmony_ci unsigned long start_pfn, end_pfn; 73488c2ecf20Sopenharmony_ci int i, nid; 73498c2ecf20Sopenharmony_ci 73508c2ecf20Sopenharmony_ci for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 73518c2ecf20Sopenharmony_ci unsigned long pages = end_pfn - start_pfn; 73528c2ecf20Sopenharmony_ci 73538c2ecf20Sopenharmony_ci totalpages += pages; 73548c2ecf20Sopenharmony_ci if (pages) 73558c2ecf20Sopenharmony_ci node_set_state(nid, N_MEMORY); 73568c2ecf20Sopenharmony_ci } 73578c2ecf20Sopenharmony_ci return totalpages; 73588c2ecf20Sopenharmony_ci} 73598c2ecf20Sopenharmony_ci 73608c2ecf20Sopenharmony_ci/* 73618c2ecf20Sopenharmony_ci * Find the PFN the Movable zone begins in each node. Kernel memory 73628c2ecf20Sopenharmony_ci * is spread evenly between nodes as long as the nodes have enough 73638c2ecf20Sopenharmony_ci * memory. When they don't, some nodes will have more kernelcore than 73648c2ecf20Sopenharmony_ci * others 73658c2ecf20Sopenharmony_ci */ 73668c2ecf20Sopenharmony_cistatic void __init find_zone_movable_pfns_for_nodes(void) 73678c2ecf20Sopenharmony_ci{ 73688c2ecf20Sopenharmony_ci int i, nid; 73698c2ecf20Sopenharmony_ci unsigned long usable_startpfn; 73708c2ecf20Sopenharmony_ci unsigned long kernelcore_node, kernelcore_remaining; 73718c2ecf20Sopenharmony_ci /* save the state before borrow the nodemask */ 73728c2ecf20Sopenharmony_ci nodemask_t saved_node_state = node_states[N_MEMORY]; 73738c2ecf20Sopenharmony_ci unsigned long totalpages = early_calculate_totalpages(); 73748c2ecf20Sopenharmony_ci int usable_nodes = nodes_weight(node_states[N_MEMORY]); 73758c2ecf20Sopenharmony_ci struct memblock_region *r; 73768c2ecf20Sopenharmony_ci 73778c2ecf20Sopenharmony_ci /* Need to find movable_zone earlier when movable_node is specified. */ 73788c2ecf20Sopenharmony_ci find_usable_zone_for_movable(); 73798c2ecf20Sopenharmony_ci 73808c2ecf20Sopenharmony_ci /* 73818c2ecf20Sopenharmony_ci * If movable_node is specified, ignore kernelcore and movablecore 73828c2ecf20Sopenharmony_ci * options. 73838c2ecf20Sopenharmony_ci */ 73848c2ecf20Sopenharmony_ci if (movable_node_is_enabled()) { 73858c2ecf20Sopenharmony_ci for_each_mem_region(r) { 73868c2ecf20Sopenharmony_ci if (!memblock_is_hotpluggable(r)) 73878c2ecf20Sopenharmony_ci continue; 73888c2ecf20Sopenharmony_ci 73898c2ecf20Sopenharmony_ci nid = memblock_get_region_node(r); 73908c2ecf20Sopenharmony_ci 73918c2ecf20Sopenharmony_ci usable_startpfn = PFN_DOWN(r->base); 73928c2ecf20Sopenharmony_ci zone_movable_pfn[nid] = zone_movable_pfn[nid] ? 73938c2ecf20Sopenharmony_ci min(usable_startpfn, zone_movable_pfn[nid]) : 73948c2ecf20Sopenharmony_ci usable_startpfn; 73958c2ecf20Sopenharmony_ci } 73968c2ecf20Sopenharmony_ci 73978c2ecf20Sopenharmony_ci goto out2; 73988c2ecf20Sopenharmony_ci } 73998c2ecf20Sopenharmony_ci 74008c2ecf20Sopenharmony_ci /* 74018c2ecf20Sopenharmony_ci * If kernelcore=mirror is specified, ignore movablecore option 74028c2ecf20Sopenharmony_ci */ 74038c2ecf20Sopenharmony_ci if (mirrored_kernelcore) { 74048c2ecf20Sopenharmony_ci bool mem_below_4gb_not_mirrored = false; 74058c2ecf20Sopenharmony_ci 74068c2ecf20Sopenharmony_ci for_each_mem_region(r) { 74078c2ecf20Sopenharmony_ci if (memblock_is_mirror(r)) 74088c2ecf20Sopenharmony_ci continue; 74098c2ecf20Sopenharmony_ci 74108c2ecf20Sopenharmony_ci nid = memblock_get_region_node(r); 74118c2ecf20Sopenharmony_ci 74128c2ecf20Sopenharmony_ci usable_startpfn = memblock_region_memory_base_pfn(r); 74138c2ecf20Sopenharmony_ci 74148c2ecf20Sopenharmony_ci if (usable_startpfn < 0x100000) { 74158c2ecf20Sopenharmony_ci mem_below_4gb_not_mirrored = true; 74168c2ecf20Sopenharmony_ci continue; 74178c2ecf20Sopenharmony_ci } 74188c2ecf20Sopenharmony_ci 74198c2ecf20Sopenharmony_ci zone_movable_pfn[nid] = zone_movable_pfn[nid] ? 74208c2ecf20Sopenharmony_ci min(usable_startpfn, zone_movable_pfn[nid]) : 74218c2ecf20Sopenharmony_ci usable_startpfn; 74228c2ecf20Sopenharmony_ci } 74238c2ecf20Sopenharmony_ci 74248c2ecf20Sopenharmony_ci if (mem_below_4gb_not_mirrored) 74258c2ecf20Sopenharmony_ci pr_warn("This configuration results in unmirrored kernel memory.\n"); 74268c2ecf20Sopenharmony_ci 74278c2ecf20Sopenharmony_ci goto out2; 74288c2ecf20Sopenharmony_ci } 74298c2ecf20Sopenharmony_ci 74308c2ecf20Sopenharmony_ci /* 74318c2ecf20Sopenharmony_ci * If kernelcore=nn% or movablecore=nn% was specified, calculate the 74328c2ecf20Sopenharmony_ci * amount of necessary memory. 74338c2ecf20Sopenharmony_ci */ 74348c2ecf20Sopenharmony_ci if (required_kernelcore_percent) 74358c2ecf20Sopenharmony_ci required_kernelcore = (totalpages * 100 * required_kernelcore_percent) / 74368c2ecf20Sopenharmony_ci 10000UL; 74378c2ecf20Sopenharmony_ci if (required_movablecore_percent) 74388c2ecf20Sopenharmony_ci required_movablecore = (totalpages * 100 * required_movablecore_percent) / 74398c2ecf20Sopenharmony_ci 10000UL; 74408c2ecf20Sopenharmony_ci 74418c2ecf20Sopenharmony_ci /* 74428c2ecf20Sopenharmony_ci * If movablecore= was specified, calculate what size of 74438c2ecf20Sopenharmony_ci * kernelcore that corresponds so that memory usable for 74448c2ecf20Sopenharmony_ci * any allocation type is evenly spread. If both kernelcore 74458c2ecf20Sopenharmony_ci * and movablecore are specified, then the value of kernelcore 74468c2ecf20Sopenharmony_ci * will be used for required_kernelcore if it's greater than 74478c2ecf20Sopenharmony_ci * what movablecore would have allowed. 74488c2ecf20Sopenharmony_ci */ 74498c2ecf20Sopenharmony_ci if (required_movablecore) { 74508c2ecf20Sopenharmony_ci unsigned long corepages; 74518c2ecf20Sopenharmony_ci 74528c2ecf20Sopenharmony_ci /* 74538c2ecf20Sopenharmony_ci * Round-up so that ZONE_MOVABLE is at least as large as what 74548c2ecf20Sopenharmony_ci * was requested by the user 74558c2ecf20Sopenharmony_ci */ 74568c2ecf20Sopenharmony_ci required_movablecore = 74578c2ecf20Sopenharmony_ci roundup(required_movablecore, MAX_ORDER_NR_PAGES); 74588c2ecf20Sopenharmony_ci required_movablecore = min(totalpages, required_movablecore); 74598c2ecf20Sopenharmony_ci corepages = totalpages - required_movablecore; 74608c2ecf20Sopenharmony_ci 74618c2ecf20Sopenharmony_ci required_kernelcore = max(required_kernelcore, corepages); 74628c2ecf20Sopenharmony_ci } 74638c2ecf20Sopenharmony_ci 74648c2ecf20Sopenharmony_ci /* 74658c2ecf20Sopenharmony_ci * If kernelcore was not specified or kernelcore size is larger 74668c2ecf20Sopenharmony_ci * than totalpages, there is no ZONE_MOVABLE. 74678c2ecf20Sopenharmony_ci */ 74688c2ecf20Sopenharmony_ci if (!required_kernelcore || required_kernelcore >= totalpages) 74698c2ecf20Sopenharmony_ci goto out; 74708c2ecf20Sopenharmony_ci 74718c2ecf20Sopenharmony_ci /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 74728c2ecf20Sopenharmony_ci usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; 74738c2ecf20Sopenharmony_ci 74748c2ecf20Sopenharmony_cirestart: 74758c2ecf20Sopenharmony_ci /* Spread kernelcore memory as evenly as possible throughout nodes */ 74768c2ecf20Sopenharmony_ci kernelcore_node = required_kernelcore / usable_nodes; 74778c2ecf20Sopenharmony_ci for_each_node_state(nid, N_MEMORY) { 74788c2ecf20Sopenharmony_ci unsigned long start_pfn, end_pfn; 74798c2ecf20Sopenharmony_ci 74808c2ecf20Sopenharmony_ci /* 74818c2ecf20Sopenharmony_ci * Recalculate kernelcore_node if the division per node 74828c2ecf20Sopenharmony_ci * now exceeds what is necessary to satisfy the requested 74838c2ecf20Sopenharmony_ci * amount of memory for the kernel 74848c2ecf20Sopenharmony_ci */ 74858c2ecf20Sopenharmony_ci if (required_kernelcore < kernelcore_node) 74868c2ecf20Sopenharmony_ci kernelcore_node = required_kernelcore / usable_nodes; 74878c2ecf20Sopenharmony_ci 74888c2ecf20Sopenharmony_ci /* 74898c2ecf20Sopenharmony_ci * As the map is walked, we track how much memory is usable 74908c2ecf20Sopenharmony_ci * by the kernel using kernelcore_remaining. When it is 74918c2ecf20Sopenharmony_ci * 0, the rest of the node is usable by ZONE_MOVABLE 74928c2ecf20Sopenharmony_ci */ 74938c2ecf20Sopenharmony_ci kernelcore_remaining = kernelcore_node; 74948c2ecf20Sopenharmony_ci 74958c2ecf20Sopenharmony_ci /* Go through each range of PFNs within this node */ 74968c2ecf20Sopenharmony_ci for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 74978c2ecf20Sopenharmony_ci unsigned long size_pages; 74988c2ecf20Sopenharmony_ci 74998c2ecf20Sopenharmony_ci start_pfn = max(start_pfn, zone_movable_pfn[nid]); 75008c2ecf20Sopenharmony_ci if (start_pfn >= end_pfn) 75018c2ecf20Sopenharmony_ci continue; 75028c2ecf20Sopenharmony_ci 75038c2ecf20Sopenharmony_ci /* Account for what is only usable for kernelcore */ 75048c2ecf20Sopenharmony_ci if (start_pfn < usable_startpfn) { 75058c2ecf20Sopenharmony_ci unsigned long kernel_pages; 75068c2ecf20Sopenharmony_ci kernel_pages = min(end_pfn, usable_startpfn) 75078c2ecf20Sopenharmony_ci - start_pfn; 75088c2ecf20Sopenharmony_ci 75098c2ecf20Sopenharmony_ci kernelcore_remaining -= min(kernel_pages, 75108c2ecf20Sopenharmony_ci kernelcore_remaining); 75118c2ecf20Sopenharmony_ci required_kernelcore -= min(kernel_pages, 75128c2ecf20Sopenharmony_ci required_kernelcore); 75138c2ecf20Sopenharmony_ci 75148c2ecf20Sopenharmony_ci /* Continue if range is now fully accounted */ 75158c2ecf20Sopenharmony_ci if (end_pfn <= usable_startpfn) { 75168c2ecf20Sopenharmony_ci 75178c2ecf20Sopenharmony_ci /* 75188c2ecf20Sopenharmony_ci * Push zone_movable_pfn to the end so 75198c2ecf20Sopenharmony_ci * that if we have to rebalance 75208c2ecf20Sopenharmony_ci * kernelcore across nodes, we will 75218c2ecf20Sopenharmony_ci * not double account here 75228c2ecf20Sopenharmony_ci */ 75238c2ecf20Sopenharmony_ci zone_movable_pfn[nid] = end_pfn; 75248c2ecf20Sopenharmony_ci continue; 75258c2ecf20Sopenharmony_ci } 75268c2ecf20Sopenharmony_ci start_pfn = usable_startpfn; 75278c2ecf20Sopenharmony_ci } 75288c2ecf20Sopenharmony_ci 75298c2ecf20Sopenharmony_ci /* 75308c2ecf20Sopenharmony_ci * The usable PFN range for ZONE_MOVABLE is from 75318c2ecf20Sopenharmony_ci * start_pfn->end_pfn. Calculate size_pages as the 75328c2ecf20Sopenharmony_ci * number of pages used as kernelcore 75338c2ecf20Sopenharmony_ci */ 75348c2ecf20Sopenharmony_ci size_pages = end_pfn - start_pfn; 75358c2ecf20Sopenharmony_ci if (size_pages > kernelcore_remaining) 75368c2ecf20Sopenharmony_ci size_pages = kernelcore_remaining; 75378c2ecf20Sopenharmony_ci zone_movable_pfn[nid] = start_pfn + size_pages; 75388c2ecf20Sopenharmony_ci 75398c2ecf20Sopenharmony_ci /* 75408c2ecf20Sopenharmony_ci * Some kernelcore has been met, update counts and 75418c2ecf20Sopenharmony_ci * break if the kernelcore for this node has been 75428c2ecf20Sopenharmony_ci * satisfied 75438c2ecf20Sopenharmony_ci */ 75448c2ecf20Sopenharmony_ci required_kernelcore -= min(required_kernelcore, 75458c2ecf20Sopenharmony_ci size_pages); 75468c2ecf20Sopenharmony_ci kernelcore_remaining -= size_pages; 75478c2ecf20Sopenharmony_ci if (!kernelcore_remaining) 75488c2ecf20Sopenharmony_ci break; 75498c2ecf20Sopenharmony_ci } 75508c2ecf20Sopenharmony_ci } 75518c2ecf20Sopenharmony_ci 75528c2ecf20Sopenharmony_ci /* 75538c2ecf20Sopenharmony_ci * If there is still required_kernelcore, we do another pass with one 75548c2ecf20Sopenharmony_ci * less node in the count. This will push zone_movable_pfn[nid] further 75558c2ecf20Sopenharmony_ci * along on the nodes that still have memory until kernelcore is 75568c2ecf20Sopenharmony_ci * satisfied 75578c2ecf20Sopenharmony_ci */ 75588c2ecf20Sopenharmony_ci usable_nodes--; 75598c2ecf20Sopenharmony_ci if (usable_nodes && required_kernelcore > usable_nodes) 75608c2ecf20Sopenharmony_ci goto restart; 75618c2ecf20Sopenharmony_ci 75628c2ecf20Sopenharmony_ciout2: 75638c2ecf20Sopenharmony_ci /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ 75648c2ecf20Sopenharmony_ci for (nid = 0; nid < MAX_NUMNODES; nid++) { 75658c2ecf20Sopenharmony_ci unsigned long start_pfn, end_pfn; 75668c2ecf20Sopenharmony_ci 75678c2ecf20Sopenharmony_ci zone_movable_pfn[nid] = 75688c2ecf20Sopenharmony_ci roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 75698c2ecf20Sopenharmony_ci 75708c2ecf20Sopenharmony_ci get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 75718c2ecf20Sopenharmony_ci if (zone_movable_pfn[nid] >= end_pfn) 75728c2ecf20Sopenharmony_ci zone_movable_pfn[nid] = 0; 75738c2ecf20Sopenharmony_ci } 75748c2ecf20Sopenharmony_ci 75758c2ecf20Sopenharmony_ciout: 75768c2ecf20Sopenharmony_ci /* restore the node_state */ 75778c2ecf20Sopenharmony_ci node_states[N_MEMORY] = saved_node_state; 75788c2ecf20Sopenharmony_ci} 75798c2ecf20Sopenharmony_ci 75808c2ecf20Sopenharmony_ci/* Any regular or high memory on that node ? */ 75818c2ecf20Sopenharmony_cistatic void check_for_memory(pg_data_t *pgdat, int nid) 75828c2ecf20Sopenharmony_ci{ 75838c2ecf20Sopenharmony_ci enum zone_type zone_type; 75848c2ecf20Sopenharmony_ci 75858c2ecf20Sopenharmony_ci for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { 75868c2ecf20Sopenharmony_ci struct zone *zone = &pgdat->node_zones[zone_type]; 75878c2ecf20Sopenharmony_ci if (populated_zone(zone)) { 75888c2ecf20Sopenharmony_ci if (IS_ENABLED(CONFIG_HIGHMEM)) 75898c2ecf20Sopenharmony_ci node_set_state(nid, N_HIGH_MEMORY); 75908c2ecf20Sopenharmony_ci if (zone_type <= ZONE_NORMAL) 75918c2ecf20Sopenharmony_ci node_set_state(nid, N_NORMAL_MEMORY); 75928c2ecf20Sopenharmony_ci break; 75938c2ecf20Sopenharmony_ci } 75948c2ecf20Sopenharmony_ci } 75958c2ecf20Sopenharmony_ci} 75968c2ecf20Sopenharmony_ci 75978c2ecf20Sopenharmony_ci/* 75988c2ecf20Sopenharmony_ci * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For 75998c2ecf20Sopenharmony_ci * such cases we allow max_zone_pfn sorted in the descending order 76008c2ecf20Sopenharmony_ci */ 76018c2ecf20Sopenharmony_cibool __weak arch_has_descending_max_zone_pfns(void) 76028c2ecf20Sopenharmony_ci{ 76038c2ecf20Sopenharmony_ci return false; 76048c2ecf20Sopenharmony_ci} 76058c2ecf20Sopenharmony_ci 76068c2ecf20Sopenharmony_ci/** 76078c2ecf20Sopenharmony_ci * free_area_init - Initialise all pg_data_t and zone data 76088c2ecf20Sopenharmony_ci * @max_zone_pfn: an array of max PFNs for each zone 76098c2ecf20Sopenharmony_ci * 76108c2ecf20Sopenharmony_ci * This will call free_area_init_node() for each active node in the system. 76118c2ecf20Sopenharmony_ci * Using the page ranges provided by memblock_set_node(), the size of each 76128c2ecf20Sopenharmony_ci * zone in each node and their holes is calculated. If the maximum PFN 76138c2ecf20Sopenharmony_ci * between two adjacent zones match, it is assumed that the zone is empty. 76148c2ecf20Sopenharmony_ci * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed 76158c2ecf20Sopenharmony_ci * that arch_max_dma32_pfn has no pages. It is also assumed that a zone 76168c2ecf20Sopenharmony_ci * starts where the previous one ended. For example, ZONE_DMA32 starts 76178c2ecf20Sopenharmony_ci * at arch_max_dma_pfn. 76188c2ecf20Sopenharmony_ci */ 76198c2ecf20Sopenharmony_civoid __init free_area_init(unsigned long *max_zone_pfn) 76208c2ecf20Sopenharmony_ci{ 76218c2ecf20Sopenharmony_ci unsigned long start_pfn, end_pfn; 76228c2ecf20Sopenharmony_ci int i, nid, zone; 76238c2ecf20Sopenharmony_ci bool descending; 76248c2ecf20Sopenharmony_ci 76258c2ecf20Sopenharmony_ci /* Record where the zone boundaries are */ 76268c2ecf20Sopenharmony_ci memset(arch_zone_lowest_possible_pfn, 0, 76278c2ecf20Sopenharmony_ci sizeof(arch_zone_lowest_possible_pfn)); 76288c2ecf20Sopenharmony_ci memset(arch_zone_highest_possible_pfn, 0, 76298c2ecf20Sopenharmony_ci sizeof(arch_zone_highest_possible_pfn)); 76308c2ecf20Sopenharmony_ci 76318c2ecf20Sopenharmony_ci start_pfn = find_min_pfn_with_active_regions(); 76328c2ecf20Sopenharmony_ci descending = arch_has_descending_max_zone_pfns(); 76338c2ecf20Sopenharmony_ci 76348c2ecf20Sopenharmony_ci for (i = 0; i < MAX_NR_ZONES; i++) { 76358c2ecf20Sopenharmony_ci if (descending) 76368c2ecf20Sopenharmony_ci zone = MAX_NR_ZONES - i - 1; 76378c2ecf20Sopenharmony_ci else 76388c2ecf20Sopenharmony_ci zone = i; 76398c2ecf20Sopenharmony_ci 76408c2ecf20Sopenharmony_ci if (zone == ZONE_MOVABLE) 76418c2ecf20Sopenharmony_ci continue; 76428c2ecf20Sopenharmony_ci 76438c2ecf20Sopenharmony_ci end_pfn = max(max_zone_pfn[zone], start_pfn); 76448c2ecf20Sopenharmony_ci arch_zone_lowest_possible_pfn[zone] = start_pfn; 76458c2ecf20Sopenharmony_ci arch_zone_highest_possible_pfn[zone] = end_pfn; 76468c2ecf20Sopenharmony_ci 76478c2ecf20Sopenharmony_ci start_pfn = end_pfn; 76488c2ecf20Sopenharmony_ci } 76498c2ecf20Sopenharmony_ci 76508c2ecf20Sopenharmony_ci /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 76518c2ecf20Sopenharmony_ci memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 76528c2ecf20Sopenharmony_ci find_zone_movable_pfns_for_nodes(); 76538c2ecf20Sopenharmony_ci 76548c2ecf20Sopenharmony_ci /* Print out the zone ranges */ 76558c2ecf20Sopenharmony_ci pr_info("Zone ranges:\n"); 76568c2ecf20Sopenharmony_ci for (i = 0; i < MAX_NR_ZONES; i++) { 76578c2ecf20Sopenharmony_ci if (i == ZONE_MOVABLE) 76588c2ecf20Sopenharmony_ci continue; 76598c2ecf20Sopenharmony_ci pr_info(" %-8s ", zone_names[i]); 76608c2ecf20Sopenharmony_ci if (arch_zone_lowest_possible_pfn[i] == 76618c2ecf20Sopenharmony_ci arch_zone_highest_possible_pfn[i]) 76628c2ecf20Sopenharmony_ci pr_cont("empty\n"); 76638c2ecf20Sopenharmony_ci else 76648c2ecf20Sopenharmony_ci pr_cont("[mem %#018Lx-%#018Lx]\n", 76658c2ecf20Sopenharmony_ci (u64)arch_zone_lowest_possible_pfn[i] 76668c2ecf20Sopenharmony_ci << PAGE_SHIFT, 76678c2ecf20Sopenharmony_ci ((u64)arch_zone_highest_possible_pfn[i] 76688c2ecf20Sopenharmony_ci << PAGE_SHIFT) - 1); 76698c2ecf20Sopenharmony_ci } 76708c2ecf20Sopenharmony_ci 76718c2ecf20Sopenharmony_ci /* Print out the PFNs ZONE_MOVABLE begins at in each node */ 76728c2ecf20Sopenharmony_ci pr_info("Movable zone start for each node\n"); 76738c2ecf20Sopenharmony_ci for (i = 0; i < MAX_NUMNODES; i++) { 76748c2ecf20Sopenharmony_ci if (zone_movable_pfn[i]) 76758c2ecf20Sopenharmony_ci pr_info(" Node %d: %#018Lx\n", i, 76768c2ecf20Sopenharmony_ci (u64)zone_movable_pfn[i] << PAGE_SHIFT); 76778c2ecf20Sopenharmony_ci } 76788c2ecf20Sopenharmony_ci 76798c2ecf20Sopenharmony_ci /* 76808c2ecf20Sopenharmony_ci * Print out the early node map, and initialize the 76818c2ecf20Sopenharmony_ci * subsection-map relative to active online memory ranges to 76828c2ecf20Sopenharmony_ci * enable future "sub-section" extensions of the memory map. 76838c2ecf20Sopenharmony_ci */ 76848c2ecf20Sopenharmony_ci pr_info("Early memory node ranges\n"); 76858c2ecf20Sopenharmony_ci for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 76868c2ecf20Sopenharmony_ci pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid, 76878c2ecf20Sopenharmony_ci (u64)start_pfn << PAGE_SHIFT, 76888c2ecf20Sopenharmony_ci ((u64)end_pfn << PAGE_SHIFT) - 1); 76898c2ecf20Sopenharmony_ci subsection_map_init(start_pfn, end_pfn - start_pfn); 76908c2ecf20Sopenharmony_ci } 76918c2ecf20Sopenharmony_ci 76928c2ecf20Sopenharmony_ci /* Initialise every node */ 76938c2ecf20Sopenharmony_ci mminit_verify_pageflags_layout(); 76948c2ecf20Sopenharmony_ci setup_nr_node_ids(); 76958c2ecf20Sopenharmony_ci for_each_online_node(nid) { 76968c2ecf20Sopenharmony_ci pg_data_t *pgdat = NODE_DATA(nid); 76978c2ecf20Sopenharmony_ci free_area_init_node(nid); 76988c2ecf20Sopenharmony_ci 76998c2ecf20Sopenharmony_ci /* Any memory on that node */ 77008c2ecf20Sopenharmony_ci if (pgdat->node_present_pages) 77018c2ecf20Sopenharmony_ci node_set_state(nid, N_MEMORY); 77028c2ecf20Sopenharmony_ci check_for_memory(pgdat, nid); 77038c2ecf20Sopenharmony_ci } 77048c2ecf20Sopenharmony_ci 77058c2ecf20Sopenharmony_ci memmap_init(); 77068c2ecf20Sopenharmony_ci} 77078c2ecf20Sopenharmony_ci 77088c2ecf20Sopenharmony_cistatic int __init cmdline_parse_core(char *p, unsigned long *core, 77098c2ecf20Sopenharmony_ci unsigned long *percent) 77108c2ecf20Sopenharmony_ci{ 77118c2ecf20Sopenharmony_ci unsigned long long coremem; 77128c2ecf20Sopenharmony_ci char *endptr; 77138c2ecf20Sopenharmony_ci 77148c2ecf20Sopenharmony_ci if (!p) 77158c2ecf20Sopenharmony_ci return -EINVAL; 77168c2ecf20Sopenharmony_ci 77178c2ecf20Sopenharmony_ci /* Value may be a percentage of total memory, otherwise bytes */ 77188c2ecf20Sopenharmony_ci coremem = simple_strtoull(p, &endptr, 0); 77198c2ecf20Sopenharmony_ci if (*endptr == '%') { 77208c2ecf20Sopenharmony_ci /* Paranoid check for percent values greater than 100 */ 77218c2ecf20Sopenharmony_ci WARN_ON(coremem > 100); 77228c2ecf20Sopenharmony_ci 77238c2ecf20Sopenharmony_ci *percent = coremem; 77248c2ecf20Sopenharmony_ci } else { 77258c2ecf20Sopenharmony_ci coremem = memparse(p, &p); 77268c2ecf20Sopenharmony_ci /* Paranoid check that UL is enough for the coremem value */ 77278c2ecf20Sopenharmony_ci WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); 77288c2ecf20Sopenharmony_ci 77298c2ecf20Sopenharmony_ci *core = coremem >> PAGE_SHIFT; 77308c2ecf20Sopenharmony_ci *percent = 0UL; 77318c2ecf20Sopenharmony_ci } 77328c2ecf20Sopenharmony_ci return 0; 77338c2ecf20Sopenharmony_ci} 77348c2ecf20Sopenharmony_ci 77358c2ecf20Sopenharmony_ci/* 77368c2ecf20Sopenharmony_ci * kernelcore=size sets the amount of memory for use for allocations that 77378c2ecf20Sopenharmony_ci * cannot be reclaimed or migrated. 77388c2ecf20Sopenharmony_ci */ 77398c2ecf20Sopenharmony_cistatic int __init cmdline_parse_kernelcore(char *p) 77408c2ecf20Sopenharmony_ci{ 77418c2ecf20Sopenharmony_ci /* parse kernelcore=mirror */ 77428c2ecf20Sopenharmony_ci if (parse_option_str(p, "mirror")) { 77438c2ecf20Sopenharmony_ci mirrored_kernelcore = true; 77448c2ecf20Sopenharmony_ci return 0; 77458c2ecf20Sopenharmony_ci } 77468c2ecf20Sopenharmony_ci 77478c2ecf20Sopenharmony_ci return cmdline_parse_core(p, &required_kernelcore, 77488c2ecf20Sopenharmony_ci &required_kernelcore_percent); 77498c2ecf20Sopenharmony_ci} 77508c2ecf20Sopenharmony_ci 77518c2ecf20Sopenharmony_ci/* 77528c2ecf20Sopenharmony_ci * movablecore=size sets the amount of memory for use for allocations that 77538c2ecf20Sopenharmony_ci * can be reclaimed or migrated. 77548c2ecf20Sopenharmony_ci */ 77558c2ecf20Sopenharmony_cistatic int __init cmdline_parse_movablecore(char *p) 77568c2ecf20Sopenharmony_ci{ 77578c2ecf20Sopenharmony_ci return cmdline_parse_core(p, &required_movablecore, 77588c2ecf20Sopenharmony_ci &required_movablecore_percent); 77598c2ecf20Sopenharmony_ci} 77608c2ecf20Sopenharmony_ci 77618c2ecf20Sopenharmony_ciearly_param("kernelcore", cmdline_parse_kernelcore); 77628c2ecf20Sopenharmony_ciearly_param("movablecore", cmdline_parse_movablecore); 77638c2ecf20Sopenharmony_ci 77648c2ecf20Sopenharmony_civoid adjust_managed_page_count(struct page *page, long count) 77658c2ecf20Sopenharmony_ci{ 77668c2ecf20Sopenharmony_ci atomic_long_add(count, &page_zone(page)->managed_pages); 77678c2ecf20Sopenharmony_ci totalram_pages_add(count); 77688c2ecf20Sopenharmony_ci#ifdef CONFIG_HIGHMEM 77698c2ecf20Sopenharmony_ci if (PageHighMem(page)) 77708c2ecf20Sopenharmony_ci totalhigh_pages_add(count); 77718c2ecf20Sopenharmony_ci#endif 77728c2ecf20Sopenharmony_ci} 77738c2ecf20Sopenharmony_ciEXPORT_SYMBOL(adjust_managed_page_count); 77748c2ecf20Sopenharmony_ci 77758c2ecf20Sopenharmony_ciunsigned long free_reserved_area(void *start, void *end, int poison, const char *s) 77768c2ecf20Sopenharmony_ci{ 77778c2ecf20Sopenharmony_ci void *pos; 77788c2ecf20Sopenharmony_ci unsigned long pages = 0; 77798c2ecf20Sopenharmony_ci 77808c2ecf20Sopenharmony_ci start = (void *)PAGE_ALIGN((unsigned long)start); 77818c2ecf20Sopenharmony_ci end = (void *)((unsigned long)end & PAGE_MASK); 77828c2ecf20Sopenharmony_ci for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { 77838c2ecf20Sopenharmony_ci struct page *page = virt_to_page(pos); 77848c2ecf20Sopenharmony_ci void *direct_map_addr; 77858c2ecf20Sopenharmony_ci 77868c2ecf20Sopenharmony_ci /* 77878c2ecf20Sopenharmony_ci * 'direct_map_addr' might be different from 'pos' 77888c2ecf20Sopenharmony_ci * because some architectures' virt_to_page() 77898c2ecf20Sopenharmony_ci * work with aliases. Getting the direct map 77908c2ecf20Sopenharmony_ci * address ensures that we get a _writeable_ 77918c2ecf20Sopenharmony_ci * alias for the memset(). 77928c2ecf20Sopenharmony_ci */ 77938c2ecf20Sopenharmony_ci direct_map_addr = page_address(page); 77948c2ecf20Sopenharmony_ci if ((unsigned int)poison <= 0xFF) 77958c2ecf20Sopenharmony_ci memset(direct_map_addr, poison, PAGE_SIZE); 77968c2ecf20Sopenharmony_ci 77978c2ecf20Sopenharmony_ci free_reserved_page(page); 77988c2ecf20Sopenharmony_ci } 77998c2ecf20Sopenharmony_ci 78008c2ecf20Sopenharmony_ci if (pages && s) 78018c2ecf20Sopenharmony_ci pr_info("Freeing %s memory: %ldK\n", 78028c2ecf20Sopenharmony_ci s, pages << (PAGE_SHIFT - 10)); 78038c2ecf20Sopenharmony_ci 78048c2ecf20Sopenharmony_ci return pages; 78058c2ecf20Sopenharmony_ci} 78068c2ecf20Sopenharmony_ci 78078c2ecf20Sopenharmony_ci#ifdef CONFIG_HIGHMEM 78088c2ecf20Sopenharmony_civoid free_highmem_page(struct page *page) 78098c2ecf20Sopenharmony_ci{ 78108c2ecf20Sopenharmony_ci __free_reserved_page(page); 78118c2ecf20Sopenharmony_ci totalram_pages_inc(); 78128c2ecf20Sopenharmony_ci atomic_long_inc(&page_zone(page)->managed_pages); 78138c2ecf20Sopenharmony_ci totalhigh_pages_inc(); 78148c2ecf20Sopenharmony_ci} 78158c2ecf20Sopenharmony_ci#endif 78168c2ecf20Sopenharmony_ci 78178c2ecf20Sopenharmony_ci 78188c2ecf20Sopenharmony_civoid __init mem_init_print_info(const char *str) 78198c2ecf20Sopenharmony_ci{ 78208c2ecf20Sopenharmony_ci unsigned long physpages, codesize, datasize, rosize, bss_size; 78218c2ecf20Sopenharmony_ci unsigned long init_code_size, init_data_size; 78228c2ecf20Sopenharmony_ci 78238c2ecf20Sopenharmony_ci physpages = get_num_physpages(); 78248c2ecf20Sopenharmony_ci codesize = _etext - _stext; 78258c2ecf20Sopenharmony_ci datasize = _edata - _sdata; 78268c2ecf20Sopenharmony_ci rosize = __end_rodata - __start_rodata; 78278c2ecf20Sopenharmony_ci bss_size = __bss_stop - __bss_start; 78288c2ecf20Sopenharmony_ci init_data_size = __init_end - __init_begin; 78298c2ecf20Sopenharmony_ci init_code_size = _einittext - _sinittext; 78308c2ecf20Sopenharmony_ci 78318c2ecf20Sopenharmony_ci /* 78328c2ecf20Sopenharmony_ci * Detect special cases and adjust section sizes accordingly: 78338c2ecf20Sopenharmony_ci * 1) .init.* may be embedded into .data sections 78348c2ecf20Sopenharmony_ci * 2) .init.text.* may be out of [__init_begin, __init_end], 78358c2ecf20Sopenharmony_ci * please refer to arch/tile/kernel/vmlinux.lds.S. 78368c2ecf20Sopenharmony_ci * 3) .rodata.* may be embedded into .text or .data sections. 78378c2ecf20Sopenharmony_ci */ 78388c2ecf20Sopenharmony_ci#define adj_init_size(start, end, size, pos, adj) \ 78398c2ecf20Sopenharmony_ci do { \ 78408c2ecf20Sopenharmony_ci if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \ 78418c2ecf20Sopenharmony_ci size -= adj; \ 78428c2ecf20Sopenharmony_ci } while (0) 78438c2ecf20Sopenharmony_ci 78448c2ecf20Sopenharmony_ci adj_init_size(__init_begin, __init_end, init_data_size, 78458c2ecf20Sopenharmony_ci _sinittext, init_code_size); 78468c2ecf20Sopenharmony_ci adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size); 78478c2ecf20Sopenharmony_ci adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size); 78488c2ecf20Sopenharmony_ci adj_init_size(_stext, _etext, codesize, __start_rodata, rosize); 78498c2ecf20Sopenharmony_ci adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize); 78508c2ecf20Sopenharmony_ci 78518c2ecf20Sopenharmony_ci#undef adj_init_size 78528c2ecf20Sopenharmony_ci 78538c2ecf20Sopenharmony_ci pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved" 78548c2ecf20Sopenharmony_ci#ifdef CONFIG_HIGHMEM 78558c2ecf20Sopenharmony_ci ", %luK highmem" 78568c2ecf20Sopenharmony_ci#endif 78578c2ecf20Sopenharmony_ci "%s%s)\n", 78588c2ecf20Sopenharmony_ci nr_free_pages() << (PAGE_SHIFT - 10), 78598c2ecf20Sopenharmony_ci physpages << (PAGE_SHIFT - 10), 78608c2ecf20Sopenharmony_ci codesize >> 10, datasize >> 10, rosize >> 10, 78618c2ecf20Sopenharmony_ci (init_data_size + init_code_size) >> 10, bss_size >> 10, 78628c2ecf20Sopenharmony_ci (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10), 78638c2ecf20Sopenharmony_ci totalcma_pages << (PAGE_SHIFT - 10), 78648c2ecf20Sopenharmony_ci#ifdef CONFIG_HIGHMEM 78658c2ecf20Sopenharmony_ci totalhigh_pages() << (PAGE_SHIFT - 10), 78668c2ecf20Sopenharmony_ci#endif 78678c2ecf20Sopenharmony_ci str ? ", " : "", str ? str : ""); 78688c2ecf20Sopenharmony_ci} 78698c2ecf20Sopenharmony_ci 78708c2ecf20Sopenharmony_ci/** 78718c2ecf20Sopenharmony_ci * set_dma_reserve - set the specified number of pages reserved in the first zone 78728c2ecf20Sopenharmony_ci * @new_dma_reserve: The number of pages to mark reserved 78738c2ecf20Sopenharmony_ci * 78748c2ecf20Sopenharmony_ci * The per-cpu batchsize and zone watermarks are determined by managed_pages. 78758c2ecf20Sopenharmony_ci * In the DMA zone, a significant percentage may be consumed by kernel image 78768c2ecf20Sopenharmony_ci * and other unfreeable allocations which can skew the watermarks badly. This 78778c2ecf20Sopenharmony_ci * function may optionally be used to account for unfreeable pages in the 78788c2ecf20Sopenharmony_ci * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and 78798c2ecf20Sopenharmony_ci * smaller per-cpu batchsize. 78808c2ecf20Sopenharmony_ci */ 78818c2ecf20Sopenharmony_civoid __init set_dma_reserve(unsigned long new_dma_reserve) 78828c2ecf20Sopenharmony_ci{ 78838c2ecf20Sopenharmony_ci dma_reserve = new_dma_reserve; 78848c2ecf20Sopenharmony_ci} 78858c2ecf20Sopenharmony_ci 78868c2ecf20Sopenharmony_cistatic int page_alloc_cpu_dead(unsigned int cpu) 78878c2ecf20Sopenharmony_ci{ 78888c2ecf20Sopenharmony_ci 78898c2ecf20Sopenharmony_ci lru_add_drain_cpu(cpu); 78908c2ecf20Sopenharmony_ci drain_pages(cpu); 78918c2ecf20Sopenharmony_ci 78928c2ecf20Sopenharmony_ci /* 78938c2ecf20Sopenharmony_ci * Spill the event counters of the dead processor 78948c2ecf20Sopenharmony_ci * into the current processors event counters. 78958c2ecf20Sopenharmony_ci * This artificially elevates the count of the current 78968c2ecf20Sopenharmony_ci * processor. 78978c2ecf20Sopenharmony_ci */ 78988c2ecf20Sopenharmony_ci vm_events_fold_cpu(cpu); 78998c2ecf20Sopenharmony_ci 79008c2ecf20Sopenharmony_ci /* 79018c2ecf20Sopenharmony_ci * Zero the differential counters of the dead processor 79028c2ecf20Sopenharmony_ci * so that the vm statistics are consistent. 79038c2ecf20Sopenharmony_ci * 79048c2ecf20Sopenharmony_ci * This is only okay since the processor is dead and cannot 79058c2ecf20Sopenharmony_ci * race with what we are doing. 79068c2ecf20Sopenharmony_ci */ 79078c2ecf20Sopenharmony_ci cpu_vm_stats_fold(cpu); 79088c2ecf20Sopenharmony_ci return 0; 79098c2ecf20Sopenharmony_ci} 79108c2ecf20Sopenharmony_ci 79118c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA 79128c2ecf20Sopenharmony_ciint hashdist = HASHDIST_DEFAULT; 79138c2ecf20Sopenharmony_ci 79148c2ecf20Sopenharmony_cistatic int __init set_hashdist(char *str) 79158c2ecf20Sopenharmony_ci{ 79168c2ecf20Sopenharmony_ci if (!str) 79178c2ecf20Sopenharmony_ci return 0; 79188c2ecf20Sopenharmony_ci hashdist = simple_strtoul(str, &str, 0); 79198c2ecf20Sopenharmony_ci return 1; 79208c2ecf20Sopenharmony_ci} 79218c2ecf20Sopenharmony_ci__setup("hashdist=", set_hashdist); 79228c2ecf20Sopenharmony_ci#endif 79238c2ecf20Sopenharmony_ci 79248c2ecf20Sopenharmony_civoid __init page_alloc_init(void) 79258c2ecf20Sopenharmony_ci{ 79268c2ecf20Sopenharmony_ci int ret; 79278c2ecf20Sopenharmony_ci 79288c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA 79298c2ecf20Sopenharmony_ci if (num_node_state(N_MEMORY) == 1) 79308c2ecf20Sopenharmony_ci hashdist = 0; 79318c2ecf20Sopenharmony_ci#endif 79328c2ecf20Sopenharmony_ci 79338c2ecf20Sopenharmony_ci ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD, 79348c2ecf20Sopenharmony_ci "mm/page_alloc:dead", NULL, 79358c2ecf20Sopenharmony_ci page_alloc_cpu_dead); 79368c2ecf20Sopenharmony_ci WARN_ON(ret < 0); 79378c2ecf20Sopenharmony_ci} 79388c2ecf20Sopenharmony_ci 79398c2ecf20Sopenharmony_ci/* 79408c2ecf20Sopenharmony_ci * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio 79418c2ecf20Sopenharmony_ci * or min_free_kbytes changes. 79428c2ecf20Sopenharmony_ci */ 79438c2ecf20Sopenharmony_cistatic void calculate_totalreserve_pages(void) 79448c2ecf20Sopenharmony_ci{ 79458c2ecf20Sopenharmony_ci struct pglist_data *pgdat; 79468c2ecf20Sopenharmony_ci unsigned long reserve_pages = 0; 79478c2ecf20Sopenharmony_ci enum zone_type i, j; 79488c2ecf20Sopenharmony_ci 79498c2ecf20Sopenharmony_ci for_each_online_pgdat(pgdat) { 79508c2ecf20Sopenharmony_ci 79518c2ecf20Sopenharmony_ci pgdat->totalreserve_pages = 0; 79528c2ecf20Sopenharmony_ci 79538c2ecf20Sopenharmony_ci for (i = 0; i < MAX_NR_ZONES; i++) { 79548c2ecf20Sopenharmony_ci struct zone *zone = pgdat->node_zones + i; 79558c2ecf20Sopenharmony_ci long max = 0; 79568c2ecf20Sopenharmony_ci unsigned long managed_pages = zone_managed_pages(zone); 79578c2ecf20Sopenharmony_ci 79588c2ecf20Sopenharmony_ci /* Find valid and maximum lowmem_reserve in the zone */ 79598c2ecf20Sopenharmony_ci for (j = i; j < MAX_NR_ZONES; j++) { 79608c2ecf20Sopenharmony_ci if (zone->lowmem_reserve[j] > max) 79618c2ecf20Sopenharmony_ci max = zone->lowmem_reserve[j]; 79628c2ecf20Sopenharmony_ci } 79638c2ecf20Sopenharmony_ci 79648c2ecf20Sopenharmony_ci /* we treat the high watermark as reserved pages. */ 79658c2ecf20Sopenharmony_ci max += high_wmark_pages(zone); 79668c2ecf20Sopenharmony_ci 79678c2ecf20Sopenharmony_ci if (max > managed_pages) 79688c2ecf20Sopenharmony_ci max = managed_pages; 79698c2ecf20Sopenharmony_ci 79708c2ecf20Sopenharmony_ci pgdat->totalreserve_pages += max; 79718c2ecf20Sopenharmony_ci 79728c2ecf20Sopenharmony_ci reserve_pages += max; 79738c2ecf20Sopenharmony_ci } 79748c2ecf20Sopenharmony_ci } 79758c2ecf20Sopenharmony_ci totalreserve_pages = reserve_pages; 79768c2ecf20Sopenharmony_ci} 79778c2ecf20Sopenharmony_ci 79788c2ecf20Sopenharmony_ci/* 79798c2ecf20Sopenharmony_ci * setup_per_zone_lowmem_reserve - called whenever 79808c2ecf20Sopenharmony_ci * sysctl_lowmem_reserve_ratio changes. Ensures that each zone 79818c2ecf20Sopenharmony_ci * has a correct pages reserved value, so an adequate number of 79828c2ecf20Sopenharmony_ci * pages are left in the zone after a successful __alloc_pages(). 79838c2ecf20Sopenharmony_ci */ 79848c2ecf20Sopenharmony_cistatic void setup_per_zone_lowmem_reserve(void) 79858c2ecf20Sopenharmony_ci{ 79868c2ecf20Sopenharmony_ci struct pglist_data *pgdat; 79878c2ecf20Sopenharmony_ci enum zone_type i, j; 79888c2ecf20Sopenharmony_ci 79898c2ecf20Sopenharmony_ci for_each_online_pgdat(pgdat) { 79908c2ecf20Sopenharmony_ci for (i = 0; i < MAX_NR_ZONES - 1; i++) { 79918c2ecf20Sopenharmony_ci struct zone *zone = &pgdat->node_zones[i]; 79928c2ecf20Sopenharmony_ci int ratio = sysctl_lowmem_reserve_ratio[i]; 79938c2ecf20Sopenharmony_ci bool clear = !ratio || !zone_managed_pages(zone); 79948c2ecf20Sopenharmony_ci unsigned long managed_pages = 0; 79958c2ecf20Sopenharmony_ci 79968c2ecf20Sopenharmony_ci for (j = i + 1; j < MAX_NR_ZONES; j++) { 79978c2ecf20Sopenharmony_ci struct zone *upper_zone = &pgdat->node_zones[j]; 79988c2ecf20Sopenharmony_ci 79998c2ecf20Sopenharmony_ci managed_pages += zone_managed_pages(upper_zone); 80008c2ecf20Sopenharmony_ci 80018c2ecf20Sopenharmony_ci if (clear) 80028c2ecf20Sopenharmony_ci zone->lowmem_reserve[j] = 0; 80038c2ecf20Sopenharmony_ci else 80048c2ecf20Sopenharmony_ci zone->lowmem_reserve[j] = managed_pages / ratio; 80058c2ecf20Sopenharmony_ci } 80068c2ecf20Sopenharmony_ci } 80078c2ecf20Sopenharmony_ci } 80088c2ecf20Sopenharmony_ci 80098c2ecf20Sopenharmony_ci /* update totalreserve_pages */ 80108c2ecf20Sopenharmony_ci calculate_totalreserve_pages(); 80118c2ecf20Sopenharmony_ci} 80128c2ecf20Sopenharmony_ci 80138c2ecf20Sopenharmony_cistatic void __setup_per_zone_wmarks(void) 80148c2ecf20Sopenharmony_ci{ 80158c2ecf20Sopenharmony_ci unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 80168c2ecf20Sopenharmony_ci unsigned long lowmem_pages = 0; 80178c2ecf20Sopenharmony_ci struct zone *zone; 80188c2ecf20Sopenharmony_ci unsigned long flags; 80198c2ecf20Sopenharmony_ci 80208c2ecf20Sopenharmony_ci /* Calculate total number of !ZONE_HIGHMEM pages */ 80218c2ecf20Sopenharmony_ci for_each_zone(zone) { 80228c2ecf20Sopenharmony_ci if (!is_highmem(zone)) 80238c2ecf20Sopenharmony_ci lowmem_pages += zone_managed_pages(zone); 80248c2ecf20Sopenharmony_ci } 80258c2ecf20Sopenharmony_ci 80268c2ecf20Sopenharmony_ci for_each_zone(zone) { 80278c2ecf20Sopenharmony_ci u64 tmp; 80288c2ecf20Sopenharmony_ci 80298c2ecf20Sopenharmony_ci spin_lock_irqsave(&zone->lock, flags); 80308c2ecf20Sopenharmony_ci tmp = (u64)pages_min * zone_managed_pages(zone); 80318c2ecf20Sopenharmony_ci do_div(tmp, lowmem_pages); 80328c2ecf20Sopenharmony_ci if (is_highmem(zone)) { 80338c2ecf20Sopenharmony_ci /* 80348c2ecf20Sopenharmony_ci * __GFP_HIGH and PF_MEMALLOC allocations usually don't 80358c2ecf20Sopenharmony_ci * need highmem pages, so cap pages_min to a small 80368c2ecf20Sopenharmony_ci * value here. 80378c2ecf20Sopenharmony_ci * 80388c2ecf20Sopenharmony_ci * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) 80398c2ecf20Sopenharmony_ci * deltas control async page reclaim, and so should 80408c2ecf20Sopenharmony_ci * not be capped for highmem. 80418c2ecf20Sopenharmony_ci */ 80428c2ecf20Sopenharmony_ci unsigned long min_pages; 80438c2ecf20Sopenharmony_ci 80448c2ecf20Sopenharmony_ci min_pages = zone_managed_pages(zone) / 1024; 80458c2ecf20Sopenharmony_ci min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); 80468c2ecf20Sopenharmony_ci zone->_watermark[WMARK_MIN] = min_pages; 80478c2ecf20Sopenharmony_ci } else { 80488c2ecf20Sopenharmony_ci /* 80498c2ecf20Sopenharmony_ci * If it's a lowmem zone, reserve a number of pages 80508c2ecf20Sopenharmony_ci * proportionate to the zone's size. 80518c2ecf20Sopenharmony_ci */ 80528c2ecf20Sopenharmony_ci zone->_watermark[WMARK_MIN] = tmp; 80538c2ecf20Sopenharmony_ci } 80548c2ecf20Sopenharmony_ci 80558c2ecf20Sopenharmony_ci /* 80568c2ecf20Sopenharmony_ci * Set the kswapd watermarks distance according to the 80578c2ecf20Sopenharmony_ci * scale factor in proportion to available memory, but 80588c2ecf20Sopenharmony_ci * ensure a minimum size on small systems. 80598c2ecf20Sopenharmony_ci */ 80608c2ecf20Sopenharmony_ci tmp = max_t(u64, tmp >> 2, 80618c2ecf20Sopenharmony_ci mult_frac(zone_managed_pages(zone), 80628c2ecf20Sopenharmony_ci watermark_scale_factor, 10000)); 80638c2ecf20Sopenharmony_ci 80648c2ecf20Sopenharmony_ci zone->watermark_boost = 0; 80658c2ecf20Sopenharmony_ci zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; 80668c2ecf20Sopenharmony_ci zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; 80678c2ecf20Sopenharmony_ci 80688c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&zone->lock, flags); 80698c2ecf20Sopenharmony_ci } 80708c2ecf20Sopenharmony_ci 80718c2ecf20Sopenharmony_ci /* update totalreserve_pages */ 80728c2ecf20Sopenharmony_ci calculate_totalreserve_pages(); 80738c2ecf20Sopenharmony_ci} 80748c2ecf20Sopenharmony_ci 80758c2ecf20Sopenharmony_ci/** 80768c2ecf20Sopenharmony_ci * setup_per_zone_wmarks - called when min_free_kbytes changes 80778c2ecf20Sopenharmony_ci * or when memory is hot-{added|removed} 80788c2ecf20Sopenharmony_ci * 80798c2ecf20Sopenharmony_ci * Ensures that the watermark[min,low,high] values for each zone are set 80808c2ecf20Sopenharmony_ci * correctly with respect to min_free_kbytes. 80818c2ecf20Sopenharmony_ci */ 80828c2ecf20Sopenharmony_civoid setup_per_zone_wmarks(void) 80838c2ecf20Sopenharmony_ci{ 80848c2ecf20Sopenharmony_ci static DEFINE_SPINLOCK(lock); 80858c2ecf20Sopenharmony_ci 80868c2ecf20Sopenharmony_ci spin_lock(&lock); 80878c2ecf20Sopenharmony_ci __setup_per_zone_wmarks(); 80888c2ecf20Sopenharmony_ci spin_unlock(&lock); 80898c2ecf20Sopenharmony_ci} 80908c2ecf20Sopenharmony_ci 80918c2ecf20Sopenharmony_ci/* 80928c2ecf20Sopenharmony_ci * Initialise min_free_kbytes. 80938c2ecf20Sopenharmony_ci * 80948c2ecf20Sopenharmony_ci * For small machines we want it small (128k min). For large machines 80958c2ecf20Sopenharmony_ci * we want it large (256MB max). But it is not linear, because network 80968c2ecf20Sopenharmony_ci * bandwidth does not increase linearly with machine size. We use 80978c2ecf20Sopenharmony_ci * 80988c2ecf20Sopenharmony_ci * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 80998c2ecf20Sopenharmony_ci * min_free_kbytes = sqrt(lowmem_kbytes * 16) 81008c2ecf20Sopenharmony_ci * 81018c2ecf20Sopenharmony_ci * which yields 81028c2ecf20Sopenharmony_ci * 81038c2ecf20Sopenharmony_ci * 16MB: 512k 81048c2ecf20Sopenharmony_ci * 32MB: 724k 81058c2ecf20Sopenharmony_ci * 64MB: 1024k 81068c2ecf20Sopenharmony_ci * 128MB: 1448k 81078c2ecf20Sopenharmony_ci * 256MB: 2048k 81088c2ecf20Sopenharmony_ci * 512MB: 2896k 81098c2ecf20Sopenharmony_ci * 1024MB: 4096k 81108c2ecf20Sopenharmony_ci * 2048MB: 5792k 81118c2ecf20Sopenharmony_ci * 4096MB: 8192k 81128c2ecf20Sopenharmony_ci * 8192MB: 11584k 81138c2ecf20Sopenharmony_ci * 16384MB: 16384k 81148c2ecf20Sopenharmony_ci */ 81158c2ecf20Sopenharmony_ciint __meminit init_per_zone_wmark_min(void) 81168c2ecf20Sopenharmony_ci{ 81178c2ecf20Sopenharmony_ci unsigned long lowmem_kbytes; 81188c2ecf20Sopenharmony_ci int new_min_free_kbytes; 81198c2ecf20Sopenharmony_ci 81208c2ecf20Sopenharmony_ci lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 81218c2ecf20Sopenharmony_ci new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 81228c2ecf20Sopenharmony_ci 81238c2ecf20Sopenharmony_ci if (new_min_free_kbytes > user_min_free_kbytes) { 81248c2ecf20Sopenharmony_ci min_free_kbytes = new_min_free_kbytes; 81258c2ecf20Sopenharmony_ci if (min_free_kbytes < 128) 81268c2ecf20Sopenharmony_ci min_free_kbytes = 128; 81278c2ecf20Sopenharmony_ci if (min_free_kbytes > 262144) 81288c2ecf20Sopenharmony_ci min_free_kbytes = 262144; 81298c2ecf20Sopenharmony_ci } else { 81308c2ecf20Sopenharmony_ci pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", 81318c2ecf20Sopenharmony_ci new_min_free_kbytes, user_min_free_kbytes); 81328c2ecf20Sopenharmony_ci } 81338c2ecf20Sopenharmony_ci setup_per_zone_wmarks(); 81348c2ecf20Sopenharmony_ci refresh_zone_stat_thresholds(); 81358c2ecf20Sopenharmony_ci setup_per_zone_lowmem_reserve(); 81368c2ecf20Sopenharmony_ci 81378c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA 81388c2ecf20Sopenharmony_ci setup_min_unmapped_ratio(); 81398c2ecf20Sopenharmony_ci setup_min_slab_ratio(); 81408c2ecf20Sopenharmony_ci#endif 81418c2ecf20Sopenharmony_ci 81428c2ecf20Sopenharmony_ci khugepaged_min_free_kbytes_update(); 81438c2ecf20Sopenharmony_ci 81448c2ecf20Sopenharmony_ci return 0; 81458c2ecf20Sopenharmony_ci} 81468c2ecf20Sopenharmony_cipostcore_initcall(init_per_zone_wmark_min) 81478c2ecf20Sopenharmony_ci 81488c2ecf20Sopenharmony_ci/* 81498c2ecf20Sopenharmony_ci * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 81508c2ecf20Sopenharmony_ci * that we can call two helper functions whenever min_free_kbytes 81518c2ecf20Sopenharmony_ci * changes. 81528c2ecf20Sopenharmony_ci */ 81538c2ecf20Sopenharmony_ciint min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, 81548c2ecf20Sopenharmony_ci void *buffer, size_t *length, loff_t *ppos) 81558c2ecf20Sopenharmony_ci{ 81568c2ecf20Sopenharmony_ci int rc; 81578c2ecf20Sopenharmony_ci 81588c2ecf20Sopenharmony_ci rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 81598c2ecf20Sopenharmony_ci if (rc) 81608c2ecf20Sopenharmony_ci return rc; 81618c2ecf20Sopenharmony_ci 81628c2ecf20Sopenharmony_ci if (write) { 81638c2ecf20Sopenharmony_ci user_min_free_kbytes = min_free_kbytes; 81648c2ecf20Sopenharmony_ci setup_per_zone_wmarks(); 81658c2ecf20Sopenharmony_ci } 81668c2ecf20Sopenharmony_ci return 0; 81678c2ecf20Sopenharmony_ci} 81688c2ecf20Sopenharmony_ci 81698c2ecf20Sopenharmony_ciint watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, 81708c2ecf20Sopenharmony_ci void *buffer, size_t *length, loff_t *ppos) 81718c2ecf20Sopenharmony_ci{ 81728c2ecf20Sopenharmony_ci int rc; 81738c2ecf20Sopenharmony_ci 81748c2ecf20Sopenharmony_ci rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 81758c2ecf20Sopenharmony_ci if (rc) 81768c2ecf20Sopenharmony_ci return rc; 81778c2ecf20Sopenharmony_ci 81788c2ecf20Sopenharmony_ci if (write) 81798c2ecf20Sopenharmony_ci setup_per_zone_wmarks(); 81808c2ecf20Sopenharmony_ci 81818c2ecf20Sopenharmony_ci return 0; 81828c2ecf20Sopenharmony_ci} 81838c2ecf20Sopenharmony_ci 81848c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA 81858c2ecf20Sopenharmony_cistatic void setup_min_unmapped_ratio(void) 81868c2ecf20Sopenharmony_ci{ 81878c2ecf20Sopenharmony_ci pg_data_t *pgdat; 81888c2ecf20Sopenharmony_ci struct zone *zone; 81898c2ecf20Sopenharmony_ci 81908c2ecf20Sopenharmony_ci for_each_online_pgdat(pgdat) 81918c2ecf20Sopenharmony_ci pgdat->min_unmapped_pages = 0; 81928c2ecf20Sopenharmony_ci 81938c2ecf20Sopenharmony_ci for_each_zone(zone) 81948c2ecf20Sopenharmony_ci zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) * 81958c2ecf20Sopenharmony_ci sysctl_min_unmapped_ratio) / 100; 81968c2ecf20Sopenharmony_ci} 81978c2ecf20Sopenharmony_ci 81988c2ecf20Sopenharmony_ci 81998c2ecf20Sopenharmony_ciint sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, 82008c2ecf20Sopenharmony_ci void *buffer, size_t *length, loff_t *ppos) 82018c2ecf20Sopenharmony_ci{ 82028c2ecf20Sopenharmony_ci int rc; 82038c2ecf20Sopenharmony_ci 82048c2ecf20Sopenharmony_ci rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 82058c2ecf20Sopenharmony_ci if (rc) 82068c2ecf20Sopenharmony_ci return rc; 82078c2ecf20Sopenharmony_ci 82088c2ecf20Sopenharmony_ci setup_min_unmapped_ratio(); 82098c2ecf20Sopenharmony_ci 82108c2ecf20Sopenharmony_ci return 0; 82118c2ecf20Sopenharmony_ci} 82128c2ecf20Sopenharmony_ci 82138c2ecf20Sopenharmony_cistatic void setup_min_slab_ratio(void) 82148c2ecf20Sopenharmony_ci{ 82158c2ecf20Sopenharmony_ci pg_data_t *pgdat; 82168c2ecf20Sopenharmony_ci struct zone *zone; 82178c2ecf20Sopenharmony_ci 82188c2ecf20Sopenharmony_ci for_each_online_pgdat(pgdat) 82198c2ecf20Sopenharmony_ci pgdat->min_slab_pages = 0; 82208c2ecf20Sopenharmony_ci 82218c2ecf20Sopenharmony_ci for_each_zone(zone) 82228c2ecf20Sopenharmony_ci zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) * 82238c2ecf20Sopenharmony_ci sysctl_min_slab_ratio) / 100; 82248c2ecf20Sopenharmony_ci} 82258c2ecf20Sopenharmony_ci 82268c2ecf20Sopenharmony_ciint sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, 82278c2ecf20Sopenharmony_ci void *buffer, size_t *length, loff_t *ppos) 82288c2ecf20Sopenharmony_ci{ 82298c2ecf20Sopenharmony_ci int rc; 82308c2ecf20Sopenharmony_ci 82318c2ecf20Sopenharmony_ci rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 82328c2ecf20Sopenharmony_ci if (rc) 82338c2ecf20Sopenharmony_ci return rc; 82348c2ecf20Sopenharmony_ci 82358c2ecf20Sopenharmony_ci setup_min_slab_ratio(); 82368c2ecf20Sopenharmony_ci 82378c2ecf20Sopenharmony_ci return 0; 82388c2ecf20Sopenharmony_ci} 82398c2ecf20Sopenharmony_ci#endif 82408c2ecf20Sopenharmony_ci 82418c2ecf20Sopenharmony_ci/* 82428c2ecf20Sopenharmony_ci * lowmem_reserve_ratio_sysctl_handler - just a wrapper around 82438c2ecf20Sopenharmony_ci * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() 82448c2ecf20Sopenharmony_ci * whenever sysctl_lowmem_reserve_ratio changes. 82458c2ecf20Sopenharmony_ci * 82468c2ecf20Sopenharmony_ci * The reserve ratio obviously has absolutely no relation with the 82478c2ecf20Sopenharmony_ci * minimum watermarks. The lowmem reserve ratio can only make sense 82488c2ecf20Sopenharmony_ci * if in function of the boot time zone sizes. 82498c2ecf20Sopenharmony_ci */ 82508c2ecf20Sopenharmony_ciint lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, 82518c2ecf20Sopenharmony_ci void *buffer, size_t *length, loff_t *ppos) 82528c2ecf20Sopenharmony_ci{ 82538c2ecf20Sopenharmony_ci int i; 82548c2ecf20Sopenharmony_ci 82558c2ecf20Sopenharmony_ci proc_dointvec_minmax(table, write, buffer, length, ppos); 82568c2ecf20Sopenharmony_ci 82578c2ecf20Sopenharmony_ci for (i = 0; i < MAX_NR_ZONES; i++) { 82588c2ecf20Sopenharmony_ci if (sysctl_lowmem_reserve_ratio[i] < 1) 82598c2ecf20Sopenharmony_ci sysctl_lowmem_reserve_ratio[i] = 0; 82608c2ecf20Sopenharmony_ci } 82618c2ecf20Sopenharmony_ci 82628c2ecf20Sopenharmony_ci setup_per_zone_lowmem_reserve(); 82638c2ecf20Sopenharmony_ci return 0; 82648c2ecf20Sopenharmony_ci} 82658c2ecf20Sopenharmony_ci 82668c2ecf20Sopenharmony_cistatic void __zone_pcp_update(struct zone *zone) 82678c2ecf20Sopenharmony_ci{ 82688c2ecf20Sopenharmony_ci unsigned int cpu; 82698c2ecf20Sopenharmony_ci 82708c2ecf20Sopenharmony_ci for_each_possible_cpu(cpu) 82718c2ecf20Sopenharmony_ci pageset_set_high_and_batch(zone, 82728c2ecf20Sopenharmony_ci per_cpu_ptr(zone->pageset, cpu)); 82738c2ecf20Sopenharmony_ci} 82748c2ecf20Sopenharmony_ci 82758c2ecf20Sopenharmony_ci/* 82768c2ecf20Sopenharmony_ci * percpu_pagelist_fraction - changes the pcp->high for each zone on each 82778c2ecf20Sopenharmony_ci * cpu. It is the fraction of total pages in each zone that a hot per cpu 82788c2ecf20Sopenharmony_ci * pagelist can have before it gets flushed back to buddy allocator. 82798c2ecf20Sopenharmony_ci */ 82808c2ecf20Sopenharmony_ciint percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, 82818c2ecf20Sopenharmony_ci void *buffer, size_t *length, loff_t *ppos) 82828c2ecf20Sopenharmony_ci{ 82838c2ecf20Sopenharmony_ci struct zone *zone; 82848c2ecf20Sopenharmony_ci int old_percpu_pagelist_fraction; 82858c2ecf20Sopenharmony_ci int ret; 82868c2ecf20Sopenharmony_ci 82878c2ecf20Sopenharmony_ci mutex_lock(&pcp_batch_high_lock); 82888c2ecf20Sopenharmony_ci old_percpu_pagelist_fraction = percpu_pagelist_fraction; 82898c2ecf20Sopenharmony_ci 82908c2ecf20Sopenharmony_ci ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 82918c2ecf20Sopenharmony_ci if (!write || ret < 0) 82928c2ecf20Sopenharmony_ci goto out; 82938c2ecf20Sopenharmony_ci 82948c2ecf20Sopenharmony_ci /* Sanity checking to avoid pcp imbalance */ 82958c2ecf20Sopenharmony_ci if (percpu_pagelist_fraction && 82968c2ecf20Sopenharmony_ci percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) { 82978c2ecf20Sopenharmony_ci percpu_pagelist_fraction = old_percpu_pagelist_fraction; 82988c2ecf20Sopenharmony_ci ret = -EINVAL; 82998c2ecf20Sopenharmony_ci goto out; 83008c2ecf20Sopenharmony_ci } 83018c2ecf20Sopenharmony_ci 83028c2ecf20Sopenharmony_ci /* No change? */ 83038c2ecf20Sopenharmony_ci if (percpu_pagelist_fraction == old_percpu_pagelist_fraction) 83048c2ecf20Sopenharmony_ci goto out; 83058c2ecf20Sopenharmony_ci 83068c2ecf20Sopenharmony_ci for_each_populated_zone(zone) 83078c2ecf20Sopenharmony_ci __zone_pcp_update(zone); 83088c2ecf20Sopenharmony_ciout: 83098c2ecf20Sopenharmony_ci mutex_unlock(&pcp_batch_high_lock); 83108c2ecf20Sopenharmony_ci return ret; 83118c2ecf20Sopenharmony_ci} 83128c2ecf20Sopenharmony_ci 83138c2ecf20Sopenharmony_ci#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES 83148c2ecf20Sopenharmony_ci/* 83158c2ecf20Sopenharmony_ci * Returns the number of pages that arch has reserved but 83168c2ecf20Sopenharmony_ci * is not known to alloc_large_system_hash(). 83178c2ecf20Sopenharmony_ci */ 83188c2ecf20Sopenharmony_cistatic unsigned long __init arch_reserved_kernel_pages(void) 83198c2ecf20Sopenharmony_ci{ 83208c2ecf20Sopenharmony_ci return 0; 83218c2ecf20Sopenharmony_ci} 83228c2ecf20Sopenharmony_ci#endif 83238c2ecf20Sopenharmony_ci 83248c2ecf20Sopenharmony_ci/* 83258c2ecf20Sopenharmony_ci * Adaptive scale is meant to reduce sizes of hash tables on large memory 83268c2ecf20Sopenharmony_ci * machines. As memory size is increased the scale is also increased but at 83278c2ecf20Sopenharmony_ci * slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory 83288c2ecf20Sopenharmony_ci * quadruples the scale is increased by one, which means the size of hash table 83298c2ecf20Sopenharmony_ci * only doubles, instead of quadrupling as well. 83308c2ecf20Sopenharmony_ci * Because 32-bit systems cannot have large physical memory, where this scaling 83318c2ecf20Sopenharmony_ci * makes sense, it is disabled on such platforms. 83328c2ecf20Sopenharmony_ci */ 83338c2ecf20Sopenharmony_ci#if __BITS_PER_LONG > 32 83348c2ecf20Sopenharmony_ci#define ADAPT_SCALE_BASE (64ul << 30) 83358c2ecf20Sopenharmony_ci#define ADAPT_SCALE_SHIFT 2 83368c2ecf20Sopenharmony_ci#define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT) 83378c2ecf20Sopenharmony_ci#endif 83388c2ecf20Sopenharmony_ci 83398c2ecf20Sopenharmony_ci/* 83408c2ecf20Sopenharmony_ci * allocate a large system hash table from bootmem 83418c2ecf20Sopenharmony_ci * - it is assumed that the hash table must contain an exact power-of-2 83428c2ecf20Sopenharmony_ci * quantity of entries 83438c2ecf20Sopenharmony_ci * - limit is the number of hash buckets, not the total allocation size 83448c2ecf20Sopenharmony_ci */ 83458c2ecf20Sopenharmony_civoid *__init alloc_large_system_hash(const char *tablename, 83468c2ecf20Sopenharmony_ci unsigned long bucketsize, 83478c2ecf20Sopenharmony_ci unsigned long numentries, 83488c2ecf20Sopenharmony_ci int scale, 83498c2ecf20Sopenharmony_ci int flags, 83508c2ecf20Sopenharmony_ci unsigned int *_hash_shift, 83518c2ecf20Sopenharmony_ci unsigned int *_hash_mask, 83528c2ecf20Sopenharmony_ci unsigned long low_limit, 83538c2ecf20Sopenharmony_ci unsigned long high_limit) 83548c2ecf20Sopenharmony_ci{ 83558c2ecf20Sopenharmony_ci unsigned long long max = high_limit; 83568c2ecf20Sopenharmony_ci unsigned long log2qty, size; 83578c2ecf20Sopenharmony_ci void *table = NULL; 83588c2ecf20Sopenharmony_ci gfp_t gfp_flags; 83598c2ecf20Sopenharmony_ci bool virt; 83608c2ecf20Sopenharmony_ci 83618c2ecf20Sopenharmony_ci /* allow the kernel cmdline to have a say */ 83628c2ecf20Sopenharmony_ci if (!numentries) { 83638c2ecf20Sopenharmony_ci /* round applicable memory size up to nearest megabyte */ 83648c2ecf20Sopenharmony_ci numentries = nr_kernel_pages; 83658c2ecf20Sopenharmony_ci numentries -= arch_reserved_kernel_pages(); 83668c2ecf20Sopenharmony_ci 83678c2ecf20Sopenharmony_ci /* It isn't necessary when PAGE_SIZE >= 1MB */ 83688c2ecf20Sopenharmony_ci if (PAGE_SHIFT < 20) 83698c2ecf20Sopenharmony_ci numentries = round_up(numentries, (1<<20)/PAGE_SIZE); 83708c2ecf20Sopenharmony_ci 83718c2ecf20Sopenharmony_ci#if __BITS_PER_LONG > 32 83728c2ecf20Sopenharmony_ci if (!high_limit) { 83738c2ecf20Sopenharmony_ci unsigned long adapt; 83748c2ecf20Sopenharmony_ci 83758c2ecf20Sopenharmony_ci for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries; 83768c2ecf20Sopenharmony_ci adapt <<= ADAPT_SCALE_SHIFT) 83778c2ecf20Sopenharmony_ci scale++; 83788c2ecf20Sopenharmony_ci } 83798c2ecf20Sopenharmony_ci#endif 83808c2ecf20Sopenharmony_ci 83818c2ecf20Sopenharmony_ci /* limit to 1 bucket per 2^scale bytes of low memory */ 83828c2ecf20Sopenharmony_ci if (scale > PAGE_SHIFT) 83838c2ecf20Sopenharmony_ci numentries >>= (scale - PAGE_SHIFT); 83848c2ecf20Sopenharmony_ci else 83858c2ecf20Sopenharmony_ci numentries <<= (PAGE_SHIFT - scale); 83868c2ecf20Sopenharmony_ci 83878c2ecf20Sopenharmony_ci /* Make sure we've got at least a 0-order allocation.. */ 83888c2ecf20Sopenharmony_ci if (unlikely(flags & HASH_SMALL)) { 83898c2ecf20Sopenharmony_ci /* Makes no sense without HASH_EARLY */ 83908c2ecf20Sopenharmony_ci WARN_ON(!(flags & HASH_EARLY)); 83918c2ecf20Sopenharmony_ci if (!(numentries >> *_hash_shift)) { 83928c2ecf20Sopenharmony_ci numentries = 1UL << *_hash_shift; 83938c2ecf20Sopenharmony_ci BUG_ON(!numentries); 83948c2ecf20Sopenharmony_ci } 83958c2ecf20Sopenharmony_ci } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) 83968c2ecf20Sopenharmony_ci numentries = PAGE_SIZE / bucketsize; 83978c2ecf20Sopenharmony_ci } 83988c2ecf20Sopenharmony_ci numentries = roundup_pow_of_two(numentries); 83998c2ecf20Sopenharmony_ci 84008c2ecf20Sopenharmony_ci /* limit allocation size to 1/16 total memory by default */ 84018c2ecf20Sopenharmony_ci if (max == 0) { 84028c2ecf20Sopenharmony_ci max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; 84038c2ecf20Sopenharmony_ci do_div(max, bucketsize); 84048c2ecf20Sopenharmony_ci } 84058c2ecf20Sopenharmony_ci max = min(max, 0x80000000ULL); 84068c2ecf20Sopenharmony_ci 84078c2ecf20Sopenharmony_ci if (numentries < low_limit) 84088c2ecf20Sopenharmony_ci numentries = low_limit; 84098c2ecf20Sopenharmony_ci if (numentries > max) 84108c2ecf20Sopenharmony_ci numentries = max; 84118c2ecf20Sopenharmony_ci 84128c2ecf20Sopenharmony_ci log2qty = ilog2(numentries); 84138c2ecf20Sopenharmony_ci 84148c2ecf20Sopenharmony_ci gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC; 84158c2ecf20Sopenharmony_ci do { 84168c2ecf20Sopenharmony_ci virt = false; 84178c2ecf20Sopenharmony_ci size = bucketsize << log2qty; 84188c2ecf20Sopenharmony_ci if (flags & HASH_EARLY) { 84198c2ecf20Sopenharmony_ci if (flags & HASH_ZERO) 84208c2ecf20Sopenharmony_ci table = memblock_alloc(size, SMP_CACHE_BYTES); 84218c2ecf20Sopenharmony_ci else 84228c2ecf20Sopenharmony_ci table = memblock_alloc_raw(size, 84238c2ecf20Sopenharmony_ci SMP_CACHE_BYTES); 84248c2ecf20Sopenharmony_ci } else if (get_order(size) >= MAX_ORDER || hashdist) { 84258c2ecf20Sopenharmony_ci table = __vmalloc(size, gfp_flags); 84268c2ecf20Sopenharmony_ci virt = true; 84278c2ecf20Sopenharmony_ci } else { 84288c2ecf20Sopenharmony_ci /* 84298c2ecf20Sopenharmony_ci * If bucketsize is not a power-of-two, we may free 84308c2ecf20Sopenharmony_ci * some pages at the end of hash table which 84318c2ecf20Sopenharmony_ci * alloc_pages_exact() automatically does 84328c2ecf20Sopenharmony_ci */ 84338c2ecf20Sopenharmony_ci table = alloc_pages_exact(size, gfp_flags); 84348c2ecf20Sopenharmony_ci kmemleak_alloc(table, size, 1, gfp_flags); 84358c2ecf20Sopenharmony_ci } 84368c2ecf20Sopenharmony_ci } while (!table && size > PAGE_SIZE && --log2qty); 84378c2ecf20Sopenharmony_ci 84388c2ecf20Sopenharmony_ci if (!table) 84398c2ecf20Sopenharmony_ci panic("Failed to allocate %s hash table\n", tablename); 84408c2ecf20Sopenharmony_ci 84418c2ecf20Sopenharmony_ci pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n", 84428c2ecf20Sopenharmony_ci tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size, 84438c2ecf20Sopenharmony_ci virt ? "vmalloc" : "linear"); 84448c2ecf20Sopenharmony_ci 84458c2ecf20Sopenharmony_ci if (_hash_shift) 84468c2ecf20Sopenharmony_ci *_hash_shift = log2qty; 84478c2ecf20Sopenharmony_ci if (_hash_mask) 84488c2ecf20Sopenharmony_ci *_hash_mask = (1 << log2qty) - 1; 84498c2ecf20Sopenharmony_ci 84508c2ecf20Sopenharmony_ci return table; 84518c2ecf20Sopenharmony_ci} 84528c2ecf20Sopenharmony_ci 84538c2ecf20Sopenharmony_ci/* 84548c2ecf20Sopenharmony_ci * This function checks whether pageblock includes unmovable pages or not. 84558c2ecf20Sopenharmony_ci * 84568c2ecf20Sopenharmony_ci * PageLRU check without isolation or lru_lock could race so that 84578c2ecf20Sopenharmony_ci * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable 84588c2ecf20Sopenharmony_ci * check without lock_page also may miss some movable non-lru pages at 84598c2ecf20Sopenharmony_ci * race condition. So you can't expect this function should be exact. 84608c2ecf20Sopenharmony_ci * 84618c2ecf20Sopenharmony_ci * Returns a page without holding a reference. If the caller wants to 84628c2ecf20Sopenharmony_ci * dereference that page (e.g., dumping), it has to make sure that it 84638c2ecf20Sopenharmony_ci * cannot get removed (e.g., via memory unplug) concurrently. 84648c2ecf20Sopenharmony_ci * 84658c2ecf20Sopenharmony_ci */ 84668c2ecf20Sopenharmony_cistruct page *has_unmovable_pages(struct zone *zone, struct page *page, 84678c2ecf20Sopenharmony_ci int migratetype, int flags) 84688c2ecf20Sopenharmony_ci{ 84698c2ecf20Sopenharmony_ci unsigned long iter = 0; 84708c2ecf20Sopenharmony_ci unsigned long pfn = page_to_pfn(page); 84718c2ecf20Sopenharmony_ci unsigned long offset = pfn % pageblock_nr_pages; 84728c2ecf20Sopenharmony_ci 84738c2ecf20Sopenharmony_ci if (is_migrate_cma_page(page)) { 84748c2ecf20Sopenharmony_ci /* 84758c2ecf20Sopenharmony_ci * CMA allocations (alloc_contig_range) really need to mark 84768c2ecf20Sopenharmony_ci * isolate CMA pageblocks even when they are not movable in fact 84778c2ecf20Sopenharmony_ci * so consider them movable here. 84788c2ecf20Sopenharmony_ci */ 84798c2ecf20Sopenharmony_ci if (is_migrate_cma(migratetype)) 84808c2ecf20Sopenharmony_ci return NULL; 84818c2ecf20Sopenharmony_ci 84828c2ecf20Sopenharmony_ci return page; 84838c2ecf20Sopenharmony_ci } 84848c2ecf20Sopenharmony_ci 84858c2ecf20Sopenharmony_ci for (; iter < pageblock_nr_pages - offset; iter++) { 84868c2ecf20Sopenharmony_ci if (!pfn_valid_within(pfn + iter)) 84878c2ecf20Sopenharmony_ci continue; 84888c2ecf20Sopenharmony_ci 84898c2ecf20Sopenharmony_ci page = pfn_to_page(pfn + iter); 84908c2ecf20Sopenharmony_ci 84918c2ecf20Sopenharmony_ci /* 84928c2ecf20Sopenharmony_ci * Both, bootmem allocations and memory holes are marked 84938c2ecf20Sopenharmony_ci * PG_reserved and are unmovable. We can even have unmovable 84948c2ecf20Sopenharmony_ci * allocations inside ZONE_MOVABLE, for example when 84958c2ecf20Sopenharmony_ci * specifying "movablecore". 84968c2ecf20Sopenharmony_ci */ 84978c2ecf20Sopenharmony_ci if (PageReserved(page)) 84988c2ecf20Sopenharmony_ci return page; 84998c2ecf20Sopenharmony_ci 85008c2ecf20Sopenharmony_ci /* 85018c2ecf20Sopenharmony_ci * If the zone is movable and we have ruled out all reserved 85028c2ecf20Sopenharmony_ci * pages then it should be reasonably safe to assume the rest 85038c2ecf20Sopenharmony_ci * is movable. 85048c2ecf20Sopenharmony_ci */ 85058c2ecf20Sopenharmony_ci if (zone_idx(zone) == ZONE_MOVABLE) 85068c2ecf20Sopenharmony_ci continue; 85078c2ecf20Sopenharmony_ci 85088c2ecf20Sopenharmony_ci /* 85098c2ecf20Sopenharmony_ci * Hugepages are not in LRU lists, but they're movable. 85108c2ecf20Sopenharmony_ci * THPs are on the LRU, but need to be counted as #small pages. 85118c2ecf20Sopenharmony_ci * We need not scan over tail pages because we don't 85128c2ecf20Sopenharmony_ci * handle each tail page individually in migration. 85138c2ecf20Sopenharmony_ci */ 85148c2ecf20Sopenharmony_ci if (PageHuge(page) || PageTransCompound(page)) { 85158c2ecf20Sopenharmony_ci struct page *head = compound_head(page); 85168c2ecf20Sopenharmony_ci unsigned int skip_pages; 85178c2ecf20Sopenharmony_ci 85188c2ecf20Sopenharmony_ci if (PageHuge(page)) { 85198c2ecf20Sopenharmony_ci if (!hugepage_migration_supported(page_hstate(head))) 85208c2ecf20Sopenharmony_ci return page; 85218c2ecf20Sopenharmony_ci } else if (!PageLRU(head) && !__PageMovable(head)) { 85228c2ecf20Sopenharmony_ci return page; 85238c2ecf20Sopenharmony_ci } 85248c2ecf20Sopenharmony_ci 85258c2ecf20Sopenharmony_ci skip_pages = compound_nr(head) - (page - head); 85268c2ecf20Sopenharmony_ci iter += skip_pages - 1; 85278c2ecf20Sopenharmony_ci continue; 85288c2ecf20Sopenharmony_ci } 85298c2ecf20Sopenharmony_ci 85308c2ecf20Sopenharmony_ci /* 85318c2ecf20Sopenharmony_ci * We can't use page_count without pin a page 85328c2ecf20Sopenharmony_ci * because another CPU can free compound page. 85338c2ecf20Sopenharmony_ci * This check already skips compound tails of THP 85348c2ecf20Sopenharmony_ci * because their page->_refcount is zero at all time. 85358c2ecf20Sopenharmony_ci */ 85368c2ecf20Sopenharmony_ci if (!page_ref_count(page)) { 85378c2ecf20Sopenharmony_ci if (PageBuddy(page)) 85388c2ecf20Sopenharmony_ci iter += (1 << buddy_order(page)) - 1; 85398c2ecf20Sopenharmony_ci continue; 85408c2ecf20Sopenharmony_ci } 85418c2ecf20Sopenharmony_ci 85428c2ecf20Sopenharmony_ci /* 85438c2ecf20Sopenharmony_ci * The HWPoisoned page may be not in buddy system, and 85448c2ecf20Sopenharmony_ci * page_count() is not 0. 85458c2ecf20Sopenharmony_ci */ 85468c2ecf20Sopenharmony_ci if ((flags & MEMORY_OFFLINE) && PageHWPoison(page)) 85478c2ecf20Sopenharmony_ci continue; 85488c2ecf20Sopenharmony_ci 85498c2ecf20Sopenharmony_ci /* 85508c2ecf20Sopenharmony_ci * We treat all PageOffline() pages as movable when offlining 85518c2ecf20Sopenharmony_ci * to give drivers a chance to decrement their reference count 85528c2ecf20Sopenharmony_ci * in MEM_GOING_OFFLINE in order to indicate that these pages 85538c2ecf20Sopenharmony_ci * can be offlined as there are no direct references anymore. 85548c2ecf20Sopenharmony_ci * For actually unmovable PageOffline() where the driver does 85558c2ecf20Sopenharmony_ci * not support this, we will fail later when trying to actually 85568c2ecf20Sopenharmony_ci * move these pages that still have a reference count > 0. 85578c2ecf20Sopenharmony_ci * (false negatives in this function only) 85588c2ecf20Sopenharmony_ci */ 85598c2ecf20Sopenharmony_ci if ((flags & MEMORY_OFFLINE) && PageOffline(page)) 85608c2ecf20Sopenharmony_ci continue; 85618c2ecf20Sopenharmony_ci 85628c2ecf20Sopenharmony_ci if (__PageMovable(page) || PageLRU(page)) 85638c2ecf20Sopenharmony_ci continue; 85648c2ecf20Sopenharmony_ci 85658c2ecf20Sopenharmony_ci /* 85668c2ecf20Sopenharmony_ci * If there are RECLAIMABLE pages, we need to check 85678c2ecf20Sopenharmony_ci * it. But now, memory offline itself doesn't call 85688c2ecf20Sopenharmony_ci * shrink_node_slabs() and it still to be fixed. 85698c2ecf20Sopenharmony_ci */ 85708c2ecf20Sopenharmony_ci return page; 85718c2ecf20Sopenharmony_ci } 85728c2ecf20Sopenharmony_ci return NULL; 85738c2ecf20Sopenharmony_ci} 85748c2ecf20Sopenharmony_ci 85758c2ecf20Sopenharmony_ci#ifdef CONFIG_CONTIG_ALLOC 85768c2ecf20Sopenharmony_cistatic unsigned long pfn_max_align_down(unsigned long pfn) 85778c2ecf20Sopenharmony_ci{ 85788c2ecf20Sopenharmony_ci return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, 85798c2ecf20Sopenharmony_ci pageblock_nr_pages) - 1); 85808c2ecf20Sopenharmony_ci} 85818c2ecf20Sopenharmony_ci 85828c2ecf20Sopenharmony_cistatic unsigned long pfn_max_align_up(unsigned long pfn) 85838c2ecf20Sopenharmony_ci{ 85848c2ecf20Sopenharmony_ci return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, 85858c2ecf20Sopenharmony_ci pageblock_nr_pages)); 85868c2ecf20Sopenharmony_ci} 85878c2ecf20Sopenharmony_ci 85888c2ecf20Sopenharmony_ci/* [start, end) must belong to a single zone. */ 85898c2ecf20Sopenharmony_cistatic int __alloc_contig_migrate_range(struct compact_control *cc, 85908c2ecf20Sopenharmony_ci unsigned long start, unsigned long end) 85918c2ecf20Sopenharmony_ci{ 85928c2ecf20Sopenharmony_ci /* This function is based on compact_zone() from compaction.c. */ 85938c2ecf20Sopenharmony_ci unsigned int nr_reclaimed; 85948c2ecf20Sopenharmony_ci unsigned long pfn = start; 85958c2ecf20Sopenharmony_ci unsigned int tries = 0; 85968c2ecf20Sopenharmony_ci int ret = 0; 85978c2ecf20Sopenharmony_ci struct migration_target_control mtc = { 85988c2ecf20Sopenharmony_ci .nid = zone_to_nid(cc->zone), 85998c2ecf20Sopenharmony_ci .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, 86008c2ecf20Sopenharmony_ci }; 86018c2ecf20Sopenharmony_ci 86028c2ecf20Sopenharmony_ci migrate_prep(); 86038c2ecf20Sopenharmony_ci 86048c2ecf20Sopenharmony_ci while (pfn < end || !list_empty(&cc->migratepages)) { 86058c2ecf20Sopenharmony_ci if (fatal_signal_pending(current)) { 86068c2ecf20Sopenharmony_ci ret = -EINTR; 86078c2ecf20Sopenharmony_ci break; 86088c2ecf20Sopenharmony_ci } 86098c2ecf20Sopenharmony_ci 86108c2ecf20Sopenharmony_ci if (list_empty(&cc->migratepages)) { 86118c2ecf20Sopenharmony_ci cc->nr_migratepages = 0; 86128c2ecf20Sopenharmony_ci pfn = isolate_migratepages_range(cc, pfn, end); 86138c2ecf20Sopenharmony_ci if (!pfn) { 86148c2ecf20Sopenharmony_ci ret = -EINTR; 86158c2ecf20Sopenharmony_ci break; 86168c2ecf20Sopenharmony_ci } 86178c2ecf20Sopenharmony_ci tries = 0; 86188c2ecf20Sopenharmony_ci } else if (++tries == 5) { 86198c2ecf20Sopenharmony_ci ret = ret < 0 ? ret : -EBUSY; 86208c2ecf20Sopenharmony_ci break; 86218c2ecf20Sopenharmony_ci } 86228c2ecf20Sopenharmony_ci 86238c2ecf20Sopenharmony_ci nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, 86248c2ecf20Sopenharmony_ci &cc->migratepages); 86258c2ecf20Sopenharmony_ci cc->nr_migratepages -= nr_reclaimed; 86268c2ecf20Sopenharmony_ci 86278c2ecf20Sopenharmony_ci ret = migrate_pages(&cc->migratepages, alloc_migration_target, 86288c2ecf20Sopenharmony_ci NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE); 86298c2ecf20Sopenharmony_ci } 86308c2ecf20Sopenharmony_ci if (ret < 0) { 86318c2ecf20Sopenharmony_ci putback_movable_pages(&cc->migratepages); 86328c2ecf20Sopenharmony_ci return ret; 86338c2ecf20Sopenharmony_ci } 86348c2ecf20Sopenharmony_ci return 0; 86358c2ecf20Sopenharmony_ci} 86368c2ecf20Sopenharmony_ci 86378c2ecf20Sopenharmony_ci/** 86388c2ecf20Sopenharmony_ci * alloc_contig_range() -- tries to allocate given range of pages 86398c2ecf20Sopenharmony_ci * @start: start PFN to allocate 86408c2ecf20Sopenharmony_ci * @end: one-past-the-last PFN to allocate 86418c2ecf20Sopenharmony_ci * @migratetype: migratetype of the underlaying pageblocks (either 86428c2ecf20Sopenharmony_ci * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks 86438c2ecf20Sopenharmony_ci * in range must have the same migratetype and it must 86448c2ecf20Sopenharmony_ci * be either of the two. 86458c2ecf20Sopenharmony_ci * @gfp_mask: GFP mask to use during compaction 86468c2ecf20Sopenharmony_ci * 86478c2ecf20Sopenharmony_ci * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES 86488c2ecf20Sopenharmony_ci * aligned. The PFN range must belong to a single zone. 86498c2ecf20Sopenharmony_ci * 86508c2ecf20Sopenharmony_ci * The first thing this routine does is attempt to MIGRATE_ISOLATE all 86518c2ecf20Sopenharmony_ci * pageblocks in the range. Once isolated, the pageblocks should not 86528c2ecf20Sopenharmony_ci * be modified by others. 86538c2ecf20Sopenharmony_ci * 86548c2ecf20Sopenharmony_ci * Return: zero on success or negative error code. On success all 86558c2ecf20Sopenharmony_ci * pages which PFN is in [start, end) are allocated for the caller and 86568c2ecf20Sopenharmony_ci * need to be freed with free_contig_range(). 86578c2ecf20Sopenharmony_ci */ 86588c2ecf20Sopenharmony_ciint alloc_contig_range(unsigned long start, unsigned long end, 86598c2ecf20Sopenharmony_ci unsigned migratetype, gfp_t gfp_mask) 86608c2ecf20Sopenharmony_ci{ 86618c2ecf20Sopenharmony_ci unsigned long outer_start, outer_end; 86628c2ecf20Sopenharmony_ci unsigned int order; 86638c2ecf20Sopenharmony_ci int ret = 0; 86648c2ecf20Sopenharmony_ci 86658c2ecf20Sopenharmony_ci struct compact_control cc = { 86668c2ecf20Sopenharmony_ci .nr_migratepages = 0, 86678c2ecf20Sopenharmony_ci .order = -1, 86688c2ecf20Sopenharmony_ci .zone = page_zone(pfn_to_page(start)), 86698c2ecf20Sopenharmony_ci .mode = MIGRATE_SYNC, 86708c2ecf20Sopenharmony_ci .ignore_skip_hint = true, 86718c2ecf20Sopenharmony_ci .no_set_skip_hint = true, 86728c2ecf20Sopenharmony_ci .gfp_mask = current_gfp_context(gfp_mask), 86738c2ecf20Sopenharmony_ci .alloc_contig = true, 86748c2ecf20Sopenharmony_ci }; 86758c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&cc.migratepages); 86768c2ecf20Sopenharmony_ci 86778c2ecf20Sopenharmony_ci /* 86788c2ecf20Sopenharmony_ci * What we do here is we mark all pageblocks in range as 86798c2ecf20Sopenharmony_ci * MIGRATE_ISOLATE. Because pageblock and max order pages may 86808c2ecf20Sopenharmony_ci * have different sizes, and due to the way page allocator 86818c2ecf20Sopenharmony_ci * work, we align the range to biggest of the two pages so 86828c2ecf20Sopenharmony_ci * that page allocator won't try to merge buddies from 86838c2ecf20Sopenharmony_ci * different pageblocks and change MIGRATE_ISOLATE to some 86848c2ecf20Sopenharmony_ci * other migration type. 86858c2ecf20Sopenharmony_ci * 86868c2ecf20Sopenharmony_ci * Once the pageblocks are marked as MIGRATE_ISOLATE, we 86878c2ecf20Sopenharmony_ci * migrate the pages from an unaligned range (ie. pages that 86888c2ecf20Sopenharmony_ci * we are interested in). This will put all the pages in 86898c2ecf20Sopenharmony_ci * range back to page allocator as MIGRATE_ISOLATE. 86908c2ecf20Sopenharmony_ci * 86918c2ecf20Sopenharmony_ci * When this is done, we take the pages in range from page 86928c2ecf20Sopenharmony_ci * allocator removing them from the buddy system. This way 86938c2ecf20Sopenharmony_ci * page allocator will never consider using them. 86948c2ecf20Sopenharmony_ci * 86958c2ecf20Sopenharmony_ci * This lets us mark the pageblocks back as 86968c2ecf20Sopenharmony_ci * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the 86978c2ecf20Sopenharmony_ci * aligned range but not in the unaligned, original range are 86988c2ecf20Sopenharmony_ci * put back to page allocator so that buddy can use them. 86998c2ecf20Sopenharmony_ci */ 87008c2ecf20Sopenharmony_ci 87018c2ecf20Sopenharmony_ci ret = start_isolate_page_range(pfn_max_align_down(start), 87028c2ecf20Sopenharmony_ci pfn_max_align_up(end), migratetype, 0); 87038c2ecf20Sopenharmony_ci if (ret) 87048c2ecf20Sopenharmony_ci return ret; 87058c2ecf20Sopenharmony_ci 87068c2ecf20Sopenharmony_ci /* 87078c2ecf20Sopenharmony_ci * In case of -EBUSY, we'd like to know which page causes problem. 87088c2ecf20Sopenharmony_ci * So, just fall through. test_pages_isolated() has a tracepoint 87098c2ecf20Sopenharmony_ci * which will report the busy page. 87108c2ecf20Sopenharmony_ci * 87118c2ecf20Sopenharmony_ci * It is possible that busy pages could become available before 87128c2ecf20Sopenharmony_ci * the call to test_pages_isolated, and the range will actually be 87138c2ecf20Sopenharmony_ci * allocated. So, if we fall through be sure to clear ret so that 87148c2ecf20Sopenharmony_ci * -EBUSY is not accidentally used or returned to caller. 87158c2ecf20Sopenharmony_ci */ 87168c2ecf20Sopenharmony_ci ret = __alloc_contig_migrate_range(&cc, start, end); 87178c2ecf20Sopenharmony_ci if (ret && ret != -EBUSY) 87188c2ecf20Sopenharmony_ci goto done; 87198c2ecf20Sopenharmony_ci ret =0; 87208c2ecf20Sopenharmony_ci 87218c2ecf20Sopenharmony_ci /* 87228c2ecf20Sopenharmony_ci * Pages from [start, end) are within a MAX_ORDER_NR_PAGES 87238c2ecf20Sopenharmony_ci * aligned blocks that are marked as MIGRATE_ISOLATE. What's 87248c2ecf20Sopenharmony_ci * more, all pages in [start, end) are free in page allocator. 87258c2ecf20Sopenharmony_ci * What we are going to do is to allocate all pages from 87268c2ecf20Sopenharmony_ci * [start, end) (that is remove them from page allocator). 87278c2ecf20Sopenharmony_ci * 87288c2ecf20Sopenharmony_ci * The only problem is that pages at the beginning and at the 87298c2ecf20Sopenharmony_ci * end of interesting range may be not aligned with pages that 87308c2ecf20Sopenharmony_ci * page allocator holds, ie. they can be part of higher order 87318c2ecf20Sopenharmony_ci * pages. Because of this, we reserve the bigger range and 87328c2ecf20Sopenharmony_ci * once this is done free the pages we are not interested in. 87338c2ecf20Sopenharmony_ci * 87348c2ecf20Sopenharmony_ci * We don't have to hold zone->lock here because the pages are 87358c2ecf20Sopenharmony_ci * isolated thus they won't get removed from buddy. 87368c2ecf20Sopenharmony_ci */ 87378c2ecf20Sopenharmony_ci 87388c2ecf20Sopenharmony_ci lru_add_drain_all(); 87398c2ecf20Sopenharmony_ci 87408c2ecf20Sopenharmony_ci order = 0; 87418c2ecf20Sopenharmony_ci outer_start = start; 87428c2ecf20Sopenharmony_ci while (!PageBuddy(pfn_to_page(outer_start))) { 87438c2ecf20Sopenharmony_ci if (++order >= MAX_ORDER) { 87448c2ecf20Sopenharmony_ci outer_start = start; 87458c2ecf20Sopenharmony_ci break; 87468c2ecf20Sopenharmony_ci } 87478c2ecf20Sopenharmony_ci outer_start &= ~0UL << order; 87488c2ecf20Sopenharmony_ci } 87498c2ecf20Sopenharmony_ci 87508c2ecf20Sopenharmony_ci if (outer_start != start) { 87518c2ecf20Sopenharmony_ci order = buddy_order(pfn_to_page(outer_start)); 87528c2ecf20Sopenharmony_ci 87538c2ecf20Sopenharmony_ci /* 87548c2ecf20Sopenharmony_ci * outer_start page could be small order buddy page and 87558c2ecf20Sopenharmony_ci * it doesn't include start page. Adjust outer_start 87568c2ecf20Sopenharmony_ci * in this case to report failed page properly 87578c2ecf20Sopenharmony_ci * on tracepoint in test_pages_isolated() 87588c2ecf20Sopenharmony_ci */ 87598c2ecf20Sopenharmony_ci if (outer_start + (1UL << order) <= start) 87608c2ecf20Sopenharmony_ci outer_start = start; 87618c2ecf20Sopenharmony_ci } 87628c2ecf20Sopenharmony_ci 87638c2ecf20Sopenharmony_ci /* Make sure the range is really isolated. */ 87648c2ecf20Sopenharmony_ci if (test_pages_isolated(outer_start, end, 0)) { 87658c2ecf20Sopenharmony_ci pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n", 87668c2ecf20Sopenharmony_ci __func__, outer_start, end); 87678c2ecf20Sopenharmony_ci ret = -EBUSY; 87688c2ecf20Sopenharmony_ci goto done; 87698c2ecf20Sopenharmony_ci } 87708c2ecf20Sopenharmony_ci 87718c2ecf20Sopenharmony_ci /* Grab isolated pages from freelists. */ 87728c2ecf20Sopenharmony_ci outer_end = isolate_freepages_range(&cc, outer_start, end); 87738c2ecf20Sopenharmony_ci if (!outer_end) { 87748c2ecf20Sopenharmony_ci ret = -EBUSY; 87758c2ecf20Sopenharmony_ci goto done; 87768c2ecf20Sopenharmony_ci } 87778c2ecf20Sopenharmony_ci 87788c2ecf20Sopenharmony_ci /* Free head and tail (if any) */ 87798c2ecf20Sopenharmony_ci if (start != outer_start) 87808c2ecf20Sopenharmony_ci free_contig_range(outer_start, start - outer_start); 87818c2ecf20Sopenharmony_ci if (end != outer_end) 87828c2ecf20Sopenharmony_ci free_contig_range(end, outer_end - end); 87838c2ecf20Sopenharmony_ci 87848c2ecf20Sopenharmony_cidone: 87858c2ecf20Sopenharmony_ci undo_isolate_page_range(pfn_max_align_down(start), 87868c2ecf20Sopenharmony_ci pfn_max_align_up(end), migratetype); 87878c2ecf20Sopenharmony_ci return ret; 87888c2ecf20Sopenharmony_ci} 87898c2ecf20Sopenharmony_ciEXPORT_SYMBOL(alloc_contig_range); 87908c2ecf20Sopenharmony_ci 87918c2ecf20Sopenharmony_cistatic int __alloc_contig_pages(unsigned long start_pfn, 87928c2ecf20Sopenharmony_ci unsigned long nr_pages, gfp_t gfp_mask) 87938c2ecf20Sopenharmony_ci{ 87948c2ecf20Sopenharmony_ci unsigned long end_pfn = start_pfn + nr_pages; 87958c2ecf20Sopenharmony_ci 87968c2ecf20Sopenharmony_ci return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, 87978c2ecf20Sopenharmony_ci gfp_mask); 87988c2ecf20Sopenharmony_ci} 87998c2ecf20Sopenharmony_ci 88008c2ecf20Sopenharmony_cistatic bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, 88018c2ecf20Sopenharmony_ci unsigned long nr_pages) 88028c2ecf20Sopenharmony_ci{ 88038c2ecf20Sopenharmony_ci unsigned long i, end_pfn = start_pfn + nr_pages; 88048c2ecf20Sopenharmony_ci struct page *page; 88058c2ecf20Sopenharmony_ci 88068c2ecf20Sopenharmony_ci for (i = start_pfn; i < end_pfn; i++) { 88078c2ecf20Sopenharmony_ci page = pfn_to_online_page(i); 88088c2ecf20Sopenharmony_ci if (!page) 88098c2ecf20Sopenharmony_ci return false; 88108c2ecf20Sopenharmony_ci 88118c2ecf20Sopenharmony_ci if (page_zone(page) != z) 88128c2ecf20Sopenharmony_ci return false; 88138c2ecf20Sopenharmony_ci 88148c2ecf20Sopenharmony_ci if (PageReserved(page)) 88158c2ecf20Sopenharmony_ci return false; 88168c2ecf20Sopenharmony_ci 88178c2ecf20Sopenharmony_ci if (page_count(page) > 0) 88188c2ecf20Sopenharmony_ci return false; 88198c2ecf20Sopenharmony_ci 88208c2ecf20Sopenharmony_ci if (PageHuge(page)) 88218c2ecf20Sopenharmony_ci return false; 88228c2ecf20Sopenharmony_ci } 88238c2ecf20Sopenharmony_ci return true; 88248c2ecf20Sopenharmony_ci} 88258c2ecf20Sopenharmony_ci 88268c2ecf20Sopenharmony_cistatic bool zone_spans_last_pfn(const struct zone *zone, 88278c2ecf20Sopenharmony_ci unsigned long start_pfn, unsigned long nr_pages) 88288c2ecf20Sopenharmony_ci{ 88298c2ecf20Sopenharmony_ci unsigned long last_pfn = start_pfn + nr_pages - 1; 88308c2ecf20Sopenharmony_ci 88318c2ecf20Sopenharmony_ci return zone_spans_pfn(zone, last_pfn); 88328c2ecf20Sopenharmony_ci} 88338c2ecf20Sopenharmony_ci 88348c2ecf20Sopenharmony_ci/** 88358c2ecf20Sopenharmony_ci * alloc_contig_pages() -- tries to find and allocate contiguous range of pages 88368c2ecf20Sopenharmony_ci * @nr_pages: Number of contiguous pages to allocate 88378c2ecf20Sopenharmony_ci * @gfp_mask: GFP mask to limit search and used during compaction 88388c2ecf20Sopenharmony_ci * @nid: Target node 88398c2ecf20Sopenharmony_ci * @nodemask: Mask for other possible nodes 88408c2ecf20Sopenharmony_ci * 88418c2ecf20Sopenharmony_ci * This routine is a wrapper around alloc_contig_range(). It scans over zones 88428c2ecf20Sopenharmony_ci * on an applicable zonelist to find a contiguous pfn range which can then be 88438c2ecf20Sopenharmony_ci * tried for allocation with alloc_contig_range(). This routine is intended 88448c2ecf20Sopenharmony_ci * for allocation requests which can not be fulfilled with the buddy allocator. 88458c2ecf20Sopenharmony_ci * 88468c2ecf20Sopenharmony_ci * The allocated memory is always aligned to a page boundary. If nr_pages is a 88478c2ecf20Sopenharmony_ci * power of two then the alignment is guaranteed to be to the given nr_pages 88488c2ecf20Sopenharmony_ci * (e.g. 1GB request would be aligned to 1GB). 88498c2ecf20Sopenharmony_ci * 88508c2ecf20Sopenharmony_ci * Allocated pages can be freed with free_contig_range() or by manually calling 88518c2ecf20Sopenharmony_ci * __free_page() on each allocated page. 88528c2ecf20Sopenharmony_ci * 88538c2ecf20Sopenharmony_ci * Return: pointer to contiguous pages on success, or NULL if not successful. 88548c2ecf20Sopenharmony_ci */ 88558c2ecf20Sopenharmony_cistruct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, 88568c2ecf20Sopenharmony_ci int nid, nodemask_t *nodemask) 88578c2ecf20Sopenharmony_ci{ 88588c2ecf20Sopenharmony_ci unsigned long ret, pfn, flags; 88598c2ecf20Sopenharmony_ci struct zonelist *zonelist; 88608c2ecf20Sopenharmony_ci struct zone *zone; 88618c2ecf20Sopenharmony_ci struct zoneref *z; 88628c2ecf20Sopenharmony_ci 88638c2ecf20Sopenharmony_ci zonelist = node_zonelist(nid, gfp_mask); 88648c2ecf20Sopenharmony_ci for_each_zone_zonelist_nodemask(zone, z, zonelist, 88658c2ecf20Sopenharmony_ci gfp_zone(gfp_mask), nodemask) { 88668c2ecf20Sopenharmony_ci spin_lock_irqsave(&zone->lock, flags); 88678c2ecf20Sopenharmony_ci 88688c2ecf20Sopenharmony_ci pfn = ALIGN(zone->zone_start_pfn, nr_pages); 88698c2ecf20Sopenharmony_ci while (zone_spans_last_pfn(zone, pfn, nr_pages)) { 88708c2ecf20Sopenharmony_ci if (pfn_range_valid_contig(zone, pfn, nr_pages)) { 88718c2ecf20Sopenharmony_ci /* 88728c2ecf20Sopenharmony_ci * We release the zone lock here because 88738c2ecf20Sopenharmony_ci * alloc_contig_range() will also lock the zone 88748c2ecf20Sopenharmony_ci * at some point. If there's an allocation 88758c2ecf20Sopenharmony_ci * spinning on this lock, it may win the race 88768c2ecf20Sopenharmony_ci * and cause alloc_contig_range() to fail... 88778c2ecf20Sopenharmony_ci */ 88788c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&zone->lock, flags); 88798c2ecf20Sopenharmony_ci ret = __alloc_contig_pages(pfn, nr_pages, 88808c2ecf20Sopenharmony_ci gfp_mask); 88818c2ecf20Sopenharmony_ci if (!ret) 88828c2ecf20Sopenharmony_ci return pfn_to_page(pfn); 88838c2ecf20Sopenharmony_ci spin_lock_irqsave(&zone->lock, flags); 88848c2ecf20Sopenharmony_ci } 88858c2ecf20Sopenharmony_ci pfn += nr_pages; 88868c2ecf20Sopenharmony_ci } 88878c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&zone->lock, flags); 88888c2ecf20Sopenharmony_ci } 88898c2ecf20Sopenharmony_ci return NULL; 88908c2ecf20Sopenharmony_ci} 88918c2ecf20Sopenharmony_ci#endif /* CONFIG_CONTIG_ALLOC */ 88928c2ecf20Sopenharmony_ci 88938c2ecf20Sopenharmony_civoid free_contig_range(unsigned long pfn, unsigned int nr_pages) 88948c2ecf20Sopenharmony_ci{ 88958c2ecf20Sopenharmony_ci unsigned int count = 0; 88968c2ecf20Sopenharmony_ci 88978c2ecf20Sopenharmony_ci for (; nr_pages--; pfn++) { 88988c2ecf20Sopenharmony_ci struct page *page = pfn_to_page(pfn); 88998c2ecf20Sopenharmony_ci 89008c2ecf20Sopenharmony_ci count += page_count(page) != 1; 89018c2ecf20Sopenharmony_ci __free_page(page); 89028c2ecf20Sopenharmony_ci } 89038c2ecf20Sopenharmony_ci WARN(count != 0, "%d pages are still in use!\n", count); 89048c2ecf20Sopenharmony_ci} 89058c2ecf20Sopenharmony_ciEXPORT_SYMBOL(free_contig_range); 89068c2ecf20Sopenharmony_ci 89078c2ecf20Sopenharmony_ci/* 89088c2ecf20Sopenharmony_ci * The zone indicated has a new number of managed_pages; batch sizes and percpu 89098c2ecf20Sopenharmony_ci * page high values need to be recalulated. 89108c2ecf20Sopenharmony_ci */ 89118c2ecf20Sopenharmony_civoid __meminit zone_pcp_update(struct zone *zone) 89128c2ecf20Sopenharmony_ci{ 89138c2ecf20Sopenharmony_ci mutex_lock(&pcp_batch_high_lock); 89148c2ecf20Sopenharmony_ci __zone_pcp_update(zone); 89158c2ecf20Sopenharmony_ci mutex_unlock(&pcp_batch_high_lock); 89168c2ecf20Sopenharmony_ci} 89178c2ecf20Sopenharmony_ci 89188c2ecf20Sopenharmony_civoid zone_pcp_reset(struct zone *zone) 89198c2ecf20Sopenharmony_ci{ 89208c2ecf20Sopenharmony_ci unsigned long flags; 89218c2ecf20Sopenharmony_ci int cpu; 89228c2ecf20Sopenharmony_ci struct per_cpu_pageset *pset; 89238c2ecf20Sopenharmony_ci 89248c2ecf20Sopenharmony_ci /* avoid races with drain_pages() */ 89258c2ecf20Sopenharmony_ci local_irq_save(flags); 89268c2ecf20Sopenharmony_ci if (zone->pageset != &boot_pageset) { 89278c2ecf20Sopenharmony_ci for_each_online_cpu(cpu) { 89288c2ecf20Sopenharmony_ci pset = per_cpu_ptr(zone->pageset, cpu); 89298c2ecf20Sopenharmony_ci drain_zonestat(zone, pset); 89308c2ecf20Sopenharmony_ci } 89318c2ecf20Sopenharmony_ci free_percpu(zone->pageset); 89328c2ecf20Sopenharmony_ci zone->pageset = &boot_pageset; 89338c2ecf20Sopenharmony_ci } 89348c2ecf20Sopenharmony_ci local_irq_restore(flags); 89358c2ecf20Sopenharmony_ci} 89368c2ecf20Sopenharmony_ci 89378c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMORY_HOTREMOVE 89388c2ecf20Sopenharmony_ci/* 89398c2ecf20Sopenharmony_ci * All pages in the range must be in a single zone, must not contain holes, 89408c2ecf20Sopenharmony_ci * must span full sections, and must be isolated before calling this function. 89418c2ecf20Sopenharmony_ci */ 89428c2ecf20Sopenharmony_civoid __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 89438c2ecf20Sopenharmony_ci{ 89448c2ecf20Sopenharmony_ci unsigned long pfn = start_pfn; 89458c2ecf20Sopenharmony_ci struct page *page; 89468c2ecf20Sopenharmony_ci struct zone *zone; 89478c2ecf20Sopenharmony_ci unsigned int order; 89488c2ecf20Sopenharmony_ci unsigned long flags; 89498c2ecf20Sopenharmony_ci 89508c2ecf20Sopenharmony_ci offline_mem_sections(pfn, end_pfn); 89518c2ecf20Sopenharmony_ci zone = page_zone(pfn_to_page(pfn)); 89528c2ecf20Sopenharmony_ci spin_lock_irqsave(&zone->lock, flags); 89538c2ecf20Sopenharmony_ci while (pfn < end_pfn) { 89548c2ecf20Sopenharmony_ci page = pfn_to_page(pfn); 89558c2ecf20Sopenharmony_ci /* 89568c2ecf20Sopenharmony_ci * The HWPoisoned page may be not in buddy system, and 89578c2ecf20Sopenharmony_ci * page_count() is not 0. 89588c2ecf20Sopenharmony_ci */ 89598c2ecf20Sopenharmony_ci if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { 89608c2ecf20Sopenharmony_ci pfn++; 89618c2ecf20Sopenharmony_ci continue; 89628c2ecf20Sopenharmony_ci } 89638c2ecf20Sopenharmony_ci /* 89648c2ecf20Sopenharmony_ci * At this point all remaining PageOffline() pages have a 89658c2ecf20Sopenharmony_ci * reference count of 0 and can simply be skipped. 89668c2ecf20Sopenharmony_ci */ 89678c2ecf20Sopenharmony_ci if (PageOffline(page)) { 89688c2ecf20Sopenharmony_ci BUG_ON(page_count(page)); 89698c2ecf20Sopenharmony_ci BUG_ON(PageBuddy(page)); 89708c2ecf20Sopenharmony_ci pfn++; 89718c2ecf20Sopenharmony_ci continue; 89728c2ecf20Sopenharmony_ci } 89738c2ecf20Sopenharmony_ci 89748c2ecf20Sopenharmony_ci BUG_ON(page_count(page)); 89758c2ecf20Sopenharmony_ci BUG_ON(!PageBuddy(page)); 89768c2ecf20Sopenharmony_ci order = buddy_order(page); 89778c2ecf20Sopenharmony_ci del_page_from_free_list(page, zone, order); 89788c2ecf20Sopenharmony_ci pfn += (1 << order); 89798c2ecf20Sopenharmony_ci } 89808c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&zone->lock, flags); 89818c2ecf20Sopenharmony_ci} 89828c2ecf20Sopenharmony_ci#endif 89838c2ecf20Sopenharmony_ci 89848c2ecf20Sopenharmony_cibool is_free_buddy_page(struct page *page) 89858c2ecf20Sopenharmony_ci{ 89868c2ecf20Sopenharmony_ci struct zone *zone = page_zone(page); 89878c2ecf20Sopenharmony_ci unsigned long pfn = page_to_pfn(page); 89888c2ecf20Sopenharmony_ci unsigned long flags; 89898c2ecf20Sopenharmony_ci unsigned int order; 89908c2ecf20Sopenharmony_ci 89918c2ecf20Sopenharmony_ci spin_lock_irqsave(&zone->lock, flags); 89928c2ecf20Sopenharmony_ci for (order = 0; order < MAX_ORDER; order++) { 89938c2ecf20Sopenharmony_ci struct page *page_head = page - (pfn & ((1 << order) - 1)); 89948c2ecf20Sopenharmony_ci 89958c2ecf20Sopenharmony_ci if (PageBuddy(page_head) && buddy_order(page_head) >= order) 89968c2ecf20Sopenharmony_ci break; 89978c2ecf20Sopenharmony_ci } 89988c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&zone->lock, flags); 89998c2ecf20Sopenharmony_ci 90008c2ecf20Sopenharmony_ci return order < MAX_ORDER; 90018c2ecf20Sopenharmony_ci} 90028c2ecf20Sopenharmony_ci 90038c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMORY_FAILURE 90048c2ecf20Sopenharmony_ci/* 90058c2ecf20Sopenharmony_ci * Break down a higher-order page in sub-pages, and keep our target out of 90068c2ecf20Sopenharmony_ci * buddy allocator. 90078c2ecf20Sopenharmony_ci */ 90088c2ecf20Sopenharmony_cistatic void break_down_buddy_pages(struct zone *zone, struct page *page, 90098c2ecf20Sopenharmony_ci struct page *target, int low, int high, 90108c2ecf20Sopenharmony_ci int migratetype) 90118c2ecf20Sopenharmony_ci{ 90128c2ecf20Sopenharmony_ci unsigned long size = 1 << high; 90138c2ecf20Sopenharmony_ci struct page *current_buddy, *next_page; 90148c2ecf20Sopenharmony_ci 90158c2ecf20Sopenharmony_ci while (high > low) { 90168c2ecf20Sopenharmony_ci high--; 90178c2ecf20Sopenharmony_ci size >>= 1; 90188c2ecf20Sopenharmony_ci 90198c2ecf20Sopenharmony_ci if (target >= &page[size]) { 90208c2ecf20Sopenharmony_ci next_page = page + size; 90218c2ecf20Sopenharmony_ci current_buddy = page; 90228c2ecf20Sopenharmony_ci } else { 90238c2ecf20Sopenharmony_ci next_page = page; 90248c2ecf20Sopenharmony_ci current_buddy = page + size; 90258c2ecf20Sopenharmony_ci } 90268c2ecf20Sopenharmony_ci page = next_page; 90278c2ecf20Sopenharmony_ci 90288c2ecf20Sopenharmony_ci if (set_page_guard(zone, current_buddy, high, migratetype)) 90298c2ecf20Sopenharmony_ci continue; 90308c2ecf20Sopenharmony_ci 90318c2ecf20Sopenharmony_ci if (current_buddy != target) { 90328c2ecf20Sopenharmony_ci add_to_free_list(current_buddy, zone, high, migratetype); 90338c2ecf20Sopenharmony_ci set_buddy_order(current_buddy, high); 90348c2ecf20Sopenharmony_ci } 90358c2ecf20Sopenharmony_ci } 90368c2ecf20Sopenharmony_ci} 90378c2ecf20Sopenharmony_ci 90388c2ecf20Sopenharmony_ci/* 90398c2ecf20Sopenharmony_ci * Take a page that will be marked as poisoned off the buddy allocator. 90408c2ecf20Sopenharmony_ci */ 90418c2ecf20Sopenharmony_cibool take_page_off_buddy(struct page *page) 90428c2ecf20Sopenharmony_ci{ 90438c2ecf20Sopenharmony_ci struct zone *zone = page_zone(page); 90448c2ecf20Sopenharmony_ci unsigned long pfn = page_to_pfn(page); 90458c2ecf20Sopenharmony_ci unsigned long flags; 90468c2ecf20Sopenharmony_ci unsigned int order; 90478c2ecf20Sopenharmony_ci bool ret = false; 90488c2ecf20Sopenharmony_ci 90498c2ecf20Sopenharmony_ci spin_lock_irqsave(&zone->lock, flags); 90508c2ecf20Sopenharmony_ci for (order = 0; order < MAX_ORDER; order++) { 90518c2ecf20Sopenharmony_ci struct page *page_head = page - (pfn & ((1 << order) - 1)); 90528c2ecf20Sopenharmony_ci int page_order = buddy_order(page_head); 90538c2ecf20Sopenharmony_ci 90548c2ecf20Sopenharmony_ci if (PageBuddy(page_head) && page_order >= order) { 90558c2ecf20Sopenharmony_ci unsigned long pfn_head = page_to_pfn(page_head); 90568c2ecf20Sopenharmony_ci int migratetype = get_pfnblock_migratetype(page_head, 90578c2ecf20Sopenharmony_ci pfn_head); 90588c2ecf20Sopenharmony_ci 90598c2ecf20Sopenharmony_ci del_page_from_free_list(page_head, zone, page_order); 90608c2ecf20Sopenharmony_ci break_down_buddy_pages(zone, page_head, page, 0, 90618c2ecf20Sopenharmony_ci page_order, migratetype); 90628c2ecf20Sopenharmony_ci if (!is_migrate_isolate(migratetype)) 90638c2ecf20Sopenharmony_ci __mod_zone_freepage_state(zone, -1, migratetype); 90648c2ecf20Sopenharmony_ci ret = true; 90658c2ecf20Sopenharmony_ci break; 90668c2ecf20Sopenharmony_ci } 90678c2ecf20Sopenharmony_ci if (page_count(page_head) > 0) 90688c2ecf20Sopenharmony_ci break; 90698c2ecf20Sopenharmony_ci } 90708c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&zone->lock, flags); 90718c2ecf20Sopenharmony_ci return ret; 90728c2ecf20Sopenharmony_ci} 90738c2ecf20Sopenharmony_ci#endif 90748c2ecf20Sopenharmony_ci 90758c2ecf20Sopenharmony_ci#ifdef CONFIG_ZONE_DMA 90768c2ecf20Sopenharmony_cibool has_managed_dma(void) 90778c2ecf20Sopenharmony_ci{ 90788c2ecf20Sopenharmony_ci struct pglist_data *pgdat; 90798c2ecf20Sopenharmony_ci 90808c2ecf20Sopenharmony_ci for_each_online_pgdat(pgdat) { 90818c2ecf20Sopenharmony_ci struct zone *zone = &pgdat->node_zones[ZONE_DMA]; 90828c2ecf20Sopenharmony_ci 90838c2ecf20Sopenharmony_ci if (managed_zone(zone)) 90848c2ecf20Sopenharmony_ci return true; 90858c2ecf20Sopenharmony_ci } 90868c2ecf20Sopenharmony_ci return false; 90878c2ecf20Sopenharmony_ci} 90888c2ecf20Sopenharmony_ci#endif /* CONFIG_ZONE_DMA */ 9089