162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * linux/mm/memory_hotplug.c 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Copyright (C) 662306a36Sopenharmony_ci */ 762306a36Sopenharmony_ci 862306a36Sopenharmony_ci#include <linux/stddef.h> 962306a36Sopenharmony_ci#include <linux/mm.h> 1062306a36Sopenharmony_ci#include <linux/sched/signal.h> 1162306a36Sopenharmony_ci#include <linux/swap.h> 1262306a36Sopenharmony_ci#include <linux/interrupt.h> 1362306a36Sopenharmony_ci#include <linux/pagemap.h> 1462306a36Sopenharmony_ci#include <linux/compiler.h> 1562306a36Sopenharmony_ci#include <linux/export.h> 1662306a36Sopenharmony_ci#include <linux/writeback.h> 1762306a36Sopenharmony_ci#include <linux/slab.h> 1862306a36Sopenharmony_ci#include <linux/sysctl.h> 1962306a36Sopenharmony_ci#include <linux/cpu.h> 2062306a36Sopenharmony_ci#include <linux/memory.h> 2162306a36Sopenharmony_ci#include <linux/memremap.h> 2262306a36Sopenharmony_ci#include <linux/memory_hotplug.h> 2362306a36Sopenharmony_ci#include <linux/vmalloc.h> 2462306a36Sopenharmony_ci#include <linux/ioport.h> 2562306a36Sopenharmony_ci#include <linux/delay.h> 2662306a36Sopenharmony_ci#include <linux/migrate.h> 2762306a36Sopenharmony_ci#include <linux/page-isolation.h> 2862306a36Sopenharmony_ci#include <linux/pfn.h> 2962306a36Sopenharmony_ci#include <linux/suspend.h> 3062306a36Sopenharmony_ci#include <linux/mm_inline.h> 3162306a36Sopenharmony_ci#include <linux/firmware-map.h> 3262306a36Sopenharmony_ci#include <linux/stop_machine.h> 3362306a36Sopenharmony_ci#include <linux/hugetlb.h> 3462306a36Sopenharmony_ci#include <linux/memblock.h> 3562306a36Sopenharmony_ci#include <linux/compaction.h> 3662306a36Sopenharmony_ci#include <linux/rmap.h> 3762306a36Sopenharmony_ci#include <linux/module.h> 3862306a36Sopenharmony_ci#include <linux/zswapd.h> 3962306a36Sopenharmony_ci 4062306a36Sopenharmony_ci#include <asm/tlbflush.h> 4162306a36Sopenharmony_ci 4262306a36Sopenharmony_ci#include "internal.h" 4362306a36Sopenharmony_ci#include "shuffle.h" 4462306a36Sopenharmony_ci 4562306a36Sopenharmony_cienum { 4662306a36Sopenharmony_ci MEMMAP_ON_MEMORY_DISABLE = 0, 4762306a36Sopenharmony_ci MEMMAP_ON_MEMORY_ENABLE, 4862306a36Sopenharmony_ci MEMMAP_ON_MEMORY_FORCE, 4962306a36Sopenharmony_ci}; 5062306a36Sopenharmony_ci 5162306a36Sopenharmony_cistatic int memmap_mode __read_mostly = MEMMAP_ON_MEMORY_DISABLE; 5262306a36Sopenharmony_ci 5362306a36Sopenharmony_cistatic inline unsigned long memory_block_memmap_size(void) 5462306a36Sopenharmony_ci{ 5562306a36Sopenharmony_ci return PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page); 5662306a36Sopenharmony_ci} 5762306a36Sopenharmony_ci 5862306a36Sopenharmony_cistatic inline unsigned long memory_block_memmap_on_memory_pages(void) 5962306a36Sopenharmony_ci{ 6062306a36Sopenharmony_ci unsigned long nr_pages = PFN_UP(memory_block_memmap_size()); 6162306a36Sopenharmony_ci 6262306a36Sopenharmony_ci /* 6362306a36Sopenharmony_ci * In "forced" memmap_on_memory mode, we add extra pages to align the 6462306a36Sopenharmony_ci * vmemmap size to cover full pageblocks. That way, we can add memory 6562306a36Sopenharmony_ci * even if the vmemmap size is not properly aligned, however, we might waste 6662306a36Sopenharmony_ci * memory. 6762306a36Sopenharmony_ci */ 6862306a36Sopenharmony_ci if (memmap_mode == MEMMAP_ON_MEMORY_FORCE) 6962306a36Sopenharmony_ci return pageblock_align(nr_pages); 7062306a36Sopenharmony_ci return nr_pages; 7162306a36Sopenharmony_ci} 7262306a36Sopenharmony_ci 7362306a36Sopenharmony_ci#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY 7462306a36Sopenharmony_ci/* 7562306a36Sopenharmony_ci * memory_hotplug.memmap_on_memory parameter 7662306a36Sopenharmony_ci */ 7762306a36Sopenharmony_cistatic int set_memmap_mode(const char *val, const struct kernel_param *kp) 7862306a36Sopenharmony_ci{ 7962306a36Sopenharmony_ci int ret, mode; 8062306a36Sopenharmony_ci bool enabled; 8162306a36Sopenharmony_ci 8262306a36Sopenharmony_ci if (sysfs_streq(val, "force") || sysfs_streq(val, "FORCE")) { 8362306a36Sopenharmony_ci mode = MEMMAP_ON_MEMORY_FORCE; 8462306a36Sopenharmony_ci } else { 8562306a36Sopenharmony_ci ret = kstrtobool(val, &enabled); 8662306a36Sopenharmony_ci if (ret < 0) 8762306a36Sopenharmony_ci return ret; 8862306a36Sopenharmony_ci if (enabled) 8962306a36Sopenharmony_ci mode = MEMMAP_ON_MEMORY_ENABLE; 9062306a36Sopenharmony_ci else 9162306a36Sopenharmony_ci mode = MEMMAP_ON_MEMORY_DISABLE; 9262306a36Sopenharmony_ci } 9362306a36Sopenharmony_ci *((int *)kp->arg) = mode; 9462306a36Sopenharmony_ci if (mode == MEMMAP_ON_MEMORY_FORCE) { 9562306a36Sopenharmony_ci unsigned long memmap_pages = memory_block_memmap_on_memory_pages(); 9662306a36Sopenharmony_ci 9762306a36Sopenharmony_ci pr_info_once("Memory hotplug will waste %ld pages in each memory block\n", 9862306a36Sopenharmony_ci memmap_pages - PFN_UP(memory_block_memmap_size())); 9962306a36Sopenharmony_ci } 10062306a36Sopenharmony_ci return 0; 10162306a36Sopenharmony_ci} 10262306a36Sopenharmony_ci 10362306a36Sopenharmony_cistatic int get_memmap_mode(char *buffer, const struct kernel_param *kp) 10462306a36Sopenharmony_ci{ 10562306a36Sopenharmony_ci int mode = *((int *)kp->arg); 10662306a36Sopenharmony_ci 10762306a36Sopenharmony_ci if (mode == MEMMAP_ON_MEMORY_FORCE) 10862306a36Sopenharmony_ci return sprintf(buffer, "force\n"); 10962306a36Sopenharmony_ci return sprintf(buffer, "%c\n", mode ? 'Y' : 'N'); 11062306a36Sopenharmony_ci} 11162306a36Sopenharmony_ci 11262306a36Sopenharmony_cistatic const struct kernel_param_ops memmap_mode_ops = { 11362306a36Sopenharmony_ci .set = set_memmap_mode, 11462306a36Sopenharmony_ci .get = get_memmap_mode, 11562306a36Sopenharmony_ci}; 11662306a36Sopenharmony_cimodule_param_cb(memmap_on_memory, &memmap_mode_ops, &memmap_mode, 0444); 11762306a36Sopenharmony_ciMODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug\n" 11862306a36Sopenharmony_ci "With value \"force\" it could result in memory wastage due " 11962306a36Sopenharmony_ci "to memmap size limitations (Y/N/force)"); 12062306a36Sopenharmony_ci 12162306a36Sopenharmony_cistatic inline bool mhp_memmap_on_memory(void) 12262306a36Sopenharmony_ci{ 12362306a36Sopenharmony_ci return memmap_mode != MEMMAP_ON_MEMORY_DISABLE; 12462306a36Sopenharmony_ci} 12562306a36Sopenharmony_ci#else 12662306a36Sopenharmony_cistatic inline bool mhp_memmap_on_memory(void) 12762306a36Sopenharmony_ci{ 12862306a36Sopenharmony_ci return false; 12962306a36Sopenharmony_ci} 13062306a36Sopenharmony_ci#endif 13162306a36Sopenharmony_ci 13262306a36Sopenharmony_cienum { 13362306a36Sopenharmony_ci ONLINE_POLICY_CONTIG_ZONES = 0, 13462306a36Sopenharmony_ci ONLINE_POLICY_AUTO_MOVABLE, 13562306a36Sopenharmony_ci}; 13662306a36Sopenharmony_ci 13762306a36Sopenharmony_cistatic const char * const online_policy_to_str[] = { 13862306a36Sopenharmony_ci [ONLINE_POLICY_CONTIG_ZONES] = "contig-zones", 13962306a36Sopenharmony_ci [ONLINE_POLICY_AUTO_MOVABLE] = "auto-movable", 14062306a36Sopenharmony_ci}; 14162306a36Sopenharmony_ci 14262306a36Sopenharmony_cistatic int set_online_policy(const char *val, const struct kernel_param *kp) 14362306a36Sopenharmony_ci{ 14462306a36Sopenharmony_ci int ret = sysfs_match_string(online_policy_to_str, val); 14562306a36Sopenharmony_ci 14662306a36Sopenharmony_ci if (ret < 0) 14762306a36Sopenharmony_ci return ret; 14862306a36Sopenharmony_ci *((int *)kp->arg) = ret; 14962306a36Sopenharmony_ci return 0; 15062306a36Sopenharmony_ci} 15162306a36Sopenharmony_ci 15262306a36Sopenharmony_cistatic int get_online_policy(char *buffer, const struct kernel_param *kp) 15362306a36Sopenharmony_ci{ 15462306a36Sopenharmony_ci return sprintf(buffer, "%s\n", online_policy_to_str[*((int *)kp->arg)]); 15562306a36Sopenharmony_ci} 15662306a36Sopenharmony_ci 15762306a36Sopenharmony_ci/* 15862306a36Sopenharmony_ci * memory_hotplug.online_policy: configure online behavior when onlining without 15962306a36Sopenharmony_ci * specifying a zone (MMOP_ONLINE) 16062306a36Sopenharmony_ci * 16162306a36Sopenharmony_ci * "contig-zones": keep zone contiguous 16262306a36Sopenharmony_ci * "auto-movable": online memory to ZONE_MOVABLE if the configuration 16362306a36Sopenharmony_ci * (auto_movable_ratio, auto_movable_numa_aware) allows for it 16462306a36Sopenharmony_ci */ 16562306a36Sopenharmony_cistatic int online_policy __read_mostly = ONLINE_POLICY_CONTIG_ZONES; 16662306a36Sopenharmony_cistatic const struct kernel_param_ops online_policy_ops = { 16762306a36Sopenharmony_ci .set = set_online_policy, 16862306a36Sopenharmony_ci .get = get_online_policy, 16962306a36Sopenharmony_ci}; 17062306a36Sopenharmony_cimodule_param_cb(online_policy, &online_policy_ops, &online_policy, 0644); 17162306a36Sopenharmony_ciMODULE_PARM_DESC(online_policy, 17262306a36Sopenharmony_ci "Set the online policy (\"contig-zones\", \"auto-movable\") " 17362306a36Sopenharmony_ci "Default: \"contig-zones\""); 17462306a36Sopenharmony_ci 17562306a36Sopenharmony_ci/* 17662306a36Sopenharmony_ci * memory_hotplug.auto_movable_ratio: specify maximum MOVABLE:KERNEL ratio 17762306a36Sopenharmony_ci * 17862306a36Sopenharmony_ci * The ratio represent an upper limit and the kernel might decide to not 17962306a36Sopenharmony_ci * online some memory to ZONE_MOVABLE -- e.g., because hotplugged KERNEL memory 18062306a36Sopenharmony_ci * doesn't allow for more MOVABLE memory. 18162306a36Sopenharmony_ci */ 18262306a36Sopenharmony_cistatic unsigned int auto_movable_ratio __read_mostly = 301; 18362306a36Sopenharmony_cimodule_param(auto_movable_ratio, uint, 0644); 18462306a36Sopenharmony_ciMODULE_PARM_DESC(auto_movable_ratio, 18562306a36Sopenharmony_ci "Set the maximum ratio of MOVABLE:KERNEL memory in the system " 18662306a36Sopenharmony_ci "in percent for \"auto-movable\" online policy. Default: 301"); 18762306a36Sopenharmony_ci 18862306a36Sopenharmony_ci/* 18962306a36Sopenharmony_ci * memory_hotplug.auto_movable_numa_aware: consider numa node stats 19062306a36Sopenharmony_ci */ 19162306a36Sopenharmony_ci#ifdef CONFIG_NUMA 19262306a36Sopenharmony_cistatic bool auto_movable_numa_aware __read_mostly = true; 19362306a36Sopenharmony_cimodule_param(auto_movable_numa_aware, bool, 0644); 19462306a36Sopenharmony_ciMODULE_PARM_DESC(auto_movable_numa_aware, 19562306a36Sopenharmony_ci "Consider numa node stats in addition to global stats in " 19662306a36Sopenharmony_ci "\"auto-movable\" online policy. Default: true"); 19762306a36Sopenharmony_ci#endif /* CONFIG_NUMA */ 19862306a36Sopenharmony_ci 19962306a36Sopenharmony_ci/* 20062306a36Sopenharmony_ci * online_page_callback contains pointer to current page onlining function. 20162306a36Sopenharmony_ci * Initially it is generic_online_page(). If it is required it could be 20262306a36Sopenharmony_ci * changed by calling set_online_page_callback() for callback registration 20362306a36Sopenharmony_ci * and restore_online_page_callback() for generic callback restore. 20462306a36Sopenharmony_ci */ 20562306a36Sopenharmony_ci 20662306a36Sopenharmony_cistatic online_page_callback_t online_page_callback = generic_online_page; 20762306a36Sopenharmony_cistatic DEFINE_MUTEX(online_page_callback_lock); 20862306a36Sopenharmony_ci 20962306a36Sopenharmony_ciDEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock); 21062306a36Sopenharmony_ci 21162306a36Sopenharmony_civoid get_online_mems(void) 21262306a36Sopenharmony_ci{ 21362306a36Sopenharmony_ci percpu_down_read(&mem_hotplug_lock); 21462306a36Sopenharmony_ci} 21562306a36Sopenharmony_ci 21662306a36Sopenharmony_civoid put_online_mems(void) 21762306a36Sopenharmony_ci{ 21862306a36Sopenharmony_ci percpu_up_read(&mem_hotplug_lock); 21962306a36Sopenharmony_ci} 22062306a36Sopenharmony_ci 22162306a36Sopenharmony_cibool movable_node_enabled = false; 22262306a36Sopenharmony_ci 22362306a36Sopenharmony_ci#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE 22462306a36Sopenharmony_ciint mhp_default_online_type = MMOP_OFFLINE; 22562306a36Sopenharmony_ci#else 22662306a36Sopenharmony_ciint mhp_default_online_type = MMOP_ONLINE; 22762306a36Sopenharmony_ci#endif 22862306a36Sopenharmony_ci 22962306a36Sopenharmony_cistatic int __init setup_memhp_default_state(char *str) 23062306a36Sopenharmony_ci{ 23162306a36Sopenharmony_ci const int online_type = mhp_online_type_from_str(str); 23262306a36Sopenharmony_ci 23362306a36Sopenharmony_ci if (online_type >= 0) 23462306a36Sopenharmony_ci mhp_default_online_type = online_type; 23562306a36Sopenharmony_ci 23662306a36Sopenharmony_ci return 1; 23762306a36Sopenharmony_ci} 23862306a36Sopenharmony_ci__setup("memhp_default_state=", setup_memhp_default_state); 23962306a36Sopenharmony_ci 24062306a36Sopenharmony_civoid mem_hotplug_begin(void) 24162306a36Sopenharmony_ci{ 24262306a36Sopenharmony_ci cpus_read_lock(); 24362306a36Sopenharmony_ci percpu_down_write(&mem_hotplug_lock); 24462306a36Sopenharmony_ci} 24562306a36Sopenharmony_ci 24662306a36Sopenharmony_civoid mem_hotplug_done(void) 24762306a36Sopenharmony_ci{ 24862306a36Sopenharmony_ci percpu_up_write(&mem_hotplug_lock); 24962306a36Sopenharmony_ci cpus_read_unlock(); 25062306a36Sopenharmony_ci} 25162306a36Sopenharmony_ci 25262306a36Sopenharmony_ciu64 max_mem_size = U64_MAX; 25362306a36Sopenharmony_ci 25462306a36Sopenharmony_ci/* add this memory to iomem resource */ 25562306a36Sopenharmony_cistatic struct resource *register_memory_resource(u64 start, u64 size, 25662306a36Sopenharmony_ci const char *resource_name) 25762306a36Sopenharmony_ci{ 25862306a36Sopenharmony_ci struct resource *res; 25962306a36Sopenharmony_ci unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; 26062306a36Sopenharmony_ci 26162306a36Sopenharmony_ci if (strcmp(resource_name, "System RAM")) 26262306a36Sopenharmony_ci flags |= IORESOURCE_SYSRAM_DRIVER_MANAGED; 26362306a36Sopenharmony_ci 26462306a36Sopenharmony_ci if (!mhp_range_allowed(start, size, true)) 26562306a36Sopenharmony_ci return ERR_PTR(-E2BIG); 26662306a36Sopenharmony_ci 26762306a36Sopenharmony_ci /* 26862306a36Sopenharmony_ci * Make sure value parsed from 'mem=' only restricts memory adding 26962306a36Sopenharmony_ci * while booting, so that memory hotplug won't be impacted. Please 27062306a36Sopenharmony_ci * refer to document of 'mem=' in kernel-parameters.txt for more 27162306a36Sopenharmony_ci * details. 27262306a36Sopenharmony_ci */ 27362306a36Sopenharmony_ci if (start + size > max_mem_size && system_state < SYSTEM_RUNNING) 27462306a36Sopenharmony_ci return ERR_PTR(-E2BIG); 27562306a36Sopenharmony_ci 27662306a36Sopenharmony_ci /* 27762306a36Sopenharmony_ci * Request ownership of the new memory range. This might be 27862306a36Sopenharmony_ci * a child of an existing resource that was present but 27962306a36Sopenharmony_ci * not marked as busy. 28062306a36Sopenharmony_ci */ 28162306a36Sopenharmony_ci res = __request_region(&iomem_resource, start, size, 28262306a36Sopenharmony_ci resource_name, flags); 28362306a36Sopenharmony_ci 28462306a36Sopenharmony_ci if (!res) { 28562306a36Sopenharmony_ci pr_debug("Unable to reserve System RAM region: %016llx->%016llx\n", 28662306a36Sopenharmony_ci start, start + size); 28762306a36Sopenharmony_ci return ERR_PTR(-EEXIST); 28862306a36Sopenharmony_ci } 28962306a36Sopenharmony_ci return res; 29062306a36Sopenharmony_ci} 29162306a36Sopenharmony_ci 29262306a36Sopenharmony_cistatic void release_memory_resource(struct resource *res) 29362306a36Sopenharmony_ci{ 29462306a36Sopenharmony_ci if (!res) 29562306a36Sopenharmony_ci return; 29662306a36Sopenharmony_ci release_resource(res); 29762306a36Sopenharmony_ci kfree(res); 29862306a36Sopenharmony_ci} 29962306a36Sopenharmony_ci 30062306a36Sopenharmony_cistatic int check_pfn_span(unsigned long pfn, unsigned long nr_pages) 30162306a36Sopenharmony_ci{ 30262306a36Sopenharmony_ci /* 30362306a36Sopenharmony_ci * Disallow all operations smaller than a sub-section and only 30462306a36Sopenharmony_ci * allow operations smaller than a section for 30562306a36Sopenharmony_ci * SPARSEMEM_VMEMMAP. Note that check_hotplug_memory_range() 30662306a36Sopenharmony_ci * enforces a larger memory_block_size_bytes() granularity for 30762306a36Sopenharmony_ci * memory that will be marked online, so this check should only 30862306a36Sopenharmony_ci * fire for direct arch_{add,remove}_memory() users outside of 30962306a36Sopenharmony_ci * add_memory_resource(). 31062306a36Sopenharmony_ci */ 31162306a36Sopenharmony_ci unsigned long min_align; 31262306a36Sopenharmony_ci 31362306a36Sopenharmony_ci if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) 31462306a36Sopenharmony_ci min_align = PAGES_PER_SUBSECTION; 31562306a36Sopenharmony_ci else 31662306a36Sopenharmony_ci min_align = PAGES_PER_SECTION; 31762306a36Sopenharmony_ci if (!IS_ALIGNED(pfn | nr_pages, min_align)) 31862306a36Sopenharmony_ci return -EINVAL; 31962306a36Sopenharmony_ci return 0; 32062306a36Sopenharmony_ci} 32162306a36Sopenharmony_ci 32262306a36Sopenharmony_ci/* 32362306a36Sopenharmony_ci * Return page for the valid pfn only if the page is online. All pfn 32462306a36Sopenharmony_ci * walkers which rely on the fully initialized page->flags and others 32562306a36Sopenharmony_ci * should use this rather than pfn_valid && pfn_to_page 32662306a36Sopenharmony_ci */ 32762306a36Sopenharmony_cistruct page *pfn_to_online_page(unsigned long pfn) 32862306a36Sopenharmony_ci{ 32962306a36Sopenharmony_ci unsigned long nr = pfn_to_section_nr(pfn); 33062306a36Sopenharmony_ci struct dev_pagemap *pgmap; 33162306a36Sopenharmony_ci struct mem_section *ms; 33262306a36Sopenharmony_ci 33362306a36Sopenharmony_ci if (nr >= NR_MEM_SECTIONS) 33462306a36Sopenharmony_ci return NULL; 33562306a36Sopenharmony_ci 33662306a36Sopenharmony_ci ms = __nr_to_section(nr); 33762306a36Sopenharmony_ci if (!online_section(ms)) 33862306a36Sopenharmony_ci return NULL; 33962306a36Sopenharmony_ci 34062306a36Sopenharmony_ci /* 34162306a36Sopenharmony_ci * Save some code text when online_section() + 34262306a36Sopenharmony_ci * pfn_section_valid() are sufficient. 34362306a36Sopenharmony_ci */ 34462306a36Sopenharmony_ci if (IS_ENABLED(CONFIG_HAVE_ARCH_PFN_VALID) && !pfn_valid(pfn)) 34562306a36Sopenharmony_ci return NULL; 34662306a36Sopenharmony_ci 34762306a36Sopenharmony_ci if (!pfn_section_valid(ms, pfn)) 34862306a36Sopenharmony_ci return NULL; 34962306a36Sopenharmony_ci 35062306a36Sopenharmony_ci if (!online_device_section(ms)) 35162306a36Sopenharmony_ci return pfn_to_page(pfn); 35262306a36Sopenharmony_ci 35362306a36Sopenharmony_ci /* 35462306a36Sopenharmony_ci * Slowpath: when ZONE_DEVICE collides with 35562306a36Sopenharmony_ci * ZONE_{NORMAL,MOVABLE} within the same section some pfns in 35662306a36Sopenharmony_ci * the section may be 'offline' but 'valid'. Only 35762306a36Sopenharmony_ci * get_dev_pagemap() can determine sub-section online status. 35862306a36Sopenharmony_ci */ 35962306a36Sopenharmony_ci pgmap = get_dev_pagemap(pfn, NULL); 36062306a36Sopenharmony_ci put_dev_pagemap(pgmap); 36162306a36Sopenharmony_ci 36262306a36Sopenharmony_ci /* The presence of a pgmap indicates ZONE_DEVICE offline pfn */ 36362306a36Sopenharmony_ci if (pgmap) 36462306a36Sopenharmony_ci return NULL; 36562306a36Sopenharmony_ci 36662306a36Sopenharmony_ci return pfn_to_page(pfn); 36762306a36Sopenharmony_ci} 36862306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(pfn_to_online_page); 36962306a36Sopenharmony_ci 37062306a36Sopenharmony_ciint __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, 37162306a36Sopenharmony_ci struct mhp_params *params) 37262306a36Sopenharmony_ci{ 37362306a36Sopenharmony_ci const unsigned long end_pfn = pfn + nr_pages; 37462306a36Sopenharmony_ci unsigned long cur_nr_pages; 37562306a36Sopenharmony_ci int err; 37662306a36Sopenharmony_ci struct vmem_altmap *altmap = params->altmap; 37762306a36Sopenharmony_ci 37862306a36Sopenharmony_ci if (WARN_ON_ONCE(!pgprot_val(params->pgprot))) 37962306a36Sopenharmony_ci return -EINVAL; 38062306a36Sopenharmony_ci 38162306a36Sopenharmony_ci VM_BUG_ON(!mhp_range_allowed(PFN_PHYS(pfn), nr_pages * PAGE_SIZE, false)); 38262306a36Sopenharmony_ci 38362306a36Sopenharmony_ci if (altmap) { 38462306a36Sopenharmony_ci /* 38562306a36Sopenharmony_ci * Validate altmap is within bounds of the total request 38662306a36Sopenharmony_ci */ 38762306a36Sopenharmony_ci if (altmap->base_pfn != pfn 38862306a36Sopenharmony_ci || vmem_altmap_offset(altmap) > nr_pages) { 38962306a36Sopenharmony_ci pr_warn_once("memory add fail, invalid altmap\n"); 39062306a36Sopenharmony_ci return -EINVAL; 39162306a36Sopenharmony_ci } 39262306a36Sopenharmony_ci altmap->alloc = 0; 39362306a36Sopenharmony_ci } 39462306a36Sopenharmony_ci 39562306a36Sopenharmony_ci if (check_pfn_span(pfn, nr_pages)) { 39662306a36Sopenharmony_ci WARN(1, "Misaligned %s start: %#lx end: %#lx\n", __func__, pfn, pfn + nr_pages - 1); 39762306a36Sopenharmony_ci return -EINVAL; 39862306a36Sopenharmony_ci } 39962306a36Sopenharmony_ci 40062306a36Sopenharmony_ci for (; pfn < end_pfn; pfn += cur_nr_pages) { 40162306a36Sopenharmony_ci /* Select all remaining pages up to the next section boundary */ 40262306a36Sopenharmony_ci cur_nr_pages = min(end_pfn - pfn, 40362306a36Sopenharmony_ci SECTION_ALIGN_UP(pfn + 1) - pfn); 40462306a36Sopenharmony_ci err = sparse_add_section(nid, pfn, cur_nr_pages, altmap, 40562306a36Sopenharmony_ci params->pgmap); 40662306a36Sopenharmony_ci if (err) 40762306a36Sopenharmony_ci break; 40862306a36Sopenharmony_ci cond_resched(); 40962306a36Sopenharmony_ci } 41062306a36Sopenharmony_ci vmemmap_populate_print_last(); 41162306a36Sopenharmony_ci return err; 41262306a36Sopenharmony_ci} 41362306a36Sopenharmony_ci 41462306a36Sopenharmony_ci/* find the smallest valid pfn in the range [start_pfn, end_pfn) */ 41562306a36Sopenharmony_cistatic unsigned long find_smallest_section_pfn(int nid, struct zone *zone, 41662306a36Sopenharmony_ci unsigned long start_pfn, 41762306a36Sopenharmony_ci unsigned long end_pfn) 41862306a36Sopenharmony_ci{ 41962306a36Sopenharmony_ci for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) { 42062306a36Sopenharmony_ci if (unlikely(!pfn_to_online_page(start_pfn))) 42162306a36Sopenharmony_ci continue; 42262306a36Sopenharmony_ci 42362306a36Sopenharmony_ci if (unlikely(pfn_to_nid(start_pfn) != nid)) 42462306a36Sopenharmony_ci continue; 42562306a36Sopenharmony_ci 42662306a36Sopenharmony_ci if (zone != page_zone(pfn_to_page(start_pfn))) 42762306a36Sopenharmony_ci continue; 42862306a36Sopenharmony_ci 42962306a36Sopenharmony_ci return start_pfn; 43062306a36Sopenharmony_ci } 43162306a36Sopenharmony_ci 43262306a36Sopenharmony_ci return 0; 43362306a36Sopenharmony_ci} 43462306a36Sopenharmony_ci 43562306a36Sopenharmony_ci/* find the biggest valid pfn in the range [start_pfn, end_pfn). */ 43662306a36Sopenharmony_cistatic unsigned long find_biggest_section_pfn(int nid, struct zone *zone, 43762306a36Sopenharmony_ci unsigned long start_pfn, 43862306a36Sopenharmony_ci unsigned long end_pfn) 43962306a36Sopenharmony_ci{ 44062306a36Sopenharmony_ci unsigned long pfn; 44162306a36Sopenharmony_ci 44262306a36Sopenharmony_ci /* pfn is the end pfn of a memory section. */ 44362306a36Sopenharmony_ci pfn = end_pfn - 1; 44462306a36Sopenharmony_ci for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) { 44562306a36Sopenharmony_ci if (unlikely(!pfn_to_online_page(pfn))) 44662306a36Sopenharmony_ci continue; 44762306a36Sopenharmony_ci 44862306a36Sopenharmony_ci if (unlikely(pfn_to_nid(pfn) != nid)) 44962306a36Sopenharmony_ci continue; 45062306a36Sopenharmony_ci 45162306a36Sopenharmony_ci if (zone != page_zone(pfn_to_page(pfn))) 45262306a36Sopenharmony_ci continue; 45362306a36Sopenharmony_ci 45462306a36Sopenharmony_ci return pfn; 45562306a36Sopenharmony_ci } 45662306a36Sopenharmony_ci 45762306a36Sopenharmony_ci return 0; 45862306a36Sopenharmony_ci} 45962306a36Sopenharmony_ci 46062306a36Sopenharmony_cistatic void shrink_zone_span(struct zone *zone, unsigned long start_pfn, 46162306a36Sopenharmony_ci unsigned long end_pfn) 46262306a36Sopenharmony_ci{ 46362306a36Sopenharmony_ci unsigned long pfn; 46462306a36Sopenharmony_ci int nid = zone_to_nid(zone); 46562306a36Sopenharmony_ci 46662306a36Sopenharmony_ci if (zone->zone_start_pfn == start_pfn) { 46762306a36Sopenharmony_ci /* 46862306a36Sopenharmony_ci * If the section is smallest section in the zone, it need 46962306a36Sopenharmony_ci * shrink zone->zone_start_pfn and zone->zone_spanned_pages. 47062306a36Sopenharmony_ci * In this case, we find second smallest valid mem_section 47162306a36Sopenharmony_ci * for shrinking zone. 47262306a36Sopenharmony_ci */ 47362306a36Sopenharmony_ci pfn = find_smallest_section_pfn(nid, zone, end_pfn, 47462306a36Sopenharmony_ci zone_end_pfn(zone)); 47562306a36Sopenharmony_ci if (pfn) { 47662306a36Sopenharmony_ci zone->spanned_pages = zone_end_pfn(zone) - pfn; 47762306a36Sopenharmony_ci zone->zone_start_pfn = pfn; 47862306a36Sopenharmony_ci } else { 47962306a36Sopenharmony_ci zone->zone_start_pfn = 0; 48062306a36Sopenharmony_ci zone->spanned_pages = 0; 48162306a36Sopenharmony_ci } 48262306a36Sopenharmony_ci } else if (zone_end_pfn(zone) == end_pfn) { 48362306a36Sopenharmony_ci /* 48462306a36Sopenharmony_ci * If the section is biggest section in the zone, it need 48562306a36Sopenharmony_ci * shrink zone->spanned_pages. 48662306a36Sopenharmony_ci * In this case, we find second biggest valid mem_section for 48762306a36Sopenharmony_ci * shrinking zone. 48862306a36Sopenharmony_ci */ 48962306a36Sopenharmony_ci pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn, 49062306a36Sopenharmony_ci start_pfn); 49162306a36Sopenharmony_ci if (pfn) 49262306a36Sopenharmony_ci zone->spanned_pages = pfn - zone->zone_start_pfn + 1; 49362306a36Sopenharmony_ci else { 49462306a36Sopenharmony_ci zone->zone_start_pfn = 0; 49562306a36Sopenharmony_ci zone->spanned_pages = 0; 49662306a36Sopenharmony_ci } 49762306a36Sopenharmony_ci } 49862306a36Sopenharmony_ci} 49962306a36Sopenharmony_ci 50062306a36Sopenharmony_cistatic void update_pgdat_span(struct pglist_data *pgdat) 50162306a36Sopenharmony_ci{ 50262306a36Sopenharmony_ci unsigned long node_start_pfn = 0, node_end_pfn = 0; 50362306a36Sopenharmony_ci struct zone *zone; 50462306a36Sopenharmony_ci 50562306a36Sopenharmony_ci for (zone = pgdat->node_zones; 50662306a36Sopenharmony_ci zone < pgdat->node_zones + MAX_NR_ZONES; zone++) { 50762306a36Sopenharmony_ci unsigned long end_pfn = zone_end_pfn(zone); 50862306a36Sopenharmony_ci 50962306a36Sopenharmony_ci /* No need to lock the zones, they can't change. */ 51062306a36Sopenharmony_ci if (!zone->spanned_pages) 51162306a36Sopenharmony_ci continue; 51262306a36Sopenharmony_ci if (!node_end_pfn) { 51362306a36Sopenharmony_ci node_start_pfn = zone->zone_start_pfn; 51462306a36Sopenharmony_ci node_end_pfn = end_pfn; 51562306a36Sopenharmony_ci continue; 51662306a36Sopenharmony_ci } 51762306a36Sopenharmony_ci 51862306a36Sopenharmony_ci if (end_pfn > node_end_pfn) 51962306a36Sopenharmony_ci node_end_pfn = end_pfn; 52062306a36Sopenharmony_ci if (zone->zone_start_pfn < node_start_pfn) 52162306a36Sopenharmony_ci node_start_pfn = zone->zone_start_pfn; 52262306a36Sopenharmony_ci } 52362306a36Sopenharmony_ci 52462306a36Sopenharmony_ci pgdat->node_start_pfn = node_start_pfn; 52562306a36Sopenharmony_ci pgdat->node_spanned_pages = node_end_pfn - node_start_pfn; 52662306a36Sopenharmony_ci} 52762306a36Sopenharmony_ci 52862306a36Sopenharmony_civoid __ref remove_pfn_range_from_zone(struct zone *zone, 52962306a36Sopenharmony_ci unsigned long start_pfn, 53062306a36Sopenharmony_ci unsigned long nr_pages) 53162306a36Sopenharmony_ci{ 53262306a36Sopenharmony_ci const unsigned long end_pfn = start_pfn + nr_pages; 53362306a36Sopenharmony_ci struct pglist_data *pgdat = zone->zone_pgdat; 53462306a36Sopenharmony_ci unsigned long pfn, cur_nr_pages; 53562306a36Sopenharmony_ci 53662306a36Sopenharmony_ci /* Poison struct pages because they are now uninitialized again. */ 53762306a36Sopenharmony_ci for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) { 53862306a36Sopenharmony_ci cond_resched(); 53962306a36Sopenharmony_ci 54062306a36Sopenharmony_ci /* Select all remaining pages up to the next section boundary */ 54162306a36Sopenharmony_ci cur_nr_pages = 54262306a36Sopenharmony_ci min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn); 54362306a36Sopenharmony_ci page_init_poison(pfn_to_page(pfn), 54462306a36Sopenharmony_ci sizeof(struct page) * cur_nr_pages); 54562306a36Sopenharmony_ci } 54662306a36Sopenharmony_ci 54762306a36Sopenharmony_ci /* 54862306a36Sopenharmony_ci * Zone shrinking code cannot properly deal with ZONE_DEVICE. So 54962306a36Sopenharmony_ci * we will not try to shrink the zones - which is okay as 55062306a36Sopenharmony_ci * set_zone_contiguous() cannot deal with ZONE_DEVICE either way. 55162306a36Sopenharmony_ci */ 55262306a36Sopenharmony_ci if (zone_is_zone_device(zone)) 55362306a36Sopenharmony_ci return; 55462306a36Sopenharmony_ci 55562306a36Sopenharmony_ci clear_zone_contiguous(zone); 55662306a36Sopenharmony_ci 55762306a36Sopenharmony_ci shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); 55862306a36Sopenharmony_ci update_pgdat_span(pgdat); 55962306a36Sopenharmony_ci 56062306a36Sopenharmony_ci set_zone_contiguous(zone); 56162306a36Sopenharmony_ci} 56262306a36Sopenharmony_ci 56362306a36Sopenharmony_ci/** 56462306a36Sopenharmony_ci * __remove_pages() - remove sections of pages 56562306a36Sopenharmony_ci * @pfn: starting pageframe (must be aligned to start of a section) 56662306a36Sopenharmony_ci * @nr_pages: number of pages to remove (must be multiple of section size) 56762306a36Sopenharmony_ci * @altmap: alternative device page map or %NULL if default memmap is used 56862306a36Sopenharmony_ci * 56962306a36Sopenharmony_ci * Generic helper function to remove section mappings and sysfs entries 57062306a36Sopenharmony_ci * for the section of the memory we are removing. Caller needs to make 57162306a36Sopenharmony_ci * sure that pages are marked reserved and zones are adjust properly by 57262306a36Sopenharmony_ci * calling offline_pages(). 57362306a36Sopenharmony_ci */ 57462306a36Sopenharmony_civoid __remove_pages(unsigned long pfn, unsigned long nr_pages, 57562306a36Sopenharmony_ci struct vmem_altmap *altmap) 57662306a36Sopenharmony_ci{ 57762306a36Sopenharmony_ci const unsigned long end_pfn = pfn + nr_pages; 57862306a36Sopenharmony_ci unsigned long cur_nr_pages; 57962306a36Sopenharmony_ci 58062306a36Sopenharmony_ci if (check_pfn_span(pfn, nr_pages)) { 58162306a36Sopenharmony_ci WARN(1, "Misaligned %s start: %#lx end: %#lx\n", __func__, pfn, pfn + nr_pages - 1); 58262306a36Sopenharmony_ci return; 58362306a36Sopenharmony_ci } 58462306a36Sopenharmony_ci 58562306a36Sopenharmony_ci for (; pfn < end_pfn; pfn += cur_nr_pages) { 58662306a36Sopenharmony_ci cond_resched(); 58762306a36Sopenharmony_ci /* Select all remaining pages up to the next section boundary */ 58862306a36Sopenharmony_ci cur_nr_pages = min(end_pfn - pfn, 58962306a36Sopenharmony_ci SECTION_ALIGN_UP(pfn + 1) - pfn); 59062306a36Sopenharmony_ci sparse_remove_section(pfn, cur_nr_pages, altmap); 59162306a36Sopenharmony_ci } 59262306a36Sopenharmony_ci} 59362306a36Sopenharmony_ci 59462306a36Sopenharmony_ciint set_online_page_callback(online_page_callback_t callback) 59562306a36Sopenharmony_ci{ 59662306a36Sopenharmony_ci int rc = -EINVAL; 59762306a36Sopenharmony_ci 59862306a36Sopenharmony_ci get_online_mems(); 59962306a36Sopenharmony_ci mutex_lock(&online_page_callback_lock); 60062306a36Sopenharmony_ci 60162306a36Sopenharmony_ci if (online_page_callback == generic_online_page) { 60262306a36Sopenharmony_ci online_page_callback = callback; 60362306a36Sopenharmony_ci rc = 0; 60462306a36Sopenharmony_ci } 60562306a36Sopenharmony_ci 60662306a36Sopenharmony_ci mutex_unlock(&online_page_callback_lock); 60762306a36Sopenharmony_ci put_online_mems(); 60862306a36Sopenharmony_ci 60962306a36Sopenharmony_ci return rc; 61062306a36Sopenharmony_ci} 61162306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(set_online_page_callback); 61262306a36Sopenharmony_ci 61362306a36Sopenharmony_ciint restore_online_page_callback(online_page_callback_t callback) 61462306a36Sopenharmony_ci{ 61562306a36Sopenharmony_ci int rc = -EINVAL; 61662306a36Sopenharmony_ci 61762306a36Sopenharmony_ci get_online_mems(); 61862306a36Sopenharmony_ci mutex_lock(&online_page_callback_lock); 61962306a36Sopenharmony_ci 62062306a36Sopenharmony_ci if (online_page_callback == callback) { 62162306a36Sopenharmony_ci online_page_callback = generic_online_page; 62262306a36Sopenharmony_ci rc = 0; 62362306a36Sopenharmony_ci } 62462306a36Sopenharmony_ci 62562306a36Sopenharmony_ci mutex_unlock(&online_page_callback_lock); 62662306a36Sopenharmony_ci put_online_mems(); 62762306a36Sopenharmony_ci 62862306a36Sopenharmony_ci return rc; 62962306a36Sopenharmony_ci} 63062306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(restore_online_page_callback); 63162306a36Sopenharmony_ci 63262306a36Sopenharmony_civoid generic_online_page(struct page *page, unsigned int order) 63362306a36Sopenharmony_ci{ 63462306a36Sopenharmony_ci /* 63562306a36Sopenharmony_ci * Freeing the page with debug_pagealloc enabled will try to unmap it, 63662306a36Sopenharmony_ci * so we should map it first. This is better than introducing a special 63762306a36Sopenharmony_ci * case in page freeing fast path. 63862306a36Sopenharmony_ci */ 63962306a36Sopenharmony_ci debug_pagealloc_map_pages(page, 1 << order); 64062306a36Sopenharmony_ci __free_pages_core(page, order); 64162306a36Sopenharmony_ci totalram_pages_add(1UL << order); 64262306a36Sopenharmony_ci} 64362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(generic_online_page); 64462306a36Sopenharmony_ci 64562306a36Sopenharmony_cistatic void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) 64662306a36Sopenharmony_ci{ 64762306a36Sopenharmony_ci const unsigned long end_pfn = start_pfn + nr_pages; 64862306a36Sopenharmony_ci unsigned long pfn; 64962306a36Sopenharmony_ci 65062306a36Sopenharmony_ci /* 65162306a36Sopenharmony_ci * Online the pages in MAX_ORDER aligned chunks. The callback might 65262306a36Sopenharmony_ci * decide to not expose all pages to the buddy (e.g., expose them 65362306a36Sopenharmony_ci * later). We account all pages as being online and belonging to this 65462306a36Sopenharmony_ci * zone ("present"). 65562306a36Sopenharmony_ci * When using memmap_on_memory, the range might not be aligned to 65662306a36Sopenharmony_ci * MAX_ORDER_NR_PAGES - 1, but pageblock aligned. __ffs() will detect 65762306a36Sopenharmony_ci * this and the first chunk to online will be pageblock_nr_pages. 65862306a36Sopenharmony_ci */ 65962306a36Sopenharmony_ci for (pfn = start_pfn; pfn < end_pfn;) { 66062306a36Sopenharmony_ci int order; 66162306a36Sopenharmony_ci 66262306a36Sopenharmony_ci /* 66362306a36Sopenharmony_ci * Free to online pages in the largest chunks alignment allows. 66462306a36Sopenharmony_ci * 66562306a36Sopenharmony_ci * __ffs() behaviour is undefined for 0. start == 0 is 66662306a36Sopenharmony_ci * MAX_ORDER-aligned, Set order to MAX_ORDER for the case. 66762306a36Sopenharmony_ci */ 66862306a36Sopenharmony_ci if (pfn) 66962306a36Sopenharmony_ci order = min_t(int, MAX_ORDER, __ffs(pfn)); 67062306a36Sopenharmony_ci else 67162306a36Sopenharmony_ci order = MAX_ORDER; 67262306a36Sopenharmony_ci 67362306a36Sopenharmony_ci (*online_page_callback)(pfn_to_page(pfn), order); 67462306a36Sopenharmony_ci pfn += (1UL << order); 67562306a36Sopenharmony_ci } 67662306a36Sopenharmony_ci 67762306a36Sopenharmony_ci /* mark all involved sections as online */ 67862306a36Sopenharmony_ci online_mem_sections(start_pfn, end_pfn); 67962306a36Sopenharmony_ci} 68062306a36Sopenharmony_ci 68162306a36Sopenharmony_ci/* check which state of node_states will be changed when online memory */ 68262306a36Sopenharmony_cistatic void node_states_check_changes_online(unsigned long nr_pages, 68362306a36Sopenharmony_ci struct zone *zone, struct memory_notify *arg) 68462306a36Sopenharmony_ci{ 68562306a36Sopenharmony_ci int nid = zone_to_nid(zone); 68662306a36Sopenharmony_ci 68762306a36Sopenharmony_ci arg->status_change_nid = NUMA_NO_NODE; 68862306a36Sopenharmony_ci arg->status_change_nid_normal = NUMA_NO_NODE; 68962306a36Sopenharmony_ci 69062306a36Sopenharmony_ci if (!node_state(nid, N_MEMORY)) 69162306a36Sopenharmony_ci arg->status_change_nid = nid; 69262306a36Sopenharmony_ci if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY)) 69362306a36Sopenharmony_ci arg->status_change_nid_normal = nid; 69462306a36Sopenharmony_ci} 69562306a36Sopenharmony_ci 69662306a36Sopenharmony_cistatic void node_states_set_node(int node, struct memory_notify *arg) 69762306a36Sopenharmony_ci{ 69862306a36Sopenharmony_ci if (arg->status_change_nid_normal >= 0) 69962306a36Sopenharmony_ci node_set_state(node, N_NORMAL_MEMORY); 70062306a36Sopenharmony_ci 70162306a36Sopenharmony_ci if (arg->status_change_nid >= 0) 70262306a36Sopenharmony_ci node_set_state(node, N_MEMORY); 70362306a36Sopenharmony_ci} 70462306a36Sopenharmony_ci 70562306a36Sopenharmony_cistatic void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, 70662306a36Sopenharmony_ci unsigned long nr_pages) 70762306a36Sopenharmony_ci{ 70862306a36Sopenharmony_ci unsigned long old_end_pfn = zone_end_pfn(zone); 70962306a36Sopenharmony_ci 71062306a36Sopenharmony_ci if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) 71162306a36Sopenharmony_ci zone->zone_start_pfn = start_pfn; 71262306a36Sopenharmony_ci 71362306a36Sopenharmony_ci zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn; 71462306a36Sopenharmony_ci} 71562306a36Sopenharmony_ci 71662306a36Sopenharmony_cistatic void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn, 71762306a36Sopenharmony_ci unsigned long nr_pages) 71862306a36Sopenharmony_ci{ 71962306a36Sopenharmony_ci unsigned long old_end_pfn = pgdat_end_pfn(pgdat); 72062306a36Sopenharmony_ci 72162306a36Sopenharmony_ci if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) 72262306a36Sopenharmony_ci pgdat->node_start_pfn = start_pfn; 72362306a36Sopenharmony_ci 72462306a36Sopenharmony_ci pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; 72562306a36Sopenharmony_ci 72662306a36Sopenharmony_ci} 72762306a36Sopenharmony_ci 72862306a36Sopenharmony_ci#ifdef CONFIG_ZONE_DEVICE 72962306a36Sopenharmony_cistatic void section_taint_zone_device(unsigned long pfn) 73062306a36Sopenharmony_ci{ 73162306a36Sopenharmony_ci struct mem_section *ms = __pfn_to_section(pfn); 73262306a36Sopenharmony_ci 73362306a36Sopenharmony_ci ms->section_mem_map |= SECTION_TAINT_ZONE_DEVICE; 73462306a36Sopenharmony_ci} 73562306a36Sopenharmony_ci#else 73662306a36Sopenharmony_cistatic inline void section_taint_zone_device(unsigned long pfn) 73762306a36Sopenharmony_ci{ 73862306a36Sopenharmony_ci} 73962306a36Sopenharmony_ci#endif 74062306a36Sopenharmony_ci 74162306a36Sopenharmony_ci/* 74262306a36Sopenharmony_ci * Associate the pfn range with the given zone, initializing the memmaps 74362306a36Sopenharmony_ci * and resizing the pgdat/zone data to span the added pages. After this 74462306a36Sopenharmony_ci * call, all affected pages are PG_reserved. 74562306a36Sopenharmony_ci * 74662306a36Sopenharmony_ci * All aligned pageblocks are initialized to the specified migratetype 74762306a36Sopenharmony_ci * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related 74862306a36Sopenharmony_ci * zone stats (e.g., nr_isolate_pageblock) are touched. 74962306a36Sopenharmony_ci */ 75062306a36Sopenharmony_civoid __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, 75162306a36Sopenharmony_ci unsigned long nr_pages, 75262306a36Sopenharmony_ci struct vmem_altmap *altmap, int migratetype) 75362306a36Sopenharmony_ci{ 75462306a36Sopenharmony_ci struct pglist_data *pgdat = zone->zone_pgdat; 75562306a36Sopenharmony_ci int nid = pgdat->node_id; 75662306a36Sopenharmony_ci 75762306a36Sopenharmony_ci clear_zone_contiguous(zone); 75862306a36Sopenharmony_ci 75962306a36Sopenharmony_ci if (zone_is_empty(zone)) 76062306a36Sopenharmony_ci init_currently_empty_zone(zone, start_pfn, nr_pages); 76162306a36Sopenharmony_ci resize_zone_range(zone, start_pfn, nr_pages); 76262306a36Sopenharmony_ci resize_pgdat_range(pgdat, start_pfn, nr_pages); 76362306a36Sopenharmony_ci 76462306a36Sopenharmony_ci /* 76562306a36Sopenharmony_ci * Subsection population requires care in pfn_to_online_page(). 76662306a36Sopenharmony_ci * Set the taint to enable the slow path detection of 76762306a36Sopenharmony_ci * ZONE_DEVICE pages in an otherwise ZONE_{NORMAL,MOVABLE} 76862306a36Sopenharmony_ci * section. 76962306a36Sopenharmony_ci */ 77062306a36Sopenharmony_ci if (zone_is_zone_device(zone)) { 77162306a36Sopenharmony_ci if (!IS_ALIGNED(start_pfn, PAGES_PER_SECTION)) 77262306a36Sopenharmony_ci section_taint_zone_device(start_pfn); 77362306a36Sopenharmony_ci if (!IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION)) 77462306a36Sopenharmony_ci section_taint_zone_device(start_pfn + nr_pages); 77562306a36Sopenharmony_ci } 77662306a36Sopenharmony_ci 77762306a36Sopenharmony_ci /* 77862306a36Sopenharmony_ci * TODO now we have a visible range of pages which are not associated 77962306a36Sopenharmony_ci * with their zone properly. Not nice but set_pfnblock_flags_mask 78062306a36Sopenharmony_ci * expects the zone spans the pfn range. All the pages in the range 78162306a36Sopenharmony_ci * are reserved so nobody should be touching them so we should be safe 78262306a36Sopenharmony_ci */ 78362306a36Sopenharmony_ci memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0, 78462306a36Sopenharmony_ci MEMINIT_HOTPLUG, altmap, migratetype); 78562306a36Sopenharmony_ci 78662306a36Sopenharmony_ci set_zone_contiguous(zone); 78762306a36Sopenharmony_ci} 78862306a36Sopenharmony_ci 78962306a36Sopenharmony_cistruct auto_movable_stats { 79062306a36Sopenharmony_ci unsigned long kernel_early_pages; 79162306a36Sopenharmony_ci unsigned long movable_pages; 79262306a36Sopenharmony_ci}; 79362306a36Sopenharmony_ci 79462306a36Sopenharmony_cistatic void auto_movable_stats_account_zone(struct auto_movable_stats *stats, 79562306a36Sopenharmony_ci struct zone *zone) 79662306a36Sopenharmony_ci{ 79762306a36Sopenharmony_ci if (zone_idx(zone) == ZONE_MOVABLE) { 79862306a36Sopenharmony_ci stats->movable_pages += zone->present_pages; 79962306a36Sopenharmony_ci } else { 80062306a36Sopenharmony_ci stats->kernel_early_pages += zone->present_early_pages; 80162306a36Sopenharmony_ci#ifdef CONFIG_CMA 80262306a36Sopenharmony_ci /* 80362306a36Sopenharmony_ci * CMA pages (never on hotplugged memory) behave like 80462306a36Sopenharmony_ci * ZONE_MOVABLE. 80562306a36Sopenharmony_ci */ 80662306a36Sopenharmony_ci stats->movable_pages += zone->cma_pages; 80762306a36Sopenharmony_ci stats->kernel_early_pages -= zone->cma_pages; 80862306a36Sopenharmony_ci#endif /* CONFIG_CMA */ 80962306a36Sopenharmony_ci } 81062306a36Sopenharmony_ci} 81162306a36Sopenharmony_cistruct auto_movable_group_stats { 81262306a36Sopenharmony_ci unsigned long movable_pages; 81362306a36Sopenharmony_ci unsigned long req_kernel_early_pages; 81462306a36Sopenharmony_ci}; 81562306a36Sopenharmony_ci 81662306a36Sopenharmony_cistatic int auto_movable_stats_account_group(struct memory_group *group, 81762306a36Sopenharmony_ci void *arg) 81862306a36Sopenharmony_ci{ 81962306a36Sopenharmony_ci const int ratio = READ_ONCE(auto_movable_ratio); 82062306a36Sopenharmony_ci struct auto_movable_group_stats *stats = arg; 82162306a36Sopenharmony_ci long pages; 82262306a36Sopenharmony_ci 82362306a36Sopenharmony_ci /* 82462306a36Sopenharmony_ci * We don't support modifying the config while the auto-movable online 82562306a36Sopenharmony_ci * policy is already enabled. Just avoid the division by zero below. 82662306a36Sopenharmony_ci */ 82762306a36Sopenharmony_ci if (!ratio) 82862306a36Sopenharmony_ci return 0; 82962306a36Sopenharmony_ci 83062306a36Sopenharmony_ci /* 83162306a36Sopenharmony_ci * Calculate how many early kernel pages this group requires to 83262306a36Sopenharmony_ci * satisfy the configured zone ratio. 83362306a36Sopenharmony_ci */ 83462306a36Sopenharmony_ci pages = group->present_movable_pages * 100 / ratio; 83562306a36Sopenharmony_ci pages -= group->present_kernel_pages; 83662306a36Sopenharmony_ci 83762306a36Sopenharmony_ci if (pages > 0) 83862306a36Sopenharmony_ci stats->req_kernel_early_pages += pages; 83962306a36Sopenharmony_ci stats->movable_pages += group->present_movable_pages; 84062306a36Sopenharmony_ci return 0; 84162306a36Sopenharmony_ci} 84262306a36Sopenharmony_ci 84362306a36Sopenharmony_cistatic bool auto_movable_can_online_movable(int nid, struct memory_group *group, 84462306a36Sopenharmony_ci unsigned long nr_pages) 84562306a36Sopenharmony_ci{ 84662306a36Sopenharmony_ci unsigned long kernel_early_pages, movable_pages; 84762306a36Sopenharmony_ci struct auto_movable_group_stats group_stats = {}; 84862306a36Sopenharmony_ci struct auto_movable_stats stats = {}; 84962306a36Sopenharmony_ci pg_data_t *pgdat = NODE_DATA(nid); 85062306a36Sopenharmony_ci struct zone *zone; 85162306a36Sopenharmony_ci int i; 85262306a36Sopenharmony_ci 85362306a36Sopenharmony_ci /* Walk all relevant zones and collect MOVABLE vs. KERNEL stats. */ 85462306a36Sopenharmony_ci if (nid == NUMA_NO_NODE) { 85562306a36Sopenharmony_ci /* TODO: cache values */ 85662306a36Sopenharmony_ci for_each_populated_zone(zone) 85762306a36Sopenharmony_ci auto_movable_stats_account_zone(&stats, zone); 85862306a36Sopenharmony_ci } else { 85962306a36Sopenharmony_ci for (i = 0; i < MAX_NR_ZONES; i++) { 86062306a36Sopenharmony_ci zone = pgdat->node_zones + i; 86162306a36Sopenharmony_ci if (populated_zone(zone)) 86262306a36Sopenharmony_ci auto_movable_stats_account_zone(&stats, zone); 86362306a36Sopenharmony_ci } 86462306a36Sopenharmony_ci } 86562306a36Sopenharmony_ci 86662306a36Sopenharmony_ci kernel_early_pages = stats.kernel_early_pages; 86762306a36Sopenharmony_ci movable_pages = stats.movable_pages; 86862306a36Sopenharmony_ci 86962306a36Sopenharmony_ci /* 87062306a36Sopenharmony_ci * Kernel memory inside dynamic memory group allows for more MOVABLE 87162306a36Sopenharmony_ci * memory within the same group. Remove the effect of all but the 87262306a36Sopenharmony_ci * current group from the stats. 87362306a36Sopenharmony_ci */ 87462306a36Sopenharmony_ci walk_dynamic_memory_groups(nid, auto_movable_stats_account_group, 87562306a36Sopenharmony_ci group, &group_stats); 87662306a36Sopenharmony_ci if (kernel_early_pages <= group_stats.req_kernel_early_pages) 87762306a36Sopenharmony_ci return false; 87862306a36Sopenharmony_ci kernel_early_pages -= group_stats.req_kernel_early_pages; 87962306a36Sopenharmony_ci movable_pages -= group_stats.movable_pages; 88062306a36Sopenharmony_ci 88162306a36Sopenharmony_ci if (group && group->is_dynamic) 88262306a36Sopenharmony_ci kernel_early_pages += group->present_kernel_pages; 88362306a36Sopenharmony_ci 88462306a36Sopenharmony_ci /* 88562306a36Sopenharmony_ci * Test if we could online the given number of pages to ZONE_MOVABLE 88662306a36Sopenharmony_ci * and still stay in the configured ratio. 88762306a36Sopenharmony_ci */ 88862306a36Sopenharmony_ci movable_pages += nr_pages; 88962306a36Sopenharmony_ci return movable_pages <= (auto_movable_ratio * kernel_early_pages) / 100; 89062306a36Sopenharmony_ci} 89162306a36Sopenharmony_ci 89262306a36Sopenharmony_ci/* 89362306a36Sopenharmony_ci * Returns a default kernel memory zone for the given pfn range. 89462306a36Sopenharmony_ci * If no kernel zone covers this pfn range it will automatically go 89562306a36Sopenharmony_ci * to the ZONE_NORMAL. 89662306a36Sopenharmony_ci */ 89762306a36Sopenharmony_cistatic struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn, 89862306a36Sopenharmony_ci unsigned long nr_pages) 89962306a36Sopenharmony_ci{ 90062306a36Sopenharmony_ci struct pglist_data *pgdat = NODE_DATA(nid); 90162306a36Sopenharmony_ci int zid; 90262306a36Sopenharmony_ci 90362306a36Sopenharmony_ci for (zid = 0; zid < ZONE_NORMAL; zid++) { 90462306a36Sopenharmony_ci struct zone *zone = &pgdat->node_zones[zid]; 90562306a36Sopenharmony_ci 90662306a36Sopenharmony_ci if (zone_intersects(zone, start_pfn, nr_pages)) 90762306a36Sopenharmony_ci return zone; 90862306a36Sopenharmony_ci } 90962306a36Sopenharmony_ci 91062306a36Sopenharmony_ci return &pgdat->node_zones[ZONE_NORMAL]; 91162306a36Sopenharmony_ci} 91262306a36Sopenharmony_ci 91362306a36Sopenharmony_ci/* 91462306a36Sopenharmony_ci * Determine to which zone to online memory dynamically based on user 91562306a36Sopenharmony_ci * configuration and system stats. We care about the following ratio: 91662306a36Sopenharmony_ci * 91762306a36Sopenharmony_ci * MOVABLE : KERNEL 91862306a36Sopenharmony_ci * 91962306a36Sopenharmony_ci * Whereby MOVABLE is memory in ZONE_MOVABLE and KERNEL is memory in 92062306a36Sopenharmony_ci * one of the kernel zones. CMA pages inside one of the kernel zones really 92162306a36Sopenharmony_ci * behaves like ZONE_MOVABLE, so we treat them accordingly. 92262306a36Sopenharmony_ci * 92362306a36Sopenharmony_ci * We don't allow for hotplugged memory in a KERNEL zone to increase the 92462306a36Sopenharmony_ci * amount of MOVABLE memory we can have, so we end up with: 92562306a36Sopenharmony_ci * 92662306a36Sopenharmony_ci * MOVABLE : KERNEL_EARLY 92762306a36Sopenharmony_ci * 92862306a36Sopenharmony_ci * Whereby KERNEL_EARLY is memory in one of the kernel zones, available sinze 92962306a36Sopenharmony_ci * boot. We base our calculation on KERNEL_EARLY internally, because: 93062306a36Sopenharmony_ci * 93162306a36Sopenharmony_ci * a) Hotplugged memory in one of the kernel zones can sometimes still get 93262306a36Sopenharmony_ci * hotunplugged, especially when hot(un)plugging individual memory blocks. 93362306a36Sopenharmony_ci * There is no coordination across memory devices, therefore "automatic" 93462306a36Sopenharmony_ci * hotunplugging, as implemented in hypervisors, could result in zone 93562306a36Sopenharmony_ci * imbalances. 93662306a36Sopenharmony_ci * b) Early/boot memory in one of the kernel zones can usually not get 93762306a36Sopenharmony_ci * hotunplugged again (e.g., no firmware interface to unplug, fragmented 93862306a36Sopenharmony_ci * with unmovable allocations). While there are corner cases where it might 93962306a36Sopenharmony_ci * still work, it is barely relevant in practice. 94062306a36Sopenharmony_ci * 94162306a36Sopenharmony_ci * Exceptions are dynamic memory groups, which allow for more MOVABLE 94262306a36Sopenharmony_ci * memory within the same memory group -- because in that case, there is 94362306a36Sopenharmony_ci * coordination within the single memory device managed by a single driver. 94462306a36Sopenharmony_ci * 94562306a36Sopenharmony_ci * We rely on "present pages" instead of "managed pages", as the latter is 94662306a36Sopenharmony_ci * highly unreliable and dynamic in virtualized environments, and does not 94762306a36Sopenharmony_ci * consider boot time allocations. For example, memory ballooning adjusts the 94862306a36Sopenharmony_ci * managed pages when inflating/deflating the balloon, and balloon compaction 94962306a36Sopenharmony_ci * can even migrate inflated pages between zones. 95062306a36Sopenharmony_ci * 95162306a36Sopenharmony_ci * Using "present pages" is better but some things to keep in mind are: 95262306a36Sopenharmony_ci * 95362306a36Sopenharmony_ci * a) Some memblock allocations, such as for the crashkernel area, are 95462306a36Sopenharmony_ci * effectively unused by the kernel, yet they account to "present pages". 95562306a36Sopenharmony_ci * Fortunately, these allocations are comparatively small in relevant setups 95662306a36Sopenharmony_ci * (e.g., fraction of system memory). 95762306a36Sopenharmony_ci * b) Some hotplugged memory blocks in virtualized environments, esecially 95862306a36Sopenharmony_ci * hotplugged by virtio-mem, look like they are completely present, however, 95962306a36Sopenharmony_ci * only parts of the memory block are actually currently usable. 96062306a36Sopenharmony_ci * "present pages" is an upper limit that can get reached at runtime. As 96162306a36Sopenharmony_ci * we base our calculations on KERNEL_EARLY, this is not an issue. 96262306a36Sopenharmony_ci */ 96362306a36Sopenharmony_cistatic struct zone *auto_movable_zone_for_pfn(int nid, 96462306a36Sopenharmony_ci struct memory_group *group, 96562306a36Sopenharmony_ci unsigned long pfn, 96662306a36Sopenharmony_ci unsigned long nr_pages) 96762306a36Sopenharmony_ci{ 96862306a36Sopenharmony_ci unsigned long online_pages = 0, max_pages, end_pfn; 96962306a36Sopenharmony_ci struct page *page; 97062306a36Sopenharmony_ci 97162306a36Sopenharmony_ci if (!auto_movable_ratio) 97262306a36Sopenharmony_ci goto kernel_zone; 97362306a36Sopenharmony_ci 97462306a36Sopenharmony_ci if (group && !group->is_dynamic) { 97562306a36Sopenharmony_ci max_pages = group->s.max_pages; 97662306a36Sopenharmony_ci online_pages = group->present_movable_pages; 97762306a36Sopenharmony_ci 97862306a36Sopenharmony_ci /* If anything is !MOVABLE online the rest !MOVABLE. */ 97962306a36Sopenharmony_ci if (group->present_kernel_pages) 98062306a36Sopenharmony_ci goto kernel_zone; 98162306a36Sopenharmony_ci } else if (!group || group->d.unit_pages == nr_pages) { 98262306a36Sopenharmony_ci max_pages = nr_pages; 98362306a36Sopenharmony_ci } else { 98462306a36Sopenharmony_ci max_pages = group->d.unit_pages; 98562306a36Sopenharmony_ci /* 98662306a36Sopenharmony_ci * Take a look at all online sections in the current unit. 98762306a36Sopenharmony_ci * We can safely assume that all pages within a section belong 98862306a36Sopenharmony_ci * to the same zone, because dynamic memory groups only deal 98962306a36Sopenharmony_ci * with hotplugged memory. 99062306a36Sopenharmony_ci */ 99162306a36Sopenharmony_ci pfn = ALIGN_DOWN(pfn, group->d.unit_pages); 99262306a36Sopenharmony_ci end_pfn = pfn + group->d.unit_pages; 99362306a36Sopenharmony_ci for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 99462306a36Sopenharmony_ci page = pfn_to_online_page(pfn); 99562306a36Sopenharmony_ci if (!page) 99662306a36Sopenharmony_ci continue; 99762306a36Sopenharmony_ci /* If anything is !MOVABLE online the rest !MOVABLE. */ 99862306a36Sopenharmony_ci if (!is_zone_movable_page(page)) 99962306a36Sopenharmony_ci goto kernel_zone; 100062306a36Sopenharmony_ci online_pages += PAGES_PER_SECTION; 100162306a36Sopenharmony_ci } 100262306a36Sopenharmony_ci } 100362306a36Sopenharmony_ci 100462306a36Sopenharmony_ci /* 100562306a36Sopenharmony_ci * Online MOVABLE if we could *currently* online all remaining parts 100662306a36Sopenharmony_ci * MOVABLE. We expect to (add+) online them immediately next, so if 100762306a36Sopenharmony_ci * nobody interferes, all will be MOVABLE if possible. 100862306a36Sopenharmony_ci */ 100962306a36Sopenharmony_ci nr_pages = max_pages - online_pages; 101062306a36Sopenharmony_ci if (!auto_movable_can_online_movable(NUMA_NO_NODE, group, nr_pages)) 101162306a36Sopenharmony_ci goto kernel_zone; 101262306a36Sopenharmony_ci 101362306a36Sopenharmony_ci#ifdef CONFIG_NUMA 101462306a36Sopenharmony_ci if (auto_movable_numa_aware && 101562306a36Sopenharmony_ci !auto_movable_can_online_movable(nid, group, nr_pages)) 101662306a36Sopenharmony_ci goto kernel_zone; 101762306a36Sopenharmony_ci#endif /* CONFIG_NUMA */ 101862306a36Sopenharmony_ci 101962306a36Sopenharmony_ci return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; 102062306a36Sopenharmony_cikernel_zone: 102162306a36Sopenharmony_ci return default_kernel_zone_for_pfn(nid, pfn, nr_pages); 102262306a36Sopenharmony_ci} 102362306a36Sopenharmony_ci 102462306a36Sopenharmony_cistatic inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, 102562306a36Sopenharmony_ci unsigned long nr_pages) 102662306a36Sopenharmony_ci{ 102762306a36Sopenharmony_ci struct zone *kernel_zone = default_kernel_zone_for_pfn(nid, start_pfn, 102862306a36Sopenharmony_ci nr_pages); 102962306a36Sopenharmony_ci struct zone *movable_zone = &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; 103062306a36Sopenharmony_ci bool in_kernel = zone_intersects(kernel_zone, start_pfn, nr_pages); 103162306a36Sopenharmony_ci bool in_movable = zone_intersects(movable_zone, start_pfn, nr_pages); 103262306a36Sopenharmony_ci 103362306a36Sopenharmony_ci /* 103462306a36Sopenharmony_ci * We inherit the existing zone in a simple case where zones do not 103562306a36Sopenharmony_ci * overlap in the given range 103662306a36Sopenharmony_ci */ 103762306a36Sopenharmony_ci if (in_kernel ^ in_movable) 103862306a36Sopenharmony_ci return (in_kernel) ? kernel_zone : movable_zone; 103962306a36Sopenharmony_ci 104062306a36Sopenharmony_ci /* 104162306a36Sopenharmony_ci * If the range doesn't belong to any zone or two zones overlap in the 104262306a36Sopenharmony_ci * given range then we use movable zone only if movable_node is 104362306a36Sopenharmony_ci * enabled because we always online to a kernel zone by default. 104462306a36Sopenharmony_ci */ 104562306a36Sopenharmony_ci return movable_node_enabled ? movable_zone : kernel_zone; 104662306a36Sopenharmony_ci} 104762306a36Sopenharmony_ci 104862306a36Sopenharmony_cistruct zone *zone_for_pfn_range(int online_type, int nid, 104962306a36Sopenharmony_ci struct memory_group *group, unsigned long start_pfn, 105062306a36Sopenharmony_ci unsigned long nr_pages) 105162306a36Sopenharmony_ci{ 105262306a36Sopenharmony_ci if (online_type == MMOP_ONLINE_KERNEL) 105362306a36Sopenharmony_ci return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages); 105462306a36Sopenharmony_ci 105562306a36Sopenharmony_ci if (online_type == MMOP_ONLINE_MOVABLE) 105662306a36Sopenharmony_ci return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; 105762306a36Sopenharmony_ci 105862306a36Sopenharmony_ci if (online_policy == ONLINE_POLICY_AUTO_MOVABLE) 105962306a36Sopenharmony_ci return auto_movable_zone_for_pfn(nid, group, start_pfn, nr_pages); 106062306a36Sopenharmony_ci 106162306a36Sopenharmony_ci return default_zone_for_pfn(nid, start_pfn, nr_pages); 106262306a36Sopenharmony_ci} 106362306a36Sopenharmony_ci 106462306a36Sopenharmony_ci/* 106562306a36Sopenharmony_ci * This function should only be called by memory_block_{online,offline}, 106662306a36Sopenharmony_ci * and {online,offline}_pages. 106762306a36Sopenharmony_ci */ 106862306a36Sopenharmony_civoid adjust_present_page_count(struct page *page, struct memory_group *group, 106962306a36Sopenharmony_ci long nr_pages) 107062306a36Sopenharmony_ci{ 107162306a36Sopenharmony_ci struct zone *zone = page_zone(page); 107262306a36Sopenharmony_ci const bool movable = zone_idx(zone) == ZONE_MOVABLE; 107362306a36Sopenharmony_ci 107462306a36Sopenharmony_ci /* 107562306a36Sopenharmony_ci * We only support onlining/offlining/adding/removing of complete 107662306a36Sopenharmony_ci * memory blocks; therefore, either all is either early or hotplugged. 107762306a36Sopenharmony_ci */ 107862306a36Sopenharmony_ci if (early_section(__pfn_to_section(page_to_pfn(page)))) 107962306a36Sopenharmony_ci zone->present_early_pages += nr_pages; 108062306a36Sopenharmony_ci zone->present_pages += nr_pages; 108162306a36Sopenharmony_ci zone->zone_pgdat->node_present_pages += nr_pages; 108262306a36Sopenharmony_ci 108362306a36Sopenharmony_ci if (group && movable) 108462306a36Sopenharmony_ci group->present_movable_pages += nr_pages; 108562306a36Sopenharmony_ci else if (group && !movable) 108662306a36Sopenharmony_ci group->present_kernel_pages += nr_pages; 108762306a36Sopenharmony_ci} 108862306a36Sopenharmony_ci 108962306a36Sopenharmony_ciint mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, 109062306a36Sopenharmony_ci struct zone *zone) 109162306a36Sopenharmony_ci{ 109262306a36Sopenharmony_ci unsigned long end_pfn = pfn + nr_pages; 109362306a36Sopenharmony_ci int ret, i; 109462306a36Sopenharmony_ci 109562306a36Sopenharmony_ci ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); 109662306a36Sopenharmony_ci if (ret) 109762306a36Sopenharmony_ci return ret; 109862306a36Sopenharmony_ci 109962306a36Sopenharmony_ci move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE); 110062306a36Sopenharmony_ci 110162306a36Sopenharmony_ci for (i = 0; i < nr_pages; i++) 110262306a36Sopenharmony_ci SetPageVmemmapSelfHosted(pfn_to_page(pfn + i)); 110362306a36Sopenharmony_ci 110462306a36Sopenharmony_ci /* 110562306a36Sopenharmony_ci * It might be that the vmemmap_pages fully span sections. If that is 110662306a36Sopenharmony_ci * the case, mark those sections online here as otherwise they will be 110762306a36Sopenharmony_ci * left offline. 110862306a36Sopenharmony_ci */ 110962306a36Sopenharmony_ci if (nr_pages >= PAGES_PER_SECTION) 111062306a36Sopenharmony_ci online_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION)); 111162306a36Sopenharmony_ci 111262306a36Sopenharmony_ci return ret; 111362306a36Sopenharmony_ci} 111462306a36Sopenharmony_ci 111562306a36Sopenharmony_civoid mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages) 111662306a36Sopenharmony_ci{ 111762306a36Sopenharmony_ci unsigned long end_pfn = pfn + nr_pages; 111862306a36Sopenharmony_ci 111962306a36Sopenharmony_ci /* 112062306a36Sopenharmony_ci * It might be that the vmemmap_pages fully span sections. If that is 112162306a36Sopenharmony_ci * the case, mark those sections offline here as otherwise they will be 112262306a36Sopenharmony_ci * left online. 112362306a36Sopenharmony_ci */ 112462306a36Sopenharmony_ci if (nr_pages >= PAGES_PER_SECTION) 112562306a36Sopenharmony_ci offline_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION)); 112662306a36Sopenharmony_ci 112762306a36Sopenharmony_ci /* 112862306a36Sopenharmony_ci * The pages associated with this vmemmap have been offlined, so 112962306a36Sopenharmony_ci * we can reset its state here. 113062306a36Sopenharmony_ci */ 113162306a36Sopenharmony_ci remove_pfn_range_from_zone(page_zone(pfn_to_page(pfn)), pfn, nr_pages); 113262306a36Sopenharmony_ci kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); 113362306a36Sopenharmony_ci} 113462306a36Sopenharmony_ci 113562306a36Sopenharmony_ci/* 113662306a36Sopenharmony_ci * Must be called with mem_hotplug_lock in write mode. 113762306a36Sopenharmony_ci */ 113862306a36Sopenharmony_ciint __ref online_pages(unsigned long pfn, unsigned long nr_pages, 113962306a36Sopenharmony_ci struct zone *zone, struct memory_group *group) 114062306a36Sopenharmony_ci{ 114162306a36Sopenharmony_ci unsigned long flags; 114262306a36Sopenharmony_ci int need_zonelists_rebuild = 0; 114362306a36Sopenharmony_ci const int nid = zone_to_nid(zone); 114462306a36Sopenharmony_ci int ret; 114562306a36Sopenharmony_ci struct memory_notify arg; 114662306a36Sopenharmony_ci 114762306a36Sopenharmony_ci /* 114862306a36Sopenharmony_ci * {on,off}lining is constrained to full memory sections (or more 114962306a36Sopenharmony_ci * precisely to memory blocks from the user space POV). 115062306a36Sopenharmony_ci * memmap_on_memory is an exception because it reserves initial part 115162306a36Sopenharmony_ci * of the physical memory space for vmemmaps. That space is pageblock 115262306a36Sopenharmony_ci * aligned. 115362306a36Sopenharmony_ci */ 115462306a36Sopenharmony_ci if (WARN_ON_ONCE(!nr_pages || !pageblock_aligned(pfn) || 115562306a36Sopenharmony_ci !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION))) 115662306a36Sopenharmony_ci return -EINVAL; 115762306a36Sopenharmony_ci 115862306a36Sopenharmony_ci 115962306a36Sopenharmony_ci /* associate pfn range with the zone */ 116062306a36Sopenharmony_ci move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE); 116162306a36Sopenharmony_ci 116262306a36Sopenharmony_ci arg.start_pfn = pfn; 116362306a36Sopenharmony_ci arg.nr_pages = nr_pages; 116462306a36Sopenharmony_ci node_states_check_changes_online(nr_pages, zone, &arg); 116562306a36Sopenharmony_ci 116662306a36Sopenharmony_ci ret = memory_notify(MEM_GOING_ONLINE, &arg); 116762306a36Sopenharmony_ci ret = notifier_to_errno(ret); 116862306a36Sopenharmony_ci if (ret) 116962306a36Sopenharmony_ci goto failed_addition; 117062306a36Sopenharmony_ci 117162306a36Sopenharmony_ci /* 117262306a36Sopenharmony_ci * Fixup the number of isolated pageblocks before marking the sections 117362306a36Sopenharmony_ci * onlining, such that undo_isolate_page_range() works correctly. 117462306a36Sopenharmony_ci */ 117562306a36Sopenharmony_ci spin_lock_irqsave(&zone->lock, flags); 117662306a36Sopenharmony_ci zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages; 117762306a36Sopenharmony_ci spin_unlock_irqrestore(&zone->lock, flags); 117862306a36Sopenharmony_ci 117962306a36Sopenharmony_ci /* 118062306a36Sopenharmony_ci * If this zone is not populated, then it is not in zonelist. 118162306a36Sopenharmony_ci * This means the page allocator ignores this zone. 118262306a36Sopenharmony_ci * So, zonelist must be updated after online. 118362306a36Sopenharmony_ci */ 118462306a36Sopenharmony_ci if (!populated_zone(zone)) { 118562306a36Sopenharmony_ci need_zonelists_rebuild = 1; 118662306a36Sopenharmony_ci setup_zone_pageset(zone); 118762306a36Sopenharmony_ci } 118862306a36Sopenharmony_ci 118962306a36Sopenharmony_ci online_pages_range(pfn, nr_pages); 119062306a36Sopenharmony_ci adjust_present_page_count(pfn_to_page(pfn), group, nr_pages); 119162306a36Sopenharmony_ci 119262306a36Sopenharmony_ci node_states_set_node(nid, &arg); 119362306a36Sopenharmony_ci if (need_zonelists_rebuild) 119462306a36Sopenharmony_ci build_all_zonelists(NULL); 119562306a36Sopenharmony_ci 119662306a36Sopenharmony_ci /* Basic onlining is complete, allow allocation of onlined pages. */ 119762306a36Sopenharmony_ci undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE); 119862306a36Sopenharmony_ci 119962306a36Sopenharmony_ci /* 120062306a36Sopenharmony_ci * Freshly onlined pages aren't shuffled (e.g., all pages are placed to 120162306a36Sopenharmony_ci * the tail of the freelist when undoing isolation). Shuffle the whole 120262306a36Sopenharmony_ci * zone to make sure the just onlined pages are properly distributed 120362306a36Sopenharmony_ci * across the whole freelist - to create an initial shuffle. 120462306a36Sopenharmony_ci */ 120562306a36Sopenharmony_ci shuffle_zone(zone); 120662306a36Sopenharmony_ci 120762306a36Sopenharmony_ci /* reinitialise watermarks and update pcp limits */ 120862306a36Sopenharmony_ci init_per_zone_wmark_min(); 120962306a36Sopenharmony_ci 121062306a36Sopenharmony_ci kswapd_run(nid); 121162306a36Sopenharmony_ci kcompactd_run(nid); 121262306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_ZSWAPD 121362306a36Sopenharmony_ci zswapd_run(nid); 121462306a36Sopenharmony_ci#endif 121562306a36Sopenharmony_ci 121662306a36Sopenharmony_ci writeback_set_ratelimit(); 121762306a36Sopenharmony_ci 121862306a36Sopenharmony_ci memory_notify(MEM_ONLINE, &arg); 121962306a36Sopenharmony_ci return 0; 122062306a36Sopenharmony_ci 122162306a36Sopenharmony_cifailed_addition: 122262306a36Sopenharmony_ci pr_debug("online_pages [mem %#010llx-%#010llx] failed\n", 122362306a36Sopenharmony_ci (unsigned long long) pfn << PAGE_SHIFT, 122462306a36Sopenharmony_ci (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); 122562306a36Sopenharmony_ci memory_notify(MEM_CANCEL_ONLINE, &arg); 122662306a36Sopenharmony_ci remove_pfn_range_from_zone(zone, pfn, nr_pages); 122762306a36Sopenharmony_ci return ret; 122862306a36Sopenharmony_ci} 122962306a36Sopenharmony_ci 123062306a36Sopenharmony_ci/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 123162306a36Sopenharmony_cistatic pg_data_t __ref *hotadd_init_pgdat(int nid) 123262306a36Sopenharmony_ci{ 123362306a36Sopenharmony_ci struct pglist_data *pgdat; 123462306a36Sopenharmony_ci 123562306a36Sopenharmony_ci /* 123662306a36Sopenharmony_ci * NODE_DATA is preallocated (free_area_init) but its internal 123762306a36Sopenharmony_ci * state is not allocated completely. Add missing pieces. 123862306a36Sopenharmony_ci * Completely offline nodes stay around and they just need 123962306a36Sopenharmony_ci * reintialization. 124062306a36Sopenharmony_ci */ 124162306a36Sopenharmony_ci pgdat = NODE_DATA(nid); 124262306a36Sopenharmony_ci 124362306a36Sopenharmony_ci /* init node's zones as empty zones, we don't have any present pages.*/ 124462306a36Sopenharmony_ci free_area_init_core_hotplug(pgdat); 124562306a36Sopenharmony_ci 124662306a36Sopenharmony_ci /* 124762306a36Sopenharmony_ci * The node we allocated has no zone fallback lists. For avoiding 124862306a36Sopenharmony_ci * to access not-initialized zonelist, build here. 124962306a36Sopenharmony_ci */ 125062306a36Sopenharmony_ci build_all_zonelists(pgdat); 125162306a36Sopenharmony_ci 125262306a36Sopenharmony_ci return pgdat; 125362306a36Sopenharmony_ci} 125462306a36Sopenharmony_ci 125562306a36Sopenharmony_ci/* 125662306a36Sopenharmony_ci * __try_online_node - online a node if offlined 125762306a36Sopenharmony_ci * @nid: the node ID 125862306a36Sopenharmony_ci * @set_node_online: Whether we want to online the node 125962306a36Sopenharmony_ci * called by cpu_up() to online a node without onlined memory. 126062306a36Sopenharmony_ci * 126162306a36Sopenharmony_ci * Returns: 126262306a36Sopenharmony_ci * 1 -> a new node has been allocated 126362306a36Sopenharmony_ci * 0 -> the node is already online 126462306a36Sopenharmony_ci * -ENOMEM -> the node could not be allocated 126562306a36Sopenharmony_ci */ 126662306a36Sopenharmony_cistatic int __try_online_node(int nid, bool set_node_online) 126762306a36Sopenharmony_ci{ 126862306a36Sopenharmony_ci pg_data_t *pgdat; 126962306a36Sopenharmony_ci int ret = 1; 127062306a36Sopenharmony_ci 127162306a36Sopenharmony_ci if (node_online(nid)) 127262306a36Sopenharmony_ci return 0; 127362306a36Sopenharmony_ci 127462306a36Sopenharmony_ci pgdat = hotadd_init_pgdat(nid); 127562306a36Sopenharmony_ci if (!pgdat) { 127662306a36Sopenharmony_ci pr_err("Cannot online node %d due to NULL pgdat\n", nid); 127762306a36Sopenharmony_ci ret = -ENOMEM; 127862306a36Sopenharmony_ci goto out; 127962306a36Sopenharmony_ci } 128062306a36Sopenharmony_ci 128162306a36Sopenharmony_ci if (set_node_online) { 128262306a36Sopenharmony_ci node_set_online(nid); 128362306a36Sopenharmony_ci ret = register_one_node(nid); 128462306a36Sopenharmony_ci BUG_ON(ret); 128562306a36Sopenharmony_ci } 128662306a36Sopenharmony_ciout: 128762306a36Sopenharmony_ci return ret; 128862306a36Sopenharmony_ci} 128962306a36Sopenharmony_ci 129062306a36Sopenharmony_ci/* 129162306a36Sopenharmony_ci * Users of this function always want to online/register the node 129262306a36Sopenharmony_ci */ 129362306a36Sopenharmony_ciint try_online_node(int nid) 129462306a36Sopenharmony_ci{ 129562306a36Sopenharmony_ci int ret; 129662306a36Sopenharmony_ci 129762306a36Sopenharmony_ci mem_hotplug_begin(); 129862306a36Sopenharmony_ci ret = __try_online_node(nid, true); 129962306a36Sopenharmony_ci mem_hotplug_done(); 130062306a36Sopenharmony_ci return ret; 130162306a36Sopenharmony_ci} 130262306a36Sopenharmony_ci 130362306a36Sopenharmony_cistatic int check_hotplug_memory_range(u64 start, u64 size) 130462306a36Sopenharmony_ci{ 130562306a36Sopenharmony_ci /* memory range must be block size aligned */ 130662306a36Sopenharmony_ci if (!size || !IS_ALIGNED(start, memory_block_size_bytes()) || 130762306a36Sopenharmony_ci !IS_ALIGNED(size, memory_block_size_bytes())) { 130862306a36Sopenharmony_ci pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx", 130962306a36Sopenharmony_ci memory_block_size_bytes(), start, size); 131062306a36Sopenharmony_ci return -EINVAL; 131162306a36Sopenharmony_ci } 131262306a36Sopenharmony_ci 131362306a36Sopenharmony_ci return 0; 131462306a36Sopenharmony_ci} 131562306a36Sopenharmony_ci 131662306a36Sopenharmony_cistatic int online_memory_block(struct memory_block *mem, void *arg) 131762306a36Sopenharmony_ci{ 131862306a36Sopenharmony_ci mem->online_type = mhp_default_online_type; 131962306a36Sopenharmony_ci return device_online(&mem->dev); 132062306a36Sopenharmony_ci} 132162306a36Sopenharmony_ci 132262306a36Sopenharmony_ci#ifndef arch_supports_memmap_on_memory 132362306a36Sopenharmony_cistatic inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size) 132462306a36Sopenharmony_ci{ 132562306a36Sopenharmony_ci /* 132662306a36Sopenharmony_ci * As default, we want the vmemmap to span a complete PMD such that we 132762306a36Sopenharmony_ci * can map the vmemmap using a single PMD if supported by the 132862306a36Sopenharmony_ci * architecture. 132962306a36Sopenharmony_ci */ 133062306a36Sopenharmony_ci return IS_ALIGNED(vmemmap_size, PMD_SIZE); 133162306a36Sopenharmony_ci} 133262306a36Sopenharmony_ci#endif 133362306a36Sopenharmony_ci 133462306a36Sopenharmony_cistatic bool mhp_supports_memmap_on_memory(unsigned long size) 133562306a36Sopenharmony_ci{ 133662306a36Sopenharmony_ci unsigned long vmemmap_size = memory_block_memmap_size(); 133762306a36Sopenharmony_ci unsigned long memmap_pages = memory_block_memmap_on_memory_pages(); 133862306a36Sopenharmony_ci 133962306a36Sopenharmony_ci /* 134062306a36Sopenharmony_ci * Besides having arch support and the feature enabled at runtime, we 134162306a36Sopenharmony_ci * need a few more assumptions to hold true: 134262306a36Sopenharmony_ci * 134362306a36Sopenharmony_ci * a) We span a single memory block: memory onlining/offlinin;g happens 134462306a36Sopenharmony_ci * in memory block granularity. We don't want the vmemmap of online 134562306a36Sopenharmony_ci * memory blocks to reside on offline memory blocks. In the future, 134662306a36Sopenharmony_ci * we might want to support variable-sized memory blocks to make the 134762306a36Sopenharmony_ci * feature more versatile. 134862306a36Sopenharmony_ci * 134962306a36Sopenharmony_ci * b) The vmemmap pages span complete PMDs: We don't want vmemmap code 135062306a36Sopenharmony_ci * to populate memory from the altmap for unrelated parts (i.e., 135162306a36Sopenharmony_ci * other memory blocks) 135262306a36Sopenharmony_ci * 135362306a36Sopenharmony_ci * c) The vmemmap pages (and thereby the pages that will be exposed to 135462306a36Sopenharmony_ci * the buddy) have to cover full pageblocks: memory onlining/offlining 135562306a36Sopenharmony_ci * code requires applicable ranges to be page-aligned, for example, to 135662306a36Sopenharmony_ci * set the migratetypes properly. 135762306a36Sopenharmony_ci * 135862306a36Sopenharmony_ci * TODO: Although we have a check here to make sure that vmemmap pages 135962306a36Sopenharmony_ci * fully populate a PMD, it is not the right place to check for 136062306a36Sopenharmony_ci * this. A much better solution involves improving vmemmap code 136162306a36Sopenharmony_ci * to fallback to base pages when trying to populate vmemmap using 136262306a36Sopenharmony_ci * altmap as an alternative source of memory, and we do not exactly 136362306a36Sopenharmony_ci * populate a single PMD. 136462306a36Sopenharmony_ci */ 136562306a36Sopenharmony_ci if (!mhp_memmap_on_memory() || size != memory_block_size_bytes()) 136662306a36Sopenharmony_ci return false; 136762306a36Sopenharmony_ci 136862306a36Sopenharmony_ci /* 136962306a36Sopenharmony_ci * Make sure the vmemmap allocation is fully contained 137062306a36Sopenharmony_ci * so that we always allocate vmemmap memory from altmap area. 137162306a36Sopenharmony_ci */ 137262306a36Sopenharmony_ci if (!IS_ALIGNED(vmemmap_size, PAGE_SIZE)) 137362306a36Sopenharmony_ci return false; 137462306a36Sopenharmony_ci 137562306a36Sopenharmony_ci /* 137662306a36Sopenharmony_ci * start pfn should be pageblock_nr_pages aligned for correctly 137762306a36Sopenharmony_ci * setting migrate types 137862306a36Sopenharmony_ci */ 137962306a36Sopenharmony_ci if (!pageblock_aligned(memmap_pages)) 138062306a36Sopenharmony_ci return false; 138162306a36Sopenharmony_ci 138262306a36Sopenharmony_ci if (memmap_pages == PHYS_PFN(memory_block_size_bytes())) 138362306a36Sopenharmony_ci /* No effective hotplugged memory doesn't make sense. */ 138462306a36Sopenharmony_ci return false; 138562306a36Sopenharmony_ci 138662306a36Sopenharmony_ci return arch_supports_memmap_on_memory(vmemmap_size); 138762306a36Sopenharmony_ci} 138862306a36Sopenharmony_ci 138962306a36Sopenharmony_ci/* 139062306a36Sopenharmony_ci * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 139162306a36Sopenharmony_ci * and online/offline operations (triggered e.g. by sysfs). 139262306a36Sopenharmony_ci * 139362306a36Sopenharmony_ci * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG 139462306a36Sopenharmony_ci */ 139562306a36Sopenharmony_ciint __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) 139662306a36Sopenharmony_ci{ 139762306a36Sopenharmony_ci struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) }; 139862306a36Sopenharmony_ci enum memblock_flags memblock_flags = MEMBLOCK_NONE; 139962306a36Sopenharmony_ci struct vmem_altmap mhp_altmap = { 140062306a36Sopenharmony_ci .base_pfn = PHYS_PFN(res->start), 140162306a36Sopenharmony_ci .end_pfn = PHYS_PFN(res->end), 140262306a36Sopenharmony_ci }; 140362306a36Sopenharmony_ci struct memory_group *group = NULL; 140462306a36Sopenharmony_ci u64 start, size; 140562306a36Sopenharmony_ci bool new_node = false; 140662306a36Sopenharmony_ci int ret; 140762306a36Sopenharmony_ci 140862306a36Sopenharmony_ci start = res->start; 140962306a36Sopenharmony_ci size = resource_size(res); 141062306a36Sopenharmony_ci 141162306a36Sopenharmony_ci ret = check_hotplug_memory_range(start, size); 141262306a36Sopenharmony_ci if (ret) 141362306a36Sopenharmony_ci return ret; 141462306a36Sopenharmony_ci 141562306a36Sopenharmony_ci if (mhp_flags & MHP_NID_IS_MGID) { 141662306a36Sopenharmony_ci group = memory_group_find_by_id(nid); 141762306a36Sopenharmony_ci if (!group) 141862306a36Sopenharmony_ci return -EINVAL; 141962306a36Sopenharmony_ci nid = group->nid; 142062306a36Sopenharmony_ci } 142162306a36Sopenharmony_ci 142262306a36Sopenharmony_ci if (!node_possible(nid)) { 142362306a36Sopenharmony_ci WARN(1, "node %d was absent from the node_possible_map\n", nid); 142462306a36Sopenharmony_ci return -EINVAL; 142562306a36Sopenharmony_ci } 142662306a36Sopenharmony_ci 142762306a36Sopenharmony_ci mem_hotplug_begin(); 142862306a36Sopenharmony_ci 142962306a36Sopenharmony_ci if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { 143062306a36Sopenharmony_ci if (res->flags & IORESOURCE_SYSRAM_DRIVER_MANAGED) 143162306a36Sopenharmony_ci memblock_flags = MEMBLOCK_DRIVER_MANAGED; 143262306a36Sopenharmony_ci ret = memblock_add_node(start, size, nid, memblock_flags); 143362306a36Sopenharmony_ci if (ret) 143462306a36Sopenharmony_ci goto error_mem_hotplug_end; 143562306a36Sopenharmony_ci } 143662306a36Sopenharmony_ci 143762306a36Sopenharmony_ci ret = __try_online_node(nid, false); 143862306a36Sopenharmony_ci if (ret < 0) 143962306a36Sopenharmony_ci goto error; 144062306a36Sopenharmony_ci new_node = ret; 144162306a36Sopenharmony_ci 144262306a36Sopenharmony_ci /* 144362306a36Sopenharmony_ci * Self hosted memmap array 144462306a36Sopenharmony_ci */ 144562306a36Sopenharmony_ci if (mhp_flags & MHP_MEMMAP_ON_MEMORY) { 144662306a36Sopenharmony_ci if (mhp_supports_memmap_on_memory(size)) { 144762306a36Sopenharmony_ci mhp_altmap.free = memory_block_memmap_on_memory_pages(); 144862306a36Sopenharmony_ci params.altmap = kmalloc(sizeof(struct vmem_altmap), GFP_KERNEL); 144962306a36Sopenharmony_ci if (!params.altmap) { 145062306a36Sopenharmony_ci ret = -ENOMEM; 145162306a36Sopenharmony_ci goto error; 145262306a36Sopenharmony_ci } 145362306a36Sopenharmony_ci 145462306a36Sopenharmony_ci memcpy(params.altmap, &mhp_altmap, sizeof(mhp_altmap)); 145562306a36Sopenharmony_ci } 145662306a36Sopenharmony_ci /* fallback to not using altmap */ 145762306a36Sopenharmony_ci } 145862306a36Sopenharmony_ci 145962306a36Sopenharmony_ci /* call arch's memory hotadd */ 146062306a36Sopenharmony_ci ret = arch_add_memory(nid, start, size, ¶ms); 146162306a36Sopenharmony_ci if (ret < 0) 146262306a36Sopenharmony_ci goto error_free; 146362306a36Sopenharmony_ci 146462306a36Sopenharmony_ci /* create memory block devices after memory was added */ 146562306a36Sopenharmony_ci ret = create_memory_block_devices(start, size, params.altmap, group); 146662306a36Sopenharmony_ci if (ret) { 146762306a36Sopenharmony_ci arch_remove_memory(start, size, params.altmap); 146862306a36Sopenharmony_ci goto error_free; 146962306a36Sopenharmony_ci } 147062306a36Sopenharmony_ci 147162306a36Sopenharmony_ci if (new_node) { 147262306a36Sopenharmony_ci /* If sysfs file of new node can't be created, cpu on the node 147362306a36Sopenharmony_ci * can't be hot-added. There is no rollback way now. 147462306a36Sopenharmony_ci * So, check by BUG_ON() to catch it reluctantly.. 147562306a36Sopenharmony_ci * We online node here. We can't roll back from here. 147662306a36Sopenharmony_ci */ 147762306a36Sopenharmony_ci node_set_online(nid); 147862306a36Sopenharmony_ci ret = __register_one_node(nid); 147962306a36Sopenharmony_ci BUG_ON(ret); 148062306a36Sopenharmony_ci } 148162306a36Sopenharmony_ci 148262306a36Sopenharmony_ci register_memory_blocks_under_node(nid, PFN_DOWN(start), 148362306a36Sopenharmony_ci PFN_UP(start + size - 1), 148462306a36Sopenharmony_ci MEMINIT_HOTPLUG); 148562306a36Sopenharmony_ci 148662306a36Sopenharmony_ci /* create new memmap entry */ 148762306a36Sopenharmony_ci if (!strcmp(res->name, "System RAM")) 148862306a36Sopenharmony_ci firmware_map_add_hotplug(start, start + size, "System RAM"); 148962306a36Sopenharmony_ci 149062306a36Sopenharmony_ci /* device_online() will take the lock when calling online_pages() */ 149162306a36Sopenharmony_ci mem_hotplug_done(); 149262306a36Sopenharmony_ci 149362306a36Sopenharmony_ci /* 149462306a36Sopenharmony_ci * In case we're allowed to merge the resource, flag it and trigger 149562306a36Sopenharmony_ci * merging now that adding succeeded. 149662306a36Sopenharmony_ci */ 149762306a36Sopenharmony_ci if (mhp_flags & MHP_MERGE_RESOURCE) 149862306a36Sopenharmony_ci merge_system_ram_resource(res); 149962306a36Sopenharmony_ci 150062306a36Sopenharmony_ci /* online pages if requested */ 150162306a36Sopenharmony_ci if (mhp_default_online_type != MMOP_OFFLINE) 150262306a36Sopenharmony_ci walk_memory_blocks(start, size, NULL, online_memory_block); 150362306a36Sopenharmony_ci 150462306a36Sopenharmony_ci return ret; 150562306a36Sopenharmony_cierror_free: 150662306a36Sopenharmony_ci kfree(params.altmap); 150762306a36Sopenharmony_cierror: 150862306a36Sopenharmony_ci if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) 150962306a36Sopenharmony_ci memblock_remove(start, size); 151062306a36Sopenharmony_cierror_mem_hotplug_end: 151162306a36Sopenharmony_ci mem_hotplug_done(); 151262306a36Sopenharmony_ci return ret; 151362306a36Sopenharmony_ci} 151462306a36Sopenharmony_ci 151562306a36Sopenharmony_ci/* requires device_hotplug_lock, see add_memory_resource() */ 151662306a36Sopenharmony_ciint __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags) 151762306a36Sopenharmony_ci{ 151862306a36Sopenharmony_ci struct resource *res; 151962306a36Sopenharmony_ci int ret; 152062306a36Sopenharmony_ci 152162306a36Sopenharmony_ci res = register_memory_resource(start, size, "System RAM"); 152262306a36Sopenharmony_ci if (IS_ERR(res)) 152362306a36Sopenharmony_ci return PTR_ERR(res); 152462306a36Sopenharmony_ci 152562306a36Sopenharmony_ci ret = add_memory_resource(nid, res, mhp_flags); 152662306a36Sopenharmony_ci if (ret < 0) 152762306a36Sopenharmony_ci release_memory_resource(res); 152862306a36Sopenharmony_ci return ret; 152962306a36Sopenharmony_ci} 153062306a36Sopenharmony_ci 153162306a36Sopenharmony_ciint add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags) 153262306a36Sopenharmony_ci{ 153362306a36Sopenharmony_ci int rc; 153462306a36Sopenharmony_ci 153562306a36Sopenharmony_ci lock_device_hotplug(); 153662306a36Sopenharmony_ci rc = __add_memory(nid, start, size, mhp_flags); 153762306a36Sopenharmony_ci unlock_device_hotplug(); 153862306a36Sopenharmony_ci 153962306a36Sopenharmony_ci return rc; 154062306a36Sopenharmony_ci} 154162306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(add_memory); 154262306a36Sopenharmony_ci 154362306a36Sopenharmony_ci/* 154462306a36Sopenharmony_ci * Add special, driver-managed memory to the system as system RAM. Such 154562306a36Sopenharmony_ci * memory is not exposed via the raw firmware-provided memmap as system 154662306a36Sopenharmony_ci * RAM, instead, it is detected and added by a driver - during cold boot, 154762306a36Sopenharmony_ci * after a reboot, and after kexec. 154862306a36Sopenharmony_ci * 154962306a36Sopenharmony_ci * Reasons why this memory should not be used for the initial memmap of a 155062306a36Sopenharmony_ci * kexec kernel or for placing kexec images: 155162306a36Sopenharmony_ci * - The booting kernel is in charge of determining how this memory will be 155262306a36Sopenharmony_ci * used (e.g., use persistent memory as system RAM) 155362306a36Sopenharmony_ci * - Coordination with a hypervisor is required before this memory 155462306a36Sopenharmony_ci * can be used (e.g., inaccessible parts). 155562306a36Sopenharmony_ci * 155662306a36Sopenharmony_ci * For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided 155762306a36Sopenharmony_ci * memory map") are created. Also, the created memory resource is flagged 155862306a36Sopenharmony_ci * with IORESOURCE_SYSRAM_DRIVER_MANAGED, so in-kernel users can special-case 155962306a36Sopenharmony_ci * this memory as well (esp., not place kexec images onto it). 156062306a36Sopenharmony_ci * 156162306a36Sopenharmony_ci * The resource_name (visible via /proc/iomem) has to have the format 156262306a36Sopenharmony_ci * "System RAM ($DRIVER)". 156362306a36Sopenharmony_ci */ 156462306a36Sopenharmony_ciint add_memory_driver_managed(int nid, u64 start, u64 size, 156562306a36Sopenharmony_ci const char *resource_name, mhp_t mhp_flags) 156662306a36Sopenharmony_ci{ 156762306a36Sopenharmony_ci struct resource *res; 156862306a36Sopenharmony_ci int rc; 156962306a36Sopenharmony_ci 157062306a36Sopenharmony_ci if (!resource_name || 157162306a36Sopenharmony_ci strstr(resource_name, "System RAM (") != resource_name || 157262306a36Sopenharmony_ci resource_name[strlen(resource_name) - 1] != ')') 157362306a36Sopenharmony_ci return -EINVAL; 157462306a36Sopenharmony_ci 157562306a36Sopenharmony_ci lock_device_hotplug(); 157662306a36Sopenharmony_ci 157762306a36Sopenharmony_ci res = register_memory_resource(start, size, resource_name); 157862306a36Sopenharmony_ci if (IS_ERR(res)) { 157962306a36Sopenharmony_ci rc = PTR_ERR(res); 158062306a36Sopenharmony_ci goto out_unlock; 158162306a36Sopenharmony_ci } 158262306a36Sopenharmony_ci 158362306a36Sopenharmony_ci rc = add_memory_resource(nid, res, mhp_flags); 158462306a36Sopenharmony_ci if (rc < 0) 158562306a36Sopenharmony_ci release_memory_resource(res); 158662306a36Sopenharmony_ci 158762306a36Sopenharmony_ciout_unlock: 158862306a36Sopenharmony_ci unlock_device_hotplug(); 158962306a36Sopenharmony_ci return rc; 159062306a36Sopenharmony_ci} 159162306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(add_memory_driver_managed); 159262306a36Sopenharmony_ci 159362306a36Sopenharmony_ci/* 159462306a36Sopenharmony_ci * Platforms should define arch_get_mappable_range() that provides 159562306a36Sopenharmony_ci * maximum possible addressable physical memory range for which the 159662306a36Sopenharmony_ci * linear mapping could be created. The platform returned address 159762306a36Sopenharmony_ci * range must adhere to these following semantics. 159862306a36Sopenharmony_ci * 159962306a36Sopenharmony_ci * - range.start <= range.end 160062306a36Sopenharmony_ci * - Range includes both end points [range.start..range.end] 160162306a36Sopenharmony_ci * 160262306a36Sopenharmony_ci * There is also a fallback definition provided here, allowing the 160362306a36Sopenharmony_ci * entire possible physical address range in case any platform does 160462306a36Sopenharmony_ci * not define arch_get_mappable_range(). 160562306a36Sopenharmony_ci */ 160662306a36Sopenharmony_cistruct range __weak arch_get_mappable_range(void) 160762306a36Sopenharmony_ci{ 160862306a36Sopenharmony_ci struct range mhp_range = { 160962306a36Sopenharmony_ci .start = 0UL, 161062306a36Sopenharmony_ci .end = -1ULL, 161162306a36Sopenharmony_ci }; 161262306a36Sopenharmony_ci return mhp_range; 161362306a36Sopenharmony_ci} 161462306a36Sopenharmony_ci 161562306a36Sopenharmony_cistruct range mhp_get_pluggable_range(bool need_mapping) 161662306a36Sopenharmony_ci{ 161762306a36Sopenharmony_ci const u64 max_phys = (1ULL << MAX_PHYSMEM_BITS) - 1; 161862306a36Sopenharmony_ci struct range mhp_range; 161962306a36Sopenharmony_ci 162062306a36Sopenharmony_ci if (need_mapping) { 162162306a36Sopenharmony_ci mhp_range = arch_get_mappable_range(); 162262306a36Sopenharmony_ci if (mhp_range.start > max_phys) { 162362306a36Sopenharmony_ci mhp_range.start = 0; 162462306a36Sopenharmony_ci mhp_range.end = 0; 162562306a36Sopenharmony_ci } 162662306a36Sopenharmony_ci mhp_range.end = min_t(u64, mhp_range.end, max_phys); 162762306a36Sopenharmony_ci } else { 162862306a36Sopenharmony_ci mhp_range.start = 0; 162962306a36Sopenharmony_ci mhp_range.end = max_phys; 163062306a36Sopenharmony_ci } 163162306a36Sopenharmony_ci return mhp_range; 163262306a36Sopenharmony_ci} 163362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(mhp_get_pluggable_range); 163462306a36Sopenharmony_ci 163562306a36Sopenharmony_cibool mhp_range_allowed(u64 start, u64 size, bool need_mapping) 163662306a36Sopenharmony_ci{ 163762306a36Sopenharmony_ci struct range mhp_range = mhp_get_pluggable_range(need_mapping); 163862306a36Sopenharmony_ci u64 end = start + size; 163962306a36Sopenharmony_ci 164062306a36Sopenharmony_ci if (start < end && start >= mhp_range.start && (end - 1) <= mhp_range.end) 164162306a36Sopenharmony_ci return true; 164262306a36Sopenharmony_ci 164362306a36Sopenharmony_ci pr_warn("Hotplug memory [%#llx-%#llx] exceeds maximum addressable range [%#llx-%#llx]\n", 164462306a36Sopenharmony_ci start, end, mhp_range.start, mhp_range.end); 164562306a36Sopenharmony_ci return false; 164662306a36Sopenharmony_ci} 164762306a36Sopenharmony_ci 164862306a36Sopenharmony_ci#ifdef CONFIG_MEMORY_HOTREMOVE 164962306a36Sopenharmony_ci/* 165062306a36Sopenharmony_ci * Scan pfn range [start,end) to find movable/migratable pages (LRU pages, 165162306a36Sopenharmony_ci * non-lru movable pages and hugepages). Will skip over most unmovable 165262306a36Sopenharmony_ci * pages (esp., pages that can be skipped when offlining), but bail out on 165362306a36Sopenharmony_ci * definitely unmovable pages. 165462306a36Sopenharmony_ci * 165562306a36Sopenharmony_ci * Returns: 165662306a36Sopenharmony_ci * 0 in case a movable page is found and movable_pfn was updated. 165762306a36Sopenharmony_ci * -ENOENT in case no movable page was found. 165862306a36Sopenharmony_ci * -EBUSY in case a definitely unmovable page was found. 165962306a36Sopenharmony_ci */ 166062306a36Sopenharmony_cistatic int scan_movable_pages(unsigned long start, unsigned long end, 166162306a36Sopenharmony_ci unsigned long *movable_pfn) 166262306a36Sopenharmony_ci{ 166362306a36Sopenharmony_ci unsigned long pfn; 166462306a36Sopenharmony_ci 166562306a36Sopenharmony_ci for (pfn = start; pfn < end; pfn++) { 166662306a36Sopenharmony_ci struct page *page, *head; 166762306a36Sopenharmony_ci unsigned long skip; 166862306a36Sopenharmony_ci 166962306a36Sopenharmony_ci if (!pfn_valid(pfn)) 167062306a36Sopenharmony_ci continue; 167162306a36Sopenharmony_ci page = pfn_to_page(pfn); 167262306a36Sopenharmony_ci if (PageLRU(page)) 167362306a36Sopenharmony_ci goto found; 167462306a36Sopenharmony_ci if (__PageMovable(page)) 167562306a36Sopenharmony_ci goto found; 167662306a36Sopenharmony_ci 167762306a36Sopenharmony_ci /* 167862306a36Sopenharmony_ci * PageOffline() pages that are not marked __PageMovable() and 167962306a36Sopenharmony_ci * have a reference count > 0 (after MEM_GOING_OFFLINE) are 168062306a36Sopenharmony_ci * definitely unmovable. If their reference count would be 0, 168162306a36Sopenharmony_ci * they could at least be skipped when offlining memory. 168262306a36Sopenharmony_ci */ 168362306a36Sopenharmony_ci if (PageOffline(page) && page_count(page)) 168462306a36Sopenharmony_ci return -EBUSY; 168562306a36Sopenharmony_ci 168662306a36Sopenharmony_ci if (!PageHuge(page)) 168762306a36Sopenharmony_ci continue; 168862306a36Sopenharmony_ci head = compound_head(page); 168962306a36Sopenharmony_ci /* 169062306a36Sopenharmony_ci * This test is racy as we hold no reference or lock. The 169162306a36Sopenharmony_ci * hugetlb page could have been free'ed and head is no longer 169262306a36Sopenharmony_ci * a hugetlb page before the following check. In such unlikely 169362306a36Sopenharmony_ci * cases false positives and negatives are possible. Calling 169462306a36Sopenharmony_ci * code must deal with these scenarios. 169562306a36Sopenharmony_ci */ 169662306a36Sopenharmony_ci if (HPageMigratable(head)) 169762306a36Sopenharmony_ci goto found; 169862306a36Sopenharmony_ci skip = compound_nr(head) - (pfn - page_to_pfn(head)); 169962306a36Sopenharmony_ci pfn += skip - 1; 170062306a36Sopenharmony_ci } 170162306a36Sopenharmony_ci return -ENOENT; 170262306a36Sopenharmony_cifound: 170362306a36Sopenharmony_ci *movable_pfn = pfn; 170462306a36Sopenharmony_ci return 0; 170562306a36Sopenharmony_ci} 170662306a36Sopenharmony_ci 170762306a36Sopenharmony_cistatic void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 170862306a36Sopenharmony_ci{ 170962306a36Sopenharmony_ci unsigned long pfn; 171062306a36Sopenharmony_ci struct page *page, *head; 171162306a36Sopenharmony_ci LIST_HEAD(source); 171262306a36Sopenharmony_ci static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL, 171362306a36Sopenharmony_ci DEFAULT_RATELIMIT_BURST); 171462306a36Sopenharmony_ci 171562306a36Sopenharmony_ci for (pfn = start_pfn; pfn < end_pfn; pfn++) { 171662306a36Sopenharmony_ci struct folio *folio; 171762306a36Sopenharmony_ci bool isolated; 171862306a36Sopenharmony_ci 171962306a36Sopenharmony_ci if (!pfn_valid(pfn)) 172062306a36Sopenharmony_ci continue; 172162306a36Sopenharmony_ci page = pfn_to_page(pfn); 172262306a36Sopenharmony_ci folio = page_folio(page); 172362306a36Sopenharmony_ci head = &folio->page; 172462306a36Sopenharmony_ci 172562306a36Sopenharmony_ci if (PageHuge(page)) { 172662306a36Sopenharmony_ci pfn = page_to_pfn(head) + compound_nr(head) - 1; 172762306a36Sopenharmony_ci isolate_hugetlb(folio, &source); 172862306a36Sopenharmony_ci continue; 172962306a36Sopenharmony_ci } else if (PageTransHuge(page)) 173062306a36Sopenharmony_ci pfn = page_to_pfn(head) + thp_nr_pages(page) - 1; 173162306a36Sopenharmony_ci 173262306a36Sopenharmony_ci /* 173362306a36Sopenharmony_ci * HWPoison pages have elevated reference counts so the migration would 173462306a36Sopenharmony_ci * fail on them. It also doesn't make any sense to migrate them in the 173562306a36Sopenharmony_ci * first place. Still try to unmap such a page in case it is still mapped 173662306a36Sopenharmony_ci * (e.g. current hwpoison implementation doesn't unmap KSM pages but keep 173762306a36Sopenharmony_ci * the unmap as the catch all safety net). 173862306a36Sopenharmony_ci */ 173962306a36Sopenharmony_ci if (PageHWPoison(page)) { 174062306a36Sopenharmony_ci if (WARN_ON(folio_test_lru(folio))) 174162306a36Sopenharmony_ci folio_isolate_lru(folio); 174262306a36Sopenharmony_ci if (folio_mapped(folio)) 174362306a36Sopenharmony_ci try_to_unmap(folio, TTU_IGNORE_MLOCK); 174462306a36Sopenharmony_ci continue; 174562306a36Sopenharmony_ci } 174662306a36Sopenharmony_ci 174762306a36Sopenharmony_ci if (!get_page_unless_zero(page)) 174862306a36Sopenharmony_ci continue; 174962306a36Sopenharmony_ci /* 175062306a36Sopenharmony_ci * We can skip free pages. And we can deal with pages on 175162306a36Sopenharmony_ci * LRU and non-lru movable pages. 175262306a36Sopenharmony_ci */ 175362306a36Sopenharmony_ci if (PageLRU(page)) 175462306a36Sopenharmony_ci isolated = isolate_lru_page(page); 175562306a36Sopenharmony_ci else 175662306a36Sopenharmony_ci isolated = isolate_movable_page(page, ISOLATE_UNEVICTABLE); 175762306a36Sopenharmony_ci if (isolated) { 175862306a36Sopenharmony_ci list_add_tail(&page->lru, &source); 175962306a36Sopenharmony_ci if (!__PageMovable(page)) 176062306a36Sopenharmony_ci inc_node_page_state(page, NR_ISOLATED_ANON + 176162306a36Sopenharmony_ci page_is_file_lru(page)); 176262306a36Sopenharmony_ci 176362306a36Sopenharmony_ci } else { 176462306a36Sopenharmony_ci if (__ratelimit(&migrate_rs)) { 176562306a36Sopenharmony_ci pr_warn("failed to isolate pfn %lx\n", pfn); 176662306a36Sopenharmony_ci dump_page(page, "isolation failed"); 176762306a36Sopenharmony_ci } 176862306a36Sopenharmony_ci } 176962306a36Sopenharmony_ci put_page(page); 177062306a36Sopenharmony_ci } 177162306a36Sopenharmony_ci if (!list_empty(&source)) { 177262306a36Sopenharmony_ci nodemask_t nmask = node_states[N_MEMORY]; 177362306a36Sopenharmony_ci struct migration_target_control mtc = { 177462306a36Sopenharmony_ci .nmask = &nmask, 177562306a36Sopenharmony_ci .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, 177662306a36Sopenharmony_ci }; 177762306a36Sopenharmony_ci int ret; 177862306a36Sopenharmony_ci 177962306a36Sopenharmony_ci /* 178062306a36Sopenharmony_ci * We have checked that migration range is on a single zone so 178162306a36Sopenharmony_ci * we can use the nid of the first page to all the others. 178262306a36Sopenharmony_ci */ 178362306a36Sopenharmony_ci mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru)); 178462306a36Sopenharmony_ci 178562306a36Sopenharmony_ci /* 178662306a36Sopenharmony_ci * try to allocate from a different node but reuse this node 178762306a36Sopenharmony_ci * if there are no other online nodes to be used (e.g. we are 178862306a36Sopenharmony_ci * offlining a part of the only existing node) 178962306a36Sopenharmony_ci */ 179062306a36Sopenharmony_ci node_clear(mtc.nid, nmask); 179162306a36Sopenharmony_ci if (nodes_empty(nmask)) 179262306a36Sopenharmony_ci node_set(mtc.nid, nmask); 179362306a36Sopenharmony_ci ret = migrate_pages(&source, alloc_migration_target, NULL, 179462306a36Sopenharmony_ci (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG, NULL); 179562306a36Sopenharmony_ci if (ret) { 179662306a36Sopenharmony_ci list_for_each_entry(page, &source, lru) { 179762306a36Sopenharmony_ci if (__ratelimit(&migrate_rs)) { 179862306a36Sopenharmony_ci pr_warn("migrating pfn %lx failed ret:%d\n", 179962306a36Sopenharmony_ci page_to_pfn(page), ret); 180062306a36Sopenharmony_ci dump_page(page, "migration failure"); 180162306a36Sopenharmony_ci } 180262306a36Sopenharmony_ci } 180362306a36Sopenharmony_ci putback_movable_pages(&source); 180462306a36Sopenharmony_ci } 180562306a36Sopenharmony_ci } 180662306a36Sopenharmony_ci} 180762306a36Sopenharmony_ci 180862306a36Sopenharmony_cistatic int __init cmdline_parse_movable_node(char *p) 180962306a36Sopenharmony_ci{ 181062306a36Sopenharmony_ci movable_node_enabled = true; 181162306a36Sopenharmony_ci return 0; 181262306a36Sopenharmony_ci} 181362306a36Sopenharmony_ciearly_param("movable_node", cmdline_parse_movable_node); 181462306a36Sopenharmony_ci 181562306a36Sopenharmony_ci/* check which state of node_states will be changed when offline memory */ 181662306a36Sopenharmony_cistatic void node_states_check_changes_offline(unsigned long nr_pages, 181762306a36Sopenharmony_ci struct zone *zone, struct memory_notify *arg) 181862306a36Sopenharmony_ci{ 181962306a36Sopenharmony_ci struct pglist_data *pgdat = zone->zone_pgdat; 182062306a36Sopenharmony_ci unsigned long present_pages = 0; 182162306a36Sopenharmony_ci enum zone_type zt; 182262306a36Sopenharmony_ci 182362306a36Sopenharmony_ci arg->status_change_nid = NUMA_NO_NODE; 182462306a36Sopenharmony_ci arg->status_change_nid_normal = NUMA_NO_NODE; 182562306a36Sopenharmony_ci 182662306a36Sopenharmony_ci /* 182762306a36Sopenharmony_ci * Check whether node_states[N_NORMAL_MEMORY] will be changed. 182862306a36Sopenharmony_ci * If the memory to be offline is within the range 182962306a36Sopenharmony_ci * [0..ZONE_NORMAL], and it is the last present memory there, 183062306a36Sopenharmony_ci * the zones in that range will become empty after the offlining, 183162306a36Sopenharmony_ci * thus we can determine that we need to clear the node from 183262306a36Sopenharmony_ci * node_states[N_NORMAL_MEMORY]. 183362306a36Sopenharmony_ci */ 183462306a36Sopenharmony_ci for (zt = 0; zt <= ZONE_NORMAL; zt++) 183562306a36Sopenharmony_ci present_pages += pgdat->node_zones[zt].present_pages; 183662306a36Sopenharmony_ci if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages) 183762306a36Sopenharmony_ci arg->status_change_nid_normal = zone_to_nid(zone); 183862306a36Sopenharmony_ci 183962306a36Sopenharmony_ci /* 184062306a36Sopenharmony_ci * We have accounted the pages from [0..ZONE_NORMAL); ZONE_HIGHMEM 184162306a36Sopenharmony_ci * does not apply as we don't support 32bit. 184262306a36Sopenharmony_ci * Here we count the possible pages from ZONE_MOVABLE. 184362306a36Sopenharmony_ci * If after having accounted all the pages, we see that the nr_pages 184462306a36Sopenharmony_ci * to be offlined is over or equal to the accounted pages, 184562306a36Sopenharmony_ci * we know that the node will become empty, and so, we can clear 184662306a36Sopenharmony_ci * it for N_MEMORY as well. 184762306a36Sopenharmony_ci */ 184862306a36Sopenharmony_ci present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages; 184962306a36Sopenharmony_ci 185062306a36Sopenharmony_ci if (nr_pages >= present_pages) 185162306a36Sopenharmony_ci arg->status_change_nid = zone_to_nid(zone); 185262306a36Sopenharmony_ci} 185362306a36Sopenharmony_ci 185462306a36Sopenharmony_cistatic void node_states_clear_node(int node, struct memory_notify *arg) 185562306a36Sopenharmony_ci{ 185662306a36Sopenharmony_ci if (arg->status_change_nid_normal >= 0) 185762306a36Sopenharmony_ci node_clear_state(node, N_NORMAL_MEMORY); 185862306a36Sopenharmony_ci 185962306a36Sopenharmony_ci if (arg->status_change_nid >= 0) 186062306a36Sopenharmony_ci node_clear_state(node, N_MEMORY); 186162306a36Sopenharmony_ci} 186262306a36Sopenharmony_ci 186362306a36Sopenharmony_cistatic int count_system_ram_pages_cb(unsigned long start_pfn, 186462306a36Sopenharmony_ci unsigned long nr_pages, void *data) 186562306a36Sopenharmony_ci{ 186662306a36Sopenharmony_ci unsigned long *nr_system_ram_pages = data; 186762306a36Sopenharmony_ci 186862306a36Sopenharmony_ci *nr_system_ram_pages += nr_pages; 186962306a36Sopenharmony_ci return 0; 187062306a36Sopenharmony_ci} 187162306a36Sopenharmony_ci 187262306a36Sopenharmony_ci/* 187362306a36Sopenharmony_ci * Must be called with mem_hotplug_lock in write mode. 187462306a36Sopenharmony_ci */ 187562306a36Sopenharmony_ciint __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, 187662306a36Sopenharmony_ci struct zone *zone, struct memory_group *group) 187762306a36Sopenharmony_ci{ 187862306a36Sopenharmony_ci const unsigned long end_pfn = start_pfn + nr_pages; 187962306a36Sopenharmony_ci unsigned long pfn, system_ram_pages = 0; 188062306a36Sopenharmony_ci const int node = zone_to_nid(zone); 188162306a36Sopenharmony_ci unsigned long flags; 188262306a36Sopenharmony_ci struct memory_notify arg; 188362306a36Sopenharmony_ci char *reason; 188462306a36Sopenharmony_ci int ret; 188562306a36Sopenharmony_ci 188662306a36Sopenharmony_ci /* 188762306a36Sopenharmony_ci * {on,off}lining is constrained to full memory sections (or more 188862306a36Sopenharmony_ci * precisely to memory blocks from the user space POV). 188962306a36Sopenharmony_ci * memmap_on_memory is an exception because it reserves initial part 189062306a36Sopenharmony_ci * of the physical memory space for vmemmaps. That space is pageblock 189162306a36Sopenharmony_ci * aligned. 189262306a36Sopenharmony_ci */ 189362306a36Sopenharmony_ci if (WARN_ON_ONCE(!nr_pages || !pageblock_aligned(start_pfn) || 189462306a36Sopenharmony_ci !IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))) 189562306a36Sopenharmony_ci return -EINVAL; 189662306a36Sopenharmony_ci 189762306a36Sopenharmony_ci /* 189862306a36Sopenharmony_ci * Don't allow to offline memory blocks that contain holes. 189962306a36Sopenharmony_ci * Consequently, memory blocks with holes can never get onlined 190062306a36Sopenharmony_ci * via the hotplug path - online_pages() - as hotplugged memory has 190162306a36Sopenharmony_ci * no holes. This way, we e.g., don't have to worry about marking 190262306a36Sopenharmony_ci * memory holes PG_reserved, don't need pfn_valid() checks, and can 190362306a36Sopenharmony_ci * avoid using walk_system_ram_range() later. 190462306a36Sopenharmony_ci */ 190562306a36Sopenharmony_ci walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages, 190662306a36Sopenharmony_ci count_system_ram_pages_cb); 190762306a36Sopenharmony_ci if (system_ram_pages != nr_pages) { 190862306a36Sopenharmony_ci ret = -EINVAL; 190962306a36Sopenharmony_ci reason = "memory holes"; 191062306a36Sopenharmony_ci goto failed_removal; 191162306a36Sopenharmony_ci } 191262306a36Sopenharmony_ci 191362306a36Sopenharmony_ci /* 191462306a36Sopenharmony_ci * We only support offlining of memory blocks managed by a single zone, 191562306a36Sopenharmony_ci * checked by calling code. This is just a sanity check that we might 191662306a36Sopenharmony_ci * want to remove in the future. 191762306a36Sopenharmony_ci */ 191862306a36Sopenharmony_ci if (WARN_ON_ONCE(page_zone(pfn_to_page(start_pfn)) != zone || 191962306a36Sopenharmony_ci page_zone(pfn_to_page(end_pfn - 1)) != zone)) { 192062306a36Sopenharmony_ci ret = -EINVAL; 192162306a36Sopenharmony_ci reason = "multizone range"; 192262306a36Sopenharmony_ci goto failed_removal; 192362306a36Sopenharmony_ci } 192462306a36Sopenharmony_ci 192562306a36Sopenharmony_ci /* 192662306a36Sopenharmony_ci * Disable pcplists so that page isolation cannot race with freeing 192762306a36Sopenharmony_ci * in a way that pages from isolated pageblock are left on pcplists. 192862306a36Sopenharmony_ci */ 192962306a36Sopenharmony_ci zone_pcp_disable(zone); 193062306a36Sopenharmony_ci lru_cache_disable(); 193162306a36Sopenharmony_ci 193262306a36Sopenharmony_ci /* set above range as isolated */ 193362306a36Sopenharmony_ci ret = start_isolate_page_range(start_pfn, end_pfn, 193462306a36Sopenharmony_ci MIGRATE_MOVABLE, 193562306a36Sopenharmony_ci MEMORY_OFFLINE | REPORT_FAILURE, 193662306a36Sopenharmony_ci GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL); 193762306a36Sopenharmony_ci if (ret) { 193862306a36Sopenharmony_ci reason = "failure to isolate range"; 193962306a36Sopenharmony_ci goto failed_removal_pcplists_disabled; 194062306a36Sopenharmony_ci } 194162306a36Sopenharmony_ci 194262306a36Sopenharmony_ci arg.start_pfn = start_pfn; 194362306a36Sopenharmony_ci arg.nr_pages = nr_pages; 194462306a36Sopenharmony_ci node_states_check_changes_offline(nr_pages, zone, &arg); 194562306a36Sopenharmony_ci 194662306a36Sopenharmony_ci ret = memory_notify(MEM_GOING_OFFLINE, &arg); 194762306a36Sopenharmony_ci ret = notifier_to_errno(ret); 194862306a36Sopenharmony_ci if (ret) { 194962306a36Sopenharmony_ci reason = "notifier failure"; 195062306a36Sopenharmony_ci goto failed_removal_isolated; 195162306a36Sopenharmony_ci } 195262306a36Sopenharmony_ci 195362306a36Sopenharmony_ci do { 195462306a36Sopenharmony_ci pfn = start_pfn; 195562306a36Sopenharmony_ci do { 195662306a36Sopenharmony_ci /* 195762306a36Sopenharmony_ci * Historically we always checked for any signal and 195862306a36Sopenharmony_ci * can't limit it to fatal signals without eventually 195962306a36Sopenharmony_ci * breaking user space. 196062306a36Sopenharmony_ci */ 196162306a36Sopenharmony_ci if (signal_pending(current)) { 196262306a36Sopenharmony_ci ret = -EINTR; 196362306a36Sopenharmony_ci reason = "signal backoff"; 196462306a36Sopenharmony_ci goto failed_removal_isolated; 196562306a36Sopenharmony_ci } 196662306a36Sopenharmony_ci 196762306a36Sopenharmony_ci cond_resched(); 196862306a36Sopenharmony_ci 196962306a36Sopenharmony_ci ret = scan_movable_pages(pfn, end_pfn, &pfn); 197062306a36Sopenharmony_ci if (!ret) { 197162306a36Sopenharmony_ci /* 197262306a36Sopenharmony_ci * TODO: fatal migration failures should bail 197362306a36Sopenharmony_ci * out 197462306a36Sopenharmony_ci */ 197562306a36Sopenharmony_ci do_migrate_range(pfn, end_pfn); 197662306a36Sopenharmony_ci } 197762306a36Sopenharmony_ci } while (!ret); 197862306a36Sopenharmony_ci 197962306a36Sopenharmony_ci if (ret != -ENOENT) { 198062306a36Sopenharmony_ci reason = "unmovable page"; 198162306a36Sopenharmony_ci goto failed_removal_isolated; 198262306a36Sopenharmony_ci } 198362306a36Sopenharmony_ci 198462306a36Sopenharmony_ci /* 198562306a36Sopenharmony_ci * Dissolve free hugepages in the memory block before doing 198662306a36Sopenharmony_ci * offlining actually in order to make hugetlbfs's object 198762306a36Sopenharmony_ci * counting consistent. 198862306a36Sopenharmony_ci */ 198962306a36Sopenharmony_ci ret = dissolve_free_huge_pages(start_pfn, end_pfn); 199062306a36Sopenharmony_ci if (ret) { 199162306a36Sopenharmony_ci reason = "failure to dissolve huge pages"; 199262306a36Sopenharmony_ci goto failed_removal_isolated; 199362306a36Sopenharmony_ci } 199462306a36Sopenharmony_ci 199562306a36Sopenharmony_ci ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE); 199662306a36Sopenharmony_ci 199762306a36Sopenharmony_ci } while (ret); 199862306a36Sopenharmony_ci 199962306a36Sopenharmony_ci /* Mark all sections offline and remove free pages from the buddy. */ 200062306a36Sopenharmony_ci __offline_isolated_pages(start_pfn, end_pfn); 200162306a36Sopenharmony_ci pr_debug("Offlined Pages %ld\n", nr_pages); 200262306a36Sopenharmony_ci 200362306a36Sopenharmony_ci /* 200462306a36Sopenharmony_ci * The memory sections are marked offline, and the pageblock flags 200562306a36Sopenharmony_ci * effectively stale; nobody should be touching them. Fixup the number 200662306a36Sopenharmony_ci * of isolated pageblocks, memory onlining will properly revert this. 200762306a36Sopenharmony_ci */ 200862306a36Sopenharmony_ci spin_lock_irqsave(&zone->lock, flags); 200962306a36Sopenharmony_ci zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages; 201062306a36Sopenharmony_ci spin_unlock_irqrestore(&zone->lock, flags); 201162306a36Sopenharmony_ci 201262306a36Sopenharmony_ci lru_cache_enable(); 201362306a36Sopenharmony_ci zone_pcp_enable(zone); 201462306a36Sopenharmony_ci 201562306a36Sopenharmony_ci /* removal success */ 201662306a36Sopenharmony_ci adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages); 201762306a36Sopenharmony_ci adjust_present_page_count(pfn_to_page(start_pfn), group, -nr_pages); 201862306a36Sopenharmony_ci 201962306a36Sopenharmony_ci /* reinitialise watermarks and update pcp limits */ 202062306a36Sopenharmony_ci init_per_zone_wmark_min(); 202162306a36Sopenharmony_ci 202262306a36Sopenharmony_ci if (!populated_zone(zone)) { 202362306a36Sopenharmony_ci zone_pcp_reset(zone); 202462306a36Sopenharmony_ci build_all_zonelists(NULL); 202562306a36Sopenharmony_ci } 202662306a36Sopenharmony_ci 202762306a36Sopenharmony_ci node_states_clear_node(node, &arg); 202862306a36Sopenharmony_ci if (arg.status_change_nid >= 0) { 202962306a36Sopenharmony_ci kcompactd_stop(node); 203062306a36Sopenharmony_ci kswapd_stop(node); 203162306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_ZSWAPD 203262306a36Sopenharmony_ci zswapd_stop(node); 203362306a36Sopenharmony_ci#endif 203462306a36Sopenharmony_ci } 203562306a36Sopenharmony_ci 203662306a36Sopenharmony_ci writeback_set_ratelimit(); 203762306a36Sopenharmony_ci 203862306a36Sopenharmony_ci memory_notify(MEM_OFFLINE, &arg); 203962306a36Sopenharmony_ci remove_pfn_range_from_zone(zone, start_pfn, nr_pages); 204062306a36Sopenharmony_ci return 0; 204162306a36Sopenharmony_ci 204262306a36Sopenharmony_cifailed_removal_isolated: 204362306a36Sopenharmony_ci /* pushback to free area */ 204462306a36Sopenharmony_ci undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 204562306a36Sopenharmony_ci memory_notify(MEM_CANCEL_OFFLINE, &arg); 204662306a36Sopenharmony_cifailed_removal_pcplists_disabled: 204762306a36Sopenharmony_ci lru_cache_enable(); 204862306a36Sopenharmony_ci zone_pcp_enable(zone); 204962306a36Sopenharmony_cifailed_removal: 205062306a36Sopenharmony_ci pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n", 205162306a36Sopenharmony_ci (unsigned long long) start_pfn << PAGE_SHIFT, 205262306a36Sopenharmony_ci ((unsigned long long) end_pfn << PAGE_SHIFT) - 1, 205362306a36Sopenharmony_ci reason); 205462306a36Sopenharmony_ci return ret; 205562306a36Sopenharmony_ci} 205662306a36Sopenharmony_ci 205762306a36Sopenharmony_cistatic int check_memblock_offlined_cb(struct memory_block *mem, void *arg) 205862306a36Sopenharmony_ci{ 205962306a36Sopenharmony_ci int *nid = arg; 206062306a36Sopenharmony_ci 206162306a36Sopenharmony_ci *nid = mem->nid; 206262306a36Sopenharmony_ci if (unlikely(mem->state != MEM_OFFLINE)) { 206362306a36Sopenharmony_ci phys_addr_t beginpa, endpa; 206462306a36Sopenharmony_ci 206562306a36Sopenharmony_ci beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); 206662306a36Sopenharmony_ci endpa = beginpa + memory_block_size_bytes() - 1; 206762306a36Sopenharmony_ci pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n", 206862306a36Sopenharmony_ci &beginpa, &endpa); 206962306a36Sopenharmony_ci 207062306a36Sopenharmony_ci return -EBUSY; 207162306a36Sopenharmony_ci } 207262306a36Sopenharmony_ci return 0; 207362306a36Sopenharmony_ci} 207462306a36Sopenharmony_ci 207562306a36Sopenharmony_cistatic int test_has_altmap_cb(struct memory_block *mem, void *arg) 207662306a36Sopenharmony_ci{ 207762306a36Sopenharmony_ci struct memory_block **mem_ptr = (struct memory_block **)arg; 207862306a36Sopenharmony_ci /* 207962306a36Sopenharmony_ci * return the memblock if we have altmap 208062306a36Sopenharmony_ci * and break callback. 208162306a36Sopenharmony_ci */ 208262306a36Sopenharmony_ci if (mem->altmap) { 208362306a36Sopenharmony_ci *mem_ptr = mem; 208462306a36Sopenharmony_ci return 1; 208562306a36Sopenharmony_ci } 208662306a36Sopenharmony_ci return 0; 208762306a36Sopenharmony_ci} 208862306a36Sopenharmony_ci 208962306a36Sopenharmony_cistatic int check_cpu_on_node(int nid) 209062306a36Sopenharmony_ci{ 209162306a36Sopenharmony_ci int cpu; 209262306a36Sopenharmony_ci 209362306a36Sopenharmony_ci for_each_present_cpu(cpu) { 209462306a36Sopenharmony_ci if (cpu_to_node(cpu) == nid) 209562306a36Sopenharmony_ci /* 209662306a36Sopenharmony_ci * the cpu on this node isn't removed, and we can't 209762306a36Sopenharmony_ci * offline this node. 209862306a36Sopenharmony_ci */ 209962306a36Sopenharmony_ci return -EBUSY; 210062306a36Sopenharmony_ci } 210162306a36Sopenharmony_ci 210262306a36Sopenharmony_ci return 0; 210362306a36Sopenharmony_ci} 210462306a36Sopenharmony_ci 210562306a36Sopenharmony_cistatic int check_no_memblock_for_node_cb(struct memory_block *mem, void *arg) 210662306a36Sopenharmony_ci{ 210762306a36Sopenharmony_ci int nid = *(int *)arg; 210862306a36Sopenharmony_ci 210962306a36Sopenharmony_ci /* 211062306a36Sopenharmony_ci * If a memory block belongs to multiple nodes, the stored nid is not 211162306a36Sopenharmony_ci * reliable. However, such blocks are always online (e.g., cannot get 211262306a36Sopenharmony_ci * offlined) and, therefore, are still spanned by the node. 211362306a36Sopenharmony_ci */ 211462306a36Sopenharmony_ci return mem->nid == nid ? -EEXIST : 0; 211562306a36Sopenharmony_ci} 211662306a36Sopenharmony_ci 211762306a36Sopenharmony_ci/** 211862306a36Sopenharmony_ci * try_offline_node 211962306a36Sopenharmony_ci * @nid: the node ID 212062306a36Sopenharmony_ci * 212162306a36Sopenharmony_ci * Offline a node if all memory sections and cpus of the node are removed. 212262306a36Sopenharmony_ci * 212362306a36Sopenharmony_ci * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 212462306a36Sopenharmony_ci * and online/offline operations before this call. 212562306a36Sopenharmony_ci */ 212662306a36Sopenharmony_civoid try_offline_node(int nid) 212762306a36Sopenharmony_ci{ 212862306a36Sopenharmony_ci int rc; 212962306a36Sopenharmony_ci 213062306a36Sopenharmony_ci /* 213162306a36Sopenharmony_ci * If the node still spans pages (especially ZONE_DEVICE), don't 213262306a36Sopenharmony_ci * offline it. A node spans memory after move_pfn_range_to_zone(), 213362306a36Sopenharmony_ci * e.g., after the memory block was onlined. 213462306a36Sopenharmony_ci */ 213562306a36Sopenharmony_ci if (node_spanned_pages(nid)) 213662306a36Sopenharmony_ci return; 213762306a36Sopenharmony_ci 213862306a36Sopenharmony_ci /* 213962306a36Sopenharmony_ci * Especially offline memory blocks might not be spanned by the 214062306a36Sopenharmony_ci * node. They will get spanned by the node once they get onlined. 214162306a36Sopenharmony_ci * However, they link to the node in sysfs and can get onlined later. 214262306a36Sopenharmony_ci */ 214362306a36Sopenharmony_ci rc = for_each_memory_block(&nid, check_no_memblock_for_node_cb); 214462306a36Sopenharmony_ci if (rc) 214562306a36Sopenharmony_ci return; 214662306a36Sopenharmony_ci 214762306a36Sopenharmony_ci if (check_cpu_on_node(nid)) 214862306a36Sopenharmony_ci return; 214962306a36Sopenharmony_ci 215062306a36Sopenharmony_ci /* 215162306a36Sopenharmony_ci * all memory/cpu of this node are removed, we can offline this 215262306a36Sopenharmony_ci * node now. 215362306a36Sopenharmony_ci */ 215462306a36Sopenharmony_ci node_set_offline(nid); 215562306a36Sopenharmony_ci unregister_one_node(nid); 215662306a36Sopenharmony_ci} 215762306a36Sopenharmony_ciEXPORT_SYMBOL(try_offline_node); 215862306a36Sopenharmony_ci 215962306a36Sopenharmony_cistatic int __ref try_remove_memory(u64 start, u64 size) 216062306a36Sopenharmony_ci{ 216162306a36Sopenharmony_ci struct memory_block *mem; 216262306a36Sopenharmony_ci int rc = 0, nid = NUMA_NO_NODE; 216362306a36Sopenharmony_ci struct vmem_altmap *altmap = NULL; 216462306a36Sopenharmony_ci 216562306a36Sopenharmony_ci BUG_ON(check_hotplug_memory_range(start, size)); 216662306a36Sopenharmony_ci 216762306a36Sopenharmony_ci /* 216862306a36Sopenharmony_ci * All memory blocks must be offlined before removing memory. Check 216962306a36Sopenharmony_ci * whether all memory blocks in question are offline and return error 217062306a36Sopenharmony_ci * if this is not the case. 217162306a36Sopenharmony_ci * 217262306a36Sopenharmony_ci * While at it, determine the nid. Note that if we'd have mixed nodes, 217362306a36Sopenharmony_ci * we'd only try to offline the last determined one -- which is good 217462306a36Sopenharmony_ci * enough for the cases we care about. 217562306a36Sopenharmony_ci */ 217662306a36Sopenharmony_ci rc = walk_memory_blocks(start, size, &nid, check_memblock_offlined_cb); 217762306a36Sopenharmony_ci if (rc) 217862306a36Sopenharmony_ci return rc; 217962306a36Sopenharmony_ci 218062306a36Sopenharmony_ci /* 218162306a36Sopenharmony_ci * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in 218262306a36Sopenharmony_ci * the same granularity it was added - a single memory block. 218362306a36Sopenharmony_ci */ 218462306a36Sopenharmony_ci if (mhp_memmap_on_memory()) { 218562306a36Sopenharmony_ci rc = walk_memory_blocks(start, size, &mem, test_has_altmap_cb); 218662306a36Sopenharmony_ci if (rc) { 218762306a36Sopenharmony_ci if (size != memory_block_size_bytes()) { 218862306a36Sopenharmony_ci pr_warn("Refuse to remove %#llx - %#llx," 218962306a36Sopenharmony_ci "wrong granularity\n", 219062306a36Sopenharmony_ci start, start + size); 219162306a36Sopenharmony_ci return -EINVAL; 219262306a36Sopenharmony_ci } 219362306a36Sopenharmony_ci altmap = mem->altmap; 219462306a36Sopenharmony_ci /* 219562306a36Sopenharmony_ci * Mark altmap NULL so that we can add a debug 219662306a36Sopenharmony_ci * check on memblock free. 219762306a36Sopenharmony_ci */ 219862306a36Sopenharmony_ci mem->altmap = NULL; 219962306a36Sopenharmony_ci } 220062306a36Sopenharmony_ci } 220162306a36Sopenharmony_ci 220262306a36Sopenharmony_ci /* remove memmap entry */ 220362306a36Sopenharmony_ci firmware_map_remove(start, start + size, "System RAM"); 220462306a36Sopenharmony_ci 220562306a36Sopenharmony_ci /* 220662306a36Sopenharmony_ci * Memory block device removal under the device_hotplug_lock is 220762306a36Sopenharmony_ci * a barrier against racing online attempts. 220862306a36Sopenharmony_ci */ 220962306a36Sopenharmony_ci remove_memory_block_devices(start, size); 221062306a36Sopenharmony_ci 221162306a36Sopenharmony_ci mem_hotplug_begin(); 221262306a36Sopenharmony_ci 221362306a36Sopenharmony_ci arch_remove_memory(start, size, altmap); 221462306a36Sopenharmony_ci 221562306a36Sopenharmony_ci /* Verify that all vmemmap pages have actually been freed. */ 221662306a36Sopenharmony_ci if (altmap) { 221762306a36Sopenharmony_ci WARN(altmap->alloc, "Altmap not fully unmapped"); 221862306a36Sopenharmony_ci kfree(altmap); 221962306a36Sopenharmony_ci } 222062306a36Sopenharmony_ci 222162306a36Sopenharmony_ci if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { 222262306a36Sopenharmony_ci memblock_phys_free(start, size); 222362306a36Sopenharmony_ci memblock_remove(start, size); 222462306a36Sopenharmony_ci } 222562306a36Sopenharmony_ci 222662306a36Sopenharmony_ci release_mem_region_adjustable(start, size); 222762306a36Sopenharmony_ci 222862306a36Sopenharmony_ci if (nid != NUMA_NO_NODE) 222962306a36Sopenharmony_ci try_offline_node(nid); 223062306a36Sopenharmony_ci 223162306a36Sopenharmony_ci mem_hotplug_done(); 223262306a36Sopenharmony_ci return 0; 223362306a36Sopenharmony_ci} 223462306a36Sopenharmony_ci 223562306a36Sopenharmony_ci/** 223662306a36Sopenharmony_ci * __remove_memory - Remove memory if every memory block is offline 223762306a36Sopenharmony_ci * @start: physical address of the region to remove 223862306a36Sopenharmony_ci * @size: size of the region to remove 223962306a36Sopenharmony_ci * 224062306a36Sopenharmony_ci * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 224162306a36Sopenharmony_ci * and online/offline operations before this call, as required by 224262306a36Sopenharmony_ci * try_offline_node(). 224362306a36Sopenharmony_ci */ 224462306a36Sopenharmony_civoid __remove_memory(u64 start, u64 size) 224562306a36Sopenharmony_ci{ 224662306a36Sopenharmony_ci 224762306a36Sopenharmony_ci /* 224862306a36Sopenharmony_ci * trigger BUG() if some memory is not offlined prior to calling this 224962306a36Sopenharmony_ci * function 225062306a36Sopenharmony_ci */ 225162306a36Sopenharmony_ci if (try_remove_memory(start, size)) 225262306a36Sopenharmony_ci BUG(); 225362306a36Sopenharmony_ci} 225462306a36Sopenharmony_ci 225562306a36Sopenharmony_ci/* 225662306a36Sopenharmony_ci * Remove memory if every memory block is offline, otherwise return -EBUSY is 225762306a36Sopenharmony_ci * some memory is not offline 225862306a36Sopenharmony_ci */ 225962306a36Sopenharmony_ciint remove_memory(u64 start, u64 size) 226062306a36Sopenharmony_ci{ 226162306a36Sopenharmony_ci int rc; 226262306a36Sopenharmony_ci 226362306a36Sopenharmony_ci lock_device_hotplug(); 226462306a36Sopenharmony_ci rc = try_remove_memory(start, size); 226562306a36Sopenharmony_ci unlock_device_hotplug(); 226662306a36Sopenharmony_ci 226762306a36Sopenharmony_ci return rc; 226862306a36Sopenharmony_ci} 226962306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(remove_memory); 227062306a36Sopenharmony_ci 227162306a36Sopenharmony_cistatic int try_offline_memory_block(struct memory_block *mem, void *arg) 227262306a36Sopenharmony_ci{ 227362306a36Sopenharmony_ci uint8_t online_type = MMOP_ONLINE_KERNEL; 227462306a36Sopenharmony_ci uint8_t **online_types = arg; 227562306a36Sopenharmony_ci struct page *page; 227662306a36Sopenharmony_ci int rc; 227762306a36Sopenharmony_ci 227862306a36Sopenharmony_ci /* 227962306a36Sopenharmony_ci * Sense the online_type via the zone of the memory block. Offlining 228062306a36Sopenharmony_ci * with multiple zones within one memory block will be rejected 228162306a36Sopenharmony_ci * by offlining code ... so we don't care about that. 228262306a36Sopenharmony_ci */ 228362306a36Sopenharmony_ci page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr)); 228462306a36Sopenharmony_ci if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE) 228562306a36Sopenharmony_ci online_type = MMOP_ONLINE_MOVABLE; 228662306a36Sopenharmony_ci 228762306a36Sopenharmony_ci rc = device_offline(&mem->dev); 228862306a36Sopenharmony_ci /* 228962306a36Sopenharmony_ci * Default is MMOP_OFFLINE - change it only if offlining succeeded, 229062306a36Sopenharmony_ci * so try_reonline_memory_block() can do the right thing. 229162306a36Sopenharmony_ci */ 229262306a36Sopenharmony_ci if (!rc) 229362306a36Sopenharmony_ci **online_types = online_type; 229462306a36Sopenharmony_ci 229562306a36Sopenharmony_ci (*online_types)++; 229662306a36Sopenharmony_ci /* Ignore if already offline. */ 229762306a36Sopenharmony_ci return rc < 0 ? rc : 0; 229862306a36Sopenharmony_ci} 229962306a36Sopenharmony_ci 230062306a36Sopenharmony_cistatic int try_reonline_memory_block(struct memory_block *mem, void *arg) 230162306a36Sopenharmony_ci{ 230262306a36Sopenharmony_ci uint8_t **online_types = arg; 230362306a36Sopenharmony_ci int rc; 230462306a36Sopenharmony_ci 230562306a36Sopenharmony_ci if (**online_types != MMOP_OFFLINE) { 230662306a36Sopenharmony_ci mem->online_type = **online_types; 230762306a36Sopenharmony_ci rc = device_online(&mem->dev); 230862306a36Sopenharmony_ci if (rc < 0) 230962306a36Sopenharmony_ci pr_warn("%s: Failed to re-online memory: %d", 231062306a36Sopenharmony_ci __func__, rc); 231162306a36Sopenharmony_ci } 231262306a36Sopenharmony_ci 231362306a36Sopenharmony_ci /* Continue processing all remaining memory blocks. */ 231462306a36Sopenharmony_ci (*online_types)++; 231562306a36Sopenharmony_ci return 0; 231662306a36Sopenharmony_ci} 231762306a36Sopenharmony_ci 231862306a36Sopenharmony_ci/* 231962306a36Sopenharmony_ci * Try to offline and remove memory. Might take a long time to finish in case 232062306a36Sopenharmony_ci * memory is still in use. Primarily useful for memory devices that logically 232162306a36Sopenharmony_ci * unplugged all memory (so it's no longer in use) and want to offline + remove 232262306a36Sopenharmony_ci * that memory. 232362306a36Sopenharmony_ci */ 232462306a36Sopenharmony_ciint offline_and_remove_memory(u64 start, u64 size) 232562306a36Sopenharmony_ci{ 232662306a36Sopenharmony_ci const unsigned long mb_count = size / memory_block_size_bytes(); 232762306a36Sopenharmony_ci uint8_t *online_types, *tmp; 232862306a36Sopenharmony_ci int rc; 232962306a36Sopenharmony_ci 233062306a36Sopenharmony_ci if (!IS_ALIGNED(start, memory_block_size_bytes()) || 233162306a36Sopenharmony_ci !IS_ALIGNED(size, memory_block_size_bytes()) || !size) 233262306a36Sopenharmony_ci return -EINVAL; 233362306a36Sopenharmony_ci 233462306a36Sopenharmony_ci /* 233562306a36Sopenharmony_ci * We'll remember the old online type of each memory block, so we can 233662306a36Sopenharmony_ci * try to revert whatever we did when offlining one memory block fails 233762306a36Sopenharmony_ci * after offlining some others succeeded. 233862306a36Sopenharmony_ci */ 233962306a36Sopenharmony_ci online_types = kmalloc_array(mb_count, sizeof(*online_types), 234062306a36Sopenharmony_ci GFP_KERNEL); 234162306a36Sopenharmony_ci if (!online_types) 234262306a36Sopenharmony_ci return -ENOMEM; 234362306a36Sopenharmony_ci /* 234462306a36Sopenharmony_ci * Initialize all states to MMOP_OFFLINE, so when we abort processing in 234562306a36Sopenharmony_ci * try_offline_memory_block(), we'll skip all unprocessed blocks in 234662306a36Sopenharmony_ci * try_reonline_memory_block(). 234762306a36Sopenharmony_ci */ 234862306a36Sopenharmony_ci memset(online_types, MMOP_OFFLINE, mb_count); 234962306a36Sopenharmony_ci 235062306a36Sopenharmony_ci lock_device_hotplug(); 235162306a36Sopenharmony_ci 235262306a36Sopenharmony_ci tmp = online_types; 235362306a36Sopenharmony_ci rc = walk_memory_blocks(start, size, &tmp, try_offline_memory_block); 235462306a36Sopenharmony_ci 235562306a36Sopenharmony_ci /* 235662306a36Sopenharmony_ci * In case we succeeded to offline all memory, remove it. 235762306a36Sopenharmony_ci * This cannot fail as it cannot get onlined in the meantime. 235862306a36Sopenharmony_ci */ 235962306a36Sopenharmony_ci if (!rc) { 236062306a36Sopenharmony_ci rc = try_remove_memory(start, size); 236162306a36Sopenharmony_ci if (rc) 236262306a36Sopenharmony_ci pr_err("%s: Failed to remove memory: %d", __func__, rc); 236362306a36Sopenharmony_ci } 236462306a36Sopenharmony_ci 236562306a36Sopenharmony_ci /* 236662306a36Sopenharmony_ci * Rollback what we did. While memory onlining might theoretically fail 236762306a36Sopenharmony_ci * (nacked by a notifier), it barely ever happens. 236862306a36Sopenharmony_ci */ 236962306a36Sopenharmony_ci if (rc) { 237062306a36Sopenharmony_ci tmp = online_types; 237162306a36Sopenharmony_ci walk_memory_blocks(start, size, &tmp, 237262306a36Sopenharmony_ci try_reonline_memory_block); 237362306a36Sopenharmony_ci } 237462306a36Sopenharmony_ci unlock_device_hotplug(); 237562306a36Sopenharmony_ci 237662306a36Sopenharmony_ci kfree(online_types); 237762306a36Sopenharmony_ci return rc; 237862306a36Sopenharmony_ci} 237962306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(offline_and_remove_memory); 238062306a36Sopenharmony_ci#endif /* CONFIG_MEMORY_HOTREMOVE */ 2381