162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * NUMA emulation 462306a36Sopenharmony_ci */ 562306a36Sopenharmony_ci#include <linux/kernel.h> 662306a36Sopenharmony_ci#include <linux/errno.h> 762306a36Sopenharmony_ci#include <linux/topology.h> 862306a36Sopenharmony_ci#include <linux/memblock.h> 962306a36Sopenharmony_ci#include <asm/dma.h> 1062306a36Sopenharmony_ci 1162306a36Sopenharmony_ci#include "numa_internal.h" 1262306a36Sopenharmony_ci 1362306a36Sopenharmony_cistatic int emu_nid_to_phys[MAX_NUMNODES]; 1462306a36Sopenharmony_cistatic char *emu_cmdline __initdata; 1562306a36Sopenharmony_ci 1662306a36Sopenharmony_ciint __init numa_emu_cmdline(char *str) 1762306a36Sopenharmony_ci{ 1862306a36Sopenharmony_ci emu_cmdline = str; 1962306a36Sopenharmony_ci return 0; 2062306a36Sopenharmony_ci} 2162306a36Sopenharmony_ci 2262306a36Sopenharmony_cistatic int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) 2362306a36Sopenharmony_ci{ 2462306a36Sopenharmony_ci int i; 2562306a36Sopenharmony_ci 2662306a36Sopenharmony_ci for (i = 0; i < mi->nr_blks; i++) 2762306a36Sopenharmony_ci if (mi->blk[i].nid == nid) 2862306a36Sopenharmony_ci return i; 2962306a36Sopenharmony_ci return -ENOENT; 3062306a36Sopenharmony_ci} 3162306a36Sopenharmony_ci 3262306a36Sopenharmony_cistatic u64 __init mem_hole_size(u64 start, u64 end) 3362306a36Sopenharmony_ci{ 3462306a36Sopenharmony_ci unsigned long start_pfn = PFN_UP(start); 3562306a36Sopenharmony_ci unsigned long end_pfn = PFN_DOWN(end); 3662306a36Sopenharmony_ci 3762306a36Sopenharmony_ci if (start_pfn < end_pfn) 3862306a36Sopenharmony_ci return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn)); 3962306a36Sopenharmony_ci return 0; 4062306a36Sopenharmony_ci} 4162306a36Sopenharmony_ci 4262306a36Sopenharmony_ci/* 4362306a36Sopenharmony_ci * Sets up nid to range from @start to @end. The return value is -errno if 4462306a36Sopenharmony_ci * something went wrong, 0 otherwise. 4562306a36Sopenharmony_ci */ 4662306a36Sopenharmony_cistatic int __init emu_setup_memblk(struct numa_meminfo *ei, 4762306a36Sopenharmony_ci struct numa_meminfo *pi, 4862306a36Sopenharmony_ci int nid, int phys_blk, u64 size) 4962306a36Sopenharmony_ci{ 5062306a36Sopenharmony_ci struct numa_memblk *eb = &ei->blk[ei->nr_blks]; 5162306a36Sopenharmony_ci struct numa_memblk *pb = &pi->blk[phys_blk]; 5262306a36Sopenharmony_ci 5362306a36Sopenharmony_ci if (ei->nr_blks >= NR_NODE_MEMBLKS) { 5462306a36Sopenharmony_ci pr_err("NUMA: Too many emulated memblks, failing emulation\n"); 5562306a36Sopenharmony_ci return -EINVAL; 5662306a36Sopenharmony_ci } 5762306a36Sopenharmony_ci 5862306a36Sopenharmony_ci ei->nr_blks++; 5962306a36Sopenharmony_ci eb->start = pb->start; 6062306a36Sopenharmony_ci eb->end = pb->start + size; 6162306a36Sopenharmony_ci eb->nid = nid; 6262306a36Sopenharmony_ci 6362306a36Sopenharmony_ci if (emu_nid_to_phys[nid] == NUMA_NO_NODE) 6462306a36Sopenharmony_ci emu_nid_to_phys[nid] = pb->nid; 6562306a36Sopenharmony_ci 6662306a36Sopenharmony_ci pb->start += size; 6762306a36Sopenharmony_ci if (pb->start >= pb->end) { 6862306a36Sopenharmony_ci WARN_ON_ONCE(pb->start > pb->end); 6962306a36Sopenharmony_ci numa_remove_memblk_from(phys_blk, pi); 7062306a36Sopenharmony_ci } 7162306a36Sopenharmony_ci 7262306a36Sopenharmony_ci printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n", 7362306a36Sopenharmony_ci nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20); 7462306a36Sopenharmony_ci return 0; 7562306a36Sopenharmony_ci} 7662306a36Sopenharmony_ci 7762306a36Sopenharmony_ci/* 7862306a36Sopenharmony_ci * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr 7962306a36Sopenharmony_ci * to max_addr. 8062306a36Sopenharmony_ci * 8162306a36Sopenharmony_ci * Returns zero on success or negative on error. 8262306a36Sopenharmony_ci */ 8362306a36Sopenharmony_cistatic int __init split_nodes_interleave(struct numa_meminfo *ei, 8462306a36Sopenharmony_ci struct numa_meminfo *pi, 8562306a36Sopenharmony_ci u64 addr, u64 max_addr, int nr_nodes) 8662306a36Sopenharmony_ci{ 8762306a36Sopenharmony_ci nodemask_t physnode_mask = numa_nodes_parsed; 8862306a36Sopenharmony_ci u64 size; 8962306a36Sopenharmony_ci int big; 9062306a36Sopenharmony_ci int nid = 0; 9162306a36Sopenharmony_ci int i, ret; 9262306a36Sopenharmony_ci 9362306a36Sopenharmony_ci if (nr_nodes <= 0) 9462306a36Sopenharmony_ci return -1; 9562306a36Sopenharmony_ci if (nr_nodes > MAX_NUMNODES) { 9662306a36Sopenharmony_ci pr_info("numa=fake=%d too large, reducing to %d\n", 9762306a36Sopenharmony_ci nr_nodes, MAX_NUMNODES); 9862306a36Sopenharmony_ci nr_nodes = MAX_NUMNODES; 9962306a36Sopenharmony_ci } 10062306a36Sopenharmony_ci 10162306a36Sopenharmony_ci /* 10262306a36Sopenharmony_ci * Calculate target node size. x86_32 freaks on __udivdi3() so do 10362306a36Sopenharmony_ci * the division in ulong number of pages and convert back. 10462306a36Sopenharmony_ci */ 10562306a36Sopenharmony_ci size = max_addr - addr - mem_hole_size(addr, max_addr); 10662306a36Sopenharmony_ci size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); 10762306a36Sopenharmony_ci 10862306a36Sopenharmony_ci /* 10962306a36Sopenharmony_ci * Calculate the number of big nodes that can be allocated as a result 11062306a36Sopenharmony_ci * of consolidating the remainder. 11162306a36Sopenharmony_ci */ 11262306a36Sopenharmony_ci big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / 11362306a36Sopenharmony_ci FAKE_NODE_MIN_SIZE; 11462306a36Sopenharmony_ci 11562306a36Sopenharmony_ci size &= FAKE_NODE_MIN_HASH_MASK; 11662306a36Sopenharmony_ci if (!size) { 11762306a36Sopenharmony_ci pr_err("Not enough memory for each node. " 11862306a36Sopenharmony_ci "NUMA emulation disabled.\n"); 11962306a36Sopenharmony_ci return -1; 12062306a36Sopenharmony_ci } 12162306a36Sopenharmony_ci 12262306a36Sopenharmony_ci /* 12362306a36Sopenharmony_ci * Continue to fill physical nodes with fake nodes until there is no 12462306a36Sopenharmony_ci * memory left on any of them. 12562306a36Sopenharmony_ci */ 12662306a36Sopenharmony_ci while (!nodes_empty(physnode_mask)) { 12762306a36Sopenharmony_ci for_each_node_mask(i, physnode_mask) { 12862306a36Sopenharmony_ci u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); 12962306a36Sopenharmony_ci u64 start, limit, end; 13062306a36Sopenharmony_ci int phys_blk; 13162306a36Sopenharmony_ci 13262306a36Sopenharmony_ci phys_blk = emu_find_memblk_by_nid(i, pi); 13362306a36Sopenharmony_ci if (phys_blk < 0) { 13462306a36Sopenharmony_ci node_clear(i, physnode_mask); 13562306a36Sopenharmony_ci continue; 13662306a36Sopenharmony_ci } 13762306a36Sopenharmony_ci start = pi->blk[phys_blk].start; 13862306a36Sopenharmony_ci limit = pi->blk[phys_blk].end; 13962306a36Sopenharmony_ci end = start + size; 14062306a36Sopenharmony_ci 14162306a36Sopenharmony_ci if (nid < big) 14262306a36Sopenharmony_ci end += FAKE_NODE_MIN_SIZE; 14362306a36Sopenharmony_ci 14462306a36Sopenharmony_ci /* 14562306a36Sopenharmony_ci * Continue to add memory to this fake node if its 14662306a36Sopenharmony_ci * non-reserved memory is less than the per-node size. 14762306a36Sopenharmony_ci */ 14862306a36Sopenharmony_ci while (end - start - mem_hole_size(start, end) < size) { 14962306a36Sopenharmony_ci end += FAKE_NODE_MIN_SIZE; 15062306a36Sopenharmony_ci if (end > limit) { 15162306a36Sopenharmony_ci end = limit; 15262306a36Sopenharmony_ci break; 15362306a36Sopenharmony_ci } 15462306a36Sopenharmony_ci } 15562306a36Sopenharmony_ci 15662306a36Sopenharmony_ci /* 15762306a36Sopenharmony_ci * If there won't be at least FAKE_NODE_MIN_SIZE of 15862306a36Sopenharmony_ci * non-reserved memory in ZONE_DMA32 for the next node, 15962306a36Sopenharmony_ci * this one must extend to the boundary. 16062306a36Sopenharmony_ci */ 16162306a36Sopenharmony_ci if (end < dma32_end && dma32_end - end - 16262306a36Sopenharmony_ci mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) 16362306a36Sopenharmony_ci end = dma32_end; 16462306a36Sopenharmony_ci 16562306a36Sopenharmony_ci /* 16662306a36Sopenharmony_ci * If there won't be enough non-reserved memory for the 16762306a36Sopenharmony_ci * next node, this one must extend to the end of the 16862306a36Sopenharmony_ci * physical node. 16962306a36Sopenharmony_ci */ 17062306a36Sopenharmony_ci if (limit - end - mem_hole_size(end, limit) < size) 17162306a36Sopenharmony_ci end = limit; 17262306a36Sopenharmony_ci 17362306a36Sopenharmony_ci ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, 17462306a36Sopenharmony_ci phys_blk, 17562306a36Sopenharmony_ci min(end, limit) - start); 17662306a36Sopenharmony_ci if (ret < 0) 17762306a36Sopenharmony_ci return ret; 17862306a36Sopenharmony_ci } 17962306a36Sopenharmony_ci } 18062306a36Sopenharmony_ci return 0; 18162306a36Sopenharmony_ci} 18262306a36Sopenharmony_ci 18362306a36Sopenharmony_ci/* 18462306a36Sopenharmony_ci * Returns the end address of a node so that there is at least `size' amount of 18562306a36Sopenharmony_ci * non-reserved memory or `max_addr' is reached. 18662306a36Sopenharmony_ci */ 18762306a36Sopenharmony_cistatic u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) 18862306a36Sopenharmony_ci{ 18962306a36Sopenharmony_ci u64 end = start + size; 19062306a36Sopenharmony_ci 19162306a36Sopenharmony_ci while (end - start - mem_hole_size(start, end) < size) { 19262306a36Sopenharmony_ci end += FAKE_NODE_MIN_SIZE; 19362306a36Sopenharmony_ci if (end > max_addr) { 19462306a36Sopenharmony_ci end = max_addr; 19562306a36Sopenharmony_ci break; 19662306a36Sopenharmony_ci } 19762306a36Sopenharmony_ci } 19862306a36Sopenharmony_ci return end; 19962306a36Sopenharmony_ci} 20062306a36Sopenharmony_ci 20162306a36Sopenharmony_cistatic u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes) 20262306a36Sopenharmony_ci{ 20362306a36Sopenharmony_ci unsigned long max_pfn = PHYS_PFN(max_addr); 20462306a36Sopenharmony_ci unsigned long base_pfn = PHYS_PFN(base); 20562306a36Sopenharmony_ci unsigned long hole_pfns = PHYS_PFN(hole); 20662306a36Sopenharmony_ci 20762306a36Sopenharmony_ci return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes); 20862306a36Sopenharmony_ci} 20962306a36Sopenharmony_ci 21062306a36Sopenharmony_ci/* 21162306a36Sopenharmony_ci * Sets up fake nodes of `size' interleaved over physical nodes ranging from 21262306a36Sopenharmony_ci * `addr' to `max_addr'. 21362306a36Sopenharmony_ci * 21462306a36Sopenharmony_ci * Returns zero on success or negative on error. 21562306a36Sopenharmony_ci */ 21662306a36Sopenharmony_cistatic int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei, 21762306a36Sopenharmony_ci struct numa_meminfo *pi, 21862306a36Sopenharmony_ci u64 addr, u64 max_addr, u64 size, 21962306a36Sopenharmony_ci int nr_nodes, struct numa_memblk *pblk, 22062306a36Sopenharmony_ci int nid) 22162306a36Sopenharmony_ci{ 22262306a36Sopenharmony_ci nodemask_t physnode_mask = numa_nodes_parsed; 22362306a36Sopenharmony_ci int i, ret, uniform = 0; 22462306a36Sopenharmony_ci u64 min_size; 22562306a36Sopenharmony_ci 22662306a36Sopenharmony_ci if ((!size && !nr_nodes) || (nr_nodes && !pblk)) 22762306a36Sopenharmony_ci return -1; 22862306a36Sopenharmony_ci 22962306a36Sopenharmony_ci /* 23062306a36Sopenharmony_ci * In the 'uniform' case split the passed in physical node by 23162306a36Sopenharmony_ci * nr_nodes, in the non-uniform case, ignore the passed in 23262306a36Sopenharmony_ci * physical block and try to create nodes of at least size 23362306a36Sopenharmony_ci * @size. 23462306a36Sopenharmony_ci * 23562306a36Sopenharmony_ci * In the uniform case, split the nodes strictly by physical 23662306a36Sopenharmony_ci * capacity, i.e. ignore holes. In the non-uniform case account 23762306a36Sopenharmony_ci * for holes and treat @size as a minimum floor. 23862306a36Sopenharmony_ci */ 23962306a36Sopenharmony_ci if (!nr_nodes) 24062306a36Sopenharmony_ci nr_nodes = MAX_NUMNODES; 24162306a36Sopenharmony_ci else { 24262306a36Sopenharmony_ci nodes_clear(physnode_mask); 24362306a36Sopenharmony_ci node_set(pblk->nid, physnode_mask); 24462306a36Sopenharmony_ci uniform = 1; 24562306a36Sopenharmony_ci } 24662306a36Sopenharmony_ci 24762306a36Sopenharmony_ci if (uniform) { 24862306a36Sopenharmony_ci min_size = uniform_size(max_addr, addr, 0, nr_nodes); 24962306a36Sopenharmony_ci size = min_size; 25062306a36Sopenharmony_ci } else { 25162306a36Sopenharmony_ci /* 25262306a36Sopenharmony_ci * The limit on emulated nodes is MAX_NUMNODES, so the 25362306a36Sopenharmony_ci * size per node is increased accordingly if the 25462306a36Sopenharmony_ci * requested size is too small. This creates a uniform 25562306a36Sopenharmony_ci * distribution of node sizes across the entire machine 25662306a36Sopenharmony_ci * (but not necessarily over physical nodes). 25762306a36Sopenharmony_ci */ 25862306a36Sopenharmony_ci min_size = uniform_size(max_addr, addr, 25962306a36Sopenharmony_ci mem_hole_size(addr, max_addr), nr_nodes); 26062306a36Sopenharmony_ci } 26162306a36Sopenharmony_ci min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE); 26262306a36Sopenharmony_ci if (size < min_size) { 26362306a36Sopenharmony_ci pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", 26462306a36Sopenharmony_ci size >> 20, min_size >> 20); 26562306a36Sopenharmony_ci size = min_size; 26662306a36Sopenharmony_ci } 26762306a36Sopenharmony_ci size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE); 26862306a36Sopenharmony_ci 26962306a36Sopenharmony_ci /* 27062306a36Sopenharmony_ci * Fill physical nodes with fake nodes of size until there is no memory 27162306a36Sopenharmony_ci * left on any of them. 27262306a36Sopenharmony_ci */ 27362306a36Sopenharmony_ci while (!nodes_empty(physnode_mask)) { 27462306a36Sopenharmony_ci for_each_node_mask(i, physnode_mask) { 27562306a36Sopenharmony_ci u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); 27662306a36Sopenharmony_ci u64 start, limit, end; 27762306a36Sopenharmony_ci int phys_blk; 27862306a36Sopenharmony_ci 27962306a36Sopenharmony_ci phys_blk = emu_find_memblk_by_nid(i, pi); 28062306a36Sopenharmony_ci if (phys_blk < 0) { 28162306a36Sopenharmony_ci node_clear(i, physnode_mask); 28262306a36Sopenharmony_ci continue; 28362306a36Sopenharmony_ci } 28462306a36Sopenharmony_ci 28562306a36Sopenharmony_ci start = pi->blk[phys_blk].start; 28662306a36Sopenharmony_ci limit = pi->blk[phys_blk].end; 28762306a36Sopenharmony_ci 28862306a36Sopenharmony_ci if (uniform) 28962306a36Sopenharmony_ci end = start + size; 29062306a36Sopenharmony_ci else 29162306a36Sopenharmony_ci end = find_end_of_node(start, limit, size); 29262306a36Sopenharmony_ci /* 29362306a36Sopenharmony_ci * If there won't be at least FAKE_NODE_MIN_SIZE of 29462306a36Sopenharmony_ci * non-reserved memory in ZONE_DMA32 for the next node, 29562306a36Sopenharmony_ci * this one must extend to the boundary. 29662306a36Sopenharmony_ci */ 29762306a36Sopenharmony_ci if (end < dma32_end && dma32_end - end - 29862306a36Sopenharmony_ci mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) 29962306a36Sopenharmony_ci end = dma32_end; 30062306a36Sopenharmony_ci 30162306a36Sopenharmony_ci /* 30262306a36Sopenharmony_ci * If there won't be enough non-reserved memory for the 30362306a36Sopenharmony_ci * next node, this one must extend to the end of the 30462306a36Sopenharmony_ci * physical node. 30562306a36Sopenharmony_ci */ 30662306a36Sopenharmony_ci if ((limit - end - mem_hole_size(end, limit) < size) 30762306a36Sopenharmony_ci && !uniform) 30862306a36Sopenharmony_ci end = limit; 30962306a36Sopenharmony_ci 31062306a36Sopenharmony_ci ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, 31162306a36Sopenharmony_ci phys_blk, 31262306a36Sopenharmony_ci min(end, limit) - start); 31362306a36Sopenharmony_ci if (ret < 0) 31462306a36Sopenharmony_ci return ret; 31562306a36Sopenharmony_ci } 31662306a36Sopenharmony_ci } 31762306a36Sopenharmony_ci return nid; 31862306a36Sopenharmony_ci} 31962306a36Sopenharmony_ci 32062306a36Sopenharmony_cistatic int __init split_nodes_size_interleave(struct numa_meminfo *ei, 32162306a36Sopenharmony_ci struct numa_meminfo *pi, 32262306a36Sopenharmony_ci u64 addr, u64 max_addr, u64 size) 32362306a36Sopenharmony_ci{ 32462306a36Sopenharmony_ci return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size, 32562306a36Sopenharmony_ci 0, NULL, 0); 32662306a36Sopenharmony_ci} 32762306a36Sopenharmony_ci 32862306a36Sopenharmony_cistatic int __init setup_emu2phys_nid(int *dfl_phys_nid) 32962306a36Sopenharmony_ci{ 33062306a36Sopenharmony_ci int i, max_emu_nid = 0; 33162306a36Sopenharmony_ci 33262306a36Sopenharmony_ci *dfl_phys_nid = NUMA_NO_NODE; 33362306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { 33462306a36Sopenharmony_ci if (emu_nid_to_phys[i] != NUMA_NO_NODE) { 33562306a36Sopenharmony_ci max_emu_nid = i; 33662306a36Sopenharmony_ci if (*dfl_phys_nid == NUMA_NO_NODE) 33762306a36Sopenharmony_ci *dfl_phys_nid = emu_nid_to_phys[i]; 33862306a36Sopenharmony_ci } 33962306a36Sopenharmony_ci } 34062306a36Sopenharmony_ci 34162306a36Sopenharmony_ci return max_emu_nid; 34262306a36Sopenharmony_ci} 34362306a36Sopenharmony_ci 34462306a36Sopenharmony_ci/** 34562306a36Sopenharmony_ci * numa_emulation - Emulate NUMA nodes 34662306a36Sopenharmony_ci * @numa_meminfo: NUMA configuration to massage 34762306a36Sopenharmony_ci * @numa_dist_cnt: The size of the physical NUMA distance table 34862306a36Sopenharmony_ci * 34962306a36Sopenharmony_ci * Emulate NUMA nodes according to the numa=fake kernel parameter. 35062306a36Sopenharmony_ci * @numa_meminfo contains the physical memory configuration and is modified 35162306a36Sopenharmony_ci * to reflect the emulated configuration on success. @numa_dist_cnt is 35262306a36Sopenharmony_ci * used to determine the size of the physical distance table. 35362306a36Sopenharmony_ci * 35462306a36Sopenharmony_ci * On success, the following modifications are made. 35562306a36Sopenharmony_ci * 35662306a36Sopenharmony_ci * - @numa_meminfo is updated to reflect the emulated nodes. 35762306a36Sopenharmony_ci * 35862306a36Sopenharmony_ci * - __apicid_to_node[] is updated such that APIC IDs are mapped to the 35962306a36Sopenharmony_ci * emulated nodes. 36062306a36Sopenharmony_ci * 36162306a36Sopenharmony_ci * - NUMA distance table is rebuilt to represent distances between emulated 36262306a36Sopenharmony_ci * nodes. The distances are determined considering how emulated nodes 36362306a36Sopenharmony_ci * are mapped to physical nodes and match the actual distances. 36462306a36Sopenharmony_ci * 36562306a36Sopenharmony_ci * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical 36662306a36Sopenharmony_ci * nodes. This is used by numa_add_cpu() and numa_remove_cpu(). 36762306a36Sopenharmony_ci * 36862306a36Sopenharmony_ci * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with 36962306a36Sopenharmony_ci * identity mapping and no other modification is made. 37062306a36Sopenharmony_ci */ 37162306a36Sopenharmony_civoid __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) 37262306a36Sopenharmony_ci{ 37362306a36Sopenharmony_ci static struct numa_meminfo ei __initdata; 37462306a36Sopenharmony_ci static struct numa_meminfo pi __initdata; 37562306a36Sopenharmony_ci const u64 max_addr = PFN_PHYS(max_pfn); 37662306a36Sopenharmony_ci u8 *phys_dist = NULL; 37762306a36Sopenharmony_ci size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); 37862306a36Sopenharmony_ci int max_emu_nid, dfl_phys_nid; 37962306a36Sopenharmony_ci int i, j, ret; 38062306a36Sopenharmony_ci 38162306a36Sopenharmony_ci if (!emu_cmdline) 38262306a36Sopenharmony_ci goto no_emu; 38362306a36Sopenharmony_ci 38462306a36Sopenharmony_ci memset(&ei, 0, sizeof(ei)); 38562306a36Sopenharmony_ci pi = *numa_meminfo; 38662306a36Sopenharmony_ci 38762306a36Sopenharmony_ci for (i = 0; i < MAX_NUMNODES; i++) 38862306a36Sopenharmony_ci emu_nid_to_phys[i] = NUMA_NO_NODE; 38962306a36Sopenharmony_ci 39062306a36Sopenharmony_ci /* 39162306a36Sopenharmony_ci * If the numa=fake command-line contains a 'M' or 'G', it represents 39262306a36Sopenharmony_ci * the fixed node size. Otherwise, if it is just a single number N, 39362306a36Sopenharmony_ci * split the system RAM into N fake nodes. 39462306a36Sopenharmony_ci */ 39562306a36Sopenharmony_ci if (strchr(emu_cmdline, 'U')) { 39662306a36Sopenharmony_ci nodemask_t physnode_mask = numa_nodes_parsed; 39762306a36Sopenharmony_ci unsigned long n; 39862306a36Sopenharmony_ci int nid = 0; 39962306a36Sopenharmony_ci 40062306a36Sopenharmony_ci n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); 40162306a36Sopenharmony_ci ret = -1; 40262306a36Sopenharmony_ci for_each_node_mask(i, physnode_mask) { 40362306a36Sopenharmony_ci /* 40462306a36Sopenharmony_ci * The reason we pass in blk[0] is due to 40562306a36Sopenharmony_ci * numa_remove_memblk_from() called by 40662306a36Sopenharmony_ci * emu_setup_memblk() will delete entry 0 40762306a36Sopenharmony_ci * and then move everything else up in the pi.blk 40862306a36Sopenharmony_ci * array. Therefore we should always be looking 40962306a36Sopenharmony_ci * at blk[0]. 41062306a36Sopenharmony_ci */ 41162306a36Sopenharmony_ci ret = split_nodes_size_interleave_uniform(&ei, &pi, 41262306a36Sopenharmony_ci pi.blk[0].start, pi.blk[0].end, 0, 41362306a36Sopenharmony_ci n, &pi.blk[0], nid); 41462306a36Sopenharmony_ci if (ret < 0) 41562306a36Sopenharmony_ci break; 41662306a36Sopenharmony_ci if (ret < n) { 41762306a36Sopenharmony_ci pr_info("%s: phys: %d only got %d of %ld nodes, failing\n", 41862306a36Sopenharmony_ci __func__, i, ret, n); 41962306a36Sopenharmony_ci ret = -1; 42062306a36Sopenharmony_ci break; 42162306a36Sopenharmony_ci } 42262306a36Sopenharmony_ci nid = ret; 42362306a36Sopenharmony_ci } 42462306a36Sopenharmony_ci } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { 42562306a36Sopenharmony_ci u64 size; 42662306a36Sopenharmony_ci 42762306a36Sopenharmony_ci size = memparse(emu_cmdline, &emu_cmdline); 42862306a36Sopenharmony_ci ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); 42962306a36Sopenharmony_ci } else { 43062306a36Sopenharmony_ci unsigned long n; 43162306a36Sopenharmony_ci 43262306a36Sopenharmony_ci n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); 43362306a36Sopenharmony_ci ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); 43462306a36Sopenharmony_ci } 43562306a36Sopenharmony_ci if (*emu_cmdline == ':') 43662306a36Sopenharmony_ci emu_cmdline++; 43762306a36Sopenharmony_ci 43862306a36Sopenharmony_ci if (ret < 0) 43962306a36Sopenharmony_ci goto no_emu; 44062306a36Sopenharmony_ci 44162306a36Sopenharmony_ci if (numa_cleanup_meminfo(&ei) < 0) { 44262306a36Sopenharmony_ci pr_warn("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); 44362306a36Sopenharmony_ci goto no_emu; 44462306a36Sopenharmony_ci } 44562306a36Sopenharmony_ci 44662306a36Sopenharmony_ci /* copy the physical distance table */ 44762306a36Sopenharmony_ci if (numa_dist_cnt) { 44862306a36Sopenharmony_ci u64 phys; 44962306a36Sopenharmony_ci 45062306a36Sopenharmony_ci phys = memblock_phys_alloc_range(phys_size, PAGE_SIZE, 0, 45162306a36Sopenharmony_ci PFN_PHYS(max_pfn_mapped)); 45262306a36Sopenharmony_ci if (!phys) { 45362306a36Sopenharmony_ci pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); 45462306a36Sopenharmony_ci goto no_emu; 45562306a36Sopenharmony_ci } 45662306a36Sopenharmony_ci phys_dist = __va(phys); 45762306a36Sopenharmony_ci 45862306a36Sopenharmony_ci for (i = 0; i < numa_dist_cnt; i++) 45962306a36Sopenharmony_ci for (j = 0; j < numa_dist_cnt; j++) 46062306a36Sopenharmony_ci phys_dist[i * numa_dist_cnt + j] = 46162306a36Sopenharmony_ci node_distance(i, j); 46262306a36Sopenharmony_ci } 46362306a36Sopenharmony_ci 46462306a36Sopenharmony_ci /* 46562306a36Sopenharmony_ci * Determine the max emulated nid and the default phys nid to use 46662306a36Sopenharmony_ci * for unmapped nodes. 46762306a36Sopenharmony_ci */ 46862306a36Sopenharmony_ci max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid); 46962306a36Sopenharmony_ci 47062306a36Sopenharmony_ci /* commit */ 47162306a36Sopenharmony_ci *numa_meminfo = ei; 47262306a36Sopenharmony_ci 47362306a36Sopenharmony_ci /* Make sure numa_nodes_parsed only contains emulated nodes */ 47462306a36Sopenharmony_ci nodes_clear(numa_nodes_parsed); 47562306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(ei.blk); i++) 47662306a36Sopenharmony_ci if (ei.blk[i].start != ei.blk[i].end && 47762306a36Sopenharmony_ci ei.blk[i].nid != NUMA_NO_NODE) 47862306a36Sopenharmony_ci node_set(ei.blk[i].nid, numa_nodes_parsed); 47962306a36Sopenharmony_ci 48062306a36Sopenharmony_ci /* 48162306a36Sopenharmony_ci * Transform __apicid_to_node table to use emulated nids by 48262306a36Sopenharmony_ci * reverse-mapping phys_nid. The maps should always exist but fall 48362306a36Sopenharmony_ci * back to zero just in case. 48462306a36Sopenharmony_ci */ 48562306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) { 48662306a36Sopenharmony_ci if (__apicid_to_node[i] == NUMA_NO_NODE) 48762306a36Sopenharmony_ci continue; 48862306a36Sopenharmony_ci for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) 48962306a36Sopenharmony_ci if (__apicid_to_node[i] == emu_nid_to_phys[j]) 49062306a36Sopenharmony_ci break; 49162306a36Sopenharmony_ci __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0; 49262306a36Sopenharmony_ci } 49362306a36Sopenharmony_ci 49462306a36Sopenharmony_ci /* make sure all emulated nodes are mapped to a physical node */ 49562306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) 49662306a36Sopenharmony_ci if (emu_nid_to_phys[i] == NUMA_NO_NODE) 49762306a36Sopenharmony_ci emu_nid_to_phys[i] = dfl_phys_nid; 49862306a36Sopenharmony_ci 49962306a36Sopenharmony_ci /* transform distance table */ 50062306a36Sopenharmony_ci numa_reset_distance(); 50162306a36Sopenharmony_ci for (i = 0; i < max_emu_nid + 1; i++) { 50262306a36Sopenharmony_ci for (j = 0; j < max_emu_nid + 1; j++) { 50362306a36Sopenharmony_ci int physi = emu_nid_to_phys[i]; 50462306a36Sopenharmony_ci int physj = emu_nid_to_phys[j]; 50562306a36Sopenharmony_ci int dist; 50662306a36Sopenharmony_ci 50762306a36Sopenharmony_ci if (get_option(&emu_cmdline, &dist) == 2) 50862306a36Sopenharmony_ci ; 50962306a36Sopenharmony_ci else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) 51062306a36Sopenharmony_ci dist = physi == physj ? 51162306a36Sopenharmony_ci LOCAL_DISTANCE : REMOTE_DISTANCE; 51262306a36Sopenharmony_ci else 51362306a36Sopenharmony_ci dist = phys_dist[physi * numa_dist_cnt + physj]; 51462306a36Sopenharmony_ci 51562306a36Sopenharmony_ci numa_set_distance(i, j, dist); 51662306a36Sopenharmony_ci } 51762306a36Sopenharmony_ci } 51862306a36Sopenharmony_ci 51962306a36Sopenharmony_ci /* free the copied physical distance table */ 52062306a36Sopenharmony_ci memblock_free(phys_dist, phys_size); 52162306a36Sopenharmony_ci return; 52262306a36Sopenharmony_ci 52362306a36Sopenharmony_cino_emu: 52462306a36Sopenharmony_ci /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ 52562306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) 52662306a36Sopenharmony_ci emu_nid_to_phys[i] = i; 52762306a36Sopenharmony_ci} 52862306a36Sopenharmony_ci 52962306a36Sopenharmony_ci#ifndef CONFIG_DEBUG_PER_CPU_MAPS 53062306a36Sopenharmony_civoid numa_add_cpu(int cpu) 53162306a36Sopenharmony_ci{ 53262306a36Sopenharmony_ci int physnid, nid; 53362306a36Sopenharmony_ci 53462306a36Sopenharmony_ci nid = early_cpu_to_node(cpu); 53562306a36Sopenharmony_ci BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); 53662306a36Sopenharmony_ci 53762306a36Sopenharmony_ci physnid = emu_nid_to_phys[nid]; 53862306a36Sopenharmony_ci 53962306a36Sopenharmony_ci /* 54062306a36Sopenharmony_ci * Map the cpu to each emulated node that is allocated on the physical 54162306a36Sopenharmony_ci * node of the cpu's apic id. 54262306a36Sopenharmony_ci */ 54362306a36Sopenharmony_ci for_each_online_node(nid) 54462306a36Sopenharmony_ci if (emu_nid_to_phys[nid] == physnid) 54562306a36Sopenharmony_ci cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); 54662306a36Sopenharmony_ci} 54762306a36Sopenharmony_ci 54862306a36Sopenharmony_civoid numa_remove_cpu(int cpu) 54962306a36Sopenharmony_ci{ 55062306a36Sopenharmony_ci int i; 55162306a36Sopenharmony_ci 55262306a36Sopenharmony_ci for_each_online_node(i) 55362306a36Sopenharmony_ci cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); 55462306a36Sopenharmony_ci} 55562306a36Sopenharmony_ci#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ 55662306a36Sopenharmony_cistatic void numa_set_cpumask(int cpu, bool enable) 55762306a36Sopenharmony_ci{ 55862306a36Sopenharmony_ci int nid, physnid; 55962306a36Sopenharmony_ci 56062306a36Sopenharmony_ci nid = early_cpu_to_node(cpu); 56162306a36Sopenharmony_ci if (nid == NUMA_NO_NODE) { 56262306a36Sopenharmony_ci /* early_cpu_to_node() already emits a warning and trace */ 56362306a36Sopenharmony_ci return; 56462306a36Sopenharmony_ci } 56562306a36Sopenharmony_ci 56662306a36Sopenharmony_ci physnid = emu_nid_to_phys[nid]; 56762306a36Sopenharmony_ci 56862306a36Sopenharmony_ci for_each_online_node(nid) { 56962306a36Sopenharmony_ci if (emu_nid_to_phys[nid] != physnid) 57062306a36Sopenharmony_ci continue; 57162306a36Sopenharmony_ci 57262306a36Sopenharmony_ci debug_cpumask_set_cpu(cpu, nid, enable); 57362306a36Sopenharmony_ci } 57462306a36Sopenharmony_ci} 57562306a36Sopenharmony_ci 57662306a36Sopenharmony_civoid numa_add_cpu(int cpu) 57762306a36Sopenharmony_ci{ 57862306a36Sopenharmony_ci numa_set_cpumask(cpu, true); 57962306a36Sopenharmony_ci} 58062306a36Sopenharmony_ci 58162306a36Sopenharmony_civoid numa_remove_cpu(int cpu) 58262306a36Sopenharmony_ci{ 58362306a36Sopenharmony_ci numa_set_cpumask(cpu, false); 58462306a36Sopenharmony_ci} 58562306a36Sopenharmony_ci#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ 586