18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * NUMA emulation 48c2ecf20Sopenharmony_ci */ 58c2ecf20Sopenharmony_ci#include <linux/kernel.h> 68c2ecf20Sopenharmony_ci#include <linux/errno.h> 78c2ecf20Sopenharmony_ci#include <linux/topology.h> 88c2ecf20Sopenharmony_ci#include <linux/memblock.h> 98c2ecf20Sopenharmony_ci#include <asm/dma.h> 108c2ecf20Sopenharmony_ci 118c2ecf20Sopenharmony_ci#include "numa_internal.h" 128c2ecf20Sopenharmony_ci 138c2ecf20Sopenharmony_cistatic int emu_nid_to_phys[MAX_NUMNODES]; 148c2ecf20Sopenharmony_cistatic char *emu_cmdline __initdata; 158c2ecf20Sopenharmony_ci 168c2ecf20Sopenharmony_ciint __init numa_emu_cmdline(char *str) 178c2ecf20Sopenharmony_ci{ 188c2ecf20Sopenharmony_ci emu_cmdline = str; 198c2ecf20Sopenharmony_ci return 0; 208c2ecf20Sopenharmony_ci} 218c2ecf20Sopenharmony_ci 228c2ecf20Sopenharmony_cistatic int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) 238c2ecf20Sopenharmony_ci{ 248c2ecf20Sopenharmony_ci int i; 258c2ecf20Sopenharmony_ci 268c2ecf20Sopenharmony_ci for (i = 0; i < mi->nr_blks; i++) 278c2ecf20Sopenharmony_ci if (mi->blk[i].nid == nid) 288c2ecf20Sopenharmony_ci return i; 298c2ecf20Sopenharmony_ci return -ENOENT; 308c2ecf20Sopenharmony_ci} 318c2ecf20Sopenharmony_ci 328c2ecf20Sopenharmony_cistatic u64 __init mem_hole_size(u64 start, u64 end) 338c2ecf20Sopenharmony_ci{ 348c2ecf20Sopenharmony_ci unsigned long start_pfn = PFN_UP(start); 358c2ecf20Sopenharmony_ci unsigned long end_pfn = PFN_DOWN(end); 368c2ecf20Sopenharmony_ci 378c2ecf20Sopenharmony_ci if (start_pfn < end_pfn) 388c2ecf20Sopenharmony_ci return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn)); 398c2ecf20Sopenharmony_ci return 0; 408c2ecf20Sopenharmony_ci} 418c2ecf20Sopenharmony_ci 428c2ecf20Sopenharmony_ci/* 438c2ecf20Sopenharmony_ci * Sets up nid to range from @start to @end. The return value is -errno if 448c2ecf20Sopenharmony_ci * something went wrong, 0 otherwise. 458c2ecf20Sopenharmony_ci */ 468c2ecf20Sopenharmony_cistatic int __init emu_setup_memblk(struct numa_meminfo *ei, 478c2ecf20Sopenharmony_ci struct numa_meminfo *pi, 488c2ecf20Sopenharmony_ci int nid, int phys_blk, u64 size) 498c2ecf20Sopenharmony_ci{ 508c2ecf20Sopenharmony_ci struct numa_memblk *eb = &ei->blk[ei->nr_blks]; 518c2ecf20Sopenharmony_ci struct numa_memblk *pb = &pi->blk[phys_blk]; 528c2ecf20Sopenharmony_ci 538c2ecf20Sopenharmony_ci if (ei->nr_blks >= NR_NODE_MEMBLKS) { 548c2ecf20Sopenharmony_ci pr_err("NUMA: Too many emulated memblks, failing emulation\n"); 558c2ecf20Sopenharmony_ci return -EINVAL; 568c2ecf20Sopenharmony_ci } 578c2ecf20Sopenharmony_ci 588c2ecf20Sopenharmony_ci ei->nr_blks++; 598c2ecf20Sopenharmony_ci eb->start = pb->start; 608c2ecf20Sopenharmony_ci eb->end = pb->start + size; 618c2ecf20Sopenharmony_ci eb->nid = nid; 628c2ecf20Sopenharmony_ci 638c2ecf20Sopenharmony_ci if (emu_nid_to_phys[nid] == NUMA_NO_NODE) 648c2ecf20Sopenharmony_ci emu_nid_to_phys[nid] = pb->nid; 658c2ecf20Sopenharmony_ci 668c2ecf20Sopenharmony_ci pb->start += size; 678c2ecf20Sopenharmony_ci if (pb->start >= pb->end) { 688c2ecf20Sopenharmony_ci WARN_ON_ONCE(pb->start > pb->end); 698c2ecf20Sopenharmony_ci numa_remove_memblk_from(phys_blk, pi); 708c2ecf20Sopenharmony_ci } 718c2ecf20Sopenharmony_ci 728c2ecf20Sopenharmony_ci printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n", 738c2ecf20Sopenharmony_ci nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20); 748c2ecf20Sopenharmony_ci return 0; 758c2ecf20Sopenharmony_ci} 768c2ecf20Sopenharmony_ci 778c2ecf20Sopenharmony_ci/* 788c2ecf20Sopenharmony_ci * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr 798c2ecf20Sopenharmony_ci * to max_addr. 808c2ecf20Sopenharmony_ci * 818c2ecf20Sopenharmony_ci * Returns zero on success or negative on error. 828c2ecf20Sopenharmony_ci */ 838c2ecf20Sopenharmony_cistatic int __init split_nodes_interleave(struct numa_meminfo *ei, 848c2ecf20Sopenharmony_ci struct numa_meminfo *pi, 858c2ecf20Sopenharmony_ci u64 addr, u64 max_addr, int nr_nodes) 868c2ecf20Sopenharmony_ci{ 878c2ecf20Sopenharmony_ci nodemask_t physnode_mask = numa_nodes_parsed; 888c2ecf20Sopenharmony_ci u64 size; 898c2ecf20Sopenharmony_ci int big; 908c2ecf20Sopenharmony_ci int nid = 0; 918c2ecf20Sopenharmony_ci int i, ret; 928c2ecf20Sopenharmony_ci 938c2ecf20Sopenharmony_ci if (nr_nodes <= 0) 948c2ecf20Sopenharmony_ci return -1; 958c2ecf20Sopenharmony_ci if (nr_nodes > MAX_NUMNODES) { 968c2ecf20Sopenharmony_ci pr_info("numa=fake=%d too large, reducing to %d\n", 978c2ecf20Sopenharmony_ci nr_nodes, MAX_NUMNODES); 988c2ecf20Sopenharmony_ci nr_nodes = MAX_NUMNODES; 998c2ecf20Sopenharmony_ci } 1008c2ecf20Sopenharmony_ci 1018c2ecf20Sopenharmony_ci /* 1028c2ecf20Sopenharmony_ci * Calculate target node size. x86_32 freaks on __udivdi3() so do 1038c2ecf20Sopenharmony_ci * the division in ulong number of pages and convert back. 1048c2ecf20Sopenharmony_ci */ 1058c2ecf20Sopenharmony_ci size = max_addr - addr - mem_hole_size(addr, max_addr); 1068c2ecf20Sopenharmony_ci size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); 1078c2ecf20Sopenharmony_ci 1088c2ecf20Sopenharmony_ci /* 1098c2ecf20Sopenharmony_ci * Calculate the number of big nodes that can be allocated as a result 1108c2ecf20Sopenharmony_ci * of consolidating the remainder. 1118c2ecf20Sopenharmony_ci */ 1128c2ecf20Sopenharmony_ci big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / 1138c2ecf20Sopenharmony_ci FAKE_NODE_MIN_SIZE; 1148c2ecf20Sopenharmony_ci 1158c2ecf20Sopenharmony_ci size &= FAKE_NODE_MIN_HASH_MASK; 1168c2ecf20Sopenharmony_ci if (!size) { 1178c2ecf20Sopenharmony_ci pr_err("Not enough memory for each node. " 1188c2ecf20Sopenharmony_ci "NUMA emulation disabled.\n"); 1198c2ecf20Sopenharmony_ci return -1; 1208c2ecf20Sopenharmony_ci } 1218c2ecf20Sopenharmony_ci 1228c2ecf20Sopenharmony_ci /* 1238c2ecf20Sopenharmony_ci * Continue to fill physical nodes with fake nodes until there is no 1248c2ecf20Sopenharmony_ci * memory left on any of them. 1258c2ecf20Sopenharmony_ci */ 1268c2ecf20Sopenharmony_ci while (nodes_weight(physnode_mask)) { 1278c2ecf20Sopenharmony_ci for_each_node_mask(i, physnode_mask) { 1288c2ecf20Sopenharmony_ci u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); 1298c2ecf20Sopenharmony_ci u64 start, limit, end; 1308c2ecf20Sopenharmony_ci int phys_blk; 1318c2ecf20Sopenharmony_ci 1328c2ecf20Sopenharmony_ci phys_blk = emu_find_memblk_by_nid(i, pi); 1338c2ecf20Sopenharmony_ci if (phys_blk < 0) { 1348c2ecf20Sopenharmony_ci node_clear(i, physnode_mask); 1358c2ecf20Sopenharmony_ci continue; 1368c2ecf20Sopenharmony_ci } 1378c2ecf20Sopenharmony_ci start = pi->blk[phys_blk].start; 1388c2ecf20Sopenharmony_ci limit = pi->blk[phys_blk].end; 1398c2ecf20Sopenharmony_ci end = start + size; 1408c2ecf20Sopenharmony_ci 1418c2ecf20Sopenharmony_ci if (nid < big) 1428c2ecf20Sopenharmony_ci end += FAKE_NODE_MIN_SIZE; 1438c2ecf20Sopenharmony_ci 1448c2ecf20Sopenharmony_ci /* 1458c2ecf20Sopenharmony_ci * Continue to add memory to this fake node if its 1468c2ecf20Sopenharmony_ci * non-reserved memory is less than the per-node size. 1478c2ecf20Sopenharmony_ci */ 1488c2ecf20Sopenharmony_ci while (end - start - mem_hole_size(start, end) < size) { 1498c2ecf20Sopenharmony_ci end += FAKE_NODE_MIN_SIZE; 1508c2ecf20Sopenharmony_ci if (end > limit) { 1518c2ecf20Sopenharmony_ci end = limit; 1528c2ecf20Sopenharmony_ci break; 1538c2ecf20Sopenharmony_ci } 1548c2ecf20Sopenharmony_ci } 1558c2ecf20Sopenharmony_ci 1568c2ecf20Sopenharmony_ci /* 1578c2ecf20Sopenharmony_ci * If there won't be at least FAKE_NODE_MIN_SIZE of 1588c2ecf20Sopenharmony_ci * non-reserved memory in ZONE_DMA32 for the next node, 1598c2ecf20Sopenharmony_ci * this one must extend to the boundary. 1608c2ecf20Sopenharmony_ci */ 1618c2ecf20Sopenharmony_ci if (end < dma32_end && dma32_end - end - 1628c2ecf20Sopenharmony_ci mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) 1638c2ecf20Sopenharmony_ci end = dma32_end; 1648c2ecf20Sopenharmony_ci 1658c2ecf20Sopenharmony_ci /* 1668c2ecf20Sopenharmony_ci * If there won't be enough non-reserved memory for the 1678c2ecf20Sopenharmony_ci * next node, this one must extend to the end of the 1688c2ecf20Sopenharmony_ci * physical node. 1698c2ecf20Sopenharmony_ci */ 1708c2ecf20Sopenharmony_ci if (limit - end - mem_hole_size(end, limit) < size) 1718c2ecf20Sopenharmony_ci end = limit; 1728c2ecf20Sopenharmony_ci 1738c2ecf20Sopenharmony_ci ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, 1748c2ecf20Sopenharmony_ci phys_blk, 1758c2ecf20Sopenharmony_ci min(end, limit) - start); 1768c2ecf20Sopenharmony_ci if (ret < 0) 1778c2ecf20Sopenharmony_ci return ret; 1788c2ecf20Sopenharmony_ci } 1798c2ecf20Sopenharmony_ci } 1808c2ecf20Sopenharmony_ci return 0; 1818c2ecf20Sopenharmony_ci} 1828c2ecf20Sopenharmony_ci 1838c2ecf20Sopenharmony_ci/* 1848c2ecf20Sopenharmony_ci * Returns the end address of a node so that there is at least `size' amount of 1858c2ecf20Sopenharmony_ci * non-reserved memory or `max_addr' is reached. 1868c2ecf20Sopenharmony_ci */ 1878c2ecf20Sopenharmony_cistatic u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) 1888c2ecf20Sopenharmony_ci{ 1898c2ecf20Sopenharmony_ci u64 end = start + size; 1908c2ecf20Sopenharmony_ci 1918c2ecf20Sopenharmony_ci while (end - start - mem_hole_size(start, end) < size) { 1928c2ecf20Sopenharmony_ci end += FAKE_NODE_MIN_SIZE; 1938c2ecf20Sopenharmony_ci if (end > max_addr) { 1948c2ecf20Sopenharmony_ci end = max_addr; 1958c2ecf20Sopenharmony_ci break; 1968c2ecf20Sopenharmony_ci } 1978c2ecf20Sopenharmony_ci } 1988c2ecf20Sopenharmony_ci return end; 1998c2ecf20Sopenharmony_ci} 2008c2ecf20Sopenharmony_ci 2018c2ecf20Sopenharmony_cistatic u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes) 2028c2ecf20Sopenharmony_ci{ 2038c2ecf20Sopenharmony_ci unsigned long max_pfn = PHYS_PFN(max_addr); 2048c2ecf20Sopenharmony_ci unsigned long base_pfn = PHYS_PFN(base); 2058c2ecf20Sopenharmony_ci unsigned long hole_pfns = PHYS_PFN(hole); 2068c2ecf20Sopenharmony_ci 2078c2ecf20Sopenharmony_ci return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes); 2088c2ecf20Sopenharmony_ci} 2098c2ecf20Sopenharmony_ci 2108c2ecf20Sopenharmony_ci/* 2118c2ecf20Sopenharmony_ci * Sets up fake nodes of `size' interleaved over physical nodes ranging from 2128c2ecf20Sopenharmony_ci * `addr' to `max_addr'. 2138c2ecf20Sopenharmony_ci * 2148c2ecf20Sopenharmony_ci * Returns zero on success or negative on error. 2158c2ecf20Sopenharmony_ci */ 2168c2ecf20Sopenharmony_cistatic int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei, 2178c2ecf20Sopenharmony_ci struct numa_meminfo *pi, 2188c2ecf20Sopenharmony_ci u64 addr, u64 max_addr, u64 size, 2198c2ecf20Sopenharmony_ci int nr_nodes, struct numa_memblk *pblk, 2208c2ecf20Sopenharmony_ci int nid) 2218c2ecf20Sopenharmony_ci{ 2228c2ecf20Sopenharmony_ci nodemask_t physnode_mask = numa_nodes_parsed; 2238c2ecf20Sopenharmony_ci int i, ret, uniform = 0; 2248c2ecf20Sopenharmony_ci u64 min_size; 2258c2ecf20Sopenharmony_ci 2268c2ecf20Sopenharmony_ci if ((!size && !nr_nodes) || (nr_nodes && !pblk)) 2278c2ecf20Sopenharmony_ci return -1; 2288c2ecf20Sopenharmony_ci 2298c2ecf20Sopenharmony_ci /* 2308c2ecf20Sopenharmony_ci * In the 'uniform' case split the passed in physical node by 2318c2ecf20Sopenharmony_ci * nr_nodes, in the non-uniform case, ignore the passed in 2328c2ecf20Sopenharmony_ci * physical block and try to create nodes of at least size 2338c2ecf20Sopenharmony_ci * @size. 2348c2ecf20Sopenharmony_ci * 2358c2ecf20Sopenharmony_ci * In the uniform case, split the nodes strictly by physical 2368c2ecf20Sopenharmony_ci * capacity, i.e. ignore holes. In the non-uniform case account 2378c2ecf20Sopenharmony_ci * for holes and treat @size as a minimum floor. 2388c2ecf20Sopenharmony_ci */ 2398c2ecf20Sopenharmony_ci if (!nr_nodes) 2408c2ecf20Sopenharmony_ci nr_nodes = MAX_NUMNODES; 2418c2ecf20Sopenharmony_ci else { 2428c2ecf20Sopenharmony_ci nodes_clear(physnode_mask); 2438c2ecf20Sopenharmony_ci node_set(pblk->nid, physnode_mask); 2448c2ecf20Sopenharmony_ci uniform = 1; 2458c2ecf20Sopenharmony_ci } 2468c2ecf20Sopenharmony_ci 2478c2ecf20Sopenharmony_ci if (uniform) { 2488c2ecf20Sopenharmony_ci min_size = uniform_size(max_addr, addr, 0, nr_nodes); 2498c2ecf20Sopenharmony_ci size = min_size; 2508c2ecf20Sopenharmony_ci } else { 2518c2ecf20Sopenharmony_ci /* 2528c2ecf20Sopenharmony_ci * The limit on emulated nodes is MAX_NUMNODES, so the 2538c2ecf20Sopenharmony_ci * size per node is increased accordingly if the 2548c2ecf20Sopenharmony_ci * requested size is too small. This creates a uniform 2558c2ecf20Sopenharmony_ci * distribution of node sizes across the entire machine 2568c2ecf20Sopenharmony_ci * (but not necessarily over physical nodes). 2578c2ecf20Sopenharmony_ci */ 2588c2ecf20Sopenharmony_ci min_size = uniform_size(max_addr, addr, 2598c2ecf20Sopenharmony_ci mem_hole_size(addr, max_addr), nr_nodes); 2608c2ecf20Sopenharmony_ci } 2618c2ecf20Sopenharmony_ci min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE); 2628c2ecf20Sopenharmony_ci if (size < min_size) { 2638c2ecf20Sopenharmony_ci pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", 2648c2ecf20Sopenharmony_ci size >> 20, min_size >> 20); 2658c2ecf20Sopenharmony_ci size = min_size; 2668c2ecf20Sopenharmony_ci } 2678c2ecf20Sopenharmony_ci size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE); 2688c2ecf20Sopenharmony_ci 2698c2ecf20Sopenharmony_ci /* 2708c2ecf20Sopenharmony_ci * Fill physical nodes with fake nodes of size until there is no memory 2718c2ecf20Sopenharmony_ci * left on any of them. 2728c2ecf20Sopenharmony_ci */ 2738c2ecf20Sopenharmony_ci while (nodes_weight(physnode_mask)) { 2748c2ecf20Sopenharmony_ci for_each_node_mask(i, physnode_mask) { 2758c2ecf20Sopenharmony_ci u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); 2768c2ecf20Sopenharmony_ci u64 start, limit, end; 2778c2ecf20Sopenharmony_ci int phys_blk; 2788c2ecf20Sopenharmony_ci 2798c2ecf20Sopenharmony_ci phys_blk = emu_find_memblk_by_nid(i, pi); 2808c2ecf20Sopenharmony_ci if (phys_blk < 0) { 2818c2ecf20Sopenharmony_ci node_clear(i, physnode_mask); 2828c2ecf20Sopenharmony_ci continue; 2838c2ecf20Sopenharmony_ci } 2848c2ecf20Sopenharmony_ci 2858c2ecf20Sopenharmony_ci start = pi->blk[phys_blk].start; 2868c2ecf20Sopenharmony_ci limit = pi->blk[phys_blk].end; 2878c2ecf20Sopenharmony_ci 2888c2ecf20Sopenharmony_ci if (uniform) 2898c2ecf20Sopenharmony_ci end = start + size; 2908c2ecf20Sopenharmony_ci else 2918c2ecf20Sopenharmony_ci end = find_end_of_node(start, limit, size); 2928c2ecf20Sopenharmony_ci /* 2938c2ecf20Sopenharmony_ci * If there won't be at least FAKE_NODE_MIN_SIZE of 2948c2ecf20Sopenharmony_ci * non-reserved memory in ZONE_DMA32 for the next node, 2958c2ecf20Sopenharmony_ci * this one must extend to the boundary. 2968c2ecf20Sopenharmony_ci */ 2978c2ecf20Sopenharmony_ci if (end < dma32_end && dma32_end - end - 2988c2ecf20Sopenharmony_ci mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) 2998c2ecf20Sopenharmony_ci end = dma32_end; 3008c2ecf20Sopenharmony_ci 3018c2ecf20Sopenharmony_ci /* 3028c2ecf20Sopenharmony_ci * If there won't be enough non-reserved memory for the 3038c2ecf20Sopenharmony_ci * next node, this one must extend to the end of the 3048c2ecf20Sopenharmony_ci * physical node. 3058c2ecf20Sopenharmony_ci */ 3068c2ecf20Sopenharmony_ci if ((limit - end - mem_hole_size(end, limit) < size) 3078c2ecf20Sopenharmony_ci && !uniform) 3088c2ecf20Sopenharmony_ci end = limit; 3098c2ecf20Sopenharmony_ci 3108c2ecf20Sopenharmony_ci ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, 3118c2ecf20Sopenharmony_ci phys_blk, 3128c2ecf20Sopenharmony_ci min(end, limit) - start); 3138c2ecf20Sopenharmony_ci if (ret < 0) 3148c2ecf20Sopenharmony_ci return ret; 3158c2ecf20Sopenharmony_ci } 3168c2ecf20Sopenharmony_ci } 3178c2ecf20Sopenharmony_ci return nid; 3188c2ecf20Sopenharmony_ci} 3198c2ecf20Sopenharmony_ci 3208c2ecf20Sopenharmony_cistatic int __init split_nodes_size_interleave(struct numa_meminfo *ei, 3218c2ecf20Sopenharmony_ci struct numa_meminfo *pi, 3228c2ecf20Sopenharmony_ci u64 addr, u64 max_addr, u64 size) 3238c2ecf20Sopenharmony_ci{ 3248c2ecf20Sopenharmony_ci return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size, 3258c2ecf20Sopenharmony_ci 0, NULL, 0); 3268c2ecf20Sopenharmony_ci} 3278c2ecf20Sopenharmony_ci 3288c2ecf20Sopenharmony_cistatic int __init setup_emu2phys_nid(int *dfl_phys_nid) 3298c2ecf20Sopenharmony_ci{ 3308c2ecf20Sopenharmony_ci int i, max_emu_nid = 0; 3318c2ecf20Sopenharmony_ci 3328c2ecf20Sopenharmony_ci *dfl_phys_nid = NUMA_NO_NODE; 3338c2ecf20Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { 3348c2ecf20Sopenharmony_ci if (emu_nid_to_phys[i] != NUMA_NO_NODE) { 3358c2ecf20Sopenharmony_ci max_emu_nid = i; 3368c2ecf20Sopenharmony_ci if (*dfl_phys_nid == NUMA_NO_NODE) 3378c2ecf20Sopenharmony_ci *dfl_phys_nid = emu_nid_to_phys[i]; 3388c2ecf20Sopenharmony_ci } 3398c2ecf20Sopenharmony_ci } 3408c2ecf20Sopenharmony_ci 3418c2ecf20Sopenharmony_ci return max_emu_nid; 3428c2ecf20Sopenharmony_ci} 3438c2ecf20Sopenharmony_ci 3448c2ecf20Sopenharmony_ci/** 3458c2ecf20Sopenharmony_ci * numa_emulation - Emulate NUMA nodes 3468c2ecf20Sopenharmony_ci * @numa_meminfo: NUMA configuration to massage 3478c2ecf20Sopenharmony_ci * @numa_dist_cnt: The size of the physical NUMA distance table 3488c2ecf20Sopenharmony_ci * 3498c2ecf20Sopenharmony_ci * Emulate NUMA nodes according to the numa=fake kernel parameter. 3508c2ecf20Sopenharmony_ci * @numa_meminfo contains the physical memory configuration and is modified 3518c2ecf20Sopenharmony_ci * to reflect the emulated configuration on success. @numa_dist_cnt is 3528c2ecf20Sopenharmony_ci * used to determine the size of the physical distance table. 3538c2ecf20Sopenharmony_ci * 3548c2ecf20Sopenharmony_ci * On success, the following modifications are made. 3558c2ecf20Sopenharmony_ci * 3568c2ecf20Sopenharmony_ci * - @numa_meminfo is updated to reflect the emulated nodes. 3578c2ecf20Sopenharmony_ci * 3588c2ecf20Sopenharmony_ci * - __apicid_to_node[] is updated such that APIC IDs are mapped to the 3598c2ecf20Sopenharmony_ci * emulated nodes. 3608c2ecf20Sopenharmony_ci * 3618c2ecf20Sopenharmony_ci * - NUMA distance table is rebuilt to represent distances between emulated 3628c2ecf20Sopenharmony_ci * nodes. The distances are determined considering how emulated nodes 3638c2ecf20Sopenharmony_ci * are mapped to physical nodes and match the actual distances. 3648c2ecf20Sopenharmony_ci * 3658c2ecf20Sopenharmony_ci * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical 3668c2ecf20Sopenharmony_ci * nodes. This is used by numa_add_cpu() and numa_remove_cpu(). 3678c2ecf20Sopenharmony_ci * 3688c2ecf20Sopenharmony_ci * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with 3698c2ecf20Sopenharmony_ci * identity mapping and no other modification is made. 3708c2ecf20Sopenharmony_ci */ 3718c2ecf20Sopenharmony_civoid __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) 3728c2ecf20Sopenharmony_ci{ 3738c2ecf20Sopenharmony_ci static struct numa_meminfo ei __initdata; 3748c2ecf20Sopenharmony_ci static struct numa_meminfo pi __initdata; 3758c2ecf20Sopenharmony_ci const u64 max_addr = PFN_PHYS(max_pfn); 3768c2ecf20Sopenharmony_ci u8 *phys_dist = NULL; 3778c2ecf20Sopenharmony_ci size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); 3788c2ecf20Sopenharmony_ci int max_emu_nid, dfl_phys_nid; 3798c2ecf20Sopenharmony_ci int i, j, ret; 3808c2ecf20Sopenharmony_ci 3818c2ecf20Sopenharmony_ci if (!emu_cmdline) 3828c2ecf20Sopenharmony_ci goto no_emu; 3838c2ecf20Sopenharmony_ci 3848c2ecf20Sopenharmony_ci memset(&ei, 0, sizeof(ei)); 3858c2ecf20Sopenharmony_ci pi = *numa_meminfo; 3868c2ecf20Sopenharmony_ci 3878c2ecf20Sopenharmony_ci for (i = 0; i < MAX_NUMNODES; i++) 3888c2ecf20Sopenharmony_ci emu_nid_to_phys[i] = NUMA_NO_NODE; 3898c2ecf20Sopenharmony_ci 3908c2ecf20Sopenharmony_ci /* 3918c2ecf20Sopenharmony_ci * If the numa=fake command-line contains a 'M' or 'G', it represents 3928c2ecf20Sopenharmony_ci * the fixed node size. Otherwise, if it is just a single number N, 3938c2ecf20Sopenharmony_ci * split the system RAM into N fake nodes. 3948c2ecf20Sopenharmony_ci */ 3958c2ecf20Sopenharmony_ci if (strchr(emu_cmdline, 'U')) { 3968c2ecf20Sopenharmony_ci nodemask_t physnode_mask = numa_nodes_parsed; 3978c2ecf20Sopenharmony_ci unsigned long n; 3988c2ecf20Sopenharmony_ci int nid = 0; 3998c2ecf20Sopenharmony_ci 4008c2ecf20Sopenharmony_ci n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); 4018c2ecf20Sopenharmony_ci ret = -1; 4028c2ecf20Sopenharmony_ci for_each_node_mask(i, physnode_mask) { 4038c2ecf20Sopenharmony_ci /* 4048c2ecf20Sopenharmony_ci * The reason we pass in blk[0] is due to 4058c2ecf20Sopenharmony_ci * numa_remove_memblk_from() called by 4068c2ecf20Sopenharmony_ci * emu_setup_memblk() will delete entry 0 4078c2ecf20Sopenharmony_ci * and then move everything else up in the pi.blk 4088c2ecf20Sopenharmony_ci * array. Therefore we should always be looking 4098c2ecf20Sopenharmony_ci * at blk[0]. 4108c2ecf20Sopenharmony_ci */ 4118c2ecf20Sopenharmony_ci ret = split_nodes_size_interleave_uniform(&ei, &pi, 4128c2ecf20Sopenharmony_ci pi.blk[0].start, pi.blk[0].end, 0, 4138c2ecf20Sopenharmony_ci n, &pi.blk[0], nid); 4148c2ecf20Sopenharmony_ci if (ret < 0) 4158c2ecf20Sopenharmony_ci break; 4168c2ecf20Sopenharmony_ci if (ret < n) { 4178c2ecf20Sopenharmony_ci pr_info("%s: phys: %d only got %d of %ld nodes, failing\n", 4188c2ecf20Sopenharmony_ci __func__, i, ret, n); 4198c2ecf20Sopenharmony_ci ret = -1; 4208c2ecf20Sopenharmony_ci break; 4218c2ecf20Sopenharmony_ci } 4228c2ecf20Sopenharmony_ci nid = ret; 4238c2ecf20Sopenharmony_ci } 4248c2ecf20Sopenharmony_ci } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { 4258c2ecf20Sopenharmony_ci u64 size; 4268c2ecf20Sopenharmony_ci 4278c2ecf20Sopenharmony_ci size = memparse(emu_cmdline, &emu_cmdline); 4288c2ecf20Sopenharmony_ci ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); 4298c2ecf20Sopenharmony_ci } else { 4308c2ecf20Sopenharmony_ci unsigned long n; 4318c2ecf20Sopenharmony_ci 4328c2ecf20Sopenharmony_ci n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); 4338c2ecf20Sopenharmony_ci ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); 4348c2ecf20Sopenharmony_ci } 4358c2ecf20Sopenharmony_ci if (*emu_cmdline == ':') 4368c2ecf20Sopenharmony_ci emu_cmdline++; 4378c2ecf20Sopenharmony_ci 4388c2ecf20Sopenharmony_ci if (ret < 0) 4398c2ecf20Sopenharmony_ci goto no_emu; 4408c2ecf20Sopenharmony_ci 4418c2ecf20Sopenharmony_ci if (numa_cleanup_meminfo(&ei) < 0) { 4428c2ecf20Sopenharmony_ci pr_warn("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); 4438c2ecf20Sopenharmony_ci goto no_emu; 4448c2ecf20Sopenharmony_ci } 4458c2ecf20Sopenharmony_ci 4468c2ecf20Sopenharmony_ci /* copy the physical distance table */ 4478c2ecf20Sopenharmony_ci if (numa_dist_cnt) { 4488c2ecf20Sopenharmony_ci u64 phys; 4498c2ecf20Sopenharmony_ci 4508c2ecf20Sopenharmony_ci phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), 4518c2ecf20Sopenharmony_ci phys_size, PAGE_SIZE); 4528c2ecf20Sopenharmony_ci if (!phys) { 4538c2ecf20Sopenharmony_ci pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); 4548c2ecf20Sopenharmony_ci goto no_emu; 4558c2ecf20Sopenharmony_ci } 4568c2ecf20Sopenharmony_ci memblock_reserve(phys, phys_size); 4578c2ecf20Sopenharmony_ci phys_dist = __va(phys); 4588c2ecf20Sopenharmony_ci 4598c2ecf20Sopenharmony_ci for (i = 0; i < numa_dist_cnt; i++) 4608c2ecf20Sopenharmony_ci for (j = 0; j < numa_dist_cnt; j++) 4618c2ecf20Sopenharmony_ci phys_dist[i * numa_dist_cnt + j] = 4628c2ecf20Sopenharmony_ci node_distance(i, j); 4638c2ecf20Sopenharmony_ci } 4648c2ecf20Sopenharmony_ci 4658c2ecf20Sopenharmony_ci /* 4668c2ecf20Sopenharmony_ci * Determine the max emulated nid and the default phys nid to use 4678c2ecf20Sopenharmony_ci * for unmapped nodes. 4688c2ecf20Sopenharmony_ci */ 4698c2ecf20Sopenharmony_ci max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid); 4708c2ecf20Sopenharmony_ci 4718c2ecf20Sopenharmony_ci /* commit */ 4728c2ecf20Sopenharmony_ci *numa_meminfo = ei; 4738c2ecf20Sopenharmony_ci 4748c2ecf20Sopenharmony_ci /* Make sure numa_nodes_parsed only contains emulated nodes */ 4758c2ecf20Sopenharmony_ci nodes_clear(numa_nodes_parsed); 4768c2ecf20Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(ei.blk); i++) 4778c2ecf20Sopenharmony_ci if (ei.blk[i].start != ei.blk[i].end && 4788c2ecf20Sopenharmony_ci ei.blk[i].nid != NUMA_NO_NODE) 4798c2ecf20Sopenharmony_ci node_set(ei.blk[i].nid, numa_nodes_parsed); 4808c2ecf20Sopenharmony_ci 4818c2ecf20Sopenharmony_ci /* 4828c2ecf20Sopenharmony_ci * Transform __apicid_to_node table to use emulated nids by 4838c2ecf20Sopenharmony_ci * reverse-mapping phys_nid. The maps should always exist but fall 4848c2ecf20Sopenharmony_ci * back to zero just in case. 4858c2ecf20Sopenharmony_ci */ 4868c2ecf20Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) { 4878c2ecf20Sopenharmony_ci if (__apicid_to_node[i] == NUMA_NO_NODE) 4888c2ecf20Sopenharmony_ci continue; 4898c2ecf20Sopenharmony_ci for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) 4908c2ecf20Sopenharmony_ci if (__apicid_to_node[i] == emu_nid_to_phys[j]) 4918c2ecf20Sopenharmony_ci break; 4928c2ecf20Sopenharmony_ci __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0; 4938c2ecf20Sopenharmony_ci } 4948c2ecf20Sopenharmony_ci 4958c2ecf20Sopenharmony_ci /* make sure all emulated nodes are mapped to a physical node */ 4968c2ecf20Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) 4978c2ecf20Sopenharmony_ci if (emu_nid_to_phys[i] == NUMA_NO_NODE) 4988c2ecf20Sopenharmony_ci emu_nid_to_phys[i] = dfl_phys_nid; 4998c2ecf20Sopenharmony_ci 5008c2ecf20Sopenharmony_ci /* transform distance table */ 5018c2ecf20Sopenharmony_ci numa_reset_distance(); 5028c2ecf20Sopenharmony_ci for (i = 0; i < max_emu_nid + 1; i++) { 5038c2ecf20Sopenharmony_ci for (j = 0; j < max_emu_nid + 1; j++) { 5048c2ecf20Sopenharmony_ci int physi = emu_nid_to_phys[i]; 5058c2ecf20Sopenharmony_ci int physj = emu_nid_to_phys[j]; 5068c2ecf20Sopenharmony_ci int dist; 5078c2ecf20Sopenharmony_ci 5088c2ecf20Sopenharmony_ci if (get_option(&emu_cmdline, &dist) == 2) 5098c2ecf20Sopenharmony_ci ; 5108c2ecf20Sopenharmony_ci else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) 5118c2ecf20Sopenharmony_ci dist = physi == physj ? 5128c2ecf20Sopenharmony_ci LOCAL_DISTANCE : REMOTE_DISTANCE; 5138c2ecf20Sopenharmony_ci else 5148c2ecf20Sopenharmony_ci dist = phys_dist[physi * numa_dist_cnt + physj]; 5158c2ecf20Sopenharmony_ci 5168c2ecf20Sopenharmony_ci numa_set_distance(i, j, dist); 5178c2ecf20Sopenharmony_ci } 5188c2ecf20Sopenharmony_ci } 5198c2ecf20Sopenharmony_ci 5208c2ecf20Sopenharmony_ci /* free the copied physical distance table */ 5218c2ecf20Sopenharmony_ci if (phys_dist) 5228c2ecf20Sopenharmony_ci memblock_free(__pa(phys_dist), phys_size); 5238c2ecf20Sopenharmony_ci return; 5248c2ecf20Sopenharmony_ci 5258c2ecf20Sopenharmony_cino_emu: 5268c2ecf20Sopenharmony_ci /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ 5278c2ecf20Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) 5288c2ecf20Sopenharmony_ci emu_nid_to_phys[i] = i; 5298c2ecf20Sopenharmony_ci} 5308c2ecf20Sopenharmony_ci 5318c2ecf20Sopenharmony_ci#ifndef CONFIG_DEBUG_PER_CPU_MAPS 5328c2ecf20Sopenharmony_civoid numa_add_cpu(int cpu) 5338c2ecf20Sopenharmony_ci{ 5348c2ecf20Sopenharmony_ci int physnid, nid; 5358c2ecf20Sopenharmony_ci 5368c2ecf20Sopenharmony_ci nid = early_cpu_to_node(cpu); 5378c2ecf20Sopenharmony_ci BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); 5388c2ecf20Sopenharmony_ci 5398c2ecf20Sopenharmony_ci physnid = emu_nid_to_phys[nid]; 5408c2ecf20Sopenharmony_ci 5418c2ecf20Sopenharmony_ci /* 5428c2ecf20Sopenharmony_ci * Map the cpu to each emulated node that is allocated on the physical 5438c2ecf20Sopenharmony_ci * node of the cpu's apic id. 5448c2ecf20Sopenharmony_ci */ 5458c2ecf20Sopenharmony_ci for_each_online_node(nid) 5468c2ecf20Sopenharmony_ci if (emu_nid_to_phys[nid] == physnid) 5478c2ecf20Sopenharmony_ci cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); 5488c2ecf20Sopenharmony_ci} 5498c2ecf20Sopenharmony_ci 5508c2ecf20Sopenharmony_civoid numa_remove_cpu(int cpu) 5518c2ecf20Sopenharmony_ci{ 5528c2ecf20Sopenharmony_ci int i; 5538c2ecf20Sopenharmony_ci 5548c2ecf20Sopenharmony_ci for_each_online_node(i) 5558c2ecf20Sopenharmony_ci cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); 5568c2ecf20Sopenharmony_ci} 5578c2ecf20Sopenharmony_ci#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ 5588c2ecf20Sopenharmony_cistatic void numa_set_cpumask(int cpu, bool enable) 5598c2ecf20Sopenharmony_ci{ 5608c2ecf20Sopenharmony_ci int nid, physnid; 5618c2ecf20Sopenharmony_ci 5628c2ecf20Sopenharmony_ci nid = early_cpu_to_node(cpu); 5638c2ecf20Sopenharmony_ci if (nid == NUMA_NO_NODE) { 5648c2ecf20Sopenharmony_ci /* early_cpu_to_node() already emits a warning and trace */ 5658c2ecf20Sopenharmony_ci return; 5668c2ecf20Sopenharmony_ci } 5678c2ecf20Sopenharmony_ci 5688c2ecf20Sopenharmony_ci physnid = emu_nid_to_phys[nid]; 5698c2ecf20Sopenharmony_ci 5708c2ecf20Sopenharmony_ci for_each_online_node(nid) { 5718c2ecf20Sopenharmony_ci if (emu_nid_to_phys[nid] != physnid) 5728c2ecf20Sopenharmony_ci continue; 5738c2ecf20Sopenharmony_ci 5748c2ecf20Sopenharmony_ci debug_cpumask_set_cpu(cpu, nid, enable); 5758c2ecf20Sopenharmony_ci } 5768c2ecf20Sopenharmony_ci} 5778c2ecf20Sopenharmony_ci 5788c2ecf20Sopenharmony_civoid numa_add_cpu(int cpu) 5798c2ecf20Sopenharmony_ci{ 5808c2ecf20Sopenharmony_ci numa_set_cpumask(cpu, true); 5818c2ecf20Sopenharmony_ci} 5828c2ecf20Sopenharmony_ci 5838c2ecf20Sopenharmony_civoid numa_remove_cpu(int cpu) 5848c2ecf20Sopenharmony_ci{ 5858c2ecf20Sopenharmony_ci numa_set_cpumask(cpu, false); 5868c2ecf20Sopenharmony_ci} 5878c2ecf20Sopenharmony_ci#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ 588