18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * NUMA emulation
48c2ecf20Sopenharmony_ci */
58c2ecf20Sopenharmony_ci#include <linux/kernel.h>
68c2ecf20Sopenharmony_ci#include <linux/errno.h>
78c2ecf20Sopenharmony_ci#include <linux/topology.h>
88c2ecf20Sopenharmony_ci#include <linux/memblock.h>
98c2ecf20Sopenharmony_ci#include <asm/dma.h>
108c2ecf20Sopenharmony_ci
118c2ecf20Sopenharmony_ci#include "numa_internal.h"
128c2ecf20Sopenharmony_ci
138c2ecf20Sopenharmony_cistatic int emu_nid_to_phys[MAX_NUMNODES];
148c2ecf20Sopenharmony_cistatic char *emu_cmdline __initdata;
158c2ecf20Sopenharmony_ci
168c2ecf20Sopenharmony_ciint __init numa_emu_cmdline(char *str)
178c2ecf20Sopenharmony_ci{
188c2ecf20Sopenharmony_ci	emu_cmdline = str;
198c2ecf20Sopenharmony_ci	return 0;
208c2ecf20Sopenharmony_ci}
218c2ecf20Sopenharmony_ci
228c2ecf20Sopenharmony_cistatic int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
238c2ecf20Sopenharmony_ci{
248c2ecf20Sopenharmony_ci	int i;
258c2ecf20Sopenharmony_ci
268c2ecf20Sopenharmony_ci	for (i = 0; i < mi->nr_blks; i++)
278c2ecf20Sopenharmony_ci		if (mi->blk[i].nid == nid)
288c2ecf20Sopenharmony_ci			return i;
298c2ecf20Sopenharmony_ci	return -ENOENT;
308c2ecf20Sopenharmony_ci}
318c2ecf20Sopenharmony_ci
328c2ecf20Sopenharmony_cistatic u64 __init mem_hole_size(u64 start, u64 end)
338c2ecf20Sopenharmony_ci{
348c2ecf20Sopenharmony_ci	unsigned long start_pfn = PFN_UP(start);
358c2ecf20Sopenharmony_ci	unsigned long end_pfn = PFN_DOWN(end);
368c2ecf20Sopenharmony_ci
378c2ecf20Sopenharmony_ci	if (start_pfn < end_pfn)
388c2ecf20Sopenharmony_ci		return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn));
398c2ecf20Sopenharmony_ci	return 0;
408c2ecf20Sopenharmony_ci}
418c2ecf20Sopenharmony_ci
428c2ecf20Sopenharmony_ci/*
438c2ecf20Sopenharmony_ci * Sets up nid to range from @start to @end.  The return value is -errno if
448c2ecf20Sopenharmony_ci * something went wrong, 0 otherwise.
458c2ecf20Sopenharmony_ci */
468c2ecf20Sopenharmony_cistatic int __init emu_setup_memblk(struct numa_meminfo *ei,
478c2ecf20Sopenharmony_ci				   struct numa_meminfo *pi,
488c2ecf20Sopenharmony_ci				   int nid, int phys_blk, u64 size)
498c2ecf20Sopenharmony_ci{
508c2ecf20Sopenharmony_ci	struct numa_memblk *eb = &ei->blk[ei->nr_blks];
518c2ecf20Sopenharmony_ci	struct numa_memblk *pb = &pi->blk[phys_blk];
528c2ecf20Sopenharmony_ci
538c2ecf20Sopenharmony_ci	if (ei->nr_blks >= NR_NODE_MEMBLKS) {
548c2ecf20Sopenharmony_ci		pr_err("NUMA: Too many emulated memblks, failing emulation\n");
558c2ecf20Sopenharmony_ci		return -EINVAL;
568c2ecf20Sopenharmony_ci	}
578c2ecf20Sopenharmony_ci
588c2ecf20Sopenharmony_ci	ei->nr_blks++;
598c2ecf20Sopenharmony_ci	eb->start = pb->start;
608c2ecf20Sopenharmony_ci	eb->end = pb->start + size;
618c2ecf20Sopenharmony_ci	eb->nid = nid;
628c2ecf20Sopenharmony_ci
638c2ecf20Sopenharmony_ci	if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
648c2ecf20Sopenharmony_ci		emu_nid_to_phys[nid] = pb->nid;
658c2ecf20Sopenharmony_ci
668c2ecf20Sopenharmony_ci	pb->start += size;
678c2ecf20Sopenharmony_ci	if (pb->start >= pb->end) {
688c2ecf20Sopenharmony_ci		WARN_ON_ONCE(pb->start > pb->end);
698c2ecf20Sopenharmony_ci		numa_remove_memblk_from(phys_blk, pi);
708c2ecf20Sopenharmony_ci	}
718c2ecf20Sopenharmony_ci
728c2ecf20Sopenharmony_ci	printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n",
738c2ecf20Sopenharmony_ci	       nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20);
748c2ecf20Sopenharmony_ci	return 0;
758c2ecf20Sopenharmony_ci}
768c2ecf20Sopenharmony_ci
778c2ecf20Sopenharmony_ci/*
788c2ecf20Sopenharmony_ci * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
798c2ecf20Sopenharmony_ci * to max_addr.
808c2ecf20Sopenharmony_ci *
818c2ecf20Sopenharmony_ci * Returns zero on success or negative on error.
828c2ecf20Sopenharmony_ci */
838c2ecf20Sopenharmony_cistatic int __init split_nodes_interleave(struct numa_meminfo *ei,
848c2ecf20Sopenharmony_ci					 struct numa_meminfo *pi,
858c2ecf20Sopenharmony_ci					 u64 addr, u64 max_addr, int nr_nodes)
868c2ecf20Sopenharmony_ci{
878c2ecf20Sopenharmony_ci	nodemask_t physnode_mask = numa_nodes_parsed;
888c2ecf20Sopenharmony_ci	u64 size;
898c2ecf20Sopenharmony_ci	int big;
908c2ecf20Sopenharmony_ci	int nid = 0;
918c2ecf20Sopenharmony_ci	int i, ret;
928c2ecf20Sopenharmony_ci
938c2ecf20Sopenharmony_ci	if (nr_nodes <= 0)
948c2ecf20Sopenharmony_ci		return -1;
958c2ecf20Sopenharmony_ci	if (nr_nodes > MAX_NUMNODES) {
968c2ecf20Sopenharmony_ci		pr_info("numa=fake=%d too large, reducing to %d\n",
978c2ecf20Sopenharmony_ci			nr_nodes, MAX_NUMNODES);
988c2ecf20Sopenharmony_ci		nr_nodes = MAX_NUMNODES;
998c2ecf20Sopenharmony_ci	}
1008c2ecf20Sopenharmony_ci
1018c2ecf20Sopenharmony_ci	/*
1028c2ecf20Sopenharmony_ci	 * Calculate target node size.  x86_32 freaks on __udivdi3() so do
1038c2ecf20Sopenharmony_ci	 * the division in ulong number of pages and convert back.
1048c2ecf20Sopenharmony_ci	 */
1058c2ecf20Sopenharmony_ci	size = max_addr - addr - mem_hole_size(addr, max_addr);
1068c2ecf20Sopenharmony_ci	size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
1078c2ecf20Sopenharmony_ci
1088c2ecf20Sopenharmony_ci	/*
1098c2ecf20Sopenharmony_ci	 * Calculate the number of big nodes that can be allocated as a result
1108c2ecf20Sopenharmony_ci	 * of consolidating the remainder.
1118c2ecf20Sopenharmony_ci	 */
1128c2ecf20Sopenharmony_ci	big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
1138c2ecf20Sopenharmony_ci		FAKE_NODE_MIN_SIZE;
1148c2ecf20Sopenharmony_ci
1158c2ecf20Sopenharmony_ci	size &= FAKE_NODE_MIN_HASH_MASK;
1168c2ecf20Sopenharmony_ci	if (!size) {
1178c2ecf20Sopenharmony_ci		pr_err("Not enough memory for each node.  "
1188c2ecf20Sopenharmony_ci			"NUMA emulation disabled.\n");
1198c2ecf20Sopenharmony_ci		return -1;
1208c2ecf20Sopenharmony_ci	}
1218c2ecf20Sopenharmony_ci
1228c2ecf20Sopenharmony_ci	/*
1238c2ecf20Sopenharmony_ci	 * Continue to fill physical nodes with fake nodes until there is no
1248c2ecf20Sopenharmony_ci	 * memory left on any of them.
1258c2ecf20Sopenharmony_ci	 */
1268c2ecf20Sopenharmony_ci	while (nodes_weight(physnode_mask)) {
1278c2ecf20Sopenharmony_ci		for_each_node_mask(i, physnode_mask) {
1288c2ecf20Sopenharmony_ci			u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
1298c2ecf20Sopenharmony_ci			u64 start, limit, end;
1308c2ecf20Sopenharmony_ci			int phys_blk;
1318c2ecf20Sopenharmony_ci
1328c2ecf20Sopenharmony_ci			phys_blk = emu_find_memblk_by_nid(i, pi);
1338c2ecf20Sopenharmony_ci			if (phys_blk < 0) {
1348c2ecf20Sopenharmony_ci				node_clear(i, physnode_mask);
1358c2ecf20Sopenharmony_ci				continue;
1368c2ecf20Sopenharmony_ci			}
1378c2ecf20Sopenharmony_ci			start = pi->blk[phys_blk].start;
1388c2ecf20Sopenharmony_ci			limit = pi->blk[phys_blk].end;
1398c2ecf20Sopenharmony_ci			end = start + size;
1408c2ecf20Sopenharmony_ci
1418c2ecf20Sopenharmony_ci			if (nid < big)
1428c2ecf20Sopenharmony_ci				end += FAKE_NODE_MIN_SIZE;
1438c2ecf20Sopenharmony_ci
1448c2ecf20Sopenharmony_ci			/*
1458c2ecf20Sopenharmony_ci			 * Continue to add memory to this fake node if its
1468c2ecf20Sopenharmony_ci			 * non-reserved memory is less than the per-node size.
1478c2ecf20Sopenharmony_ci			 */
1488c2ecf20Sopenharmony_ci			while (end - start - mem_hole_size(start, end) < size) {
1498c2ecf20Sopenharmony_ci				end += FAKE_NODE_MIN_SIZE;
1508c2ecf20Sopenharmony_ci				if (end > limit) {
1518c2ecf20Sopenharmony_ci					end = limit;
1528c2ecf20Sopenharmony_ci					break;
1538c2ecf20Sopenharmony_ci				}
1548c2ecf20Sopenharmony_ci			}
1558c2ecf20Sopenharmony_ci
1568c2ecf20Sopenharmony_ci			/*
1578c2ecf20Sopenharmony_ci			 * If there won't be at least FAKE_NODE_MIN_SIZE of
1588c2ecf20Sopenharmony_ci			 * non-reserved memory in ZONE_DMA32 for the next node,
1598c2ecf20Sopenharmony_ci			 * this one must extend to the boundary.
1608c2ecf20Sopenharmony_ci			 */
1618c2ecf20Sopenharmony_ci			if (end < dma32_end && dma32_end - end -
1628c2ecf20Sopenharmony_ci			    mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
1638c2ecf20Sopenharmony_ci				end = dma32_end;
1648c2ecf20Sopenharmony_ci
1658c2ecf20Sopenharmony_ci			/*
1668c2ecf20Sopenharmony_ci			 * If there won't be enough non-reserved memory for the
1678c2ecf20Sopenharmony_ci			 * next node, this one must extend to the end of the
1688c2ecf20Sopenharmony_ci			 * physical node.
1698c2ecf20Sopenharmony_ci			 */
1708c2ecf20Sopenharmony_ci			if (limit - end - mem_hole_size(end, limit) < size)
1718c2ecf20Sopenharmony_ci				end = limit;
1728c2ecf20Sopenharmony_ci
1738c2ecf20Sopenharmony_ci			ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
1748c2ecf20Sopenharmony_ci					       phys_blk,
1758c2ecf20Sopenharmony_ci					       min(end, limit) - start);
1768c2ecf20Sopenharmony_ci			if (ret < 0)
1778c2ecf20Sopenharmony_ci				return ret;
1788c2ecf20Sopenharmony_ci		}
1798c2ecf20Sopenharmony_ci	}
1808c2ecf20Sopenharmony_ci	return 0;
1818c2ecf20Sopenharmony_ci}
1828c2ecf20Sopenharmony_ci
1838c2ecf20Sopenharmony_ci/*
1848c2ecf20Sopenharmony_ci * Returns the end address of a node so that there is at least `size' amount of
1858c2ecf20Sopenharmony_ci * non-reserved memory or `max_addr' is reached.
1868c2ecf20Sopenharmony_ci */
1878c2ecf20Sopenharmony_cistatic u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
1888c2ecf20Sopenharmony_ci{
1898c2ecf20Sopenharmony_ci	u64 end = start + size;
1908c2ecf20Sopenharmony_ci
1918c2ecf20Sopenharmony_ci	while (end - start - mem_hole_size(start, end) < size) {
1928c2ecf20Sopenharmony_ci		end += FAKE_NODE_MIN_SIZE;
1938c2ecf20Sopenharmony_ci		if (end > max_addr) {
1948c2ecf20Sopenharmony_ci			end = max_addr;
1958c2ecf20Sopenharmony_ci			break;
1968c2ecf20Sopenharmony_ci		}
1978c2ecf20Sopenharmony_ci	}
1988c2ecf20Sopenharmony_ci	return end;
1998c2ecf20Sopenharmony_ci}
2008c2ecf20Sopenharmony_ci
2018c2ecf20Sopenharmony_cistatic u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes)
2028c2ecf20Sopenharmony_ci{
2038c2ecf20Sopenharmony_ci	unsigned long max_pfn = PHYS_PFN(max_addr);
2048c2ecf20Sopenharmony_ci	unsigned long base_pfn = PHYS_PFN(base);
2058c2ecf20Sopenharmony_ci	unsigned long hole_pfns = PHYS_PFN(hole);
2068c2ecf20Sopenharmony_ci
2078c2ecf20Sopenharmony_ci	return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes);
2088c2ecf20Sopenharmony_ci}
2098c2ecf20Sopenharmony_ci
2108c2ecf20Sopenharmony_ci/*
2118c2ecf20Sopenharmony_ci * Sets up fake nodes of `size' interleaved over physical nodes ranging from
2128c2ecf20Sopenharmony_ci * `addr' to `max_addr'.
2138c2ecf20Sopenharmony_ci *
2148c2ecf20Sopenharmony_ci * Returns zero on success or negative on error.
2158c2ecf20Sopenharmony_ci */
2168c2ecf20Sopenharmony_cistatic int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei,
2178c2ecf20Sopenharmony_ci					      struct numa_meminfo *pi,
2188c2ecf20Sopenharmony_ci					      u64 addr, u64 max_addr, u64 size,
2198c2ecf20Sopenharmony_ci					      int nr_nodes, struct numa_memblk *pblk,
2208c2ecf20Sopenharmony_ci					      int nid)
2218c2ecf20Sopenharmony_ci{
2228c2ecf20Sopenharmony_ci	nodemask_t physnode_mask = numa_nodes_parsed;
2238c2ecf20Sopenharmony_ci	int i, ret, uniform = 0;
2248c2ecf20Sopenharmony_ci	u64 min_size;
2258c2ecf20Sopenharmony_ci
2268c2ecf20Sopenharmony_ci	if ((!size && !nr_nodes) || (nr_nodes && !pblk))
2278c2ecf20Sopenharmony_ci		return -1;
2288c2ecf20Sopenharmony_ci
2298c2ecf20Sopenharmony_ci	/*
2308c2ecf20Sopenharmony_ci	 * In the 'uniform' case split the passed in physical node by
2318c2ecf20Sopenharmony_ci	 * nr_nodes, in the non-uniform case, ignore the passed in
2328c2ecf20Sopenharmony_ci	 * physical block and try to create nodes of at least size
2338c2ecf20Sopenharmony_ci	 * @size.
2348c2ecf20Sopenharmony_ci	 *
2358c2ecf20Sopenharmony_ci	 * In the uniform case, split the nodes strictly by physical
2368c2ecf20Sopenharmony_ci	 * capacity, i.e. ignore holes. In the non-uniform case account
2378c2ecf20Sopenharmony_ci	 * for holes and treat @size as a minimum floor.
2388c2ecf20Sopenharmony_ci	 */
2398c2ecf20Sopenharmony_ci	if (!nr_nodes)
2408c2ecf20Sopenharmony_ci		nr_nodes = MAX_NUMNODES;
2418c2ecf20Sopenharmony_ci	else {
2428c2ecf20Sopenharmony_ci		nodes_clear(physnode_mask);
2438c2ecf20Sopenharmony_ci		node_set(pblk->nid, physnode_mask);
2448c2ecf20Sopenharmony_ci		uniform = 1;
2458c2ecf20Sopenharmony_ci	}
2468c2ecf20Sopenharmony_ci
2478c2ecf20Sopenharmony_ci	if (uniform) {
2488c2ecf20Sopenharmony_ci		min_size = uniform_size(max_addr, addr, 0, nr_nodes);
2498c2ecf20Sopenharmony_ci		size = min_size;
2508c2ecf20Sopenharmony_ci	} else {
2518c2ecf20Sopenharmony_ci		/*
2528c2ecf20Sopenharmony_ci		 * The limit on emulated nodes is MAX_NUMNODES, so the
2538c2ecf20Sopenharmony_ci		 * size per node is increased accordingly if the
2548c2ecf20Sopenharmony_ci		 * requested size is too small.  This creates a uniform
2558c2ecf20Sopenharmony_ci		 * distribution of node sizes across the entire machine
2568c2ecf20Sopenharmony_ci		 * (but not necessarily over physical nodes).
2578c2ecf20Sopenharmony_ci		 */
2588c2ecf20Sopenharmony_ci		min_size = uniform_size(max_addr, addr,
2598c2ecf20Sopenharmony_ci				mem_hole_size(addr, max_addr), nr_nodes);
2608c2ecf20Sopenharmony_ci	}
2618c2ecf20Sopenharmony_ci	min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE);
2628c2ecf20Sopenharmony_ci	if (size < min_size) {
2638c2ecf20Sopenharmony_ci		pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
2648c2ecf20Sopenharmony_ci			size >> 20, min_size >> 20);
2658c2ecf20Sopenharmony_ci		size = min_size;
2668c2ecf20Sopenharmony_ci	}
2678c2ecf20Sopenharmony_ci	size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE);
2688c2ecf20Sopenharmony_ci
2698c2ecf20Sopenharmony_ci	/*
2708c2ecf20Sopenharmony_ci	 * Fill physical nodes with fake nodes of size until there is no memory
2718c2ecf20Sopenharmony_ci	 * left on any of them.
2728c2ecf20Sopenharmony_ci	 */
2738c2ecf20Sopenharmony_ci	while (nodes_weight(physnode_mask)) {
2748c2ecf20Sopenharmony_ci		for_each_node_mask(i, physnode_mask) {
2758c2ecf20Sopenharmony_ci			u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
2768c2ecf20Sopenharmony_ci			u64 start, limit, end;
2778c2ecf20Sopenharmony_ci			int phys_blk;
2788c2ecf20Sopenharmony_ci
2798c2ecf20Sopenharmony_ci			phys_blk = emu_find_memblk_by_nid(i, pi);
2808c2ecf20Sopenharmony_ci			if (phys_blk < 0) {
2818c2ecf20Sopenharmony_ci				node_clear(i, physnode_mask);
2828c2ecf20Sopenharmony_ci				continue;
2838c2ecf20Sopenharmony_ci			}
2848c2ecf20Sopenharmony_ci
2858c2ecf20Sopenharmony_ci			start = pi->blk[phys_blk].start;
2868c2ecf20Sopenharmony_ci			limit = pi->blk[phys_blk].end;
2878c2ecf20Sopenharmony_ci
2888c2ecf20Sopenharmony_ci			if (uniform)
2898c2ecf20Sopenharmony_ci				end = start + size;
2908c2ecf20Sopenharmony_ci			else
2918c2ecf20Sopenharmony_ci				end = find_end_of_node(start, limit, size);
2928c2ecf20Sopenharmony_ci			/*
2938c2ecf20Sopenharmony_ci			 * If there won't be at least FAKE_NODE_MIN_SIZE of
2948c2ecf20Sopenharmony_ci			 * non-reserved memory in ZONE_DMA32 for the next node,
2958c2ecf20Sopenharmony_ci			 * this one must extend to the boundary.
2968c2ecf20Sopenharmony_ci			 */
2978c2ecf20Sopenharmony_ci			if (end < dma32_end && dma32_end - end -
2988c2ecf20Sopenharmony_ci			    mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
2998c2ecf20Sopenharmony_ci				end = dma32_end;
3008c2ecf20Sopenharmony_ci
3018c2ecf20Sopenharmony_ci			/*
3028c2ecf20Sopenharmony_ci			 * If there won't be enough non-reserved memory for the
3038c2ecf20Sopenharmony_ci			 * next node, this one must extend to the end of the
3048c2ecf20Sopenharmony_ci			 * physical node.
3058c2ecf20Sopenharmony_ci			 */
3068c2ecf20Sopenharmony_ci			if ((limit - end - mem_hole_size(end, limit) < size)
3078c2ecf20Sopenharmony_ci					&& !uniform)
3088c2ecf20Sopenharmony_ci				end = limit;
3098c2ecf20Sopenharmony_ci
3108c2ecf20Sopenharmony_ci			ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
3118c2ecf20Sopenharmony_ci					       phys_blk,
3128c2ecf20Sopenharmony_ci					       min(end, limit) - start);
3138c2ecf20Sopenharmony_ci			if (ret < 0)
3148c2ecf20Sopenharmony_ci				return ret;
3158c2ecf20Sopenharmony_ci		}
3168c2ecf20Sopenharmony_ci	}
3178c2ecf20Sopenharmony_ci	return nid;
3188c2ecf20Sopenharmony_ci}
3198c2ecf20Sopenharmony_ci
3208c2ecf20Sopenharmony_cistatic int __init split_nodes_size_interleave(struct numa_meminfo *ei,
3218c2ecf20Sopenharmony_ci					      struct numa_meminfo *pi,
3228c2ecf20Sopenharmony_ci					      u64 addr, u64 max_addr, u64 size)
3238c2ecf20Sopenharmony_ci{
3248c2ecf20Sopenharmony_ci	return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size,
3258c2ecf20Sopenharmony_ci			0, NULL, 0);
3268c2ecf20Sopenharmony_ci}
3278c2ecf20Sopenharmony_ci
3288c2ecf20Sopenharmony_cistatic int __init setup_emu2phys_nid(int *dfl_phys_nid)
3298c2ecf20Sopenharmony_ci{
3308c2ecf20Sopenharmony_ci	int i, max_emu_nid = 0;
3318c2ecf20Sopenharmony_ci
3328c2ecf20Sopenharmony_ci	*dfl_phys_nid = NUMA_NO_NODE;
3338c2ecf20Sopenharmony_ci	for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
3348c2ecf20Sopenharmony_ci		if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
3358c2ecf20Sopenharmony_ci			max_emu_nid = i;
3368c2ecf20Sopenharmony_ci			if (*dfl_phys_nid == NUMA_NO_NODE)
3378c2ecf20Sopenharmony_ci				*dfl_phys_nid = emu_nid_to_phys[i];
3388c2ecf20Sopenharmony_ci		}
3398c2ecf20Sopenharmony_ci	}
3408c2ecf20Sopenharmony_ci
3418c2ecf20Sopenharmony_ci	return max_emu_nid;
3428c2ecf20Sopenharmony_ci}
3438c2ecf20Sopenharmony_ci
3448c2ecf20Sopenharmony_ci/**
3458c2ecf20Sopenharmony_ci * numa_emulation - Emulate NUMA nodes
3468c2ecf20Sopenharmony_ci * @numa_meminfo: NUMA configuration to massage
3478c2ecf20Sopenharmony_ci * @numa_dist_cnt: The size of the physical NUMA distance table
3488c2ecf20Sopenharmony_ci *
3498c2ecf20Sopenharmony_ci * Emulate NUMA nodes according to the numa=fake kernel parameter.
3508c2ecf20Sopenharmony_ci * @numa_meminfo contains the physical memory configuration and is modified
3518c2ecf20Sopenharmony_ci * to reflect the emulated configuration on success.  @numa_dist_cnt is
3528c2ecf20Sopenharmony_ci * used to determine the size of the physical distance table.
3538c2ecf20Sopenharmony_ci *
3548c2ecf20Sopenharmony_ci * On success, the following modifications are made.
3558c2ecf20Sopenharmony_ci *
3568c2ecf20Sopenharmony_ci * - @numa_meminfo is updated to reflect the emulated nodes.
3578c2ecf20Sopenharmony_ci *
3588c2ecf20Sopenharmony_ci * - __apicid_to_node[] is updated such that APIC IDs are mapped to the
3598c2ecf20Sopenharmony_ci *   emulated nodes.
3608c2ecf20Sopenharmony_ci *
3618c2ecf20Sopenharmony_ci * - NUMA distance table is rebuilt to represent distances between emulated
3628c2ecf20Sopenharmony_ci *   nodes.  The distances are determined considering how emulated nodes
3638c2ecf20Sopenharmony_ci *   are mapped to physical nodes and match the actual distances.
3648c2ecf20Sopenharmony_ci *
3658c2ecf20Sopenharmony_ci * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical
3668c2ecf20Sopenharmony_ci *   nodes.  This is used by numa_add_cpu() and numa_remove_cpu().
3678c2ecf20Sopenharmony_ci *
3688c2ecf20Sopenharmony_ci * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with
3698c2ecf20Sopenharmony_ci * identity mapping and no other modification is made.
3708c2ecf20Sopenharmony_ci */
3718c2ecf20Sopenharmony_civoid __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
3728c2ecf20Sopenharmony_ci{
3738c2ecf20Sopenharmony_ci	static struct numa_meminfo ei __initdata;
3748c2ecf20Sopenharmony_ci	static struct numa_meminfo pi __initdata;
3758c2ecf20Sopenharmony_ci	const u64 max_addr = PFN_PHYS(max_pfn);
3768c2ecf20Sopenharmony_ci	u8 *phys_dist = NULL;
3778c2ecf20Sopenharmony_ci	size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
3788c2ecf20Sopenharmony_ci	int max_emu_nid, dfl_phys_nid;
3798c2ecf20Sopenharmony_ci	int i, j, ret;
3808c2ecf20Sopenharmony_ci
3818c2ecf20Sopenharmony_ci	if (!emu_cmdline)
3828c2ecf20Sopenharmony_ci		goto no_emu;
3838c2ecf20Sopenharmony_ci
3848c2ecf20Sopenharmony_ci	memset(&ei, 0, sizeof(ei));
3858c2ecf20Sopenharmony_ci	pi = *numa_meminfo;
3868c2ecf20Sopenharmony_ci
3878c2ecf20Sopenharmony_ci	for (i = 0; i < MAX_NUMNODES; i++)
3888c2ecf20Sopenharmony_ci		emu_nid_to_phys[i] = NUMA_NO_NODE;
3898c2ecf20Sopenharmony_ci
3908c2ecf20Sopenharmony_ci	/*
3918c2ecf20Sopenharmony_ci	 * If the numa=fake command-line contains a 'M' or 'G', it represents
3928c2ecf20Sopenharmony_ci	 * the fixed node size.  Otherwise, if it is just a single number N,
3938c2ecf20Sopenharmony_ci	 * split the system RAM into N fake nodes.
3948c2ecf20Sopenharmony_ci	 */
3958c2ecf20Sopenharmony_ci	if (strchr(emu_cmdline, 'U')) {
3968c2ecf20Sopenharmony_ci		nodemask_t physnode_mask = numa_nodes_parsed;
3978c2ecf20Sopenharmony_ci		unsigned long n;
3988c2ecf20Sopenharmony_ci		int nid = 0;
3998c2ecf20Sopenharmony_ci
4008c2ecf20Sopenharmony_ci		n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
4018c2ecf20Sopenharmony_ci		ret = -1;
4028c2ecf20Sopenharmony_ci		for_each_node_mask(i, physnode_mask) {
4038c2ecf20Sopenharmony_ci			/*
4048c2ecf20Sopenharmony_ci			 * The reason we pass in blk[0] is due to
4058c2ecf20Sopenharmony_ci			 * numa_remove_memblk_from() called by
4068c2ecf20Sopenharmony_ci			 * emu_setup_memblk() will delete entry 0
4078c2ecf20Sopenharmony_ci			 * and then move everything else up in the pi.blk
4088c2ecf20Sopenharmony_ci			 * array. Therefore we should always be looking
4098c2ecf20Sopenharmony_ci			 * at blk[0].
4108c2ecf20Sopenharmony_ci			 */
4118c2ecf20Sopenharmony_ci			ret = split_nodes_size_interleave_uniform(&ei, &pi,
4128c2ecf20Sopenharmony_ci					pi.blk[0].start, pi.blk[0].end, 0,
4138c2ecf20Sopenharmony_ci					n, &pi.blk[0], nid);
4148c2ecf20Sopenharmony_ci			if (ret < 0)
4158c2ecf20Sopenharmony_ci				break;
4168c2ecf20Sopenharmony_ci			if (ret < n) {
4178c2ecf20Sopenharmony_ci				pr_info("%s: phys: %d only got %d of %ld nodes, failing\n",
4188c2ecf20Sopenharmony_ci						__func__, i, ret, n);
4198c2ecf20Sopenharmony_ci				ret = -1;
4208c2ecf20Sopenharmony_ci				break;
4218c2ecf20Sopenharmony_ci			}
4228c2ecf20Sopenharmony_ci			nid = ret;
4238c2ecf20Sopenharmony_ci		}
4248c2ecf20Sopenharmony_ci	} else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
4258c2ecf20Sopenharmony_ci		u64 size;
4268c2ecf20Sopenharmony_ci
4278c2ecf20Sopenharmony_ci		size = memparse(emu_cmdline, &emu_cmdline);
4288c2ecf20Sopenharmony_ci		ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
4298c2ecf20Sopenharmony_ci	} else {
4308c2ecf20Sopenharmony_ci		unsigned long n;
4318c2ecf20Sopenharmony_ci
4328c2ecf20Sopenharmony_ci		n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
4338c2ecf20Sopenharmony_ci		ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
4348c2ecf20Sopenharmony_ci	}
4358c2ecf20Sopenharmony_ci	if (*emu_cmdline == ':')
4368c2ecf20Sopenharmony_ci		emu_cmdline++;
4378c2ecf20Sopenharmony_ci
4388c2ecf20Sopenharmony_ci	if (ret < 0)
4398c2ecf20Sopenharmony_ci		goto no_emu;
4408c2ecf20Sopenharmony_ci
4418c2ecf20Sopenharmony_ci	if (numa_cleanup_meminfo(&ei) < 0) {
4428c2ecf20Sopenharmony_ci		pr_warn("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
4438c2ecf20Sopenharmony_ci		goto no_emu;
4448c2ecf20Sopenharmony_ci	}
4458c2ecf20Sopenharmony_ci
4468c2ecf20Sopenharmony_ci	/* copy the physical distance table */
4478c2ecf20Sopenharmony_ci	if (numa_dist_cnt) {
4488c2ecf20Sopenharmony_ci		u64 phys;
4498c2ecf20Sopenharmony_ci
4508c2ecf20Sopenharmony_ci		phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
4518c2ecf20Sopenharmony_ci					      phys_size, PAGE_SIZE);
4528c2ecf20Sopenharmony_ci		if (!phys) {
4538c2ecf20Sopenharmony_ci			pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
4548c2ecf20Sopenharmony_ci			goto no_emu;
4558c2ecf20Sopenharmony_ci		}
4568c2ecf20Sopenharmony_ci		memblock_reserve(phys, phys_size);
4578c2ecf20Sopenharmony_ci		phys_dist = __va(phys);
4588c2ecf20Sopenharmony_ci
4598c2ecf20Sopenharmony_ci		for (i = 0; i < numa_dist_cnt; i++)
4608c2ecf20Sopenharmony_ci			for (j = 0; j < numa_dist_cnt; j++)
4618c2ecf20Sopenharmony_ci				phys_dist[i * numa_dist_cnt + j] =
4628c2ecf20Sopenharmony_ci					node_distance(i, j);
4638c2ecf20Sopenharmony_ci	}
4648c2ecf20Sopenharmony_ci
4658c2ecf20Sopenharmony_ci	/*
4668c2ecf20Sopenharmony_ci	 * Determine the max emulated nid and the default phys nid to use
4678c2ecf20Sopenharmony_ci	 * for unmapped nodes.
4688c2ecf20Sopenharmony_ci	 */
4698c2ecf20Sopenharmony_ci	max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid);
4708c2ecf20Sopenharmony_ci
4718c2ecf20Sopenharmony_ci	/* commit */
4728c2ecf20Sopenharmony_ci	*numa_meminfo = ei;
4738c2ecf20Sopenharmony_ci
4748c2ecf20Sopenharmony_ci	/* Make sure numa_nodes_parsed only contains emulated nodes */
4758c2ecf20Sopenharmony_ci	nodes_clear(numa_nodes_parsed);
4768c2ecf20Sopenharmony_ci	for (i = 0; i < ARRAY_SIZE(ei.blk); i++)
4778c2ecf20Sopenharmony_ci		if (ei.blk[i].start != ei.blk[i].end &&
4788c2ecf20Sopenharmony_ci		    ei.blk[i].nid != NUMA_NO_NODE)
4798c2ecf20Sopenharmony_ci			node_set(ei.blk[i].nid, numa_nodes_parsed);
4808c2ecf20Sopenharmony_ci
4818c2ecf20Sopenharmony_ci	/*
4828c2ecf20Sopenharmony_ci	 * Transform __apicid_to_node table to use emulated nids by
4838c2ecf20Sopenharmony_ci	 * reverse-mapping phys_nid.  The maps should always exist but fall
4848c2ecf20Sopenharmony_ci	 * back to zero just in case.
4858c2ecf20Sopenharmony_ci	 */
4868c2ecf20Sopenharmony_ci	for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
4878c2ecf20Sopenharmony_ci		if (__apicid_to_node[i] == NUMA_NO_NODE)
4888c2ecf20Sopenharmony_ci			continue;
4898c2ecf20Sopenharmony_ci		for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
4908c2ecf20Sopenharmony_ci			if (__apicid_to_node[i] == emu_nid_to_phys[j])
4918c2ecf20Sopenharmony_ci				break;
4928c2ecf20Sopenharmony_ci		__apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
4938c2ecf20Sopenharmony_ci	}
4948c2ecf20Sopenharmony_ci
4958c2ecf20Sopenharmony_ci	/* make sure all emulated nodes are mapped to a physical node */
4968c2ecf20Sopenharmony_ci	for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
4978c2ecf20Sopenharmony_ci		if (emu_nid_to_phys[i] == NUMA_NO_NODE)
4988c2ecf20Sopenharmony_ci			emu_nid_to_phys[i] = dfl_phys_nid;
4998c2ecf20Sopenharmony_ci
5008c2ecf20Sopenharmony_ci	/* transform distance table */
5018c2ecf20Sopenharmony_ci	numa_reset_distance();
5028c2ecf20Sopenharmony_ci	for (i = 0; i < max_emu_nid + 1; i++) {
5038c2ecf20Sopenharmony_ci		for (j = 0; j < max_emu_nid + 1; j++) {
5048c2ecf20Sopenharmony_ci			int physi = emu_nid_to_phys[i];
5058c2ecf20Sopenharmony_ci			int physj = emu_nid_to_phys[j];
5068c2ecf20Sopenharmony_ci			int dist;
5078c2ecf20Sopenharmony_ci
5088c2ecf20Sopenharmony_ci			if (get_option(&emu_cmdline, &dist) == 2)
5098c2ecf20Sopenharmony_ci				;
5108c2ecf20Sopenharmony_ci			else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
5118c2ecf20Sopenharmony_ci				dist = physi == physj ?
5128c2ecf20Sopenharmony_ci					LOCAL_DISTANCE : REMOTE_DISTANCE;
5138c2ecf20Sopenharmony_ci			else
5148c2ecf20Sopenharmony_ci				dist = phys_dist[physi * numa_dist_cnt + physj];
5158c2ecf20Sopenharmony_ci
5168c2ecf20Sopenharmony_ci			numa_set_distance(i, j, dist);
5178c2ecf20Sopenharmony_ci		}
5188c2ecf20Sopenharmony_ci	}
5198c2ecf20Sopenharmony_ci
5208c2ecf20Sopenharmony_ci	/* free the copied physical distance table */
5218c2ecf20Sopenharmony_ci	if (phys_dist)
5228c2ecf20Sopenharmony_ci		memblock_free(__pa(phys_dist), phys_size);
5238c2ecf20Sopenharmony_ci	return;
5248c2ecf20Sopenharmony_ci
5258c2ecf20Sopenharmony_cino_emu:
5268c2ecf20Sopenharmony_ci	/* No emulation.  Build identity emu_nid_to_phys[] for numa_add_cpu() */
5278c2ecf20Sopenharmony_ci	for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
5288c2ecf20Sopenharmony_ci		emu_nid_to_phys[i] = i;
5298c2ecf20Sopenharmony_ci}
5308c2ecf20Sopenharmony_ci
5318c2ecf20Sopenharmony_ci#ifndef CONFIG_DEBUG_PER_CPU_MAPS
5328c2ecf20Sopenharmony_civoid numa_add_cpu(int cpu)
5338c2ecf20Sopenharmony_ci{
5348c2ecf20Sopenharmony_ci	int physnid, nid;
5358c2ecf20Sopenharmony_ci
5368c2ecf20Sopenharmony_ci	nid = early_cpu_to_node(cpu);
5378c2ecf20Sopenharmony_ci	BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
5388c2ecf20Sopenharmony_ci
5398c2ecf20Sopenharmony_ci	physnid = emu_nid_to_phys[nid];
5408c2ecf20Sopenharmony_ci
5418c2ecf20Sopenharmony_ci	/*
5428c2ecf20Sopenharmony_ci	 * Map the cpu to each emulated node that is allocated on the physical
5438c2ecf20Sopenharmony_ci	 * node of the cpu's apic id.
5448c2ecf20Sopenharmony_ci	 */
5458c2ecf20Sopenharmony_ci	for_each_online_node(nid)
5468c2ecf20Sopenharmony_ci		if (emu_nid_to_phys[nid] == physnid)
5478c2ecf20Sopenharmony_ci			cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
5488c2ecf20Sopenharmony_ci}
5498c2ecf20Sopenharmony_ci
5508c2ecf20Sopenharmony_civoid numa_remove_cpu(int cpu)
5518c2ecf20Sopenharmony_ci{
5528c2ecf20Sopenharmony_ci	int i;
5538c2ecf20Sopenharmony_ci
5548c2ecf20Sopenharmony_ci	for_each_online_node(i)
5558c2ecf20Sopenharmony_ci		cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
5568c2ecf20Sopenharmony_ci}
5578c2ecf20Sopenharmony_ci#else	/* !CONFIG_DEBUG_PER_CPU_MAPS */
5588c2ecf20Sopenharmony_cistatic void numa_set_cpumask(int cpu, bool enable)
5598c2ecf20Sopenharmony_ci{
5608c2ecf20Sopenharmony_ci	int nid, physnid;
5618c2ecf20Sopenharmony_ci
5628c2ecf20Sopenharmony_ci	nid = early_cpu_to_node(cpu);
5638c2ecf20Sopenharmony_ci	if (nid == NUMA_NO_NODE) {
5648c2ecf20Sopenharmony_ci		/* early_cpu_to_node() already emits a warning and trace */
5658c2ecf20Sopenharmony_ci		return;
5668c2ecf20Sopenharmony_ci	}
5678c2ecf20Sopenharmony_ci
5688c2ecf20Sopenharmony_ci	physnid = emu_nid_to_phys[nid];
5698c2ecf20Sopenharmony_ci
5708c2ecf20Sopenharmony_ci	for_each_online_node(nid) {
5718c2ecf20Sopenharmony_ci		if (emu_nid_to_phys[nid] != physnid)
5728c2ecf20Sopenharmony_ci			continue;
5738c2ecf20Sopenharmony_ci
5748c2ecf20Sopenharmony_ci		debug_cpumask_set_cpu(cpu, nid, enable);
5758c2ecf20Sopenharmony_ci	}
5768c2ecf20Sopenharmony_ci}
5778c2ecf20Sopenharmony_ci
5788c2ecf20Sopenharmony_civoid numa_add_cpu(int cpu)
5798c2ecf20Sopenharmony_ci{
5808c2ecf20Sopenharmony_ci	numa_set_cpumask(cpu, true);
5818c2ecf20Sopenharmony_ci}
5828c2ecf20Sopenharmony_ci
5838c2ecf20Sopenharmony_civoid numa_remove_cpu(int cpu)
5848c2ecf20Sopenharmony_ci{
5858c2ecf20Sopenharmony_ci	numa_set_cpumask(cpu, false);
5868c2ecf20Sopenharmony_ci}
5878c2ecf20Sopenharmony_ci#endif	/* !CONFIG_DEBUG_PER_CPU_MAPS */
588