18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
28c2ecf20Sopenharmony_ci/* Common code for 32 and 64-bit NUMA */
38c2ecf20Sopenharmony_ci#include <linux/acpi.h>
48c2ecf20Sopenharmony_ci#include <linux/kernel.h>
58c2ecf20Sopenharmony_ci#include <linux/mm.h>
68c2ecf20Sopenharmony_ci#include <linux/string.h>
78c2ecf20Sopenharmony_ci#include <linux/init.h>
88c2ecf20Sopenharmony_ci#include <linux/memblock.h>
98c2ecf20Sopenharmony_ci#include <linux/mmzone.h>
108c2ecf20Sopenharmony_ci#include <linux/ctype.h>
118c2ecf20Sopenharmony_ci#include <linux/nodemask.h>
128c2ecf20Sopenharmony_ci#include <linux/sched.h>
138c2ecf20Sopenharmony_ci#include <linux/topology.h>
148c2ecf20Sopenharmony_ci
158c2ecf20Sopenharmony_ci#include <asm/e820/api.h>
168c2ecf20Sopenharmony_ci#include <asm/proto.h>
178c2ecf20Sopenharmony_ci#include <asm/dma.h>
188c2ecf20Sopenharmony_ci#include <asm/amd_nb.h>
198c2ecf20Sopenharmony_ci
208c2ecf20Sopenharmony_ci#include "numa_internal.h"
218c2ecf20Sopenharmony_ci
228c2ecf20Sopenharmony_ciint numa_off;
238c2ecf20Sopenharmony_cinodemask_t numa_nodes_parsed __initdata;
248c2ecf20Sopenharmony_ci
258c2ecf20Sopenharmony_cistruct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
268c2ecf20Sopenharmony_ciEXPORT_SYMBOL(node_data);
278c2ecf20Sopenharmony_ci
288c2ecf20Sopenharmony_cistatic struct numa_meminfo numa_meminfo __initdata_or_meminfo;
298c2ecf20Sopenharmony_cistatic struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo;
308c2ecf20Sopenharmony_ci
318c2ecf20Sopenharmony_cistatic int numa_distance_cnt;
328c2ecf20Sopenharmony_cistatic u8 *numa_distance;
338c2ecf20Sopenharmony_ci
348c2ecf20Sopenharmony_cistatic __init int numa_setup(char *opt)
358c2ecf20Sopenharmony_ci{
368c2ecf20Sopenharmony_ci	if (!opt)
378c2ecf20Sopenharmony_ci		return -EINVAL;
388c2ecf20Sopenharmony_ci	if (!strncmp(opt, "off", 3))
398c2ecf20Sopenharmony_ci		numa_off = 1;
408c2ecf20Sopenharmony_ci	if (!strncmp(opt, "fake=", 5))
418c2ecf20Sopenharmony_ci		return numa_emu_cmdline(opt + 5);
428c2ecf20Sopenharmony_ci	if (!strncmp(opt, "noacpi", 6))
438c2ecf20Sopenharmony_ci		disable_srat();
448c2ecf20Sopenharmony_ci	if (!strncmp(opt, "nohmat", 6))
458c2ecf20Sopenharmony_ci		disable_hmat();
468c2ecf20Sopenharmony_ci	return 0;
478c2ecf20Sopenharmony_ci}
488c2ecf20Sopenharmony_ciearly_param("numa", numa_setup);
498c2ecf20Sopenharmony_ci
508c2ecf20Sopenharmony_ci/*
518c2ecf20Sopenharmony_ci * apicid, cpu, node mappings
528c2ecf20Sopenharmony_ci */
538c2ecf20Sopenharmony_cis16 __apicid_to_node[MAX_LOCAL_APIC] = {
548c2ecf20Sopenharmony_ci	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
558c2ecf20Sopenharmony_ci};
568c2ecf20Sopenharmony_ci
578c2ecf20Sopenharmony_ciint numa_cpu_node(int cpu)
588c2ecf20Sopenharmony_ci{
598c2ecf20Sopenharmony_ci	int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
608c2ecf20Sopenharmony_ci
618c2ecf20Sopenharmony_ci	if (apicid != BAD_APICID)
628c2ecf20Sopenharmony_ci		return __apicid_to_node[apicid];
638c2ecf20Sopenharmony_ci	return NUMA_NO_NODE;
648c2ecf20Sopenharmony_ci}
658c2ecf20Sopenharmony_ci
668c2ecf20Sopenharmony_cicpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
678c2ecf20Sopenharmony_ciEXPORT_SYMBOL(node_to_cpumask_map);
688c2ecf20Sopenharmony_ci
698c2ecf20Sopenharmony_ci/*
708c2ecf20Sopenharmony_ci * Map cpu index to node index
718c2ecf20Sopenharmony_ci */
728c2ecf20Sopenharmony_ciDEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
738c2ecf20Sopenharmony_ciEXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
748c2ecf20Sopenharmony_ci
758c2ecf20Sopenharmony_civoid numa_set_node(int cpu, int node)
768c2ecf20Sopenharmony_ci{
778c2ecf20Sopenharmony_ci	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
788c2ecf20Sopenharmony_ci
798c2ecf20Sopenharmony_ci	/* early setting, no percpu area yet */
808c2ecf20Sopenharmony_ci	if (cpu_to_node_map) {
818c2ecf20Sopenharmony_ci		cpu_to_node_map[cpu] = node;
828c2ecf20Sopenharmony_ci		return;
838c2ecf20Sopenharmony_ci	}
848c2ecf20Sopenharmony_ci
858c2ecf20Sopenharmony_ci#ifdef CONFIG_DEBUG_PER_CPU_MAPS
868c2ecf20Sopenharmony_ci	if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
878c2ecf20Sopenharmony_ci		printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
888c2ecf20Sopenharmony_ci		dump_stack();
898c2ecf20Sopenharmony_ci		return;
908c2ecf20Sopenharmony_ci	}
918c2ecf20Sopenharmony_ci#endif
928c2ecf20Sopenharmony_ci	per_cpu(x86_cpu_to_node_map, cpu) = node;
938c2ecf20Sopenharmony_ci
948c2ecf20Sopenharmony_ci	set_cpu_numa_node(cpu, node);
958c2ecf20Sopenharmony_ci}
968c2ecf20Sopenharmony_ci
978c2ecf20Sopenharmony_civoid numa_clear_node(int cpu)
988c2ecf20Sopenharmony_ci{
998c2ecf20Sopenharmony_ci	numa_set_node(cpu, NUMA_NO_NODE);
1008c2ecf20Sopenharmony_ci}
1018c2ecf20Sopenharmony_ci
1028c2ecf20Sopenharmony_ci/*
1038c2ecf20Sopenharmony_ci * Allocate node_to_cpumask_map based on number of available nodes
1048c2ecf20Sopenharmony_ci * Requires node_possible_map to be valid.
1058c2ecf20Sopenharmony_ci *
1068c2ecf20Sopenharmony_ci * Note: cpumask_of_node() is not valid until after this is done.
1078c2ecf20Sopenharmony_ci * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
1088c2ecf20Sopenharmony_ci */
1098c2ecf20Sopenharmony_civoid __init setup_node_to_cpumask_map(void)
1108c2ecf20Sopenharmony_ci{
1118c2ecf20Sopenharmony_ci	unsigned int node;
1128c2ecf20Sopenharmony_ci
1138c2ecf20Sopenharmony_ci	/* setup nr_node_ids if not done yet */
1148c2ecf20Sopenharmony_ci	if (nr_node_ids == MAX_NUMNODES)
1158c2ecf20Sopenharmony_ci		setup_nr_node_ids();
1168c2ecf20Sopenharmony_ci
1178c2ecf20Sopenharmony_ci	/* allocate the map */
1188c2ecf20Sopenharmony_ci	for (node = 0; node < nr_node_ids; node++)
1198c2ecf20Sopenharmony_ci		alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
1208c2ecf20Sopenharmony_ci
1218c2ecf20Sopenharmony_ci	/* cpumask_of_node() will now work */
1228c2ecf20Sopenharmony_ci	pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
1238c2ecf20Sopenharmony_ci}
1248c2ecf20Sopenharmony_ci
1258c2ecf20Sopenharmony_cistatic int __init numa_add_memblk_to(int nid, u64 start, u64 end,
1268c2ecf20Sopenharmony_ci				     struct numa_meminfo *mi)
1278c2ecf20Sopenharmony_ci{
1288c2ecf20Sopenharmony_ci	/* ignore zero length blks */
1298c2ecf20Sopenharmony_ci	if (start == end)
1308c2ecf20Sopenharmony_ci		return 0;
1318c2ecf20Sopenharmony_ci
1328c2ecf20Sopenharmony_ci	/* whine about and ignore invalid blks */
1338c2ecf20Sopenharmony_ci	if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
1348c2ecf20Sopenharmony_ci		pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
1358c2ecf20Sopenharmony_ci			nid, start, end - 1);
1368c2ecf20Sopenharmony_ci		return 0;
1378c2ecf20Sopenharmony_ci	}
1388c2ecf20Sopenharmony_ci
1398c2ecf20Sopenharmony_ci	if (mi->nr_blks >= NR_NODE_MEMBLKS) {
1408c2ecf20Sopenharmony_ci		pr_err("too many memblk ranges\n");
1418c2ecf20Sopenharmony_ci		return -EINVAL;
1428c2ecf20Sopenharmony_ci	}
1438c2ecf20Sopenharmony_ci
1448c2ecf20Sopenharmony_ci	mi->blk[mi->nr_blks].start = start;
1458c2ecf20Sopenharmony_ci	mi->blk[mi->nr_blks].end = end;
1468c2ecf20Sopenharmony_ci	mi->blk[mi->nr_blks].nid = nid;
1478c2ecf20Sopenharmony_ci	mi->nr_blks++;
1488c2ecf20Sopenharmony_ci	return 0;
1498c2ecf20Sopenharmony_ci}
1508c2ecf20Sopenharmony_ci
1518c2ecf20Sopenharmony_ci/**
1528c2ecf20Sopenharmony_ci * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
1538c2ecf20Sopenharmony_ci * @idx: Index of memblk to remove
1548c2ecf20Sopenharmony_ci * @mi: numa_meminfo to remove memblk from
1558c2ecf20Sopenharmony_ci *
1568c2ecf20Sopenharmony_ci * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
1578c2ecf20Sopenharmony_ci * decrementing @mi->nr_blks.
1588c2ecf20Sopenharmony_ci */
1598c2ecf20Sopenharmony_civoid __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
1608c2ecf20Sopenharmony_ci{
1618c2ecf20Sopenharmony_ci	mi->nr_blks--;
1628c2ecf20Sopenharmony_ci	memmove(&mi->blk[idx], &mi->blk[idx + 1],
1638c2ecf20Sopenharmony_ci		(mi->nr_blks - idx) * sizeof(mi->blk[0]));
1648c2ecf20Sopenharmony_ci}
1658c2ecf20Sopenharmony_ci
1668c2ecf20Sopenharmony_ci/**
1678c2ecf20Sopenharmony_ci * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another
1688c2ecf20Sopenharmony_ci * @dst: numa_meminfo to append block to
1698c2ecf20Sopenharmony_ci * @idx: Index of memblk to remove
1708c2ecf20Sopenharmony_ci * @src: numa_meminfo to remove memblk from
1718c2ecf20Sopenharmony_ci */
1728c2ecf20Sopenharmony_cistatic void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx,
1738c2ecf20Sopenharmony_ci					 struct numa_meminfo *src)
1748c2ecf20Sopenharmony_ci{
1758c2ecf20Sopenharmony_ci	dst->blk[dst->nr_blks++] = src->blk[idx];
1768c2ecf20Sopenharmony_ci	numa_remove_memblk_from(idx, src);
1778c2ecf20Sopenharmony_ci}
1788c2ecf20Sopenharmony_ci
1798c2ecf20Sopenharmony_ci/**
1808c2ecf20Sopenharmony_ci * numa_add_memblk - Add one numa_memblk to numa_meminfo
1818c2ecf20Sopenharmony_ci * @nid: NUMA node ID of the new memblk
1828c2ecf20Sopenharmony_ci * @start: Start address of the new memblk
1838c2ecf20Sopenharmony_ci * @end: End address of the new memblk
1848c2ecf20Sopenharmony_ci *
1858c2ecf20Sopenharmony_ci * Add a new memblk to the default numa_meminfo.
1868c2ecf20Sopenharmony_ci *
1878c2ecf20Sopenharmony_ci * RETURNS:
1888c2ecf20Sopenharmony_ci * 0 on success, -errno on failure.
1898c2ecf20Sopenharmony_ci */
1908c2ecf20Sopenharmony_ciint __init numa_add_memblk(int nid, u64 start, u64 end)
1918c2ecf20Sopenharmony_ci{
1928c2ecf20Sopenharmony_ci	return numa_add_memblk_to(nid, start, end, &numa_meminfo);
1938c2ecf20Sopenharmony_ci}
1948c2ecf20Sopenharmony_ci
1958c2ecf20Sopenharmony_ci/* Allocate NODE_DATA for a node on the local memory */
1968c2ecf20Sopenharmony_cistatic void __init alloc_node_data(int nid)
1978c2ecf20Sopenharmony_ci{
1988c2ecf20Sopenharmony_ci	const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
1998c2ecf20Sopenharmony_ci	u64 nd_pa;
2008c2ecf20Sopenharmony_ci	void *nd;
2018c2ecf20Sopenharmony_ci	int tnid;
2028c2ecf20Sopenharmony_ci
2038c2ecf20Sopenharmony_ci	/*
2048c2ecf20Sopenharmony_ci	 * Allocate node data.  Try node-local memory and then any node.
2058c2ecf20Sopenharmony_ci	 * Never allocate in DMA zone.
2068c2ecf20Sopenharmony_ci	 */
2078c2ecf20Sopenharmony_ci	nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
2088c2ecf20Sopenharmony_ci	if (!nd_pa) {
2098c2ecf20Sopenharmony_ci		pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
2108c2ecf20Sopenharmony_ci		       nd_size, nid);
2118c2ecf20Sopenharmony_ci		return;
2128c2ecf20Sopenharmony_ci	}
2138c2ecf20Sopenharmony_ci	nd = __va(nd_pa);
2148c2ecf20Sopenharmony_ci
2158c2ecf20Sopenharmony_ci	/* report and initialize */
2168c2ecf20Sopenharmony_ci	printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
2178c2ecf20Sopenharmony_ci	       nd_pa, nd_pa + nd_size - 1);
2188c2ecf20Sopenharmony_ci	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
2198c2ecf20Sopenharmony_ci	if (tnid != nid)
2208c2ecf20Sopenharmony_ci		printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nid, tnid);
2218c2ecf20Sopenharmony_ci
2228c2ecf20Sopenharmony_ci	node_data[nid] = nd;
2238c2ecf20Sopenharmony_ci	memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
2248c2ecf20Sopenharmony_ci
2258c2ecf20Sopenharmony_ci	node_set_online(nid);
2268c2ecf20Sopenharmony_ci}
2278c2ecf20Sopenharmony_ci
2288c2ecf20Sopenharmony_ci/**
2298c2ecf20Sopenharmony_ci * numa_cleanup_meminfo - Cleanup a numa_meminfo
2308c2ecf20Sopenharmony_ci * @mi: numa_meminfo to clean up
2318c2ecf20Sopenharmony_ci *
2328c2ecf20Sopenharmony_ci * Sanitize @mi by merging and removing unnecessary memblks.  Also check for
2338c2ecf20Sopenharmony_ci * conflicts and clear unused memblks.
2348c2ecf20Sopenharmony_ci *
2358c2ecf20Sopenharmony_ci * RETURNS:
2368c2ecf20Sopenharmony_ci * 0 on success, -errno on failure.
2378c2ecf20Sopenharmony_ci */
2388c2ecf20Sopenharmony_ciint __init numa_cleanup_meminfo(struct numa_meminfo *mi)
2398c2ecf20Sopenharmony_ci{
2408c2ecf20Sopenharmony_ci	const u64 low = 0;
2418c2ecf20Sopenharmony_ci	const u64 high = PFN_PHYS(max_pfn);
2428c2ecf20Sopenharmony_ci	int i, j, k;
2438c2ecf20Sopenharmony_ci
2448c2ecf20Sopenharmony_ci	/* first, trim all entries */
2458c2ecf20Sopenharmony_ci	for (i = 0; i < mi->nr_blks; i++) {
2468c2ecf20Sopenharmony_ci		struct numa_memblk *bi = &mi->blk[i];
2478c2ecf20Sopenharmony_ci
2488c2ecf20Sopenharmony_ci		/* move / save reserved memory ranges */
2498c2ecf20Sopenharmony_ci		if (!memblock_overlaps_region(&memblock.memory,
2508c2ecf20Sopenharmony_ci					bi->start, bi->end - bi->start)) {
2518c2ecf20Sopenharmony_ci			numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi);
2528c2ecf20Sopenharmony_ci			continue;
2538c2ecf20Sopenharmony_ci		}
2548c2ecf20Sopenharmony_ci
2558c2ecf20Sopenharmony_ci		/* make sure all non-reserved blocks are inside the limits */
2568c2ecf20Sopenharmony_ci		bi->start = max(bi->start, low);
2578c2ecf20Sopenharmony_ci
2588c2ecf20Sopenharmony_ci		/* preserve info for non-RAM areas above 'max_pfn': */
2598c2ecf20Sopenharmony_ci		if (bi->end > high) {
2608c2ecf20Sopenharmony_ci			numa_add_memblk_to(bi->nid, high, bi->end,
2618c2ecf20Sopenharmony_ci					   &numa_reserved_meminfo);
2628c2ecf20Sopenharmony_ci			bi->end = high;
2638c2ecf20Sopenharmony_ci		}
2648c2ecf20Sopenharmony_ci
2658c2ecf20Sopenharmony_ci		/* and there's no empty block */
2668c2ecf20Sopenharmony_ci		if (bi->start >= bi->end)
2678c2ecf20Sopenharmony_ci			numa_remove_memblk_from(i--, mi);
2688c2ecf20Sopenharmony_ci	}
2698c2ecf20Sopenharmony_ci
2708c2ecf20Sopenharmony_ci	/* merge neighboring / overlapping entries */
2718c2ecf20Sopenharmony_ci	for (i = 0; i < mi->nr_blks; i++) {
2728c2ecf20Sopenharmony_ci		struct numa_memblk *bi = &mi->blk[i];
2738c2ecf20Sopenharmony_ci
2748c2ecf20Sopenharmony_ci		for (j = i + 1; j < mi->nr_blks; j++) {
2758c2ecf20Sopenharmony_ci			struct numa_memblk *bj = &mi->blk[j];
2768c2ecf20Sopenharmony_ci			u64 start, end;
2778c2ecf20Sopenharmony_ci
2788c2ecf20Sopenharmony_ci			/*
2798c2ecf20Sopenharmony_ci			 * See whether there are overlapping blocks.  Whine
2808c2ecf20Sopenharmony_ci			 * about but allow overlaps of the same nid.  They
2818c2ecf20Sopenharmony_ci			 * will be merged below.
2828c2ecf20Sopenharmony_ci			 */
2838c2ecf20Sopenharmony_ci			if (bi->end > bj->start && bi->start < bj->end) {
2848c2ecf20Sopenharmony_ci				if (bi->nid != bj->nid) {
2858c2ecf20Sopenharmony_ci					pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
2868c2ecf20Sopenharmony_ci					       bi->nid, bi->start, bi->end - 1,
2878c2ecf20Sopenharmony_ci					       bj->nid, bj->start, bj->end - 1);
2888c2ecf20Sopenharmony_ci					return -EINVAL;
2898c2ecf20Sopenharmony_ci				}
2908c2ecf20Sopenharmony_ci				pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
2918c2ecf20Sopenharmony_ci					bi->nid, bi->start, bi->end - 1,
2928c2ecf20Sopenharmony_ci					bj->start, bj->end - 1);
2938c2ecf20Sopenharmony_ci			}
2948c2ecf20Sopenharmony_ci
2958c2ecf20Sopenharmony_ci			/*
2968c2ecf20Sopenharmony_ci			 * Join together blocks on the same node, holes
2978c2ecf20Sopenharmony_ci			 * between which don't overlap with memory on other
2988c2ecf20Sopenharmony_ci			 * nodes.
2998c2ecf20Sopenharmony_ci			 */
3008c2ecf20Sopenharmony_ci			if (bi->nid != bj->nid)
3018c2ecf20Sopenharmony_ci				continue;
3028c2ecf20Sopenharmony_ci			start = min(bi->start, bj->start);
3038c2ecf20Sopenharmony_ci			end = max(bi->end, bj->end);
3048c2ecf20Sopenharmony_ci			for (k = 0; k < mi->nr_blks; k++) {
3058c2ecf20Sopenharmony_ci				struct numa_memblk *bk = &mi->blk[k];
3068c2ecf20Sopenharmony_ci
3078c2ecf20Sopenharmony_ci				if (bi->nid == bk->nid)
3088c2ecf20Sopenharmony_ci					continue;
3098c2ecf20Sopenharmony_ci				if (start < bk->end && end > bk->start)
3108c2ecf20Sopenharmony_ci					break;
3118c2ecf20Sopenharmony_ci			}
3128c2ecf20Sopenharmony_ci			if (k < mi->nr_blks)
3138c2ecf20Sopenharmony_ci				continue;
3148c2ecf20Sopenharmony_ci			printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n",
3158c2ecf20Sopenharmony_ci			       bi->nid, bi->start, bi->end - 1, bj->start,
3168c2ecf20Sopenharmony_ci			       bj->end - 1, start, end - 1);
3178c2ecf20Sopenharmony_ci			bi->start = start;
3188c2ecf20Sopenharmony_ci			bi->end = end;
3198c2ecf20Sopenharmony_ci			numa_remove_memblk_from(j--, mi);
3208c2ecf20Sopenharmony_ci		}
3218c2ecf20Sopenharmony_ci	}
3228c2ecf20Sopenharmony_ci
3238c2ecf20Sopenharmony_ci	/* clear unused ones */
3248c2ecf20Sopenharmony_ci	for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
3258c2ecf20Sopenharmony_ci		mi->blk[i].start = mi->blk[i].end = 0;
3268c2ecf20Sopenharmony_ci		mi->blk[i].nid = NUMA_NO_NODE;
3278c2ecf20Sopenharmony_ci	}
3288c2ecf20Sopenharmony_ci
3298c2ecf20Sopenharmony_ci	return 0;
3308c2ecf20Sopenharmony_ci}
3318c2ecf20Sopenharmony_ci
3328c2ecf20Sopenharmony_ci/*
3338c2ecf20Sopenharmony_ci * Set nodes, which have memory in @mi, in *@nodemask.
3348c2ecf20Sopenharmony_ci */
3358c2ecf20Sopenharmony_cistatic void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
3368c2ecf20Sopenharmony_ci					      const struct numa_meminfo *mi)
3378c2ecf20Sopenharmony_ci{
3388c2ecf20Sopenharmony_ci	int i;
3398c2ecf20Sopenharmony_ci
3408c2ecf20Sopenharmony_ci	for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
3418c2ecf20Sopenharmony_ci		if (mi->blk[i].start != mi->blk[i].end &&
3428c2ecf20Sopenharmony_ci		    mi->blk[i].nid != NUMA_NO_NODE)
3438c2ecf20Sopenharmony_ci			node_set(mi->blk[i].nid, *nodemask);
3448c2ecf20Sopenharmony_ci}
3458c2ecf20Sopenharmony_ci
3468c2ecf20Sopenharmony_ci/**
3478c2ecf20Sopenharmony_ci * numa_reset_distance - Reset NUMA distance table
3488c2ecf20Sopenharmony_ci *
3498c2ecf20Sopenharmony_ci * The current table is freed.  The next numa_set_distance() call will
3508c2ecf20Sopenharmony_ci * create a new one.
3518c2ecf20Sopenharmony_ci */
3528c2ecf20Sopenharmony_civoid __init numa_reset_distance(void)
3538c2ecf20Sopenharmony_ci{
3548c2ecf20Sopenharmony_ci	size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
3558c2ecf20Sopenharmony_ci
3568c2ecf20Sopenharmony_ci	/* numa_distance could be 1LU marking allocation failure, test cnt */
3578c2ecf20Sopenharmony_ci	if (numa_distance_cnt)
3588c2ecf20Sopenharmony_ci		memblock_free(__pa(numa_distance), size);
3598c2ecf20Sopenharmony_ci	numa_distance_cnt = 0;
3608c2ecf20Sopenharmony_ci	numa_distance = NULL;	/* enable table creation */
3618c2ecf20Sopenharmony_ci}
3628c2ecf20Sopenharmony_ci
3638c2ecf20Sopenharmony_cistatic int __init numa_alloc_distance(void)
3648c2ecf20Sopenharmony_ci{
3658c2ecf20Sopenharmony_ci	nodemask_t nodes_parsed;
3668c2ecf20Sopenharmony_ci	size_t size;
3678c2ecf20Sopenharmony_ci	int i, j, cnt = 0;
3688c2ecf20Sopenharmony_ci	u64 phys;
3698c2ecf20Sopenharmony_ci
3708c2ecf20Sopenharmony_ci	/* size the new table and allocate it */
3718c2ecf20Sopenharmony_ci	nodes_parsed = numa_nodes_parsed;
3728c2ecf20Sopenharmony_ci	numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
3738c2ecf20Sopenharmony_ci
3748c2ecf20Sopenharmony_ci	for_each_node_mask(i, nodes_parsed)
3758c2ecf20Sopenharmony_ci		cnt = i;
3768c2ecf20Sopenharmony_ci	cnt++;
3778c2ecf20Sopenharmony_ci	size = cnt * cnt * sizeof(numa_distance[0]);
3788c2ecf20Sopenharmony_ci
3798c2ecf20Sopenharmony_ci	phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
3808c2ecf20Sopenharmony_ci				      size, PAGE_SIZE);
3818c2ecf20Sopenharmony_ci	if (!phys) {
3828c2ecf20Sopenharmony_ci		pr_warn("Warning: can't allocate distance table!\n");
3838c2ecf20Sopenharmony_ci		/* don't retry until explicitly reset */
3848c2ecf20Sopenharmony_ci		numa_distance = (void *)1LU;
3858c2ecf20Sopenharmony_ci		return -ENOMEM;
3868c2ecf20Sopenharmony_ci	}
3878c2ecf20Sopenharmony_ci	memblock_reserve(phys, size);
3888c2ecf20Sopenharmony_ci
3898c2ecf20Sopenharmony_ci	numa_distance = __va(phys);
3908c2ecf20Sopenharmony_ci	numa_distance_cnt = cnt;
3918c2ecf20Sopenharmony_ci
3928c2ecf20Sopenharmony_ci	/* fill with the default distances */
3938c2ecf20Sopenharmony_ci	for (i = 0; i < cnt; i++)
3948c2ecf20Sopenharmony_ci		for (j = 0; j < cnt; j++)
3958c2ecf20Sopenharmony_ci			numa_distance[i * cnt + j] = i == j ?
3968c2ecf20Sopenharmony_ci				LOCAL_DISTANCE : REMOTE_DISTANCE;
3978c2ecf20Sopenharmony_ci	printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
3988c2ecf20Sopenharmony_ci
3998c2ecf20Sopenharmony_ci	return 0;
4008c2ecf20Sopenharmony_ci}
4018c2ecf20Sopenharmony_ci
4028c2ecf20Sopenharmony_ci/**
4038c2ecf20Sopenharmony_ci * numa_set_distance - Set NUMA distance from one NUMA to another
4048c2ecf20Sopenharmony_ci * @from: the 'from' node to set distance
4058c2ecf20Sopenharmony_ci * @to: the 'to'  node to set distance
4068c2ecf20Sopenharmony_ci * @distance: NUMA distance
4078c2ecf20Sopenharmony_ci *
4088c2ecf20Sopenharmony_ci * Set the distance from node @from to @to to @distance.  If distance table
4098c2ecf20Sopenharmony_ci * doesn't exist, one which is large enough to accommodate all the currently
4108c2ecf20Sopenharmony_ci * known nodes will be created.
4118c2ecf20Sopenharmony_ci *
4128c2ecf20Sopenharmony_ci * If such table cannot be allocated, a warning is printed and further
4138c2ecf20Sopenharmony_ci * calls are ignored until the distance table is reset with
4148c2ecf20Sopenharmony_ci * numa_reset_distance().
4158c2ecf20Sopenharmony_ci *
4168c2ecf20Sopenharmony_ci * If @from or @to is higher than the highest known node or lower than zero
4178c2ecf20Sopenharmony_ci * at the time of table creation or @distance doesn't make sense, the call
4188c2ecf20Sopenharmony_ci * is ignored.
4198c2ecf20Sopenharmony_ci * This is to allow simplification of specific NUMA config implementations.
4208c2ecf20Sopenharmony_ci */
4218c2ecf20Sopenharmony_civoid __init numa_set_distance(int from, int to, int distance)
4228c2ecf20Sopenharmony_ci{
4238c2ecf20Sopenharmony_ci	if (!numa_distance && numa_alloc_distance() < 0)
4248c2ecf20Sopenharmony_ci		return;
4258c2ecf20Sopenharmony_ci
4268c2ecf20Sopenharmony_ci	if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
4278c2ecf20Sopenharmony_ci			from < 0 || to < 0) {
4288c2ecf20Sopenharmony_ci		pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
4298c2ecf20Sopenharmony_ci			     from, to, distance);
4308c2ecf20Sopenharmony_ci		return;
4318c2ecf20Sopenharmony_ci	}
4328c2ecf20Sopenharmony_ci
4338c2ecf20Sopenharmony_ci	if ((u8)distance != distance ||
4348c2ecf20Sopenharmony_ci	    (from == to && distance != LOCAL_DISTANCE)) {
4358c2ecf20Sopenharmony_ci		pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
4368c2ecf20Sopenharmony_ci			     from, to, distance);
4378c2ecf20Sopenharmony_ci		return;
4388c2ecf20Sopenharmony_ci	}
4398c2ecf20Sopenharmony_ci
4408c2ecf20Sopenharmony_ci	numa_distance[from * numa_distance_cnt + to] = distance;
4418c2ecf20Sopenharmony_ci}
4428c2ecf20Sopenharmony_ci
4438c2ecf20Sopenharmony_ciint __node_distance(int from, int to)
4448c2ecf20Sopenharmony_ci{
4458c2ecf20Sopenharmony_ci	if (from >= numa_distance_cnt || to >= numa_distance_cnt)
4468c2ecf20Sopenharmony_ci		return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
4478c2ecf20Sopenharmony_ci	return numa_distance[from * numa_distance_cnt + to];
4488c2ecf20Sopenharmony_ci}
4498c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__node_distance);
4508c2ecf20Sopenharmony_ci
4518c2ecf20Sopenharmony_ci/*
4528c2ecf20Sopenharmony_ci * Sanity check to catch more bad NUMA configurations (they are amazingly
4538c2ecf20Sopenharmony_ci * common).  Make sure the nodes cover all memory.
4548c2ecf20Sopenharmony_ci */
4558c2ecf20Sopenharmony_cistatic bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
4568c2ecf20Sopenharmony_ci{
4578c2ecf20Sopenharmony_ci	u64 numaram, e820ram;
4588c2ecf20Sopenharmony_ci	int i;
4598c2ecf20Sopenharmony_ci
4608c2ecf20Sopenharmony_ci	numaram = 0;
4618c2ecf20Sopenharmony_ci	for (i = 0; i < mi->nr_blks; i++) {
4628c2ecf20Sopenharmony_ci		u64 s = mi->blk[i].start >> PAGE_SHIFT;
4638c2ecf20Sopenharmony_ci		u64 e = mi->blk[i].end >> PAGE_SHIFT;
4648c2ecf20Sopenharmony_ci		numaram += e - s;
4658c2ecf20Sopenharmony_ci		numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
4668c2ecf20Sopenharmony_ci		if ((s64)numaram < 0)
4678c2ecf20Sopenharmony_ci			numaram = 0;
4688c2ecf20Sopenharmony_ci	}
4698c2ecf20Sopenharmony_ci
4708c2ecf20Sopenharmony_ci	e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
4718c2ecf20Sopenharmony_ci
4728c2ecf20Sopenharmony_ci	/* We seem to lose 3 pages somewhere. Allow 1M of slack. */
4738c2ecf20Sopenharmony_ci	if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
4748c2ecf20Sopenharmony_ci		printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n",
4758c2ecf20Sopenharmony_ci		       (numaram << PAGE_SHIFT) >> 20,
4768c2ecf20Sopenharmony_ci		       (e820ram << PAGE_SHIFT) >> 20);
4778c2ecf20Sopenharmony_ci		return false;
4788c2ecf20Sopenharmony_ci	}
4798c2ecf20Sopenharmony_ci	return true;
4808c2ecf20Sopenharmony_ci}
4818c2ecf20Sopenharmony_ci
4828c2ecf20Sopenharmony_ci/*
4838c2ecf20Sopenharmony_ci * Mark all currently memblock-reserved physical memory (which covers the
4848c2ecf20Sopenharmony_ci * kernel's own memory ranges) as hot-unswappable.
4858c2ecf20Sopenharmony_ci */
4868c2ecf20Sopenharmony_cistatic void __init numa_clear_kernel_node_hotplug(void)
4878c2ecf20Sopenharmony_ci{
4888c2ecf20Sopenharmony_ci	nodemask_t reserved_nodemask = NODE_MASK_NONE;
4898c2ecf20Sopenharmony_ci	struct memblock_region *mb_region;
4908c2ecf20Sopenharmony_ci	int i;
4918c2ecf20Sopenharmony_ci
4928c2ecf20Sopenharmony_ci	/*
4938c2ecf20Sopenharmony_ci	 * We have to do some preprocessing of memblock regions, to
4948c2ecf20Sopenharmony_ci	 * make them suitable for reservation.
4958c2ecf20Sopenharmony_ci	 *
4968c2ecf20Sopenharmony_ci	 * At this time, all memory regions reserved by memblock are
4978c2ecf20Sopenharmony_ci	 * used by the kernel, but those regions are not split up
4988c2ecf20Sopenharmony_ci	 * along node boundaries yet, and don't necessarily have their
4998c2ecf20Sopenharmony_ci	 * node ID set yet either.
5008c2ecf20Sopenharmony_ci	 *
5018c2ecf20Sopenharmony_ci	 * So iterate over all memory known to the x86 architecture,
5028c2ecf20Sopenharmony_ci	 * and use those ranges to set the nid in memblock.reserved.
5038c2ecf20Sopenharmony_ci	 * This will split up the memblock regions along node
5048c2ecf20Sopenharmony_ci	 * boundaries and will set the node IDs as well.
5058c2ecf20Sopenharmony_ci	 */
5068c2ecf20Sopenharmony_ci	for (i = 0; i < numa_meminfo.nr_blks; i++) {
5078c2ecf20Sopenharmony_ci		struct numa_memblk *mb = numa_meminfo.blk + i;
5088c2ecf20Sopenharmony_ci		int ret;
5098c2ecf20Sopenharmony_ci
5108c2ecf20Sopenharmony_ci		ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid);
5118c2ecf20Sopenharmony_ci		WARN_ON_ONCE(ret);
5128c2ecf20Sopenharmony_ci	}
5138c2ecf20Sopenharmony_ci
5148c2ecf20Sopenharmony_ci	/*
5158c2ecf20Sopenharmony_ci	 * Now go over all reserved memblock regions, to construct a
5168c2ecf20Sopenharmony_ci	 * node mask of all kernel reserved memory areas.
5178c2ecf20Sopenharmony_ci	 *
5188c2ecf20Sopenharmony_ci	 * [ Note, when booting with mem=nn[kMG] or in a kdump kernel,
5198c2ecf20Sopenharmony_ci	 *   numa_meminfo might not include all memblock.reserved
5208c2ecf20Sopenharmony_ci	 *   memory ranges, because quirks such as trim_snb_memory()
5218c2ecf20Sopenharmony_ci	 *   reserve specific pages for Sandy Bridge graphics. ]
5228c2ecf20Sopenharmony_ci	 */
5238c2ecf20Sopenharmony_ci	for_each_reserved_mem_region(mb_region) {
5248c2ecf20Sopenharmony_ci		int nid = memblock_get_region_node(mb_region);
5258c2ecf20Sopenharmony_ci
5268c2ecf20Sopenharmony_ci		if (nid != MAX_NUMNODES)
5278c2ecf20Sopenharmony_ci			node_set(nid, reserved_nodemask);
5288c2ecf20Sopenharmony_ci	}
5298c2ecf20Sopenharmony_ci
5308c2ecf20Sopenharmony_ci	/*
5318c2ecf20Sopenharmony_ci	 * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory
5328c2ecf20Sopenharmony_ci	 * belonging to the reserved node mask.
5338c2ecf20Sopenharmony_ci	 *
5348c2ecf20Sopenharmony_ci	 * Note that this will include memory regions that reside
5358c2ecf20Sopenharmony_ci	 * on nodes that contain kernel memory - entire nodes
5368c2ecf20Sopenharmony_ci	 * become hot-unpluggable:
5378c2ecf20Sopenharmony_ci	 */
5388c2ecf20Sopenharmony_ci	for (i = 0; i < numa_meminfo.nr_blks; i++) {
5398c2ecf20Sopenharmony_ci		struct numa_memblk *mb = numa_meminfo.blk + i;
5408c2ecf20Sopenharmony_ci
5418c2ecf20Sopenharmony_ci		if (!node_isset(mb->nid, reserved_nodemask))
5428c2ecf20Sopenharmony_ci			continue;
5438c2ecf20Sopenharmony_ci
5448c2ecf20Sopenharmony_ci		memblock_clear_hotplug(mb->start, mb->end - mb->start);
5458c2ecf20Sopenharmony_ci	}
5468c2ecf20Sopenharmony_ci}
5478c2ecf20Sopenharmony_ci
5488c2ecf20Sopenharmony_cistatic int __init numa_register_memblks(struct numa_meminfo *mi)
5498c2ecf20Sopenharmony_ci{
5508c2ecf20Sopenharmony_ci	int i, nid;
5518c2ecf20Sopenharmony_ci
5528c2ecf20Sopenharmony_ci	/* Account for nodes with cpus and no memory */
5538c2ecf20Sopenharmony_ci	node_possible_map = numa_nodes_parsed;
5548c2ecf20Sopenharmony_ci	numa_nodemask_from_meminfo(&node_possible_map, mi);
5558c2ecf20Sopenharmony_ci	if (WARN_ON(nodes_empty(node_possible_map)))
5568c2ecf20Sopenharmony_ci		return -EINVAL;
5578c2ecf20Sopenharmony_ci
5588c2ecf20Sopenharmony_ci	for (i = 0; i < mi->nr_blks; i++) {
5598c2ecf20Sopenharmony_ci		struct numa_memblk *mb = &mi->blk[i];
5608c2ecf20Sopenharmony_ci		memblock_set_node(mb->start, mb->end - mb->start,
5618c2ecf20Sopenharmony_ci				  &memblock.memory, mb->nid);
5628c2ecf20Sopenharmony_ci	}
5638c2ecf20Sopenharmony_ci
5648c2ecf20Sopenharmony_ci	/*
5658c2ecf20Sopenharmony_ci	 * At very early time, the kernel have to use some memory such as
5668c2ecf20Sopenharmony_ci	 * loading the kernel image. We cannot prevent this anyway. So any
5678c2ecf20Sopenharmony_ci	 * node the kernel resides in should be un-hotpluggable.
5688c2ecf20Sopenharmony_ci	 *
5698c2ecf20Sopenharmony_ci	 * And when we come here, alloc node data won't fail.
5708c2ecf20Sopenharmony_ci	 */
5718c2ecf20Sopenharmony_ci	numa_clear_kernel_node_hotplug();
5728c2ecf20Sopenharmony_ci
5738c2ecf20Sopenharmony_ci	/*
5748c2ecf20Sopenharmony_ci	 * If sections array is gonna be used for pfn -> nid mapping, check
5758c2ecf20Sopenharmony_ci	 * whether its granularity is fine enough.
5768c2ecf20Sopenharmony_ci	 */
5778c2ecf20Sopenharmony_ci	if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) {
5788c2ecf20Sopenharmony_ci		unsigned long pfn_align = node_map_pfn_alignment();
5798c2ecf20Sopenharmony_ci
5808c2ecf20Sopenharmony_ci		if (pfn_align && pfn_align < PAGES_PER_SECTION) {
5818c2ecf20Sopenharmony_ci			pr_warn("Node alignment %LuMB < min %LuMB, rejecting NUMA config\n",
5828c2ecf20Sopenharmony_ci				PFN_PHYS(pfn_align) >> 20,
5838c2ecf20Sopenharmony_ci				PFN_PHYS(PAGES_PER_SECTION) >> 20);
5848c2ecf20Sopenharmony_ci			return -EINVAL;
5858c2ecf20Sopenharmony_ci		}
5868c2ecf20Sopenharmony_ci	}
5878c2ecf20Sopenharmony_ci	if (!numa_meminfo_cover_memory(mi))
5888c2ecf20Sopenharmony_ci		return -EINVAL;
5898c2ecf20Sopenharmony_ci
5908c2ecf20Sopenharmony_ci	/* Finally register nodes. */
5918c2ecf20Sopenharmony_ci	for_each_node_mask(nid, node_possible_map) {
5928c2ecf20Sopenharmony_ci		u64 start = PFN_PHYS(max_pfn);
5938c2ecf20Sopenharmony_ci		u64 end = 0;
5948c2ecf20Sopenharmony_ci
5958c2ecf20Sopenharmony_ci		for (i = 0; i < mi->nr_blks; i++) {
5968c2ecf20Sopenharmony_ci			if (nid != mi->blk[i].nid)
5978c2ecf20Sopenharmony_ci				continue;
5988c2ecf20Sopenharmony_ci			start = min(mi->blk[i].start, start);
5998c2ecf20Sopenharmony_ci			end = max(mi->blk[i].end, end);
6008c2ecf20Sopenharmony_ci		}
6018c2ecf20Sopenharmony_ci
6028c2ecf20Sopenharmony_ci		if (start >= end)
6038c2ecf20Sopenharmony_ci			continue;
6048c2ecf20Sopenharmony_ci
6058c2ecf20Sopenharmony_ci		alloc_node_data(nid);
6068c2ecf20Sopenharmony_ci	}
6078c2ecf20Sopenharmony_ci
6088c2ecf20Sopenharmony_ci	/* Dump memblock with node info and return. */
6098c2ecf20Sopenharmony_ci	memblock_dump_all();
6108c2ecf20Sopenharmony_ci	return 0;
6118c2ecf20Sopenharmony_ci}
6128c2ecf20Sopenharmony_ci
6138c2ecf20Sopenharmony_ci/*
6148c2ecf20Sopenharmony_ci * There are unfortunately some poorly designed mainboards around that
6158c2ecf20Sopenharmony_ci * only connect memory to a single CPU. This breaks the 1:1 cpu->node
6168c2ecf20Sopenharmony_ci * mapping. To avoid this fill in the mapping for all possible CPUs,
6178c2ecf20Sopenharmony_ci * as the number of CPUs is not known yet. We round robin the existing
6188c2ecf20Sopenharmony_ci * nodes.
6198c2ecf20Sopenharmony_ci */
6208c2ecf20Sopenharmony_cistatic void __init numa_init_array(void)
6218c2ecf20Sopenharmony_ci{
6228c2ecf20Sopenharmony_ci	int rr, i;
6238c2ecf20Sopenharmony_ci
6248c2ecf20Sopenharmony_ci	rr = first_node(node_online_map);
6258c2ecf20Sopenharmony_ci	for (i = 0; i < nr_cpu_ids; i++) {
6268c2ecf20Sopenharmony_ci		if (early_cpu_to_node(i) != NUMA_NO_NODE)
6278c2ecf20Sopenharmony_ci			continue;
6288c2ecf20Sopenharmony_ci		numa_set_node(i, rr);
6298c2ecf20Sopenharmony_ci		rr = next_node_in(rr, node_online_map);
6308c2ecf20Sopenharmony_ci	}
6318c2ecf20Sopenharmony_ci}
6328c2ecf20Sopenharmony_ci
6338c2ecf20Sopenharmony_cistatic int __init numa_init(int (*init_func)(void))
6348c2ecf20Sopenharmony_ci{
6358c2ecf20Sopenharmony_ci	int i;
6368c2ecf20Sopenharmony_ci	int ret;
6378c2ecf20Sopenharmony_ci
6388c2ecf20Sopenharmony_ci	for (i = 0; i < MAX_LOCAL_APIC; i++)
6398c2ecf20Sopenharmony_ci		set_apicid_to_node(i, NUMA_NO_NODE);
6408c2ecf20Sopenharmony_ci
6418c2ecf20Sopenharmony_ci	nodes_clear(numa_nodes_parsed);
6428c2ecf20Sopenharmony_ci	nodes_clear(node_possible_map);
6438c2ecf20Sopenharmony_ci	nodes_clear(node_online_map);
6448c2ecf20Sopenharmony_ci	memset(&numa_meminfo, 0, sizeof(numa_meminfo));
6458c2ecf20Sopenharmony_ci	WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory,
6468c2ecf20Sopenharmony_ci				  MAX_NUMNODES));
6478c2ecf20Sopenharmony_ci	WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved,
6488c2ecf20Sopenharmony_ci				  MAX_NUMNODES));
6498c2ecf20Sopenharmony_ci	/* In case that parsing SRAT failed. */
6508c2ecf20Sopenharmony_ci	WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX));
6518c2ecf20Sopenharmony_ci	numa_reset_distance();
6528c2ecf20Sopenharmony_ci
6538c2ecf20Sopenharmony_ci	ret = init_func();
6548c2ecf20Sopenharmony_ci	if (ret < 0)
6558c2ecf20Sopenharmony_ci		return ret;
6568c2ecf20Sopenharmony_ci
6578c2ecf20Sopenharmony_ci	/*
6588c2ecf20Sopenharmony_ci	 * We reset memblock back to the top-down direction
6598c2ecf20Sopenharmony_ci	 * here because if we configured ACPI_NUMA, we have
6608c2ecf20Sopenharmony_ci	 * parsed SRAT in init_func(). It is ok to have the
6618c2ecf20Sopenharmony_ci	 * reset here even if we did't configure ACPI_NUMA
6628c2ecf20Sopenharmony_ci	 * or acpi numa init fails and fallbacks to dummy
6638c2ecf20Sopenharmony_ci	 * numa init.
6648c2ecf20Sopenharmony_ci	 */
6658c2ecf20Sopenharmony_ci	memblock_set_bottom_up(false);
6668c2ecf20Sopenharmony_ci
6678c2ecf20Sopenharmony_ci	ret = numa_cleanup_meminfo(&numa_meminfo);
6688c2ecf20Sopenharmony_ci	if (ret < 0)
6698c2ecf20Sopenharmony_ci		return ret;
6708c2ecf20Sopenharmony_ci
6718c2ecf20Sopenharmony_ci	numa_emulation(&numa_meminfo, numa_distance_cnt);
6728c2ecf20Sopenharmony_ci
6738c2ecf20Sopenharmony_ci	ret = numa_register_memblks(&numa_meminfo);
6748c2ecf20Sopenharmony_ci	if (ret < 0)
6758c2ecf20Sopenharmony_ci		return ret;
6768c2ecf20Sopenharmony_ci
6778c2ecf20Sopenharmony_ci	for (i = 0; i < nr_cpu_ids; i++) {
6788c2ecf20Sopenharmony_ci		int nid = early_cpu_to_node(i);
6798c2ecf20Sopenharmony_ci
6808c2ecf20Sopenharmony_ci		if (nid == NUMA_NO_NODE)
6818c2ecf20Sopenharmony_ci			continue;
6828c2ecf20Sopenharmony_ci		if (!node_online(nid))
6838c2ecf20Sopenharmony_ci			numa_clear_node(i);
6848c2ecf20Sopenharmony_ci	}
6858c2ecf20Sopenharmony_ci	numa_init_array();
6868c2ecf20Sopenharmony_ci
6878c2ecf20Sopenharmony_ci	return 0;
6888c2ecf20Sopenharmony_ci}
6898c2ecf20Sopenharmony_ci
6908c2ecf20Sopenharmony_ci/**
6918c2ecf20Sopenharmony_ci * dummy_numa_init - Fallback dummy NUMA init
6928c2ecf20Sopenharmony_ci *
6938c2ecf20Sopenharmony_ci * Used if there's no underlying NUMA architecture, NUMA initialization
6948c2ecf20Sopenharmony_ci * fails, or NUMA is disabled on the command line.
6958c2ecf20Sopenharmony_ci *
6968c2ecf20Sopenharmony_ci * Must online at least one node and add memory blocks that cover all
6978c2ecf20Sopenharmony_ci * allowed memory.  This function must not fail.
6988c2ecf20Sopenharmony_ci */
6998c2ecf20Sopenharmony_cistatic int __init dummy_numa_init(void)
7008c2ecf20Sopenharmony_ci{
7018c2ecf20Sopenharmony_ci	printk(KERN_INFO "%s\n",
7028c2ecf20Sopenharmony_ci	       numa_off ? "NUMA turned off" : "No NUMA configuration found");
7038c2ecf20Sopenharmony_ci	printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n",
7048c2ecf20Sopenharmony_ci	       0LLU, PFN_PHYS(max_pfn) - 1);
7058c2ecf20Sopenharmony_ci
7068c2ecf20Sopenharmony_ci	node_set(0, numa_nodes_parsed);
7078c2ecf20Sopenharmony_ci	numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
7088c2ecf20Sopenharmony_ci
7098c2ecf20Sopenharmony_ci	return 0;
7108c2ecf20Sopenharmony_ci}
7118c2ecf20Sopenharmony_ci
7128c2ecf20Sopenharmony_ci/**
7138c2ecf20Sopenharmony_ci * x86_numa_init - Initialize NUMA
7148c2ecf20Sopenharmony_ci *
7158c2ecf20Sopenharmony_ci * Try each configured NUMA initialization method until one succeeds.  The
7168c2ecf20Sopenharmony_ci * last fallback is dummy single node config encompassing whole memory and
7178c2ecf20Sopenharmony_ci * never fails.
7188c2ecf20Sopenharmony_ci */
7198c2ecf20Sopenharmony_civoid __init x86_numa_init(void)
7208c2ecf20Sopenharmony_ci{
7218c2ecf20Sopenharmony_ci	if (!numa_off) {
7228c2ecf20Sopenharmony_ci#ifdef CONFIG_ACPI_NUMA
7238c2ecf20Sopenharmony_ci		if (!numa_init(x86_acpi_numa_init))
7248c2ecf20Sopenharmony_ci			return;
7258c2ecf20Sopenharmony_ci#endif
7268c2ecf20Sopenharmony_ci#ifdef CONFIG_AMD_NUMA
7278c2ecf20Sopenharmony_ci		if (!numa_init(amd_numa_init))
7288c2ecf20Sopenharmony_ci			return;
7298c2ecf20Sopenharmony_ci#endif
7308c2ecf20Sopenharmony_ci	}
7318c2ecf20Sopenharmony_ci
7328c2ecf20Sopenharmony_ci	numa_init(dummy_numa_init);
7338c2ecf20Sopenharmony_ci}
7348c2ecf20Sopenharmony_ci
7358c2ecf20Sopenharmony_cistatic void __init init_memory_less_node(int nid)
7368c2ecf20Sopenharmony_ci{
7378c2ecf20Sopenharmony_ci	/* Allocate and initialize node data. Memory-less node is now online.*/
7388c2ecf20Sopenharmony_ci	alloc_node_data(nid);
7398c2ecf20Sopenharmony_ci	free_area_init_memoryless_node(nid);
7408c2ecf20Sopenharmony_ci
7418c2ecf20Sopenharmony_ci	/*
7428c2ecf20Sopenharmony_ci	 * All zonelists will be built later in start_kernel() after per cpu
7438c2ecf20Sopenharmony_ci	 * areas are initialized.
7448c2ecf20Sopenharmony_ci	 */
7458c2ecf20Sopenharmony_ci}
7468c2ecf20Sopenharmony_ci
7478c2ecf20Sopenharmony_ci/*
7488c2ecf20Sopenharmony_ci * A node may exist which has one or more Generic Initiators but no CPUs and no
7498c2ecf20Sopenharmony_ci * memory.
7508c2ecf20Sopenharmony_ci *
7518c2ecf20Sopenharmony_ci * This function must be called after init_cpu_to_node(), to ensure that any
7528c2ecf20Sopenharmony_ci * memoryless CPU nodes have already been brought online, and before the
7538c2ecf20Sopenharmony_ci * node_data[nid] is needed for zone list setup in build_all_zonelists().
7548c2ecf20Sopenharmony_ci *
7558c2ecf20Sopenharmony_ci * When this function is called, any nodes containing either memory and/or CPUs
7568c2ecf20Sopenharmony_ci * will already be online and there is no need to do anything extra, even if
7578c2ecf20Sopenharmony_ci * they also contain one or more Generic Initiators.
7588c2ecf20Sopenharmony_ci */
7598c2ecf20Sopenharmony_civoid __init init_gi_nodes(void)
7608c2ecf20Sopenharmony_ci{
7618c2ecf20Sopenharmony_ci	int nid;
7628c2ecf20Sopenharmony_ci
7638c2ecf20Sopenharmony_ci	for_each_node_state(nid, N_GENERIC_INITIATOR)
7648c2ecf20Sopenharmony_ci		if (!node_online(nid))
7658c2ecf20Sopenharmony_ci			init_memory_less_node(nid);
7668c2ecf20Sopenharmony_ci}
7678c2ecf20Sopenharmony_ci
7688c2ecf20Sopenharmony_ci/*
7698c2ecf20Sopenharmony_ci * Setup early cpu_to_node.
7708c2ecf20Sopenharmony_ci *
7718c2ecf20Sopenharmony_ci * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
7728c2ecf20Sopenharmony_ci * and apicid_to_node[] tables have valid entries for a CPU.
7738c2ecf20Sopenharmony_ci * This means we skip cpu_to_node[] initialisation for NUMA
7748c2ecf20Sopenharmony_ci * emulation and faking node case (when running a kernel compiled
7758c2ecf20Sopenharmony_ci * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
7768c2ecf20Sopenharmony_ci * is already initialized in a round robin manner at numa_init_array,
7778c2ecf20Sopenharmony_ci * prior to this call, and this initialization is good enough
7788c2ecf20Sopenharmony_ci * for the fake NUMA cases.
7798c2ecf20Sopenharmony_ci *
7808c2ecf20Sopenharmony_ci * Called before the per_cpu areas are setup.
7818c2ecf20Sopenharmony_ci */
7828c2ecf20Sopenharmony_civoid __init init_cpu_to_node(void)
7838c2ecf20Sopenharmony_ci{
7848c2ecf20Sopenharmony_ci	int cpu;
7858c2ecf20Sopenharmony_ci	u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
7868c2ecf20Sopenharmony_ci
7878c2ecf20Sopenharmony_ci	BUG_ON(cpu_to_apicid == NULL);
7888c2ecf20Sopenharmony_ci
7898c2ecf20Sopenharmony_ci	for_each_possible_cpu(cpu) {
7908c2ecf20Sopenharmony_ci		int node = numa_cpu_node(cpu);
7918c2ecf20Sopenharmony_ci
7928c2ecf20Sopenharmony_ci		if (node == NUMA_NO_NODE)
7938c2ecf20Sopenharmony_ci			continue;
7948c2ecf20Sopenharmony_ci
7958c2ecf20Sopenharmony_ci		if (!node_online(node))
7968c2ecf20Sopenharmony_ci			init_memory_less_node(node);
7978c2ecf20Sopenharmony_ci
7988c2ecf20Sopenharmony_ci		numa_set_node(cpu, node);
7998c2ecf20Sopenharmony_ci	}
8008c2ecf20Sopenharmony_ci}
8018c2ecf20Sopenharmony_ci
8028c2ecf20Sopenharmony_ci#ifndef CONFIG_DEBUG_PER_CPU_MAPS
8038c2ecf20Sopenharmony_ci
8048c2ecf20Sopenharmony_ci# ifndef CONFIG_NUMA_EMU
8058c2ecf20Sopenharmony_civoid numa_add_cpu(int cpu)
8068c2ecf20Sopenharmony_ci{
8078c2ecf20Sopenharmony_ci	cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
8088c2ecf20Sopenharmony_ci}
8098c2ecf20Sopenharmony_ci
8108c2ecf20Sopenharmony_civoid numa_remove_cpu(int cpu)
8118c2ecf20Sopenharmony_ci{
8128c2ecf20Sopenharmony_ci	cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
8138c2ecf20Sopenharmony_ci}
8148c2ecf20Sopenharmony_ci# endif	/* !CONFIG_NUMA_EMU */
8158c2ecf20Sopenharmony_ci
8168c2ecf20Sopenharmony_ci#else	/* !CONFIG_DEBUG_PER_CPU_MAPS */
8178c2ecf20Sopenharmony_ci
8188c2ecf20Sopenharmony_ciint __cpu_to_node(int cpu)
8198c2ecf20Sopenharmony_ci{
8208c2ecf20Sopenharmony_ci	if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
8218c2ecf20Sopenharmony_ci		printk(KERN_WARNING
8228c2ecf20Sopenharmony_ci			"cpu_to_node(%d): usage too early!\n", cpu);
8238c2ecf20Sopenharmony_ci		dump_stack();
8248c2ecf20Sopenharmony_ci		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
8258c2ecf20Sopenharmony_ci	}
8268c2ecf20Sopenharmony_ci	return per_cpu(x86_cpu_to_node_map, cpu);
8278c2ecf20Sopenharmony_ci}
8288c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__cpu_to_node);
8298c2ecf20Sopenharmony_ci
8308c2ecf20Sopenharmony_ci/*
8318c2ecf20Sopenharmony_ci * Same function as cpu_to_node() but used if called before the
8328c2ecf20Sopenharmony_ci * per_cpu areas are setup.
8338c2ecf20Sopenharmony_ci */
8348c2ecf20Sopenharmony_ciint early_cpu_to_node(int cpu)
8358c2ecf20Sopenharmony_ci{
8368c2ecf20Sopenharmony_ci	if (early_per_cpu_ptr(x86_cpu_to_node_map))
8378c2ecf20Sopenharmony_ci		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
8388c2ecf20Sopenharmony_ci
8398c2ecf20Sopenharmony_ci	if (!cpu_possible(cpu)) {
8408c2ecf20Sopenharmony_ci		printk(KERN_WARNING
8418c2ecf20Sopenharmony_ci			"early_cpu_to_node(%d): no per_cpu area!\n", cpu);
8428c2ecf20Sopenharmony_ci		dump_stack();
8438c2ecf20Sopenharmony_ci		return NUMA_NO_NODE;
8448c2ecf20Sopenharmony_ci	}
8458c2ecf20Sopenharmony_ci	return per_cpu(x86_cpu_to_node_map, cpu);
8468c2ecf20Sopenharmony_ci}
8478c2ecf20Sopenharmony_ci
8488c2ecf20Sopenharmony_civoid debug_cpumask_set_cpu(int cpu, int node, bool enable)
8498c2ecf20Sopenharmony_ci{
8508c2ecf20Sopenharmony_ci	struct cpumask *mask;
8518c2ecf20Sopenharmony_ci
8528c2ecf20Sopenharmony_ci	if (node == NUMA_NO_NODE) {
8538c2ecf20Sopenharmony_ci		/* early_cpu_to_node() already emits a warning and trace */
8548c2ecf20Sopenharmony_ci		return;
8558c2ecf20Sopenharmony_ci	}
8568c2ecf20Sopenharmony_ci	mask = node_to_cpumask_map[node];
8578c2ecf20Sopenharmony_ci	if (!cpumask_available(mask)) {
8588c2ecf20Sopenharmony_ci		pr_err("node_to_cpumask_map[%i] NULL\n", node);
8598c2ecf20Sopenharmony_ci		dump_stack();
8608c2ecf20Sopenharmony_ci		return;
8618c2ecf20Sopenharmony_ci	}
8628c2ecf20Sopenharmony_ci
8638c2ecf20Sopenharmony_ci	if (enable)
8648c2ecf20Sopenharmony_ci		cpumask_set_cpu(cpu, mask);
8658c2ecf20Sopenharmony_ci	else
8668c2ecf20Sopenharmony_ci		cpumask_clear_cpu(cpu, mask);
8678c2ecf20Sopenharmony_ci
8688c2ecf20Sopenharmony_ci	printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n",
8698c2ecf20Sopenharmony_ci		enable ? "numa_add_cpu" : "numa_remove_cpu",
8708c2ecf20Sopenharmony_ci		cpu, node, cpumask_pr_args(mask));
8718c2ecf20Sopenharmony_ci	return;
8728c2ecf20Sopenharmony_ci}
8738c2ecf20Sopenharmony_ci
8748c2ecf20Sopenharmony_ci# ifndef CONFIG_NUMA_EMU
8758c2ecf20Sopenharmony_cistatic void numa_set_cpumask(int cpu, bool enable)
8768c2ecf20Sopenharmony_ci{
8778c2ecf20Sopenharmony_ci	debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
8788c2ecf20Sopenharmony_ci}
8798c2ecf20Sopenharmony_ci
8808c2ecf20Sopenharmony_civoid numa_add_cpu(int cpu)
8818c2ecf20Sopenharmony_ci{
8828c2ecf20Sopenharmony_ci	numa_set_cpumask(cpu, true);
8838c2ecf20Sopenharmony_ci}
8848c2ecf20Sopenharmony_ci
8858c2ecf20Sopenharmony_civoid numa_remove_cpu(int cpu)
8868c2ecf20Sopenharmony_ci{
8878c2ecf20Sopenharmony_ci	numa_set_cpumask(cpu, false);
8888c2ecf20Sopenharmony_ci}
8898c2ecf20Sopenharmony_ci# endif	/* !CONFIG_NUMA_EMU */
8908c2ecf20Sopenharmony_ci
8918c2ecf20Sopenharmony_ci/*
8928c2ecf20Sopenharmony_ci * Returns a pointer to the bitmask of CPUs on Node 'node'.
8938c2ecf20Sopenharmony_ci */
8948c2ecf20Sopenharmony_ciconst struct cpumask *cpumask_of_node(int node)
8958c2ecf20Sopenharmony_ci{
8968c2ecf20Sopenharmony_ci	if ((unsigned)node >= nr_node_ids) {
8978c2ecf20Sopenharmony_ci		printk(KERN_WARNING
8988c2ecf20Sopenharmony_ci			"cpumask_of_node(%d): (unsigned)node >= nr_node_ids(%u)\n",
8998c2ecf20Sopenharmony_ci			node, nr_node_ids);
9008c2ecf20Sopenharmony_ci		dump_stack();
9018c2ecf20Sopenharmony_ci		return cpu_none_mask;
9028c2ecf20Sopenharmony_ci	}
9038c2ecf20Sopenharmony_ci	if (!cpumask_available(node_to_cpumask_map[node])) {
9048c2ecf20Sopenharmony_ci		printk(KERN_WARNING
9058c2ecf20Sopenharmony_ci			"cpumask_of_node(%d): no node_to_cpumask_map!\n",
9068c2ecf20Sopenharmony_ci			node);
9078c2ecf20Sopenharmony_ci		dump_stack();
9088c2ecf20Sopenharmony_ci		return cpu_online_mask;
9098c2ecf20Sopenharmony_ci	}
9108c2ecf20Sopenharmony_ci	return node_to_cpumask_map[node];
9118c2ecf20Sopenharmony_ci}
9128c2ecf20Sopenharmony_ciEXPORT_SYMBOL(cpumask_of_node);
9138c2ecf20Sopenharmony_ci
9148c2ecf20Sopenharmony_ci#endif	/* !CONFIG_DEBUG_PER_CPU_MAPS */
9158c2ecf20Sopenharmony_ci
9168c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA_KEEP_MEMINFO
9178c2ecf20Sopenharmony_cistatic int meminfo_to_nid(struct numa_meminfo *mi, u64 start)
9188c2ecf20Sopenharmony_ci{
9198c2ecf20Sopenharmony_ci	int i;
9208c2ecf20Sopenharmony_ci
9218c2ecf20Sopenharmony_ci	for (i = 0; i < mi->nr_blks; i++)
9228c2ecf20Sopenharmony_ci		if (mi->blk[i].start <= start && mi->blk[i].end > start)
9238c2ecf20Sopenharmony_ci			return mi->blk[i].nid;
9248c2ecf20Sopenharmony_ci	return NUMA_NO_NODE;
9258c2ecf20Sopenharmony_ci}
9268c2ecf20Sopenharmony_ci
9278c2ecf20Sopenharmony_ciint phys_to_target_node(phys_addr_t start)
9288c2ecf20Sopenharmony_ci{
9298c2ecf20Sopenharmony_ci	int nid = meminfo_to_nid(&numa_meminfo, start);
9308c2ecf20Sopenharmony_ci
9318c2ecf20Sopenharmony_ci	/*
9328c2ecf20Sopenharmony_ci	 * Prefer online nodes, but if reserved memory might be
9338c2ecf20Sopenharmony_ci	 * hot-added continue the search with reserved ranges.
9348c2ecf20Sopenharmony_ci	 */
9358c2ecf20Sopenharmony_ci	if (nid != NUMA_NO_NODE)
9368c2ecf20Sopenharmony_ci		return nid;
9378c2ecf20Sopenharmony_ci
9388c2ecf20Sopenharmony_ci	return meminfo_to_nid(&numa_reserved_meminfo, start);
9398c2ecf20Sopenharmony_ci}
9408c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(phys_to_target_node);
9418c2ecf20Sopenharmony_ci
9428c2ecf20Sopenharmony_ciint memory_add_physaddr_to_nid(u64 start)
9438c2ecf20Sopenharmony_ci{
9448c2ecf20Sopenharmony_ci	int nid = meminfo_to_nid(&numa_meminfo, start);
9458c2ecf20Sopenharmony_ci
9468c2ecf20Sopenharmony_ci	if (nid == NUMA_NO_NODE)
9478c2ecf20Sopenharmony_ci		nid = numa_meminfo.blk[0].nid;
9488c2ecf20Sopenharmony_ci	return nid;
9498c2ecf20Sopenharmony_ci}
9508c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
9518c2ecf20Sopenharmony_ci#endif
952