18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 28c2ecf20Sopenharmony_ci/* Common code for 32 and 64-bit NUMA */ 38c2ecf20Sopenharmony_ci#include <linux/acpi.h> 48c2ecf20Sopenharmony_ci#include <linux/kernel.h> 58c2ecf20Sopenharmony_ci#include <linux/mm.h> 68c2ecf20Sopenharmony_ci#include <linux/string.h> 78c2ecf20Sopenharmony_ci#include <linux/init.h> 88c2ecf20Sopenharmony_ci#include <linux/memblock.h> 98c2ecf20Sopenharmony_ci#include <linux/mmzone.h> 108c2ecf20Sopenharmony_ci#include <linux/ctype.h> 118c2ecf20Sopenharmony_ci#include <linux/nodemask.h> 128c2ecf20Sopenharmony_ci#include <linux/sched.h> 138c2ecf20Sopenharmony_ci#include <linux/topology.h> 148c2ecf20Sopenharmony_ci 158c2ecf20Sopenharmony_ci#include <asm/e820/api.h> 168c2ecf20Sopenharmony_ci#include <asm/proto.h> 178c2ecf20Sopenharmony_ci#include <asm/dma.h> 188c2ecf20Sopenharmony_ci#include <asm/amd_nb.h> 198c2ecf20Sopenharmony_ci 208c2ecf20Sopenharmony_ci#include "numa_internal.h" 218c2ecf20Sopenharmony_ci 228c2ecf20Sopenharmony_ciint numa_off; 238c2ecf20Sopenharmony_cinodemask_t numa_nodes_parsed __initdata; 248c2ecf20Sopenharmony_ci 258c2ecf20Sopenharmony_cistruct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 268c2ecf20Sopenharmony_ciEXPORT_SYMBOL(node_data); 278c2ecf20Sopenharmony_ci 288c2ecf20Sopenharmony_cistatic struct numa_meminfo numa_meminfo __initdata_or_meminfo; 298c2ecf20Sopenharmony_cistatic struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo; 308c2ecf20Sopenharmony_ci 318c2ecf20Sopenharmony_cistatic int numa_distance_cnt; 328c2ecf20Sopenharmony_cistatic u8 *numa_distance; 338c2ecf20Sopenharmony_ci 348c2ecf20Sopenharmony_cistatic __init int numa_setup(char *opt) 358c2ecf20Sopenharmony_ci{ 368c2ecf20Sopenharmony_ci if (!opt) 378c2ecf20Sopenharmony_ci return -EINVAL; 388c2ecf20Sopenharmony_ci if (!strncmp(opt, "off", 3)) 398c2ecf20Sopenharmony_ci numa_off = 1; 408c2ecf20Sopenharmony_ci if (!strncmp(opt, "fake=", 5)) 418c2ecf20Sopenharmony_ci return numa_emu_cmdline(opt + 5); 428c2ecf20Sopenharmony_ci if (!strncmp(opt, "noacpi", 6)) 438c2ecf20Sopenharmony_ci disable_srat(); 448c2ecf20Sopenharmony_ci if (!strncmp(opt, "nohmat", 6)) 458c2ecf20Sopenharmony_ci disable_hmat(); 468c2ecf20Sopenharmony_ci return 0; 478c2ecf20Sopenharmony_ci} 488c2ecf20Sopenharmony_ciearly_param("numa", numa_setup); 498c2ecf20Sopenharmony_ci 508c2ecf20Sopenharmony_ci/* 518c2ecf20Sopenharmony_ci * apicid, cpu, node mappings 528c2ecf20Sopenharmony_ci */ 538c2ecf20Sopenharmony_cis16 __apicid_to_node[MAX_LOCAL_APIC] = { 548c2ecf20Sopenharmony_ci [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 558c2ecf20Sopenharmony_ci}; 568c2ecf20Sopenharmony_ci 578c2ecf20Sopenharmony_ciint numa_cpu_node(int cpu) 588c2ecf20Sopenharmony_ci{ 598c2ecf20Sopenharmony_ci int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); 608c2ecf20Sopenharmony_ci 618c2ecf20Sopenharmony_ci if (apicid != BAD_APICID) 628c2ecf20Sopenharmony_ci return __apicid_to_node[apicid]; 638c2ecf20Sopenharmony_ci return NUMA_NO_NODE; 648c2ecf20Sopenharmony_ci} 658c2ecf20Sopenharmony_ci 668c2ecf20Sopenharmony_cicpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 678c2ecf20Sopenharmony_ciEXPORT_SYMBOL(node_to_cpumask_map); 688c2ecf20Sopenharmony_ci 698c2ecf20Sopenharmony_ci/* 708c2ecf20Sopenharmony_ci * Map cpu index to node index 718c2ecf20Sopenharmony_ci */ 728c2ecf20Sopenharmony_ciDEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); 738c2ecf20Sopenharmony_ciEXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); 748c2ecf20Sopenharmony_ci 758c2ecf20Sopenharmony_civoid numa_set_node(int cpu, int node) 768c2ecf20Sopenharmony_ci{ 778c2ecf20Sopenharmony_ci int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); 788c2ecf20Sopenharmony_ci 798c2ecf20Sopenharmony_ci /* early setting, no percpu area yet */ 808c2ecf20Sopenharmony_ci if (cpu_to_node_map) { 818c2ecf20Sopenharmony_ci cpu_to_node_map[cpu] = node; 828c2ecf20Sopenharmony_ci return; 838c2ecf20Sopenharmony_ci } 848c2ecf20Sopenharmony_ci 858c2ecf20Sopenharmony_ci#ifdef CONFIG_DEBUG_PER_CPU_MAPS 868c2ecf20Sopenharmony_ci if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { 878c2ecf20Sopenharmony_ci printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); 888c2ecf20Sopenharmony_ci dump_stack(); 898c2ecf20Sopenharmony_ci return; 908c2ecf20Sopenharmony_ci } 918c2ecf20Sopenharmony_ci#endif 928c2ecf20Sopenharmony_ci per_cpu(x86_cpu_to_node_map, cpu) = node; 938c2ecf20Sopenharmony_ci 948c2ecf20Sopenharmony_ci set_cpu_numa_node(cpu, node); 958c2ecf20Sopenharmony_ci} 968c2ecf20Sopenharmony_ci 978c2ecf20Sopenharmony_civoid numa_clear_node(int cpu) 988c2ecf20Sopenharmony_ci{ 998c2ecf20Sopenharmony_ci numa_set_node(cpu, NUMA_NO_NODE); 1008c2ecf20Sopenharmony_ci} 1018c2ecf20Sopenharmony_ci 1028c2ecf20Sopenharmony_ci/* 1038c2ecf20Sopenharmony_ci * Allocate node_to_cpumask_map based on number of available nodes 1048c2ecf20Sopenharmony_ci * Requires node_possible_map to be valid. 1058c2ecf20Sopenharmony_ci * 1068c2ecf20Sopenharmony_ci * Note: cpumask_of_node() is not valid until after this is done. 1078c2ecf20Sopenharmony_ci * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) 1088c2ecf20Sopenharmony_ci */ 1098c2ecf20Sopenharmony_civoid __init setup_node_to_cpumask_map(void) 1108c2ecf20Sopenharmony_ci{ 1118c2ecf20Sopenharmony_ci unsigned int node; 1128c2ecf20Sopenharmony_ci 1138c2ecf20Sopenharmony_ci /* setup nr_node_ids if not done yet */ 1148c2ecf20Sopenharmony_ci if (nr_node_ids == MAX_NUMNODES) 1158c2ecf20Sopenharmony_ci setup_nr_node_ids(); 1168c2ecf20Sopenharmony_ci 1178c2ecf20Sopenharmony_ci /* allocate the map */ 1188c2ecf20Sopenharmony_ci for (node = 0; node < nr_node_ids; node++) 1198c2ecf20Sopenharmony_ci alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); 1208c2ecf20Sopenharmony_ci 1218c2ecf20Sopenharmony_ci /* cpumask_of_node() will now work */ 1228c2ecf20Sopenharmony_ci pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); 1238c2ecf20Sopenharmony_ci} 1248c2ecf20Sopenharmony_ci 1258c2ecf20Sopenharmony_cistatic int __init numa_add_memblk_to(int nid, u64 start, u64 end, 1268c2ecf20Sopenharmony_ci struct numa_meminfo *mi) 1278c2ecf20Sopenharmony_ci{ 1288c2ecf20Sopenharmony_ci /* ignore zero length blks */ 1298c2ecf20Sopenharmony_ci if (start == end) 1308c2ecf20Sopenharmony_ci return 0; 1318c2ecf20Sopenharmony_ci 1328c2ecf20Sopenharmony_ci /* whine about and ignore invalid blks */ 1338c2ecf20Sopenharmony_ci if (start > end || nid < 0 || nid >= MAX_NUMNODES) { 1348c2ecf20Sopenharmony_ci pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n", 1358c2ecf20Sopenharmony_ci nid, start, end - 1); 1368c2ecf20Sopenharmony_ci return 0; 1378c2ecf20Sopenharmony_ci } 1388c2ecf20Sopenharmony_ci 1398c2ecf20Sopenharmony_ci if (mi->nr_blks >= NR_NODE_MEMBLKS) { 1408c2ecf20Sopenharmony_ci pr_err("too many memblk ranges\n"); 1418c2ecf20Sopenharmony_ci return -EINVAL; 1428c2ecf20Sopenharmony_ci } 1438c2ecf20Sopenharmony_ci 1448c2ecf20Sopenharmony_ci mi->blk[mi->nr_blks].start = start; 1458c2ecf20Sopenharmony_ci mi->blk[mi->nr_blks].end = end; 1468c2ecf20Sopenharmony_ci mi->blk[mi->nr_blks].nid = nid; 1478c2ecf20Sopenharmony_ci mi->nr_blks++; 1488c2ecf20Sopenharmony_ci return 0; 1498c2ecf20Sopenharmony_ci} 1508c2ecf20Sopenharmony_ci 1518c2ecf20Sopenharmony_ci/** 1528c2ecf20Sopenharmony_ci * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo 1538c2ecf20Sopenharmony_ci * @idx: Index of memblk to remove 1548c2ecf20Sopenharmony_ci * @mi: numa_meminfo to remove memblk from 1558c2ecf20Sopenharmony_ci * 1568c2ecf20Sopenharmony_ci * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and 1578c2ecf20Sopenharmony_ci * decrementing @mi->nr_blks. 1588c2ecf20Sopenharmony_ci */ 1598c2ecf20Sopenharmony_civoid __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) 1608c2ecf20Sopenharmony_ci{ 1618c2ecf20Sopenharmony_ci mi->nr_blks--; 1628c2ecf20Sopenharmony_ci memmove(&mi->blk[idx], &mi->blk[idx + 1], 1638c2ecf20Sopenharmony_ci (mi->nr_blks - idx) * sizeof(mi->blk[0])); 1648c2ecf20Sopenharmony_ci} 1658c2ecf20Sopenharmony_ci 1668c2ecf20Sopenharmony_ci/** 1678c2ecf20Sopenharmony_ci * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another 1688c2ecf20Sopenharmony_ci * @dst: numa_meminfo to append block to 1698c2ecf20Sopenharmony_ci * @idx: Index of memblk to remove 1708c2ecf20Sopenharmony_ci * @src: numa_meminfo to remove memblk from 1718c2ecf20Sopenharmony_ci */ 1728c2ecf20Sopenharmony_cistatic void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx, 1738c2ecf20Sopenharmony_ci struct numa_meminfo *src) 1748c2ecf20Sopenharmony_ci{ 1758c2ecf20Sopenharmony_ci dst->blk[dst->nr_blks++] = src->blk[idx]; 1768c2ecf20Sopenharmony_ci numa_remove_memblk_from(idx, src); 1778c2ecf20Sopenharmony_ci} 1788c2ecf20Sopenharmony_ci 1798c2ecf20Sopenharmony_ci/** 1808c2ecf20Sopenharmony_ci * numa_add_memblk - Add one numa_memblk to numa_meminfo 1818c2ecf20Sopenharmony_ci * @nid: NUMA node ID of the new memblk 1828c2ecf20Sopenharmony_ci * @start: Start address of the new memblk 1838c2ecf20Sopenharmony_ci * @end: End address of the new memblk 1848c2ecf20Sopenharmony_ci * 1858c2ecf20Sopenharmony_ci * Add a new memblk to the default numa_meminfo. 1868c2ecf20Sopenharmony_ci * 1878c2ecf20Sopenharmony_ci * RETURNS: 1888c2ecf20Sopenharmony_ci * 0 on success, -errno on failure. 1898c2ecf20Sopenharmony_ci */ 1908c2ecf20Sopenharmony_ciint __init numa_add_memblk(int nid, u64 start, u64 end) 1918c2ecf20Sopenharmony_ci{ 1928c2ecf20Sopenharmony_ci return numa_add_memblk_to(nid, start, end, &numa_meminfo); 1938c2ecf20Sopenharmony_ci} 1948c2ecf20Sopenharmony_ci 1958c2ecf20Sopenharmony_ci/* Allocate NODE_DATA for a node on the local memory */ 1968c2ecf20Sopenharmony_cistatic void __init alloc_node_data(int nid) 1978c2ecf20Sopenharmony_ci{ 1988c2ecf20Sopenharmony_ci const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); 1998c2ecf20Sopenharmony_ci u64 nd_pa; 2008c2ecf20Sopenharmony_ci void *nd; 2018c2ecf20Sopenharmony_ci int tnid; 2028c2ecf20Sopenharmony_ci 2038c2ecf20Sopenharmony_ci /* 2048c2ecf20Sopenharmony_ci * Allocate node data. Try node-local memory and then any node. 2058c2ecf20Sopenharmony_ci * Never allocate in DMA zone. 2068c2ecf20Sopenharmony_ci */ 2078c2ecf20Sopenharmony_ci nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); 2088c2ecf20Sopenharmony_ci if (!nd_pa) { 2098c2ecf20Sopenharmony_ci pr_err("Cannot find %zu bytes in any node (initial node: %d)\n", 2108c2ecf20Sopenharmony_ci nd_size, nid); 2118c2ecf20Sopenharmony_ci return; 2128c2ecf20Sopenharmony_ci } 2138c2ecf20Sopenharmony_ci nd = __va(nd_pa); 2148c2ecf20Sopenharmony_ci 2158c2ecf20Sopenharmony_ci /* report and initialize */ 2168c2ecf20Sopenharmony_ci printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid, 2178c2ecf20Sopenharmony_ci nd_pa, nd_pa + nd_size - 1); 2188c2ecf20Sopenharmony_ci tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); 2198c2ecf20Sopenharmony_ci if (tnid != nid) 2208c2ecf20Sopenharmony_ci printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); 2218c2ecf20Sopenharmony_ci 2228c2ecf20Sopenharmony_ci node_data[nid] = nd; 2238c2ecf20Sopenharmony_ci memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); 2248c2ecf20Sopenharmony_ci 2258c2ecf20Sopenharmony_ci node_set_online(nid); 2268c2ecf20Sopenharmony_ci} 2278c2ecf20Sopenharmony_ci 2288c2ecf20Sopenharmony_ci/** 2298c2ecf20Sopenharmony_ci * numa_cleanup_meminfo - Cleanup a numa_meminfo 2308c2ecf20Sopenharmony_ci * @mi: numa_meminfo to clean up 2318c2ecf20Sopenharmony_ci * 2328c2ecf20Sopenharmony_ci * Sanitize @mi by merging and removing unnecessary memblks. Also check for 2338c2ecf20Sopenharmony_ci * conflicts and clear unused memblks. 2348c2ecf20Sopenharmony_ci * 2358c2ecf20Sopenharmony_ci * RETURNS: 2368c2ecf20Sopenharmony_ci * 0 on success, -errno on failure. 2378c2ecf20Sopenharmony_ci */ 2388c2ecf20Sopenharmony_ciint __init numa_cleanup_meminfo(struct numa_meminfo *mi) 2398c2ecf20Sopenharmony_ci{ 2408c2ecf20Sopenharmony_ci const u64 low = 0; 2418c2ecf20Sopenharmony_ci const u64 high = PFN_PHYS(max_pfn); 2428c2ecf20Sopenharmony_ci int i, j, k; 2438c2ecf20Sopenharmony_ci 2448c2ecf20Sopenharmony_ci /* first, trim all entries */ 2458c2ecf20Sopenharmony_ci for (i = 0; i < mi->nr_blks; i++) { 2468c2ecf20Sopenharmony_ci struct numa_memblk *bi = &mi->blk[i]; 2478c2ecf20Sopenharmony_ci 2488c2ecf20Sopenharmony_ci /* move / save reserved memory ranges */ 2498c2ecf20Sopenharmony_ci if (!memblock_overlaps_region(&memblock.memory, 2508c2ecf20Sopenharmony_ci bi->start, bi->end - bi->start)) { 2518c2ecf20Sopenharmony_ci numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi); 2528c2ecf20Sopenharmony_ci continue; 2538c2ecf20Sopenharmony_ci } 2548c2ecf20Sopenharmony_ci 2558c2ecf20Sopenharmony_ci /* make sure all non-reserved blocks are inside the limits */ 2568c2ecf20Sopenharmony_ci bi->start = max(bi->start, low); 2578c2ecf20Sopenharmony_ci 2588c2ecf20Sopenharmony_ci /* preserve info for non-RAM areas above 'max_pfn': */ 2598c2ecf20Sopenharmony_ci if (bi->end > high) { 2608c2ecf20Sopenharmony_ci numa_add_memblk_to(bi->nid, high, bi->end, 2618c2ecf20Sopenharmony_ci &numa_reserved_meminfo); 2628c2ecf20Sopenharmony_ci bi->end = high; 2638c2ecf20Sopenharmony_ci } 2648c2ecf20Sopenharmony_ci 2658c2ecf20Sopenharmony_ci /* and there's no empty block */ 2668c2ecf20Sopenharmony_ci if (bi->start >= bi->end) 2678c2ecf20Sopenharmony_ci numa_remove_memblk_from(i--, mi); 2688c2ecf20Sopenharmony_ci } 2698c2ecf20Sopenharmony_ci 2708c2ecf20Sopenharmony_ci /* merge neighboring / overlapping entries */ 2718c2ecf20Sopenharmony_ci for (i = 0; i < mi->nr_blks; i++) { 2728c2ecf20Sopenharmony_ci struct numa_memblk *bi = &mi->blk[i]; 2738c2ecf20Sopenharmony_ci 2748c2ecf20Sopenharmony_ci for (j = i + 1; j < mi->nr_blks; j++) { 2758c2ecf20Sopenharmony_ci struct numa_memblk *bj = &mi->blk[j]; 2768c2ecf20Sopenharmony_ci u64 start, end; 2778c2ecf20Sopenharmony_ci 2788c2ecf20Sopenharmony_ci /* 2798c2ecf20Sopenharmony_ci * See whether there are overlapping blocks. Whine 2808c2ecf20Sopenharmony_ci * about but allow overlaps of the same nid. They 2818c2ecf20Sopenharmony_ci * will be merged below. 2828c2ecf20Sopenharmony_ci */ 2838c2ecf20Sopenharmony_ci if (bi->end > bj->start && bi->start < bj->end) { 2848c2ecf20Sopenharmony_ci if (bi->nid != bj->nid) { 2858c2ecf20Sopenharmony_ci pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n", 2868c2ecf20Sopenharmony_ci bi->nid, bi->start, bi->end - 1, 2878c2ecf20Sopenharmony_ci bj->nid, bj->start, bj->end - 1); 2888c2ecf20Sopenharmony_ci return -EINVAL; 2898c2ecf20Sopenharmony_ci } 2908c2ecf20Sopenharmony_ci pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n", 2918c2ecf20Sopenharmony_ci bi->nid, bi->start, bi->end - 1, 2928c2ecf20Sopenharmony_ci bj->start, bj->end - 1); 2938c2ecf20Sopenharmony_ci } 2948c2ecf20Sopenharmony_ci 2958c2ecf20Sopenharmony_ci /* 2968c2ecf20Sopenharmony_ci * Join together blocks on the same node, holes 2978c2ecf20Sopenharmony_ci * between which don't overlap with memory on other 2988c2ecf20Sopenharmony_ci * nodes. 2998c2ecf20Sopenharmony_ci */ 3008c2ecf20Sopenharmony_ci if (bi->nid != bj->nid) 3018c2ecf20Sopenharmony_ci continue; 3028c2ecf20Sopenharmony_ci start = min(bi->start, bj->start); 3038c2ecf20Sopenharmony_ci end = max(bi->end, bj->end); 3048c2ecf20Sopenharmony_ci for (k = 0; k < mi->nr_blks; k++) { 3058c2ecf20Sopenharmony_ci struct numa_memblk *bk = &mi->blk[k]; 3068c2ecf20Sopenharmony_ci 3078c2ecf20Sopenharmony_ci if (bi->nid == bk->nid) 3088c2ecf20Sopenharmony_ci continue; 3098c2ecf20Sopenharmony_ci if (start < bk->end && end > bk->start) 3108c2ecf20Sopenharmony_ci break; 3118c2ecf20Sopenharmony_ci } 3128c2ecf20Sopenharmony_ci if (k < mi->nr_blks) 3138c2ecf20Sopenharmony_ci continue; 3148c2ecf20Sopenharmony_ci printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n", 3158c2ecf20Sopenharmony_ci bi->nid, bi->start, bi->end - 1, bj->start, 3168c2ecf20Sopenharmony_ci bj->end - 1, start, end - 1); 3178c2ecf20Sopenharmony_ci bi->start = start; 3188c2ecf20Sopenharmony_ci bi->end = end; 3198c2ecf20Sopenharmony_ci numa_remove_memblk_from(j--, mi); 3208c2ecf20Sopenharmony_ci } 3218c2ecf20Sopenharmony_ci } 3228c2ecf20Sopenharmony_ci 3238c2ecf20Sopenharmony_ci /* clear unused ones */ 3248c2ecf20Sopenharmony_ci for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { 3258c2ecf20Sopenharmony_ci mi->blk[i].start = mi->blk[i].end = 0; 3268c2ecf20Sopenharmony_ci mi->blk[i].nid = NUMA_NO_NODE; 3278c2ecf20Sopenharmony_ci } 3288c2ecf20Sopenharmony_ci 3298c2ecf20Sopenharmony_ci return 0; 3308c2ecf20Sopenharmony_ci} 3318c2ecf20Sopenharmony_ci 3328c2ecf20Sopenharmony_ci/* 3338c2ecf20Sopenharmony_ci * Set nodes, which have memory in @mi, in *@nodemask. 3348c2ecf20Sopenharmony_ci */ 3358c2ecf20Sopenharmony_cistatic void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, 3368c2ecf20Sopenharmony_ci const struct numa_meminfo *mi) 3378c2ecf20Sopenharmony_ci{ 3388c2ecf20Sopenharmony_ci int i; 3398c2ecf20Sopenharmony_ci 3408c2ecf20Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(mi->blk); i++) 3418c2ecf20Sopenharmony_ci if (mi->blk[i].start != mi->blk[i].end && 3428c2ecf20Sopenharmony_ci mi->blk[i].nid != NUMA_NO_NODE) 3438c2ecf20Sopenharmony_ci node_set(mi->blk[i].nid, *nodemask); 3448c2ecf20Sopenharmony_ci} 3458c2ecf20Sopenharmony_ci 3468c2ecf20Sopenharmony_ci/** 3478c2ecf20Sopenharmony_ci * numa_reset_distance - Reset NUMA distance table 3488c2ecf20Sopenharmony_ci * 3498c2ecf20Sopenharmony_ci * The current table is freed. The next numa_set_distance() call will 3508c2ecf20Sopenharmony_ci * create a new one. 3518c2ecf20Sopenharmony_ci */ 3528c2ecf20Sopenharmony_civoid __init numa_reset_distance(void) 3538c2ecf20Sopenharmony_ci{ 3548c2ecf20Sopenharmony_ci size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); 3558c2ecf20Sopenharmony_ci 3568c2ecf20Sopenharmony_ci /* numa_distance could be 1LU marking allocation failure, test cnt */ 3578c2ecf20Sopenharmony_ci if (numa_distance_cnt) 3588c2ecf20Sopenharmony_ci memblock_free(__pa(numa_distance), size); 3598c2ecf20Sopenharmony_ci numa_distance_cnt = 0; 3608c2ecf20Sopenharmony_ci numa_distance = NULL; /* enable table creation */ 3618c2ecf20Sopenharmony_ci} 3628c2ecf20Sopenharmony_ci 3638c2ecf20Sopenharmony_cistatic int __init numa_alloc_distance(void) 3648c2ecf20Sopenharmony_ci{ 3658c2ecf20Sopenharmony_ci nodemask_t nodes_parsed; 3668c2ecf20Sopenharmony_ci size_t size; 3678c2ecf20Sopenharmony_ci int i, j, cnt = 0; 3688c2ecf20Sopenharmony_ci u64 phys; 3698c2ecf20Sopenharmony_ci 3708c2ecf20Sopenharmony_ci /* size the new table and allocate it */ 3718c2ecf20Sopenharmony_ci nodes_parsed = numa_nodes_parsed; 3728c2ecf20Sopenharmony_ci numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); 3738c2ecf20Sopenharmony_ci 3748c2ecf20Sopenharmony_ci for_each_node_mask(i, nodes_parsed) 3758c2ecf20Sopenharmony_ci cnt = i; 3768c2ecf20Sopenharmony_ci cnt++; 3778c2ecf20Sopenharmony_ci size = cnt * cnt * sizeof(numa_distance[0]); 3788c2ecf20Sopenharmony_ci 3798c2ecf20Sopenharmony_ci phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), 3808c2ecf20Sopenharmony_ci size, PAGE_SIZE); 3818c2ecf20Sopenharmony_ci if (!phys) { 3828c2ecf20Sopenharmony_ci pr_warn("Warning: can't allocate distance table!\n"); 3838c2ecf20Sopenharmony_ci /* don't retry until explicitly reset */ 3848c2ecf20Sopenharmony_ci numa_distance = (void *)1LU; 3858c2ecf20Sopenharmony_ci return -ENOMEM; 3868c2ecf20Sopenharmony_ci } 3878c2ecf20Sopenharmony_ci memblock_reserve(phys, size); 3888c2ecf20Sopenharmony_ci 3898c2ecf20Sopenharmony_ci numa_distance = __va(phys); 3908c2ecf20Sopenharmony_ci numa_distance_cnt = cnt; 3918c2ecf20Sopenharmony_ci 3928c2ecf20Sopenharmony_ci /* fill with the default distances */ 3938c2ecf20Sopenharmony_ci for (i = 0; i < cnt; i++) 3948c2ecf20Sopenharmony_ci for (j = 0; j < cnt; j++) 3958c2ecf20Sopenharmony_ci numa_distance[i * cnt + j] = i == j ? 3968c2ecf20Sopenharmony_ci LOCAL_DISTANCE : REMOTE_DISTANCE; 3978c2ecf20Sopenharmony_ci printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); 3988c2ecf20Sopenharmony_ci 3998c2ecf20Sopenharmony_ci return 0; 4008c2ecf20Sopenharmony_ci} 4018c2ecf20Sopenharmony_ci 4028c2ecf20Sopenharmony_ci/** 4038c2ecf20Sopenharmony_ci * numa_set_distance - Set NUMA distance from one NUMA to another 4048c2ecf20Sopenharmony_ci * @from: the 'from' node to set distance 4058c2ecf20Sopenharmony_ci * @to: the 'to' node to set distance 4068c2ecf20Sopenharmony_ci * @distance: NUMA distance 4078c2ecf20Sopenharmony_ci * 4088c2ecf20Sopenharmony_ci * Set the distance from node @from to @to to @distance. If distance table 4098c2ecf20Sopenharmony_ci * doesn't exist, one which is large enough to accommodate all the currently 4108c2ecf20Sopenharmony_ci * known nodes will be created. 4118c2ecf20Sopenharmony_ci * 4128c2ecf20Sopenharmony_ci * If such table cannot be allocated, a warning is printed and further 4138c2ecf20Sopenharmony_ci * calls are ignored until the distance table is reset with 4148c2ecf20Sopenharmony_ci * numa_reset_distance(). 4158c2ecf20Sopenharmony_ci * 4168c2ecf20Sopenharmony_ci * If @from or @to is higher than the highest known node or lower than zero 4178c2ecf20Sopenharmony_ci * at the time of table creation or @distance doesn't make sense, the call 4188c2ecf20Sopenharmony_ci * is ignored. 4198c2ecf20Sopenharmony_ci * This is to allow simplification of specific NUMA config implementations. 4208c2ecf20Sopenharmony_ci */ 4218c2ecf20Sopenharmony_civoid __init numa_set_distance(int from, int to, int distance) 4228c2ecf20Sopenharmony_ci{ 4238c2ecf20Sopenharmony_ci if (!numa_distance && numa_alloc_distance() < 0) 4248c2ecf20Sopenharmony_ci return; 4258c2ecf20Sopenharmony_ci 4268c2ecf20Sopenharmony_ci if (from >= numa_distance_cnt || to >= numa_distance_cnt || 4278c2ecf20Sopenharmony_ci from < 0 || to < 0) { 4288c2ecf20Sopenharmony_ci pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n", 4298c2ecf20Sopenharmony_ci from, to, distance); 4308c2ecf20Sopenharmony_ci return; 4318c2ecf20Sopenharmony_ci } 4328c2ecf20Sopenharmony_ci 4338c2ecf20Sopenharmony_ci if ((u8)distance != distance || 4348c2ecf20Sopenharmony_ci (from == to && distance != LOCAL_DISTANCE)) { 4358c2ecf20Sopenharmony_ci pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n", 4368c2ecf20Sopenharmony_ci from, to, distance); 4378c2ecf20Sopenharmony_ci return; 4388c2ecf20Sopenharmony_ci } 4398c2ecf20Sopenharmony_ci 4408c2ecf20Sopenharmony_ci numa_distance[from * numa_distance_cnt + to] = distance; 4418c2ecf20Sopenharmony_ci} 4428c2ecf20Sopenharmony_ci 4438c2ecf20Sopenharmony_ciint __node_distance(int from, int to) 4448c2ecf20Sopenharmony_ci{ 4458c2ecf20Sopenharmony_ci if (from >= numa_distance_cnt || to >= numa_distance_cnt) 4468c2ecf20Sopenharmony_ci return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; 4478c2ecf20Sopenharmony_ci return numa_distance[from * numa_distance_cnt + to]; 4488c2ecf20Sopenharmony_ci} 4498c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__node_distance); 4508c2ecf20Sopenharmony_ci 4518c2ecf20Sopenharmony_ci/* 4528c2ecf20Sopenharmony_ci * Sanity check to catch more bad NUMA configurations (they are amazingly 4538c2ecf20Sopenharmony_ci * common). Make sure the nodes cover all memory. 4548c2ecf20Sopenharmony_ci */ 4558c2ecf20Sopenharmony_cistatic bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) 4568c2ecf20Sopenharmony_ci{ 4578c2ecf20Sopenharmony_ci u64 numaram, e820ram; 4588c2ecf20Sopenharmony_ci int i; 4598c2ecf20Sopenharmony_ci 4608c2ecf20Sopenharmony_ci numaram = 0; 4618c2ecf20Sopenharmony_ci for (i = 0; i < mi->nr_blks; i++) { 4628c2ecf20Sopenharmony_ci u64 s = mi->blk[i].start >> PAGE_SHIFT; 4638c2ecf20Sopenharmony_ci u64 e = mi->blk[i].end >> PAGE_SHIFT; 4648c2ecf20Sopenharmony_ci numaram += e - s; 4658c2ecf20Sopenharmony_ci numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); 4668c2ecf20Sopenharmony_ci if ((s64)numaram < 0) 4678c2ecf20Sopenharmony_ci numaram = 0; 4688c2ecf20Sopenharmony_ci } 4698c2ecf20Sopenharmony_ci 4708c2ecf20Sopenharmony_ci e820ram = max_pfn - absent_pages_in_range(0, max_pfn); 4718c2ecf20Sopenharmony_ci 4728c2ecf20Sopenharmony_ci /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ 4738c2ecf20Sopenharmony_ci if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { 4748c2ecf20Sopenharmony_ci printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n", 4758c2ecf20Sopenharmony_ci (numaram << PAGE_SHIFT) >> 20, 4768c2ecf20Sopenharmony_ci (e820ram << PAGE_SHIFT) >> 20); 4778c2ecf20Sopenharmony_ci return false; 4788c2ecf20Sopenharmony_ci } 4798c2ecf20Sopenharmony_ci return true; 4808c2ecf20Sopenharmony_ci} 4818c2ecf20Sopenharmony_ci 4828c2ecf20Sopenharmony_ci/* 4838c2ecf20Sopenharmony_ci * Mark all currently memblock-reserved physical memory (which covers the 4848c2ecf20Sopenharmony_ci * kernel's own memory ranges) as hot-unswappable. 4858c2ecf20Sopenharmony_ci */ 4868c2ecf20Sopenharmony_cistatic void __init numa_clear_kernel_node_hotplug(void) 4878c2ecf20Sopenharmony_ci{ 4888c2ecf20Sopenharmony_ci nodemask_t reserved_nodemask = NODE_MASK_NONE; 4898c2ecf20Sopenharmony_ci struct memblock_region *mb_region; 4908c2ecf20Sopenharmony_ci int i; 4918c2ecf20Sopenharmony_ci 4928c2ecf20Sopenharmony_ci /* 4938c2ecf20Sopenharmony_ci * We have to do some preprocessing of memblock regions, to 4948c2ecf20Sopenharmony_ci * make them suitable for reservation. 4958c2ecf20Sopenharmony_ci * 4968c2ecf20Sopenharmony_ci * At this time, all memory regions reserved by memblock are 4978c2ecf20Sopenharmony_ci * used by the kernel, but those regions are not split up 4988c2ecf20Sopenharmony_ci * along node boundaries yet, and don't necessarily have their 4998c2ecf20Sopenharmony_ci * node ID set yet either. 5008c2ecf20Sopenharmony_ci * 5018c2ecf20Sopenharmony_ci * So iterate over all memory known to the x86 architecture, 5028c2ecf20Sopenharmony_ci * and use those ranges to set the nid in memblock.reserved. 5038c2ecf20Sopenharmony_ci * This will split up the memblock regions along node 5048c2ecf20Sopenharmony_ci * boundaries and will set the node IDs as well. 5058c2ecf20Sopenharmony_ci */ 5068c2ecf20Sopenharmony_ci for (i = 0; i < numa_meminfo.nr_blks; i++) { 5078c2ecf20Sopenharmony_ci struct numa_memblk *mb = numa_meminfo.blk + i; 5088c2ecf20Sopenharmony_ci int ret; 5098c2ecf20Sopenharmony_ci 5108c2ecf20Sopenharmony_ci ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid); 5118c2ecf20Sopenharmony_ci WARN_ON_ONCE(ret); 5128c2ecf20Sopenharmony_ci } 5138c2ecf20Sopenharmony_ci 5148c2ecf20Sopenharmony_ci /* 5158c2ecf20Sopenharmony_ci * Now go over all reserved memblock regions, to construct a 5168c2ecf20Sopenharmony_ci * node mask of all kernel reserved memory areas. 5178c2ecf20Sopenharmony_ci * 5188c2ecf20Sopenharmony_ci * [ Note, when booting with mem=nn[kMG] or in a kdump kernel, 5198c2ecf20Sopenharmony_ci * numa_meminfo might not include all memblock.reserved 5208c2ecf20Sopenharmony_ci * memory ranges, because quirks such as trim_snb_memory() 5218c2ecf20Sopenharmony_ci * reserve specific pages for Sandy Bridge graphics. ] 5228c2ecf20Sopenharmony_ci */ 5238c2ecf20Sopenharmony_ci for_each_reserved_mem_region(mb_region) { 5248c2ecf20Sopenharmony_ci int nid = memblock_get_region_node(mb_region); 5258c2ecf20Sopenharmony_ci 5268c2ecf20Sopenharmony_ci if (nid != MAX_NUMNODES) 5278c2ecf20Sopenharmony_ci node_set(nid, reserved_nodemask); 5288c2ecf20Sopenharmony_ci } 5298c2ecf20Sopenharmony_ci 5308c2ecf20Sopenharmony_ci /* 5318c2ecf20Sopenharmony_ci * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory 5328c2ecf20Sopenharmony_ci * belonging to the reserved node mask. 5338c2ecf20Sopenharmony_ci * 5348c2ecf20Sopenharmony_ci * Note that this will include memory regions that reside 5358c2ecf20Sopenharmony_ci * on nodes that contain kernel memory - entire nodes 5368c2ecf20Sopenharmony_ci * become hot-unpluggable: 5378c2ecf20Sopenharmony_ci */ 5388c2ecf20Sopenharmony_ci for (i = 0; i < numa_meminfo.nr_blks; i++) { 5398c2ecf20Sopenharmony_ci struct numa_memblk *mb = numa_meminfo.blk + i; 5408c2ecf20Sopenharmony_ci 5418c2ecf20Sopenharmony_ci if (!node_isset(mb->nid, reserved_nodemask)) 5428c2ecf20Sopenharmony_ci continue; 5438c2ecf20Sopenharmony_ci 5448c2ecf20Sopenharmony_ci memblock_clear_hotplug(mb->start, mb->end - mb->start); 5458c2ecf20Sopenharmony_ci } 5468c2ecf20Sopenharmony_ci} 5478c2ecf20Sopenharmony_ci 5488c2ecf20Sopenharmony_cistatic int __init numa_register_memblks(struct numa_meminfo *mi) 5498c2ecf20Sopenharmony_ci{ 5508c2ecf20Sopenharmony_ci int i, nid; 5518c2ecf20Sopenharmony_ci 5528c2ecf20Sopenharmony_ci /* Account for nodes with cpus and no memory */ 5538c2ecf20Sopenharmony_ci node_possible_map = numa_nodes_parsed; 5548c2ecf20Sopenharmony_ci numa_nodemask_from_meminfo(&node_possible_map, mi); 5558c2ecf20Sopenharmony_ci if (WARN_ON(nodes_empty(node_possible_map))) 5568c2ecf20Sopenharmony_ci return -EINVAL; 5578c2ecf20Sopenharmony_ci 5588c2ecf20Sopenharmony_ci for (i = 0; i < mi->nr_blks; i++) { 5598c2ecf20Sopenharmony_ci struct numa_memblk *mb = &mi->blk[i]; 5608c2ecf20Sopenharmony_ci memblock_set_node(mb->start, mb->end - mb->start, 5618c2ecf20Sopenharmony_ci &memblock.memory, mb->nid); 5628c2ecf20Sopenharmony_ci } 5638c2ecf20Sopenharmony_ci 5648c2ecf20Sopenharmony_ci /* 5658c2ecf20Sopenharmony_ci * At very early time, the kernel have to use some memory such as 5668c2ecf20Sopenharmony_ci * loading the kernel image. We cannot prevent this anyway. So any 5678c2ecf20Sopenharmony_ci * node the kernel resides in should be un-hotpluggable. 5688c2ecf20Sopenharmony_ci * 5698c2ecf20Sopenharmony_ci * And when we come here, alloc node data won't fail. 5708c2ecf20Sopenharmony_ci */ 5718c2ecf20Sopenharmony_ci numa_clear_kernel_node_hotplug(); 5728c2ecf20Sopenharmony_ci 5738c2ecf20Sopenharmony_ci /* 5748c2ecf20Sopenharmony_ci * If sections array is gonna be used for pfn -> nid mapping, check 5758c2ecf20Sopenharmony_ci * whether its granularity is fine enough. 5768c2ecf20Sopenharmony_ci */ 5778c2ecf20Sopenharmony_ci if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) { 5788c2ecf20Sopenharmony_ci unsigned long pfn_align = node_map_pfn_alignment(); 5798c2ecf20Sopenharmony_ci 5808c2ecf20Sopenharmony_ci if (pfn_align && pfn_align < PAGES_PER_SECTION) { 5818c2ecf20Sopenharmony_ci pr_warn("Node alignment %LuMB < min %LuMB, rejecting NUMA config\n", 5828c2ecf20Sopenharmony_ci PFN_PHYS(pfn_align) >> 20, 5838c2ecf20Sopenharmony_ci PFN_PHYS(PAGES_PER_SECTION) >> 20); 5848c2ecf20Sopenharmony_ci return -EINVAL; 5858c2ecf20Sopenharmony_ci } 5868c2ecf20Sopenharmony_ci } 5878c2ecf20Sopenharmony_ci if (!numa_meminfo_cover_memory(mi)) 5888c2ecf20Sopenharmony_ci return -EINVAL; 5898c2ecf20Sopenharmony_ci 5908c2ecf20Sopenharmony_ci /* Finally register nodes. */ 5918c2ecf20Sopenharmony_ci for_each_node_mask(nid, node_possible_map) { 5928c2ecf20Sopenharmony_ci u64 start = PFN_PHYS(max_pfn); 5938c2ecf20Sopenharmony_ci u64 end = 0; 5948c2ecf20Sopenharmony_ci 5958c2ecf20Sopenharmony_ci for (i = 0; i < mi->nr_blks; i++) { 5968c2ecf20Sopenharmony_ci if (nid != mi->blk[i].nid) 5978c2ecf20Sopenharmony_ci continue; 5988c2ecf20Sopenharmony_ci start = min(mi->blk[i].start, start); 5998c2ecf20Sopenharmony_ci end = max(mi->blk[i].end, end); 6008c2ecf20Sopenharmony_ci } 6018c2ecf20Sopenharmony_ci 6028c2ecf20Sopenharmony_ci if (start >= end) 6038c2ecf20Sopenharmony_ci continue; 6048c2ecf20Sopenharmony_ci 6058c2ecf20Sopenharmony_ci alloc_node_data(nid); 6068c2ecf20Sopenharmony_ci } 6078c2ecf20Sopenharmony_ci 6088c2ecf20Sopenharmony_ci /* Dump memblock with node info and return. */ 6098c2ecf20Sopenharmony_ci memblock_dump_all(); 6108c2ecf20Sopenharmony_ci return 0; 6118c2ecf20Sopenharmony_ci} 6128c2ecf20Sopenharmony_ci 6138c2ecf20Sopenharmony_ci/* 6148c2ecf20Sopenharmony_ci * There are unfortunately some poorly designed mainboards around that 6158c2ecf20Sopenharmony_ci * only connect memory to a single CPU. This breaks the 1:1 cpu->node 6168c2ecf20Sopenharmony_ci * mapping. To avoid this fill in the mapping for all possible CPUs, 6178c2ecf20Sopenharmony_ci * as the number of CPUs is not known yet. We round robin the existing 6188c2ecf20Sopenharmony_ci * nodes. 6198c2ecf20Sopenharmony_ci */ 6208c2ecf20Sopenharmony_cistatic void __init numa_init_array(void) 6218c2ecf20Sopenharmony_ci{ 6228c2ecf20Sopenharmony_ci int rr, i; 6238c2ecf20Sopenharmony_ci 6248c2ecf20Sopenharmony_ci rr = first_node(node_online_map); 6258c2ecf20Sopenharmony_ci for (i = 0; i < nr_cpu_ids; i++) { 6268c2ecf20Sopenharmony_ci if (early_cpu_to_node(i) != NUMA_NO_NODE) 6278c2ecf20Sopenharmony_ci continue; 6288c2ecf20Sopenharmony_ci numa_set_node(i, rr); 6298c2ecf20Sopenharmony_ci rr = next_node_in(rr, node_online_map); 6308c2ecf20Sopenharmony_ci } 6318c2ecf20Sopenharmony_ci} 6328c2ecf20Sopenharmony_ci 6338c2ecf20Sopenharmony_cistatic int __init numa_init(int (*init_func)(void)) 6348c2ecf20Sopenharmony_ci{ 6358c2ecf20Sopenharmony_ci int i; 6368c2ecf20Sopenharmony_ci int ret; 6378c2ecf20Sopenharmony_ci 6388c2ecf20Sopenharmony_ci for (i = 0; i < MAX_LOCAL_APIC; i++) 6398c2ecf20Sopenharmony_ci set_apicid_to_node(i, NUMA_NO_NODE); 6408c2ecf20Sopenharmony_ci 6418c2ecf20Sopenharmony_ci nodes_clear(numa_nodes_parsed); 6428c2ecf20Sopenharmony_ci nodes_clear(node_possible_map); 6438c2ecf20Sopenharmony_ci nodes_clear(node_online_map); 6448c2ecf20Sopenharmony_ci memset(&numa_meminfo, 0, sizeof(numa_meminfo)); 6458c2ecf20Sopenharmony_ci WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory, 6468c2ecf20Sopenharmony_ci MAX_NUMNODES)); 6478c2ecf20Sopenharmony_ci WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved, 6488c2ecf20Sopenharmony_ci MAX_NUMNODES)); 6498c2ecf20Sopenharmony_ci /* In case that parsing SRAT failed. */ 6508c2ecf20Sopenharmony_ci WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX)); 6518c2ecf20Sopenharmony_ci numa_reset_distance(); 6528c2ecf20Sopenharmony_ci 6538c2ecf20Sopenharmony_ci ret = init_func(); 6548c2ecf20Sopenharmony_ci if (ret < 0) 6558c2ecf20Sopenharmony_ci return ret; 6568c2ecf20Sopenharmony_ci 6578c2ecf20Sopenharmony_ci /* 6588c2ecf20Sopenharmony_ci * We reset memblock back to the top-down direction 6598c2ecf20Sopenharmony_ci * here because if we configured ACPI_NUMA, we have 6608c2ecf20Sopenharmony_ci * parsed SRAT in init_func(). It is ok to have the 6618c2ecf20Sopenharmony_ci * reset here even if we did't configure ACPI_NUMA 6628c2ecf20Sopenharmony_ci * or acpi numa init fails and fallbacks to dummy 6638c2ecf20Sopenharmony_ci * numa init. 6648c2ecf20Sopenharmony_ci */ 6658c2ecf20Sopenharmony_ci memblock_set_bottom_up(false); 6668c2ecf20Sopenharmony_ci 6678c2ecf20Sopenharmony_ci ret = numa_cleanup_meminfo(&numa_meminfo); 6688c2ecf20Sopenharmony_ci if (ret < 0) 6698c2ecf20Sopenharmony_ci return ret; 6708c2ecf20Sopenharmony_ci 6718c2ecf20Sopenharmony_ci numa_emulation(&numa_meminfo, numa_distance_cnt); 6728c2ecf20Sopenharmony_ci 6738c2ecf20Sopenharmony_ci ret = numa_register_memblks(&numa_meminfo); 6748c2ecf20Sopenharmony_ci if (ret < 0) 6758c2ecf20Sopenharmony_ci return ret; 6768c2ecf20Sopenharmony_ci 6778c2ecf20Sopenharmony_ci for (i = 0; i < nr_cpu_ids; i++) { 6788c2ecf20Sopenharmony_ci int nid = early_cpu_to_node(i); 6798c2ecf20Sopenharmony_ci 6808c2ecf20Sopenharmony_ci if (nid == NUMA_NO_NODE) 6818c2ecf20Sopenharmony_ci continue; 6828c2ecf20Sopenharmony_ci if (!node_online(nid)) 6838c2ecf20Sopenharmony_ci numa_clear_node(i); 6848c2ecf20Sopenharmony_ci } 6858c2ecf20Sopenharmony_ci numa_init_array(); 6868c2ecf20Sopenharmony_ci 6878c2ecf20Sopenharmony_ci return 0; 6888c2ecf20Sopenharmony_ci} 6898c2ecf20Sopenharmony_ci 6908c2ecf20Sopenharmony_ci/** 6918c2ecf20Sopenharmony_ci * dummy_numa_init - Fallback dummy NUMA init 6928c2ecf20Sopenharmony_ci * 6938c2ecf20Sopenharmony_ci * Used if there's no underlying NUMA architecture, NUMA initialization 6948c2ecf20Sopenharmony_ci * fails, or NUMA is disabled on the command line. 6958c2ecf20Sopenharmony_ci * 6968c2ecf20Sopenharmony_ci * Must online at least one node and add memory blocks that cover all 6978c2ecf20Sopenharmony_ci * allowed memory. This function must not fail. 6988c2ecf20Sopenharmony_ci */ 6998c2ecf20Sopenharmony_cistatic int __init dummy_numa_init(void) 7008c2ecf20Sopenharmony_ci{ 7018c2ecf20Sopenharmony_ci printk(KERN_INFO "%s\n", 7028c2ecf20Sopenharmony_ci numa_off ? "NUMA turned off" : "No NUMA configuration found"); 7038c2ecf20Sopenharmony_ci printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n", 7048c2ecf20Sopenharmony_ci 0LLU, PFN_PHYS(max_pfn) - 1); 7058c2ecf20Sopenharmony_ci 7068c2ecf20Sopenharmony_ci node_set(0, numa_nodes_parsed); 7078c2ecf20Sopenharmony_ci numa_add_memblk(0, 0, PFN_PHYS(max_pfn)); 7088c2ecf20Sopenharmony_ci 7098c2ecf20Sopenharmony_ci return 0; 7108c2ecf20Sopenharmony_ci} 7118c2ecf20Sopenharmony_ci 7128c2ecf20Sopenharmony_ci/** 7138c2ecf20Sopenharmony_ci * x86_numa_init - Initialize NUMA 7148c2ecf20Sopenharmony_ci * 7158c2ecf20Sopenharmony_ci * Try each configured NUMA initialization method until one succeeds. The 7168c2ecf20Sopenharmony_ci * last fallback is dummy single node config encompassing whole memory and 7178c2ecf20Sopenharmony_ci * never fails. 7188c2ecf20Sopenharmony_ci */ 7198c2ecf20Sopenharmony_civoid __init x86_numa_init(void) 7208c2ecf20Sopenharmony_ci{ 7218c2ecf20Sopenharmony_ci if (!numa_off) { 7228c2ecf20Sopenharmony_ci#ifdef CONFIG_ACPI_NUMA 7238c2ecf20Sopenharmony_ci if (!numa_init(x86_acpi_numa_init)) 7248c2ecf20Sopenharmony_ci return; 7258c2ecf20Sopenharmony_ci#endif 7268c2ecf20Sopenharmony_ci#ifdef CONFIG_AMD_NUMA 7278c2ecf20Sopenharmony_ci if (!numa_init(amd_numa_init)) 7288c2ecf20Sopenharmony_ci return; 7298c2ecf20Sopenharmony_ci#endif 7308c2ecf20Sopenharmony_ci } 7318c2ecf20Sopenharmony_ci 7328c2ecf20Sopenharmony_ci numa_init(dummy_numa_init); 7338c2ecf20Sopenharmony_ci} 7348c2ecf20Sopenharmony_ci 7358c2ecf20Sopenharmony_cistatic void __init init_memory_less_node(int nid) 7368c2ecf20Sopenharmony_ci{ 7378c2ecf20Sopenharmony_ci /* Allocate and initialize node data. Memory-less node is now online.*/ 7388c2ecf20Sopenharmony_ci alloc_node_data(nid); 7398c2ecf20Sopenharmony_ci free_area_init_memoryless_node(nid); 7408c2ecf20Sopenharmony_ci 7418c2ecf20Sopenharmony_ci /* 7428c2ecf20Sopenharmony_ci * All zonelists will be built later in start_kernel() after per cpu 7438c2ecf20Sopenharmony_ci * areas are initialized. 7448c2ecf20Sopenharmony_ci */ 7458c2ecf20Sopenharmony_ci} 7468c2ecf20Sopenharmony_ci 7478c2ecf20Sopenharmony_ci/* 7488c2ecf20Sopenharmony_ci * A node may exist which has one or more Generic Initiators but no CPUs and no 7498c2ecf20Sopenharmony_ci * memory. 7508c2ecf20Sopenharmony_ci * 7518c2ecf20Sopenharmony_ci * This function must be called after init_cpu_to_node(), to ensure that any 7528c2ecf20Sopenharmony_ci * memoryless CPU nodes have already been brought online, and before the 7538c2ecf20Sopenharmony_ci * node_data[nid] is needed for zone list setup in build_all_zonelists(). 7548c2ecf20Sopenharmony_ci * 7558c2ecf20Sopenharmony_ci * When this function is called, any nodes containing either memory and/or CPUs 7568c2ecf20Sopenharmony_ci * will already be online and there is no need to do anything extra, even if 7578c2ecf20Sopenharmony_ci * they also contain one or more Generic Initiators. 7588c2ecf20Sopenharmony_ci */ 7598c2ecf20Sopenharmony_civoid __init init_gi_nodes(void) 7608c2ecf20Sopenharmony_ci{ 7618c2ecf20Sopenharmony_ci int nid; 7628c2ecf20Sopenharmony_ci 7638c2ecf20Sopenharmony_ci for_each_node_state(nid, N_GENERIC_INITIATOR) 7648c2ecf20Sopenharmony_ci if (!node_online(nid)) 7658c2ecf20Sopenharmony_ci init_memory_less_node(nid); 7668c2ecf20Sopenharmony_ci} 7678c2ecf20Sopenharmony_ci 7688c2ecf20Sopenharmony_ci/* 7698c2ecf20Sopenharmony_ci * Setup early cpu_to_node. 7708c2ecf20Sopenharmony_ci * 7718c2ecf20Sopenharmony_ci * Populate cpu_to_node[] only if x86_cpu_to_apicid[], 7728c2ecf20Sopenharmony_ci * and apicid_to_node[] tables have valid entries for a CPU. 7738c2ecf20Sopenharmony_ci * This means we skip cpu_to_node[] initialisation for NUMA 7748c2ecf20Sopenharmony_ci * emulation and faking node case (when running a kernel compiled 7758c2ecf20Sopenharmony_ci * for NUMA on a non NUMA box), which is OK as cpu_to_node[] 7768c2ecf20Sopenharmony_ci * is already initialized in a round robin manner at numa_init_array, 7778c2ecf20Sopenharmony_ci * prior to this call, and this initialization is good enough 7788c2ecf20Sopenharmony_ci * for the fake NUMA cases. 7798c2ecf20Sopenharmony_ci * 7808c2ecf20Sopenharmony_ci * Called before the per_cpu areas are setup. 7818c2ecf20Sopenharmony_ci */ 7828c2ecf20Sopenharmony_civoid __init init_cpu_to_node(void) 7838c2ecf20Sopenharmony_ci{ 7848c2ecf20Sopenharmony_ci int cpu; 7858c2ecf20Sopenharmony_ci u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); 7868c2ecf20Sopenharmony_ci 7878c2ecf20Sopenharmony_ci BUG_ON(cpu_to_apicid == NULL); 7888c2ecf20Sopenharmony_ci 7898c2ecf20Sopenharmony_ci for_each_possible_cpu(cpu) { 7908c2ecf20Sopenharmony_ci int node = numa_cpu_node(cpu); 7918c2ecf20Sopenharmony_ci 7928c2ecf20Sopenharmony_ci if (node == NUMA_NO_NODE) 7938c2ecf20Sopenharmony_ci continue; 7948c2ecf20Sopenharmony_ci 7958c2ecf20Sopenharmony_ci if (!node_online(node)) 7968c2ecf20Sopenharmony_ci init_memory_less_node(node); 7978c2ecf20Sopenharmony_ci 7988c2ecf20Sopenharmony_ci numa_set_node(cpu, node); 7998c2ecf20Sopenharmony_ci } 8008c2ecf20Sopenharmony_ci} 8018c2ecf20Sopenharmony_ci 8028c2ecf20Sopenharmony_ci#ifndef CONFIG_DEBUG_PER_CPU_MAPS 8038c2ecf20Sopenharmony_ci 8048c2ecf20Sopenharmony_ci# ifndef CONFIG_NUMA_EMU 8058c2ecf20Sopenharmony_civoid numa_add_cpu(int cpu) 8068c2ecf20Sopenharmony_ci{ 8078c2ecf20Sopenharmony_ci cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 8088c2ecf20Sopenharmony_ci} 8098c2ecf20Sopenharmony_ci 8108c2ecf20Sopenharmony_civoid numa_remove_cpu(int cpu) 8118c2ecf20Sopenharmony_ci{ 8128c2ecf20Sopenharmony_ci cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 8138c2ecf20Sopenharmony_ci} 8148c2ecf20Sopenharmony_ci# endif /* !CONFIG_NUMA_EMU */ 8158c2ecf20Sopenharmony_ci 8168c2ecf20Sopenharmony_ci#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ 8178c2ecf20Sopenharmony_ci 8188c2ecf20Sopenharmony_ciint __cpu_to_node(int cpu) 8198c2ecf20Sopenharmony_ci{ 8208c2ecf20Sopenharmony_ci if (early_per_cpu_ptr(x86_cpu_to_node_map)) { 8218c2ecf20Sopenharmony_ci printk(KERN_WARNING 8228c2ecf20Sopenharmony_ci "cpu_to_node(%d): usage too early!\n", cpu); 8238c2ecf20Sopenharmony_ci dump_stack(); 8248c2ecf20Sopenharmony_ci return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; 8258c2ecf20Sopenharmony_ci } 8268c2ecf20Sopenharmony_ci return per_cpu(x86_cpu_to_node_map, cpu); 8278c2ecf20Sopenharmony_ci} 8288c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__cpu_to_node); 8298c2ecf20Sopenharmony_ci 8308c2ecf20Sopenharmony_ci/* 8318c2ecf20Sopenharmony_ci * Same function as cpu_to_node() but used if called before the 8328c2ecf20Sopenharmony_ci * per_cpu areas are setup. 8338c2ecf20Sopenharmony_ci */ 8348c2ecf20Sopenharmony_ciint early_cpu_to_node(int cpu) 8358c2ecf20Sopenharmony_ci{ 8368c2ecf20Sopenharmony_ci if (early_per_cpu_ptr(x86_cpu_to_node_map)) 8378c2ecf20Sopenharmony_ci return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; 8388c2ecf20Sopenharmony_ci 8398c2ecf20Sopenharmony_ci if (!cpu_possible(cpu)) { 8408c2ecf20Sopenharmony_ci printk(KERN_WARNING 8418c2ecf20Sopenharmony_ci "early_cpu_to_node(%d): no per_cpu area!\n", cpu); 8428c2ecf20Sopenharmony_ci dump_stack(); 8438c2ecf20Sopenharmony_ci return NUMA_NO_NODE; 8448c2ecf20Sopenharmony_ci } 8458c2ecf20Sopenharmony_ci return per_cpu(x86_cpu_to_node_map, cpu); 8468c2ecf20Sopenharmony_ci} 8478c2ecf20Sopenharmony_ci 8488c2ecf20Sopenharmony_civoid debug_cpumask_set_cpu(int cpu, int node, bool enable) 8498c2ecf20Sopenharmony_ci{ 8508c2ecf20Sopenharmony_ci struct cpumask *mask; 8518c2ecf20Sopenharmony_ci 8528c2ecf20Sopenharmony_ci if (node == NUMA_NO_NODE) { 8538c2ecf20Sopenharmony_ci /* early_cpu_to_node() already emits a warning and trace */ 8548c2ecf20Sopenharmony_ci return; 8558c2ecf20Sopenharmony_ci } 8568c2ecf20Sopenharmony_ci mask = node_to_cpumask_map[node]; 8578c2ecf20Sopenharmony_ci if (!cpumask_available(mask)) { 8588c2ecf20Sopenharmony_ci pr_err("node_to_cpumask_map[%i] NULL\n", node); 8598c2ecf20Sopenharmony_ci dump_stack(); 8608c2ecf20Sopenharmony_ci return; 8618c2ecf20Sopenharmony_ci } 8628c2ecf20Sopenharmony_ci 8638c2ecf20Sopenharmony_ci if (enable) 8648c2ecf20Sopenharmony_ci cpumask_set_cpu(cpu, mask); 8658c2ecf20Sopenharmony_ci else 8668c2ecf20Sopenharmony_ci cpumask_clear_cpu(cpu, mask); 8678c2ecf20Sopenharmony_ci 8688c2ecf20Sopenharmony_ci printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n", 8698c2ecf20Sopenharmony_ci enable ? "numa_add_cpu" : "numa_remove_cpu", 8708c2ecf20Sopenharmony_ci cpu, node, cpumask_pr_args(mask)); 8718c2ecf20Sopenharmony_ci return; 8728c2ecf20Sopenharmony_ci} 8738c2ecf20Sopenharmony_ci 8748c2ecf20Sopenharmony_ci# ifndef CONFIG_NUMA_EMU 8758c2ecf20Sopenharmony_cistatic void numa_set_cpumask(int cpu, bool enable) 8768c2ecf20Sopenharmony_ci{ 8778c2ecf20Sopenharmony_ci debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable); 8788c2ecf20Sopenharmony_ci} 8798c2ecf20Sopenharmony_ci 8808c2ecf20Sopenharmony_civoid numa_add_cpu(int cpu) 8818c2ecf20Sopenharmony_ci{ 8828c2ecf20Sopenharmony_ci numa_set_cpumask(cpu, true); 8838c2ecf20Sopenharmony_ci} 8848c2ecf20Sopenharmony_ci 8858c2ecf20Sopenharmony_civoid numa_remove_cpu(int cpu) 8868c2ecf20Sopenharmony_ci{ 8878c2ecf20Sopenharmony_ci numa_set_cpumask(cpu, false); 8888c2ecf20Sopenharmony_ci} 8898c2ecf20Sopenharmony_ci# endif /* !CONFIG_NUMA_EMU */ 8908c2ecf20Sopenharmony_ci 8918c2ecf20Sopenharmony_ci/* 8928c2ecf20Sopenharmony_ci * Returns a pointer to the bitmask of CPUs on Node 'node'. 8938c2ecf20Sopenharmony_ci */ 8948c2ecf20Sopenharmony_ciconst struct cpumask *cpumask_of_node(int node) 8958c2ecf20Sopenharmony_ci{ 8968c2ecf20Sopenharmony_ci if ((unsigned)node >= nr_node_ids) { 8978c2ecf20Sopenharmony_ci printk(KERN_WARNING 8988c2ecf20Sopenharmony_ci "cpumask_of_node(%d): (unsigned)node >= nr_node_ids(%u)\n", 8998c2ecf20Sopenharmony_ci node, nr_node_ids); 9008c2ecf20Sopenharmony_ci dump_stack(); 9018c2ecf20Sopenharmony_ci return cpu_none_mask; 9028c2ecf20Sopenharmony_ci } 9038c2ecf20Sopenharmony_ci if (!cpumask_available(node_to_cpumask_map[node])) { 9048c2ecf20Sopenharmony_ci printk(KERN_WARNING 9058c2ecf20Sopenharmony_ci "cpumask_of_node(%d): no node_to_cpumask_map!\n", 9068c2ecf20Sopenharmony_ci node); 9078c2ecf20Sopenharmony_ci dump_stack(); 9088c2ecf20Sopenharmony_ci return cpu_online_mask; 9098c2ecf20Sopenharmony_ci } 9108c2ecf20Sopenharmony_ci return node_to_cpumask_map[node]; 9118c2ecf20Sopenharmony_ci} 9128c2ecf20Sopenharmony_ciEXPORT_SYMBOL(cpumask_of_node); 9138c2ecf20Sopenharmony_ci 9148c2ecf20Sopenharmony_ci#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ 9158c2ecf20Sopenharmony_ci 9168c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA_KEEP_MEMINFO 9178c2ecf20Sopenharmony_cistatic int meminfo_to_nid(struct numa_meminfo *mi, u64 start) 9188c2ecf20Sopenharmony_ci{ 9198c2ecf20Sopenharmony_ci int i; 9208c2ecf20Sopenharmony_ci 9218c2ecf20Sopenharmony_ci for (i = 0; i < mi->nr_blks; i++) 9228c2ecf20Sopenharmony_ci if (mi->blk[i].start <= start && mi->blk[i].end > start) 9238c2ecf20Sopenharmony_ci return mi->blk[i].nid; 9248c2ecf20Sopenharmony_ci return NUMA_NO_NODE; 9258c2ecf20Sopenharmony_ci} 9268c2ecf20Sopenharmony_ci 9278c2ecf20Sopenharmony_ciint phys_to_target_node(phys_addr_t start) 9288c2ecf20Sopenharmony_ci{ 9298c2ecf20Sopenharmony_ci int nid = meminfo_to_nid(&numa_meminfo, start); 9308c2ecf20Sopenharmony_ci 9318c2ecf20Sopenharmony_ci /* 9328c2ecf20Sopenharmony_ci * Prefer online nodes, but if reserved memory might be 9338c2ecf20Sopenharmony_ci * hot-added continue the search with reserved ranges. 9348c2ecf20Sopenharmony_ci */ 9358c2ecf20Sopenharmony_ci if (nid != NUMA_NO_NODE) 9368c2ecf20Sopenharmony_ci return nid; 9378c2ecf20Sopenharmony_ci 9388c2ecf20Sopenharmony_ci return meminfo_to_nid(&numa_reserved_meminfo, start); 9398c2ecf20Sopenharmony_ci} 9408c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(phys_to_target_node); 9418c2ecf20Sopenharmony_ci 9428c2ecf20Sopenharmony_ciint memory_add_physaddr_to_nid(u64 start) 9438c2ecf20Sopenharmony_ci{ 9448c2ecf20Sopenharmony_ci int nid = meminfo_to_nid(&numa_meminfo, start); 9458c2ecf20Sopenharmony_ci 9468c2ecf20Sopenharmony_ci if (nid == NUMA_NO_NODE) 9478c2ecf20Sopenharmony_ci nid = numa_meminfo.blk[0].nid; 9488c2ecf20Sopenharmony_ci return nid; 9498c2ecf20Sopenharmony_ci} 9508c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); 9518c2ecf20Sopenharmony_ci#endif 952