18c2ecf20Sopenharmony_ci#include <linux/gfp.h> 28c2ecf20Sopenharmony_ci#include <linux/initrd.h> 38c2ecf20Sopenharmony_ci#include <linux/ioport.h> 48c2ecf20Sopenharmony_ci#include <linux/swap.h> 58c2ecf20Sopenharmony_ci#include <linux/memblock.h> 68c2ecf20Sopenharmony_ci#include <linux/swapfile.h> 78c2ecf20Sopenharmony_ci#include <linux/swapops.h> 88c2ecf20Sopenharmony_ci#include <linux/kmemleak.h> 98c2ecf20Sopenharmony_ci#include <linux/sched/task.h> 108c2ecf20Sopenharmony_ci#include <linux/sched/mm.h> 118c2ecf20Sopenharmony_ci 128c2ecf20Sopenharmony_ci#include <asm/set_memory.h> 138c2ecf20Sopenharmony_ci#include <asm/cpu_device_id.h> 148c2ecf20Sopenharmony_ci#include <asm/e820/api.h> 158c2ecf20Sopenharmony_ci#include <asm/init.h> 168c2ecf20Sopenharmony_ci#include <asm/page.h> 178c2ecf20Sopenharmony_ci#include <asm/page_types.h> 188c2ecf20Sopenharmony_ci#include <asm/sections.h> 198c2ecf20Sopenharmony_ci#include <asm/setup.h> 208c2ecf20Sopenharmony_ci#include <asm/tlbflush.h> 218c2ecf20Sopenharmony_ci#include <asm/tlb.h> 228c2ecf20Sopenharmony_ci#include <asm/proto.h> 238c2ecf20Sopenharmony_ci#include <asm/dma.h> /* for MAX_DMA_PFN */ 248c2ecf20Sopenharmony_ci#include <asm/microcode.h> 258c2ecf20Sopenharmony_ci#include <asm/kaslr.h> 268c2ecf20Sopenharmony_ci#include <asm/hypervisor.h> 278c2ecf20Sopenharmony_ci#include <asm/cpufeature.h> 288c2ecf20Sopenharmony_ci#include <asm/pti.h> 298c2ecf20Sopenharmony_ci#include <asm/text-patching.h> 308c2ecf20Sopenharmony_ci#include <asm/memtype.h> 318c2ecf20Sopenharmony_ci#include <asm/paravirt.h> 328c2ecf20Sopenharmony_ci 338c2ecf20Sopenharmony_ci/* 348c2ecf20Sopenharmony_ci * We need to define the tracepoints somewhere, and tlb.c 358c2ecf20Sopenharmony_ci * is only compied when SMP=y. 368c2ecf20Sopenharmony_ci */ 378c2ecf20Sopenharmony_ci#define CREATE_TRACE_POINTS 388c2ecf20Sopenharmony_ci#include <trace/events/tlb.h> 398c2ecf20Sopenharmony_ci 408c2ecf20Sopenharmony_ci#include "mm_internal.h" 418c2ecf20Sopenharmony_ci 428c2ecf20Sopenharmony_ci/* 438c2ecf20Sopenharmony_ci * Tables translating between page_cache_type_t and pte encoding. 448c2ecf20Sopenharmony_ci * 458c2ecf20Sopenharmony_ci * The default values are defined statically as minimal supported mode; 468c2ecf20Sopenharmony_ci * WC and WT fall back to UC-. pat_init() updates these values to support 478c2ecf20Sopenharmony_ci * more cache modes, WC and WT, when it is safe to do so. See pat_init() 488c2ecf20Sopenharmony_ci * for the details. Note, __early_ioremap() used during early boot-time 498c2ecf20Sopenharmony_ci * takes pgprot_t (pte encoding) and does not use these tables. 508c2ecf20Sopenharmony_ci * 518c2ecf20Sopenharmony_ci * Index into __cachemode2pte_tbl[] is the cachemode. 528c2ecf20Sopenharmony_ci * 538c2ecf20Sopenharmony_ci * Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte 548c2ecf20Sopenharmony_ci * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2. 558c2ecf20Sopenharmony_ci */ 568c2ecf20Sopenharmony_cistatic uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { 578c2ecf20Sopenharmony_ci [_PAGE_CACHE_MODE_WB ] = 0 | 0 , 588c2ecf20Sopenharmony_ci [_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD, 598c2ecf20Sopenharmony_ci [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD, 608c2ecf20Sopenharmony_ci [_PAGE_CACHE_MODE_UC ] = _PAGE_PWT | _PAGE_PCD, 618c2ecf20Sopenharmony_ci [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD, 628c2ecf20Sopenharmony_ci [_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD, 638c2ecf20Sopenharmony_ci}; 648c2ecf20Sopenharmony_ci 658c2ecf20Sopenharmony_ciunsigned long cachemode2protval(enum page_cache_mode pcm) 668c2ecf20Sopenharmony_ci{ 678c2ecf20Sopenharmony_ci if (likely(pcm == 0)) 688c2ecf20Sopenharmony_ci return 0; 698c2ecf20Sopenharmony_ci return __cachemode2pte_tbl[pcm]; 708c2ecf20Sopenharmony_ci} 718c2ecf20Sopenharmony_ciEXPORT_SYMBOL(cachemode2protval); 728c2ecf20Sopenharmony_ci 738c2ecf20Sopenharmony_cistatic uint8_t __pte2cachemode_tbl[8] = { 748c2ecf20Sopenharmony_ci [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB, 758c2ecf20Sopenharmony_ci [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, 768c2ecf20Sopenharmony_ci [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, 778c2ecf20Sopenharmony_ci [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC, 788c2ecf20Sopenharmony_ci [__pte2cm_idx( 0 | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB, 798c2ecf20Sopenharmony_ci [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, 808c2ecf20Sopenharmony_ci [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, 818c2ecf20Sopenharmony_ci [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, 828c2ecf20Sopenharmony_ci}; 838c2ecf20Sopenharmony_ci 848c2ecf20Sopenharmony_ci/* 858c2ecf20Sopenharmony_ci * Check that the write-protect PAT entry is set for write-protect. 868c2ecf20Sopenharmony_ci * To do this without making assumptions how PAT has been set up (Xen has 878c2ecf20Sopenharmony_ci * another layout than the kernel), translate the _PAGE_CACHE_MODE_WP cache 888c2ecf20Sopenharmony_ci * mode via the __cachemode2pte_tbl[] into protection bits (those protection 898c2ecf20Sopenharmony_ci * bits will select a cache mode of WP or better), and then translate the 908c2ecf20Sopenharmony_ci * protection bits back into the cache mode using __pte2cm_idx() and the 918c2ecf20Sopenharmony_ci * __pte2cachemode_tbl[] array. This will return the really used cache mode. 928c2ecf20Sopenharmony_ci */ 938c2ecf20Sopenharmony_cibool x86_has_pat_wp(void) 948c2ecf20Sopenharmony_ci{ 958c2ecf20Sopenharmony_ci uint16_t prot = __cachemode2pte_tbl[_PAGE_CACHE_MODE_WP]; 968c2ecf20Sopenharmony_ci 978c2ecf20Sopenharmony_ci return __pte2cachemode_tbl[__pte2cm_idx(prot)] == _PAGE_CACHE_MODE_WP; 988c2ecf20Sopenharmony_ci} 998c2ecf20Sopenharmony_ci 1008c2ecf20Sopenharmony_cienum page_cache_mode pgprot2cachemode(pgprot_t pgprot) 1018c2ecf20Sopenharmony_ci{ 1028c2ecf20Sopenharmony_ci unsigned long masked; 1038c2ecf20Sopenharmony_ci 1048c2ecf20Sopenharmony_ci masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK; 1058c2ecf20Sopenharmony_ci if (likely(masked == 0)) 1068c2ecf20Sopenharmony_ci return 0; 1078c2ecf20Sopenharmony_ci return __pte2cachemode_tbl[__pte2cm_idx(masked)]; 1088c2ecf20Sopenharmony_ci} 1098c2ecf20Sopenharmony_ci 1108c2ecf20Sopenharmony_cistatic unsigned long __initdata pgt_buf_start; 1118c2ecf20Sopenharmony_cistatic unsigned long __initdata pgt_buf_end; 1128c2ecf20Sopenharmony_cistatic unsigned long __initdata pgt_buf_top; 1138c2ecf20Sopenharmony_ci 1148c2ecf20Sopenharmony_cistatic unsigned long min_pfn_mapped; 1158c2ecf20Sopenharmony_ci 1168c2ecf20Sopenharmony_cistatic bool __initdata can_use_brk_pgt = true; 1178c2ecf20Sopenharmony_ci 1188c2ecf20Sopenharmony_ci/* 1198c2ecf20Sopenharmony_ci * Pages returned are already directly mapped. 1208c2ecf20Sopenharmony_ci * 1218c2ecf20Sopenharmony_ci * Changing that is likely to break Xen, see commit: 1228c2ecf20Sopenharmony_ci * 1238c2ecf20Sopenharmony_ci * 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve 1248c2ecf20Sopenharmony_ci * 1258c2ecf20Sopenharmony_ci * for detailed information. 1268c2ecf20Sopenharmony_ci */ 1278c2ecf20Sopenharmony_ci__ref void *alloc_low_pages(unsigned int num) 1288c2ecf20Sopenharmony_ci{ 1298c2ecf20Sopenharmony_ci unsigned long pfn; 1308c2ecf20Sopenharmony_ci int i; 1318c2ecf20Sopenharmony_ci 1328c2ecf20Sopenharmony_ci if (after_bootmem) { 1338c2ecf20Sopenharmony_ci unsigned int order; 1348c2ecf20Sopenharmony_ci 1358c2ecf20Sopenharmony_ci order = get_order((unsigned long)num << PAGE_SHIFT); 1368c2ecf20Sopenharmony_ci return (void *)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order); 1378c2ecf20Sopenharmony_ci } 1388c2ecf20Sopenharmony_ci 1398c2ecf20Sopenharmony_ci if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) { 1408c2ecf20Sopenharmony_ci unsigned long ret = 0; 1418c2ecf20Sopenharmony_ci 1428c2ecf20Sopenharmony_ci if (min_pfn_mapped < max_pfn_mapped) { 1438c2ecf20Sopenharmony_ci ret = memblock_find_in_range( 1448c2ecf20Sopenharmony_ci min_pfn_mapped << PAGE_SHIFT, 1458c2ecf20Sopenharmony_ci max_pfn_mapped << PAGE_SHIFT, 1468c2ecf20Sopenharmony_ci PAGE_SIZE * num , PAGE_SIZE); 1478c2ecf20Sopenharmony_ci } 1488c2ecf20Sopenharmony_ci if (ret) 1498c2ecf20Sopenharmony_ci memblock_reserve(ret, PAGE_SIZE * num); 1508c2ecf20Sopenharmony_ci else if (can_use_brk_pgt) 1518c2ecf20Sopenharmony_ci ret = __pa(extend_brk(PAGE_SIZE * num, PAGE_SIZE)); 1528c2ecf20Sopenharmony_ci 1538c2ecf20Sopenharmony_ci if (!ret) 1548c2ecf20Sopenharmony_ci panic("alloc_low_pages: can not alloc memory"); 1558c2ecf20Sopenharmony_ci 1568c2ecf20Sopenharmony_ci pfn = ret >> PAGE_SHIFT; 1578c2ecf20Sopenharmony_ci } else { 1588c2ecf20Sopenharmony_ci pfn = pgt_buf_end; 1598c2ecf20Sopenharmony_ci pgt_buf_end += num; 1608c2ecf20Sopenharmony_ci } 1618c2ecf20Sopenharmony_ci 1628c2ecf20Sopenharmony_ci for (i = 0; i < num; i++) { 1638c2ecf20Sopenharmony_ci void *adr; 1648c2ecf20Sopenharmony_ci 1658c2ecf20Sopenharmony_ci adr = __va((pfn + i) << PAGE_SHIFT); 1668c2ecf20Sopenharmony_ci clear_page(adr); 1678c2ecf20Sopenharmony_ci } 1688c2ecf20Sopenharmony_ci 1698c2ecf20Sopenharmony_ci return __va(pfn << PAGE_SHIFT); 1708c2ecf20Sopenharmony_ci} 1718c2ecf20Sopenharmony_ci 1728c2ecf20Sopenharmony_ci/* 1738c2ecf20Sopenharmony_ci * By default need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS. 1748c2ecf20Sopenharmony_ci * With KASLR memory randomization, depending on the machine e820 memory 1758c2ecf20Sopenharmony_ci * and the PUD alignment. We may need twice more pages when KASLR memory 1768c2ecf20Sopenharmony_ci * randomization is enabled. 1778c2ecf20Sopenharmony_ci */ 1788c2ecf20Sopenharmony_ci#ifndef CONFIG_RANDOMIZE_MEMORY 1798c2ecf20Sopenharmony_ci#define INIT_PGD_PAGE_COUNT 6 1808c2ecf20Sopenharmony_ci#else 1818c2ecf20Sopenharmony_ci#define INIT_PGD_PAGE_COUNT 12 1828c2ecf20Sopenharmony_ci#endif 1838c2ecf20Sopenharmony_ci#define INIT_PGT_BUF_SIZE (INIT_PGD_PAGE_COUNT * PAGE_SIZE) 1848c2ecf20Sopenharmony_ciRESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); 1858c2ecf20Sopenharmony_civoid __init early_alloc_pgt_buf(void) 1868c2ecf20Sopenharmony_ci{ 1878c2ecf20Sopenharmony_ci unsigned long tables = INIT_PGT_BUF_SIZE; 1888c2ecf20Sopenharmony_ci phys_addr_t base; 1898c2ecf20Sopenharmony_ci 1908c2ecf20Sopenharmony_ci base = __pa(extend_brk(tables, PAGE_SIZE)); 1918c2ecf20Sopenharmony_ci 1928c2ecf20Sopenharmony_ci pgt_buf_start = base >> PAGE_SHIFT; 1938c2ecf20Sopenharmony_ci pgt_buf_end = pgt_buf_start; 1948c2ecf20Sopenharmony_ci pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); 1958c2ecf20Sopenharmony_ci} 1968c2ecf20Sopenharmony_ci 1978c2ecf20Sopenharmony_ciint after_bootmem; 1988c2ecf20Sopenharmony_ci 1998c2ecf20Sopenharmony_ciearly_param_on_off("gbpages", "nogbpages", direct_gbpages, CONFIG_X86_DIRECT_GBPAGES); 2008c2ecf20Sopenharmony_ci 2018c2ecf20Sopenharmony_cistruct map_range { 2028c2ecf20Sopenharmony_ci unsigned long start; 2038c2ecf20Sopenharmony_ci unsigned long end; 2048c2ecf20Sopenharmony_ci unsigned page_size_mask; 2058c2ecf20Sopenharmony_ci}; 2068c2ecf20Sopenharmony_ci 2078c2ecf20Sopenharmony_cistatic int page_size_mask; 2088c2ecf20Sopenharmony_ci 2098c2ecf20Sopenharmony_ci/* 2108c2ecf20Sopenharmony_ci * Save some of cr4 feature set we're using (e.g. Pentium 4MB 2118c2ecf20Sopenharmony_ci * enable and PPro Global page enable), so that any CPU's that boot 2128c2ecf20Sopenharmony_ci * up after us can get the correct flags. Invoked on the boot CPU. 2138c2ecf20Sopenharmony_ci */ 2148c2ecf20Sopenharmony_cistatic inline void cr4_set_bits_and_update_boot(unsigned long mask) 2158c2ecf20Sopenharmony_ci{ 2168c2ecf20Sopenharmony_ci mmu_cr4_features |= mask; 2178c2ecf20Sopenharmony_ci if (trampoline_cr4_features) 2188c2ecf20Sopenharmony_ci *trampoline_cr4_features = mmu_cr4_features; 2198c2ecf20Sopenharmony_ci cr4_set_bits(mask); 2208c2ecf20Sopenharmony_ci} 2218c2ecf20Sopenharmony_ci 2228c2ecf20Sopenharmony_cistatic void __init probe_page_size_mask(void) 2238c2ecf20Sopenharmony_ci{ 2248c2ecf20Sopenharmony_ci /* 2258c2ecf20Sopenharmony_ci * For pagealloc debugging, identity mapping will use small pages. 2268c2ecf20Sopenharmony_ci * This will simplify cpa(), which otherwise needs to support splitting 2278c2ecf20Sopenharmony_ci * large pages into small in interrupt context, etc. 2288c2ecf20Sopenharmony_ci */ 2298c2ecf20Sopenharmony_ci if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled()) 2308c2ecf20Sopenharmony_ci page_size_mask |= 1 << PG_LEVEL_2M; 2318c2ecf20Sopenharmony_ci else 2328c2ecf20Sopenharmony_ci direct_gbpages = 0; 2338c2ecf20Sopenharmony_ci 2348c2ecf20Sopenharmony_ci /* Enable PSE if available */ 2358c2ecf20Sopenharmony_ci if (boot_cpu_has(X86_FEATURE_PSE)) 2368c2ecf20Sopenharmony_ci cr4_set_bits_and_update_boot(X86_CR4_PSE); 2378c2ecf20Sopenharmony_ci 2388c2ecf20Sopenharmony_ci /* Enable PGE if available */ 2398c2ecf20Sopenharmony_ci __supported_pte_mask &= ~_PAGE_GLOBAL; 2408c2ecf20Sopenharmony_ci if (boot_cpu_has(X86_FEATURE_PGE)) { 2418c2ecf20Sopenharmony_ci cr4_set_bits_and_update_boot(X86_CR4_PGE); 2428c2ecf20Sopenharmony_ci __supported_pte_mask |= _PAGE_GLOBAL; 2438c2ecf20Sopenharmony_ci } 2448c2ecf20Sopenharmony_ci 2458c2ecf20Sopenharmony_ci /* By the default is everything supported: */ 2468c2ecf20Sopenharmony_ci __default_kernel_pte_mask = __supported_pte_mask; 2478c2ecf20Sopenharmony_ci /* Except when with PTI where the kernel is mostly non-Global: */ 2488c2ecf20Sopenharmony_ci if (cpu_feature_enabled(X86_FEATURE_PTI)) 2498c2ecf20Sopenharmony_ci __default_kernel_pte_mask &= ~_PAGE_GLOBAL; 2508c2ecf20Sopenharmony_ci 2518c2ecf20Sopenharmony_ci /* Enable 1 GB linear kernel mappings if available: */ 2528c2ecf20Sopenharmony_ci if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) { 2538c2ecf20Sopenharmony_ci printk(KERN_INFO "Using GB pages for direct mapping\n"); 2548c2ecf20Sopenharmony_ci page_size_mask |= 1 << PG_LEVEL_1G; 2558c2ecf20Sopenharmony_ci } else { 2568c2ecf20Sopenharmony_ci direct_gbpages = 0; 2578c2ecf20Sopenharmony_ci } 2588c2ecf20Sopenharmony_ci} 2598c2ecf20Sopenharmony_ci 2608c2ecf20Sopenharmony_ci#define INTEL_MATCH(_model) { .vendor = X86_VENDOR_INTEL, \ 2618c2ecf20Sopenharmony_ci .family = 6, \ 2628c2ecf20Sopenharmony_ci .model = _model, \ 2638c2ecf20Sopenharmony_ci } 2648c2ecf20Sopenharmony_ci/* 2658c2ecf20Sopenharmony_ci * INVLPG may not properly flush Global entries 2668c2ecf20Sopenharmony_ci * on these CPUs when PCIDs are enabled. 2678c2ecf20Sopenharmony_ci */ 2688c2ecf20Sopenharmony_cistatic const struct x86_cpu_id invlpg_miss_ids[] = { 2698c2ecf20Sopenharmony_ci INTEL_MATCH(INTEL_FAM6_ALDERLAKE ), 2708c2ecf20Sopenharmony_ci INTEL_MATCH(INTEL_FAM6_ALDERLAKE_L ), 2718c2ecf20Sopenharmony_ci INTEL_MATCH(INTEL_FAM6_ALDERLAKE_N ), 2728c2ecf20Sopenharmony_ci INTEL_MATCH(INTEL_FAM6_RAPTORLAKE ), 2738c2ecf20Sopenharmony_ci INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_P), 2748c2ecf20Sopenharmony_ci INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_S), 2758c2ecf20Sopenharmony_ci {} 2768c2ecf20Sopenharmony_ci}; 2778c2ecf20Sopenharmony_ci 2788c2ecf20Sopenharmony_cistatic void setup_pcid(void) 2798c2ecf20Sopenharmony_ci{ 2808c2ecf20Sopenharmony_ci if (!IS_ENABLED(CONFIG_X86_64)) 2818c2ecf20Sopenharmony_ci return; 2828c2ecf20Sopenharmony_ci 2838c2ecf20Sopenharmony_ci if (!boot_cpu_has(X86_FEATURE_PCID)) 2848c2ecf20Sopenharmony_ci return; 2858c2ecf20Sopenharmony_ci 2868c2ecf20Sopenharmony_ci if (x86_match_cpu(invlpg_miss_ids)) { 2878c2ecf20Sopenharmony_ci pr_info("Incomplete global flushes, disabling PCID"); 2888c2ecf20Sopenharmony_ci setup_clear_cpu_cap(X86_FEATURE_PCID); 2898c2ecf20Sopenharmony_ci return; 2908c2ecf20Sopenharmony_ci } 2918c2ecf20Sopenharmony_ci 2928c2ecf20Sopenharmony_ci if (boot_cpu_has(X86_FEATURE_PGE)) { 2938c2ecf20Sopenharmony_ci /* 2948c2ecf20Sopenharmony_ci * This can't be cr4_set_bits_and_update_boot() -- the 2958c2ecf20Sopenharmony_ci * trampoline code can't handle CR4.PCIDE and it wouldn't 2968c2ecf20Sopenharmony_ci * do any good anyway. Despite the name, 2978c2ecf20Sopenharmony_ci * cr4_set_bits_and_update_boot() doesn't actually cause 2988c2ecf20Sopenharmony_ci * the bits in question to remain set all the way through 2998c2ecf20Sopenharmony_ci * the secondary boot asm. 3008c2ecf20Sopenharmony_ci * 3018c2ecf20Sopenharmony_ci * Instead, we brute-force it and set CR4.PCIDE manually in 3028c2ecf20Sopenharmony_ci * start_secondary(). 3038c2ecf20Sopenharmony_ci */ 3048c2ecf20Sopenharmony_ci cr4_set_bits(X86_CR4_PCIDE); 3058c2ecf20Sopenharmony_ci 3068c2ecf20Sopenharmony_ci /* 3078c2ecf20Sopenharmony_ci * INVPCID's single-context modes (2/3) only work if we set 3088c2ecf20Sopenharmony_ci * X86_CR4_PCIDE, *and* we INVPCID support. It's unusable 3098c2ecf20Sopenharmony_ci * on systems that have X86_CR4_PCIDE clear, or that have 3108c2ecf20Sopenharmony_ci * no INVPCID support at all. 3118c2ecf20Sopenharmony_ci */ 3128c2ecf20Sopenharmony_ci if (boot_cpu_has(X86_FEATURE_INVPCID)) 3138c2ecf20Sopenharmony_ci setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE); 3148c2ecf20Sopenharmony_ci } else { 3158c2ecf20Sopenharmony_ci /* 3168c2ecf20Sopenharmony_ci * flush_tlb_all(), as currently implemented, won't work if 3178c2ecf20Sopenharmony_ci * PCID is on but PGE is not. Since that combination 3188c2ecf20Sopenharmony_ci * doesn't exist on real hardware, there's no reason to try 3198c2ecf20Sopenharmony_ci * to fully support it, but it's polite to avoid corrupting 3208c2ecf20Sopenharmony_ci * data if we're on an improperly configured VM. 3218c2ecf20Sopenharmony_ci */ 3228c2ecf20Sopenharmony_ci setup_clear_cpu_cap(X86_FEATURE_PCID); 3238c2ecf20Sopenharmony_ci } 3248c2ecf20Sopenharmony_ci} 3258c2ecf20Sopenharmony_ci 3268c2ecf20Sopenharmony_ci#ifdef CONFIG_X86_32 3278c2ecf20Sopenharmony_ci#define NR_RANGE_MR 3 3288c2ecf20Sopenharmony_ci#else /* CONFIG_X86_64 */ 3298c2ecf20Sopenharmony_ci#define NR_RANGE_MR 5 3308c2ecf20Sopenharmony_ci#endif 3318c2ecf20Sopenharmony_ci 3328c2ecf20Sopenharmony_cistatic int __meminit save_mr(struct map_range *mr, int nr_range, 3338c2ecf20Sopenharmony_ci unsigned long start_pfn, unsigned long end_pfn, 3348c2ecf20Sopenharmony_ci unsigned long page_size_mask) 3358c2ecf20Sopenharmony_ci{ 3368c2ecf20Sopenharmony_ci if (start_pfn < end_pfn) { 3378c2ecf20Sopenharmony_ci if (nr_range >= NR_RANGE_MR) 3388c2ecf20Sopenharmony_ci panic("run out of range for init_memory_mapping\n"); 3398c2ecf20Sopenharmony_ci mr[nr_range].start = start_pfn<<PAGE_SHIFT; 3408c2ecf20Sopenharmony_ci mr[nr_range].end = end_pfn<<PAGE_SHIFT; 3418c2ecf20Sopenharmony_ci mr[nr_range].page_size_mask = page_size_mask; 3428c2ecf20Sopenharmony_ci nr_range++; 3438c2ecf20Sopenharmony_ci } 3448c2ecf20Sopenharmony_ci 3458c2ecf20Sopenharmony_ci return nr_range; 3468c2ecf20Sopenharmony_ci} 3478c2ecf20Sopenharmony_ci 3488c2ecf20Sopenharmony_ci/* 3498c2ecf20Sopenharmony_ci * adjust the page_size_mask for small range to go with 3508c2ecf20Sopenharmony_ci * big page size instead small one if nearby are ram too. 3518c2ecf20Sopenharmony_ci */ 3528c2ecf20Sopenharmony_cistatic void __ref adjust_range_page_size_mask(struct map_range *mr, 3538c2ecf20Sopenharmony_ci int nr_range) 3548c2ecf20Sopenharmony_ci{ 3558c2ecf20Sopenharmony_ci int i; 3568c2ecf20Sopenharmony_ci 3578c2ecf20Sopenharmony_ci for (i = 0; i < nr_range; i++) { 3588c2ecf20Sopenharmony_ci if ((page_size_mask & (1<<PG_LEVEL_2M)) && 3598c2ecf20Sopenharmony_ci !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) { 3608c2ecf20Sopenharmony_ci unsigned long start = round_down(mr[i].start, PMD_SIZE); 3618c2ecf20Sopenharmony_ci unsigned long end = round_up(mr[i].end, PMD_SIZE); 3628c2ecf20Sopenharmony_ci 3638c2ecf20Sopenharmony_ci#ifdef CONFIG_X86_32 3648c2ecf20Sopenharmony_ci if ((end >> PAGE_SHIFT) > max_low_pfn) 3658c2ecf20Sopenharmony_ci continue; 3668c2ecf20Sopenharmony_ci#endif 3678c2ecf20Sopenharmony_ci 3688c2ecf20Sopenharmony_ci if (memblock_is_region_memory(start, end - start)) 3698c2ecf20Sopenharmony_ci mr[i].page_size_mask |= 1<<PG_LEVEL_2M; 3708c2ecf20Sopenharmony_ci } 3718c2ecf20Sopenharmony_ci if ((page_size_mask & (1<<PG_LEVEL_1G)) && 3728c2ecf20Sopenharmony_ci !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) { 3738c2ecf20Sopenharmony_ci unsigned long start = round_down(mr[i].start, PUD_SIZE); 3748c2ecf20Sopenharmony_ci unsigned long end = round_up(mr[i].end, PUD_SIZE); 3758c2ecf20Sopenharmony_ci 3768c2ecf20Sopenharmony_ci if (memblock_is_region_memory(start, end - start)) 3778c2ecf20Sopenharmony_ci mr[i].page_size_mask |= 1<<PG_LEVEL_1G; 3788c2ecf20Sopenharmony_ci } 3798c2ecf20Sopenharmony_ci } 3808c2ecf20Sopenharmony_ci} 3818c2ecf20Sopenharmony_ci 3828c2ecf20Sopenharmony_cistatic const char *page_size_string(struct map_range *mr) 3838c2ecf20Sopenharmony_ci{ 3848c2ecf20Sopenharmony_ci static const char str_1g[] = "1G"; 3858c2ecf20Sopenharmony_ci static const char str_2m[] = "2M"; 3868c2ecf20Sopenharmony_ci static const char str_4m[] = "4M"; 3878c2ecf20Sopenharmony_ci static const char str_4k[] = "4k"; 3888c2ecf20Sopenharmony_ci 3898c2ecf20Sopenharmony_ci if (mr->page_size_mask & (1<<PG_LEVEL_1G)) 3908c2ecf20Sopenharmony_ci return str_1g; 3918c2ecf20Sopenharmony_ci /* 3928c2ecf20Sopenharmony_ci * 32-bit without PAE has a 4M large page size. 3938c2ecf20Sopenharmony_ci * PG_LEVEL_2M is misnamed, but we can at least 3948c2ecf20Sopenharmony_ci * print out the right size in the string. 3958c2ecf20Sopenharmony_ci */ 3968c2ecf20Sopenharmony_ci if (IS_ENABLED(CONFIG_X86_32) && 3978c2ecf20Sopenharmony_ci !IS_ENABLED(CONFIG_X86_PAE) && 3988c2ecf20Sopenharmony_ci mr->page_size_mask & (1<<PG_LEVEL_2M)) 3998c2ecf20Sopenharmony_ci return str_4m; 4008c2ecf20Sopenharmony_ci 4018c2ecf20Sopenharmony_ci if (mr->page_size_mask & (1<<PG_LEVEL_2M)) 4028c2ecf20Sopenharmony_ci return str_2m; 4038c2ecf20Sopenharmony_ci 4048c2ecf20Sopenharmony_ci return str_4k; 4058c2ecf20Sopenharmony_ci} 4068c2ecf20Sopenharmony_ci 4078c2ecf20Sopenharmony_cistatic int __meminit split_mem_range(struct map_range *mr, int nr_range, 4088c2ecf20Sopenharmony_ci unsigned long start, 4098c2ecf20Sopenharmony_ci unsigned long end) 4108c2ecf20Sopenharmony_ci{ 4118c2ecf20Sopenharmony_ci unsigned long start_pfn, end_pfn, limit_pfn; 4128c2ecf20Sopenharmony_ci unsigned long pfn; 4138c2ecf20Sopenharmony_ci int i; 4148c2ecf20Sopenharmony_ci 4158c2ecf20Sopenharmony_ci limit_pfn = PFN_DOWN(end); 4168c2ecf20Sopenharmony_ci 4178c2ecf20Sopenharmony_ci /* head if not big page alignment ? */ 4188c2ecf20Sopenharmony_ci pfn = start_pfn = PFN_DOWN(start); 4198c2ecf20Sopenharmony_ci#ifdef CONFIG_X86_32 4208c2ecf20Sopenharmony_ci /* 4218c2ecf20Sopenharmony_ci * Don't use a large page for the first 2/4MB of memory 4228c2ecf20Sopenharmony_ci * because there are often fixed size MTRRs in there 4238c2ecf20Sopenharmony_ci * and overlapping MTRRs into large pages can cause 4248c2ecf20Sopenharmony_ci * slowdowns. 4258c2ecf20Sopenharmony_ci */ 4268c2ecf20Sopenharmony_ci if (pfn == 0) 4278c2ecf20Sopenharmony_ci end_pfn = PFN_DOWN(PMD_SIZE); 4288c2ecf20Sopenharmony_ci else 4298c2ecf20Sopenharmony_ci end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); 4308c2ecf20Sopenharmony_ci#else /* CONFIG_X86_64 */ 4318c2ecf20Sopenharmony_ci end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); 4328c2ecf20Sopenharmony_ci#endif 4338c2ecf20Sopenharmony_ci if (end_pfn > limit_pfn) 4348c2ecf20Sopenharmony_ci end_pfn = limit_pfn; 4358c2ecf20Sopenharmony_ci if (start_pfn < end_pfn) { 4368c2ecf20Sopenharmony_ci nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); 4378c2ecf20Sopenharmony_ci pfn = end_pfn; 4388c2ecf20Sopenharmony_ci } 4398c2ecf20Sopenharmony_ci 4408c2ecf20Sopenharmony_ci /* big page (2M) range */ 4418c2ecf20Sopenharmony_ci start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); 4428c2ecf20Sopenharmony_ci#ifdef CONFIG_X86_32 4438c2ecf20Sopenharmony_ci end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); 4448c2ecf20Sopenharmony_ci#else /* CONFIG_X86_64 */ 4458c2ecf20Sopenharmony_ci end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); 4468c2ecf20Sopenharmony_ci if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE))) 4478c2ecf20Sopenharmony_ci end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); 4488c2ecf20Sopenharmony_ci#endif 4498c2ecf20Sopenharmony_ci 4508c2ecf20Sopenharmony_ci if (start_pfn < end_pfn) { 4518c2ecf20Sopenharmony_ci nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 4528c2ecf20Sopenharmony_ci page_size_mask & (1<<PG_LEVEL_2M)); 4538c2ecf20Sopenharmony_ci pfn = end_pfn; 4548c2ecf20Sopenharmony_ci } 4558c2ecf20Sopenharmony_ci 4568c2ecf20Sopenharmony_ci#ifdef CONFIG_X86_64 4578c2ecf20Sopenharmony_ci /* big page (1G) range */ 4588c2ecf20Sopenharmony_ci start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); 4598c2ecf20Sopenharmony_ci end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE)); 4608c2ecf20Sopenharmony_ci if (start_pfn < end_pfn) { 4618c2ecf20Sopenharmony_ci nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 4628c2ecf20Sopenharmony_ci page_size_mask & 4638c2ecf20Sopenharmony_ci ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G))); 4648c2ecf20Sopenharmony_ci pfn = end_pfn; 4658c2ecf20Sopenharmony_ci } 4668c2ecf20Sopenharmony_ci 4678c2ecf20Sopenharmony_ci /* tail is not big page (1G) alignment */ 4688c2ecf20Sopenharmony_ci start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); 4698c2ecf20Sopenharmony_ci end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); 4708c2ecf20Sopenharmony_ci if (start_pfn < end_pfn) { 4718c2ecf20Sopenharmony_ci nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 4728c2ecf20Sopenharmony_ci page_size_mask & (1<<PG_LEVEL_2M)); 4738c2ecf20Sopenharmony_ci pfn = end_pfn; 4748c2ecf20Sopenharmony_ci } 4758c2ecf20Sopenharmony_ci#endif 4768c2ecf20Sopenharmony_ci 4778c2ecf20Sopenharmony_ci /* tail is not big page (2M) alignment */ 4788c2ecf20Sopenharmony_ci start_pfn = pfn; 4798c2ecf20Sopenharmony_ci end_pfn = limit_pfn; 4808c2ecf20Sopenharmony_ci nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); 4818c2ecf20Sopenharmony_ci 4828c2ecf20Sopenharmony_ci if (!after_bootmem) 4838c2ecf20Sopenharmony_ci adjust_range_page_size_mask(mr, nr_range); 4848c2ecf20Sopenharmony_ci 4858c2ecf20Sopenharmony_ci /* try to merge same page size and continuous */ 4868c2ecf20Sopenharmony_ci for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { 4878c2ecf20Sopenharmony_ci unsigned long old_start; 4888c2ecf20Sopenharmony_ci if (mr[i].end != mr[i+1].start || 4898c2ecf20Sopenharmony_ci mr[i].page_size_mask != mr[i+1].page_size_mask) 4908c2ecf20Sopenharmony_ci continue; 4918c2ecf20Sopenharmony_ci /* move it */ 4928c2ecf20Sopenharmony_ci old_start = mr[i].start; 4938c2ecf20Sopenharmony_ci memmove(&mr[i], &mr[i+1], 4948c2ecf20Sopenharmony_ci (nr_range - 1 - i) * sizeof(struct map_range)); 4958c2ecf20Sopenharmony_ci mr[i--].start = old_start; 4968c2ecf20Sopenharmony_ci nr_range--; 4978c2ecf20Sopenharmony_ci } 4988c2ecf20Sopenharmony_ci 4998c2ecf20Sopenharmony_ci for (i = 0; i < nr_range; i++) 5008c2ecf20Sopenharmony_ci pr_debug(" [mem %#010lx-%#010lx] page %s\n", 5018c2ecf20Sopenharmony_ci mr[i].start, mr[i].end - 1, 5028c2ecf20Sopenharmony_ci page_size_string(&mr[i])); 5038c2ecf20Sopenharmony_ci 5048c2ecf20Sopenharmony_ci return nr_range; 5058c2ecf20Sopenharmony_ci} 5068c2ecf20Sopenharmony_ci 5078c2ecf20Sopenharmony_cistruct range pfn_mapped[E820_MAX_ENTRIES]; 5088c2ecf20Sopenharmony_ciint nr_pfn_mapped; 5098c2ecf20Sopenharmony_ci 5108c2ecf20Sopenharmony_cistatic void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn) 5118c2ecf20Sopenharmony_ci{ 5128c2ecf20Sopenharmony_ci nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_MAX_ENTRIES, 5138c2ecf20Sopenharmony_ci nr_pfn_mapped, start_pfn, end_pfn); 5148c2ecf20Sopenharmony_ci nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_MAX_ENTRIES); 5158c2ecf20Sopenharmony_ci 5168c2ecf20Sopenharmony_ci max_pfn_mapped = max(max_pfn_mapped, end_pfn); 5178c2ecf20Sopenharmony_ci 5188c2ecf20Sopenharmony_ci if (start_pfn < (1UL<<(32-PAGE_SHIFT))) 5198c2ecf20Sopenharmony_ci max_low_pfn_mapped = max(max_low_pfn_mapped, 5208c2ecf20Sopenharmony_ci min(end_pfn, 1UL<<(32-PAGE_SHIFT))); 5218c2ecf20Sopenharmony_ci} 5228c2ecf20Sopenharmony_ci 5238c2ecf20Sopenharmony_cibool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn) 5248c2ecf20Sopenharmony_ci{ 5258c2ecf20Sopenharmony_ci int i; 5268c2ecf20Sopenharmony_ci 5278c2ecf20Sopenharmony_ci for (i = 0; i < nr_pfn_mapped; i++) 5288c2ecf20Sopenharmony_ci if ((start_pfn >= pfn_mapped[i].start) && 5298c2ecf20Sopenharmony_ci (end_pfn <= pfn_mapped[i].end)) 5308c2ecf20Sopenharmony_ci return true; 5318c2ecf20Sopenharmony_ci 5328c2ecf20Sopenharmony_ci return false; 5338c2ecf20Sopenharmony_ci} 5348c2ecf20Sopenharmony_ci 5358c2ecf20Sopenharmony_ci/* 5368c2ecf20Sopenharmony_ci * Setup the direct mapping of the physical memory at PAGE_OFFSET. 5378c2ecf20Sopenharmony_ci * This runs before bootmem is initialized and gets pages directly from 5388c2ecf20Sopenharmony_ci * the physical memory. To access them they are temporarily mapped. 5398c2ecf20Sopenharmony_ci */ 5408c2ecf20Sopenharmony_ciunsigned long __ref init_memory_mapping(unsigned long start, 5418c2ecf20Sopenharmony_ci unsigned long end, pgprot_t prot) 5428c2ecf20Sopenharmony_ci{ 5438c2ecf20Sopenharmony_ci struct map_range mr[NR_RANGE_MR]; 5448c2ecf20Sopenharmony_ci unsigned long ret = 0; 5458c2ecf20Sopenharmony_ci int nr_range, i; 5468c2ecf20Sopenharmony_ci 5478c2ecf20Sopenharmony_ci pr_debug("init_memory_mapping: [mem %#010lx-%#010lx]\n", 5488c2ecf20Sopenharmony_ci start, end - 1); 5498c2ecf20Sopenharmony_ci 5508c2ecf20Sopenharmony_ci memset(mr, 0, sizeof(mr)); 5518c2ecf20Sopenharmony_ci nr_range = split_mem_range(mr, 0, start, end); 5528c2ecf20Sopenharmony_ci 5538c2ecf20Sopenharmony_ci for (i = 0; i < nr_range; i++) 5548c2ecf20Sopenharmony_ci ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, 5558c2ecf20Sopenharmony_ci mr[i].page_size_mask, 5568c2ecf20Sopenharmony_ci prot); 5578c2ecf20Sopenharmony_ci 5588c2ecf20Sopenharmony_ci add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT); 5598c2ecf20Sopenharmony_ci 5608c2ecf20Sopenharmony_ci return ret >> PAGE_SHIFT; 5618c2ecf20Sopenharmony_ci} 5628c2ecf20Sopenharmony_ci 5638c2ecf20Sopenharmony_ci/* 5648c2ecf20Sopenharmony_ci * We need to iterate through the E820 memory map and create direct mappings 5658c2ecf20Sopenharmony_ci * for only E820_TYPE_RAM and E820_KERN_RESERVED regions. We cannot simply 5668c2ecf20Sopenharmony_ci * create direct mappings for all pfns from [0 to max_low_pfn) and 5678c2ecf20Sopenharmony_ci * [4GB to max_pfn) because of possible memory holes in high addresses 5688c2ecf20Sopenharmony_ci * that cannot be marked as UC by fixed/variable range MTRRs. 5698c2ecf20Sopenharmony_ci * Depending on the alignment of E820 ranges, this may possibly result 5708c2ecf20Sopenharmony_ci * in using smaller size (i.e. 4K instead of 2M or 1G) page tables. 5718c2ecf20Sopenharmony_ci * 5728c2ecf20Sopenharmony_ci * init_mem_mapping() calls init_range_memory_mapping() with big range. 5738c2ecf20Sopenharmony_ci * That range would have hole in the middle or ends, and only ram parts 5748c2ecf20Sopenharmony_ci * will be mapped in init_range_memory_mapping(). 5758c2ecf20Sopenharmony_ci */ 5768c2ecf20Sopenharmony_cistatic unsigned long __init init_range_memory_mapping( 5778c2ecf20Sopenharmony_ci unsigned long r_start, 5788c2ecf20Sopenharmony_ci unsigned long r_end) 5798c2ecf20Sopenharmony_ci{ 5808c2ecf20Sopenharmony_ci unsigned long start_pfn, end_pfn; 5818c2ecf20Sopenharmony_ci unsigned long mapped_ram_size = 0; 5828c2ecf20Sopenharmony_ci int i; 5838c2ecf20Sopenharmony_ci 5848c2ecf20Sopenharmony_ci for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { 5858c2ecf20Sopenharmony_ci u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end); 5868c2ecf20Sopenharmony_ci u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end); 5878c2ecf20Sopenharmony_ci if (start >= end) 5888c2ecf20Sopenharmony_ci continue; 5898c2ecf20Sopenharmony_ci 5908c2ecf20Sopenharmony_ci /* 5918c2ecf20Sopenharmony_ci * if it is overlapping with brk pgt, we need to 5928c2ecf20Sopenharmony_ci * alloc pgt buf from memblock instead. 5938c2ecf20Sopenharmony_ci */ 5948c2ecf20Sopenharmony_ci can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >= 5958c2ecf20Sopenharmony_ci min(end, (u64)pgt_buf_top<<PAGE_SHIFT); 5968c2ecf20Sopenharmony_ci init_memory_mapping(start, end, PAGE_KERNEL); 5978c2ecf20Sopenharmony_ci mapped_ram_size += end - start; 5988c2ecf20Sopenharmony_ci can_use_brk_pgt = true; 5998c2ecf20Sopenharmony_ci } 6008c2ecf20Sopenharmony_ci 6018c2ecf20Sopenharmony_ci return mapped_ram_size; 6028c2ecf20Sopenharmony_ci} 6038c2ecf20Sopenharmony_ci 6048c2ecf20Sopenharmony_cistatic unsigned long __init get_new_step_size(unsigned long step_size) 6058c2ecf20Sopenharmony_ci{ 6068c2ecf20Sopenharmony_ci /* 6078c2ecf20Sopenharmony_ci * Initial mapped size is PMD_SIZE (2M). 6088c2ecf20Sopenharmony_ci * We can not set step_size to be PUD_SIZE (1G) yet. 6098c2ecf20Sopenharmony_ci * In worse case, when we cross the 1G boundary, and 6108c2ecf20Sopenharmony_ci * PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k) 6118c2ecf20Sopenharmony_ci * to map 1G range with PTE. Hence we use one less than the 6128c2ecf20Sopenharmony_ci * difference of page table level shifts. 6138c2ecf20Sopenharmony_ci * 6148c2ecf20Sopenharmony_ci * Don't need to worry about overflow in the top-down case, on 32bit, 6158c2ecf20Sopenharmony_ci * when step_size is 0, round_down() returns 0 for start, and that 6168c2ecf20Sopenharmony_ci * turns it into 0x100000000ULL. 6178c2ecf20Sopenharmony_ci * In the bottom-up case, round_up(x, 0) returns 0 though too, which 6188c2ecf20Sopenharmony_ci * needs to be taken into consideration by the code below. 6198c2ecf20Sopenharmony_ci */ 6208c2ecf20Sopenharmony_ci return step_size << (PMD_SHIFT - PAGE_SHIFT - 1); 6218c2ecf20Sopenharmony_ci} 6228c2ecf20Sopenharmony_ci 6238c2ecf20Sopenharmony_ci/** 6248c2ecf20Sopenharmony_ci * memory_map_top_down - Map [map_start, map_end) top down 6258c2ecf20Sopenharmony_ci * @map_start: start address of the target memory range 6268c2ecf20Sopenharmony_ci * @map_end: end address of the target memory range 6278c2ecf20Sopenharmony_ci * 6288c2ecf20Sopenharmony_ci * This function will setup direct mapping for memory range 6298c2ecf20Sopenharmony_ci * [map_start, map_end) in top-down. That said, the page tables 6308c2ecf20Sopenharmony_ci * will be allocated at the end of the memory, and we map the 6318c2ecf20Sopenharmony_ci * memory in top-down. 6328c2ecf20Sopenharmony_ci */ 6338c2ecf20Sopenharmony_cistatic void __init memory_map_top_down(unsigned long map_start, 6348c2ecf20Sopenharmony_ci unsigned long map_end) 6358c2ecf20Sopenharmony_ci{ 6368c2ecf20Sopenharmony_ci unsigned long real_end, start, last_start; 6378c2ecf20Sopenharmony_ci unsigned long step_size; 6388c2ecf20Sopenharmony_ci unsigned long addr; 6398c2ecf20Sopenharmony_ci unsigned long mapped_ram_size = 0; 6408c2ecf20Sopenharmony_ci 6418c2ecf20Sopenharmony_ci /* xen has big range in reserved near end of ram, skip it at first.*/ 6428c2ecf20Sopenharmony_ci addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE); 6438c2ecf20Sopenharmony_ci real_end = addr + PMD_SIZE; 6448c2ecf20Sopenharmony_ci 6458c2ecf20Sopenharmony_ci /* step_size need to be small so pgt_buf from BRK could cover it */ 6468c2ecf20Sopenharmony_ci step_size = PMD_SIZE; 6478c2ecf20Sopenharmony_ci max_pfn_mapped = 0; /* will get exact value next */ 6488c2ecf20Sopenharmony_ci min_pfn_mapped = real_end >> PAGE_SHIFT; 6498c2ecf20Sopenharmony_ci last_start = start = real_end; 6508c2ecf20Sopenharmony_ci 6518c2ecf20Sopenharmony_ci /* 6528c2ecf20Sopenharmony_ci * We start from the top (end of memory) and go to the bottom. 6538c2ecf20Sopenharmony_ci * The memblock_find_in_range() gets us a block of RAM from the 6548c2ecf20Sopenharmony_ci * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages 6558c2ecf20Sopenharmony_ci * for page table. 6568c2ecf20Sopenharmony_ci */ 6578c2ecf20Sopenharmony_ci while (last_start > map_start) { 6588c2ecf20Sopenharmony_ci if (last_start > step_size) { 6598c2ecf20Sopenharmony_ci start = round_down(last_start - 1, step_size); 6608c2ecf20Sopenharmony_ci if (start < map_start) 6618c2ecf20Sopenharmony_ci start = map_start; 6628c2ecf20Sopenharmony_ci } else 6638c2ecf20Sopenharmony_ci start = map_start; 6648c2ecf20Sopenharmony_ci mapped_ram_size += init_range_memory_mapping(start, 6658c2ecf20Sopenharmony_ci last_start); 6668c2ecf20Sopenharmony_ci last_start = start; 6678c2ecf20Sopenharmony_ci min_pfn_mapped = last_start >> PAGE_SHIFT; 6688c2ecf20Sopenharmony_ci if (mapped_ram_size >= step_size) 6698c2ecf20Sopenharmony_ci step_size = get_new_step_size(step_size); 6708c2ecf20Sopenharmony_ci } 6718c2ecf20Sopenharmony_ci 6728c2ecf20Sopenharmony_ci if (real_end < map_end) 6738c2ecf20Sopenharmony_ci init_range_memory_mapping(real_end, map_end); 6748c2ecf20Sopenharmony_ci} 6758c2ecf20Sopenharmony_ci 6768c2ecf20Sopenharmony_ci/** 6778c2ecf20Sopenharmony_ci * memory_map_bottom_up - Map [map_start, map_end) bottom up 6788c2ecf20Sopenharmony_ci * @map_start: start address of the target memory range 6798c2ecf20Sopenharmony_ci * @map_end: end address of the target memory range 6808c2ecf20Sopenharmony_ci * 6818c2ecf20Sopenharmony_ci * This function will setup direct mapping for memory range 6828c2ecf20Sopenharmony_ci * [map_start, map_end) in bottom-up. Since we have limited the 6838c2ecf20Sopenharmony_ci * bottom-up allocation above the kernel, the page tables will 6848c2ecf20Sopenharmony_ci * be allocated just above the kernel and we map the memory 6858c2ecf20Sopenharmony_ci * in [map_start, map_end) in bottom-up. 6868c2ecf20Sopenharmony_ci */ 6878c2ecf20Sopenharmony_cistatic void __init memory_map_bottom_up(unsigned long map_start, 6888c2ecf20Sopenharmony_ci unsigned long map_end) 6898c2ecf20Sopenharmony_ci{ 6908c2ecf20Sopenharmony_ci unsigned long next, start; 6918c2ecf20Sopenharmony_ci unsigned long mapped_ram_size = 0; 6928c2ecf20Sopenharmony_ci /* step_size need to be small so pgt_buf from BRK could cover it */ 6938c2ecf20Sopenharmony_ci unsigned long step_size = PMD_SIZE; 6948c2ecf20Sopenharmony_ci 6958c2ecf20Sopenharmony_ci start = map_start; 6968c2ecf20Sopenharmony_ci min_pfn_mapped = start >> PAGE_SHIFT; 6978c2ecf20Sopenharmony_ci 6988c2ecf20Sopenharmony_ci /* 6998c2ecf20Sopenharmony_ci * We start from the bottom (@map_start) and go to the top (@map_end). 7008c2ecf20Sopenharmony_ci * The memblock_find_in_range() gets us a block of RAM from the 7018c2ecf20Sopenharmony_ci * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages 7028c2ecf20Sopenharmony_ci * for page table. 7038c2ecf20Sopenharmony_ci */ 7048c2ecf20Sopenharmony_ci while (start < map_end) { 7058c2ecf20Sopenharmony_ci if (step_size && map_end - start > step_size) { 7068c2ecf20Sopenharmony_ci next = round_up(start + 1, step_size); 7078c2ecf20Sopenharmony_ci if (next > map_end) 7088c2ecf20Sopenharmony_ci next = map_end; 7098c2ecf20Sopenharmony_ci } else { 7108c2ecf20Sopenharmony_ci next = map_end; 7118c2ecf20Sopenharmony_ci } 7128c2ecf20Sopenharmony_ci 7138c2ecf20Sopenharmony_ci mapped_ram_size += init_range_memory_mapping(start, next); 7148c2ecf20Sopenharmony_ci start = next; 7158c2ecf20Sopenharmony_ci 7168c2ecf20Sopenharmony_ci if (mapped_ram_size >= step_size) 7178c2ecf20Sopenharmony_ci step_size = get_new_step_size(step_size); 7188c2ecf20Sopenharmony_ci } 7198c2ecf20Sopenharmony_ci} 7208c2ecf20Sopenharmony_ci 7218c2ecf20Sopenharmony_ci/* 7228c2ecf20Sopenharmony_ci * The real mode trampoline, which is required for bootstrapping CPUs 7238c2ecf20Sopenharmony_ci * occupies only a small area under the low 1MB. See reserve_real_mode() 7248c2ecf20Sopenharmony_ci * for details. 7258c2ecf20Sopenharmony_ci * 7268c2ecf20Sopenharmony_ci * If KASLR is disabled the first PGD entry of the direct mapping is copied 7278c2ecf20Sopenharmony_ci * to map the real mode trampoline. 7288c2ecf20Sopenharmony_ci * 7298c2ecf20Sopenharmony_ci * If KASLR is enabled, copy only the PUD which covers the low 1MB 7308c2ecf20Sopenharmony_ci * area. This limits the randomization granularity to 1GB for both 4-level 7318c2ecf20Sopenharmony_ci * and 5-level paging. 7328c2ecf20Sopenharmony_ci */ 7338c2ecf20Sopenharmony_cistatic void __init init_trampoline(void) 7348c2ecf20Sopenharmony_ci{ 7358c2ecf20Sopenharmony_ci#ifdef CONFIG_X86_64 7368c2ecf20Sopenharmony_ci if (!kaslr_memory_enabled()) 7378c2ecf20Sopenharmony_ci trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)]; 7388c2ecf20Sopenharmony_ci else 7398c2ecf20Sopenharmony_ci init_trampoline_kaslr(); 7408c2ecf20Sopenharmony_ci#endif 7418c2ecf20Sopenharmony_ci} 7428c2ecf20Sopenharmony_ci 7438c2ecf20Sopenharmony_civoid __init init_mem_mapping(void) 7448c2ecf20Sopenharmony_ci{ 7458c2ecf20Sopenharmony_ci unsigned long end; 7468c2ecf20Sopenharmony_ci 7478c2ecf20Sopenharmony_ci pti_check_boottime_disable(); 7488c2ecf20Sopenharmony_ci probe_page_size_mask(); 7498c2ecf20Sopenharmony_ci setup_pcid(); 7508c2ecf20Sopenharmony_ci 7518c2ecf20Sopenharmony_ci#ifdef CONFIG_X86_64 7528c2ecf20Sopenharmony_ci end = max_pfn << PAGE_SHIFT; 7538c2ecf20Sopenharmony_ci#else 7548c2ecf20Sopenharmony_ci end = max_low_pfn << PAGE_SHIFT; 7558c2ecf20Sopenharmony_ci#endif 7568c2ecf20Sopenharmony_ci 7578c2ecf20Sopenharmony_ci /* the ISA range is always mapped regardless of memory holes */ 7588c2ecf20Sopenharmony_ci init_memory_mapping(0, ISA_END_ADDRESS, PAGE_KERNEL); 7598c2ecf20Sopenharmony_ci 7608c2ecf20Sopenharmony_ci /* Init the trampoline, possibly with KASLR memory offset */ 7618c2ecf20Sopenharmony_ci init_trampoline(); 7628c2ecf20Sopenharmony_ci 7638c2ecf20Sopenharmony_ci /* 7648c2ecf20Sopenharmony_ci * If the allocation is in bottom-up direction, we setup direct mapping 7658c2ecf20Sopenharmony_ci * in bottom-up, otherwise we setup direct mapping in top-down. 7668c2ecf20Sopenharmony_ci */ 7678c2ecf20Sopenharmony_ci if (memblock_bottom_up()) { 7688c2ecf20Sopenharmony_ci unsigned long kernel_end = __pa_symbol(_end); 7698c2ecf20Sopenharmony_ci 7708c2ecf20Sopenharmony_ci /* 7718c2ecf20Sopenharmony_ci * we need two separate calls here. This is because we want to 7728c2ecf20Sopenharmony_ci * allocate page tables above the kernel. So we first map 7738c2ecf20Sopenharmony_ci * [kernel_end, end) to make memory above the kernel be mapped 7748c2ecf20Sopenharmony_ci * as soon as possible. And then use page tables allocated above 7758c2ecf20Sopenharmony_ci * the kernel to map [ISA_END_ADDRESS, kernel_end). 7768c2ecf20Sopenharmony_ci */ 7778c2ecf20Sopenharmony_ci memory_map_bottom_up(kernel_end, end); 7788c2ecf20Sopenharmony_ci memory_map_bottom_up(ISA_END_ADDRESS, kernel_end); 7798c2ecf20Sopenharmony_ci } else { 7808c2ecf20Sopenharmony_ci memory_map_top_down(ISA_END_ADDRESS, end); 7818c2ecf20Sopenharmony_ci } 7828c2ecf20Sopenharmony_ci 7838c2ecf20Sopenharmony_ci#ifdef CONFIG_X86_64 7848c2ecf20Sopenharmony_ci if (max_pfn > max_low_pfn) { 7858c2ecf20Sopenharmony_ci /* can we preseve max_low_pfn ?*/ 7868c2ecf20Sopenharmony_ci max_low_pfn = max_pfn; 7878c2ecf20Sopenharmony_ci } 7888c2ecf20Sopenharmony_ci#else 7898c2ecf20Sopenharmony_ci early_ioremap_page_table_range_init(); 7908c2ecf20Sopenharmony_ci#endif 7918c2ecf20Sopenharmony_ci 7928c2ecf20Sopenharmony_ci load_cr3(swapper_pg_dir); 7938c2ecf20Sopenharmony_ci __flush_tlb_all(); 7948c2ecf20Sopenharmony_ci 7958c2ecf20Sopenharmony_ci x86_init.hyper.init_mem_mapping(); 7968c2ecf20Sopenharmony_ci 7978c2ecf20Sopenharmony_ci early_memtest(0, max_pfn_mapped << PAGE_SHIFT); 7988c2ecf20Sopenharmony_ci} 7998c2ecf20Sopenharmony_ci 8008c2ecf20Sopenharmony_ci/* 8018c2ecf20Sopenharmony_ci * Initialize an mm_struct to be used during poking and a pointer to be used 8028c2ecf20Sopenharmony_ci * during patching. 8038c2ecf20Sopenharmony_ci */ 8048c2ecf20Sopenharmony_civoid __init poking_init(void) 8058c2ecf20Sopenharmony_ci{ 8068c2ecf20Sopenharmony_ci spinlock_t *ptl; 8078c2ecf20Sopenharmony_ci pte_t *ptep; 8088c2ecf20Sopenharmony_ci 8098c2ecf20Sopenharmony_ci poking_mm = mm_alloc(); 8108c2ecf20Sopenharmony_ci BUG_ON(!poking_mm); 8118c2ecf20Sopenharmony_ci 8128c2ecf20Sopenharmony_ci /* Xen PV guests need the PGD to be pinned. */ 8138c2ecf20Sopenharmony_ci paravirt_arch_dup_mmap(NULL, poking_mm); 8148c2ecf20Sopenharmony_ci 8158c2ecf20Sopenharmony_ci /* 8168c2ecf20Sopenharmony_ci * Randomize the poking address, but make sure that the following page 8178c2ecf20Sopenharmony_ci * will be mapped at the same PMD. We need 2 pages, so find space for 3, 8188c2ecf20Sopenharmony_ci * and adjust the address if the PMD ends after the first one. 8198c2ecf20Sopenharmony_ci */ 8208c2ecf20Sopenharmony_ci poking_addr = TASK_UNMAPPED_BASE; 8218c2ecf20Sopenharmony_ci if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) 8228c2ecf20Sopenharmony_ci poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) % 8238c2ecf20Sopenharmony_ci (TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE); 8248c2ecf20Sopenharmony_ci 8258c2ecf20Sopenharmony_ci if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0) 8268c2ecf20Sopenharmony_ci poking_addr += PAGE_SIZE; 8278c2ecf20Sopenharmony_ci 8288c2ecf20Sopenharmony_ci /* 8298c2ecf20Sopenharmony_ci * We need to trigger the allocation of the page-tables that will be 8308c2ecf20Sopenharmony_ci * needed for poking now. Later, poking may be performed in an atomic 8318c2ecf20Sopenharmony_ci * section, which might cause allocation to fail. 8328c2ecf20Sopenharmony_ci */ 8338c2ecf20Sopenharmony_ci ptep = get_locked_pte(poking_mm, poking_addr, &ptl); 8348c2ecf20Sopenharmony_ci BUG_ON(!ptep); 8358c2ecf20Sopenharmony_ci pte_unmap_unlock(ptep, ptl); 8368c2ecf20Sopenharmony_ci} 8378c2ecf20Sopenharmony_ci 8388c2ecf20Sopenharmony_ci/* 8398c2ecf20Sopenharmony_ci * devmem_is_allowed() checks to see if /dev/mem access to a certain address 8408c2ecf20Sopenharmony_ci * is valid. The argument is a physical page number. 8418c2ecf20Sopenharmony_ci * 8428c2ecf20Sopenharmony_ci * On x86, access has to be given to the first megabyte of RAM because that 8438c2ecf20Sopenharmony_ci * area traditionally contains BIOS code and data regions used by X, dosemu, 8448c2ecf20Sopenharmony_ci * and similar apps. Since they map the entire memory range, the whole range 8458c2ecf20Sopenharmony_ci * must be allowed (for mapping), but any areas that would otherwise be 8468c2ecf20Sopenharmony_ci * disallowed are flagged as being "zero filled" instead of rejected. 8478c2ecf20Sopenharmony_ci * Access has to be given to non-kernel-ram areas as well, these contain the 8488c2ecf20Sopenharmony_ci * PCI mmio resources as well as potential bios/acpi data regions. 8498c2ecf20Sopenharmony_ci */ 8508c2ecf20Sopenharmony_ciint devmem_is_allowed(unsigned long pagenr) 8518c2ecf20Sopenharmony_ci{ 8528c2ecf20Sopenharmony_ci if (region_intersects(PFN_PHYS(pagenr), PAGE_SIZE, 8538c2ecf20Sopenharmony_ci IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE) 8548c2ecf20Sopenharmony_ci != REGION_DISJOINT) { 8558c2ecf20Sopenharmony_ci /* 8568c2ecf20Sopenharmony_ci * For disallowed memory regions in the low 1MB range, 8578c2ecf20Sopenharmony_ci * request that the page be shown as all zeros. 8588c2ecf20Sopenharmony_ci */ 8598c2ecf20Sopenharmony_ci if (pagenr < 256) 8608c2ecf20Sopenharmony_ci return 2; 8618c2ecf20Sopenharmony_ci 8628c2ecf20Sopenharmony_ci return 0; 8638c2ecf20Sopenharmony_ci } 8648c2ecf20Sopenharmony_ci 8658c2ecf20Sopenharmony_ci /* 8668c2ecf20Sopenharmony_ci * This must follow RAM test, since System RAM is considered a 8678c2ecf20Sopenharmony_ci * restricted resource under CONFIG_STRICT_IOMEM. 8688c2ecf20Sopenharmony_ci */ 8698c2ecf20Sopenharmony_ci if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) { 8708c2ecf20Sopenharmony_ci /* Low 1MB bypasses iomem restrictions. */ 8718c2ecf20Sopenharmony_ci if (pagenr < 256) 8728c2ecf20Sopenharmony_ci return 1; 8738c2ecf20Sopenharmony_ci 8748c2ecf20Sopenharmony_ci return 0; 8758c2ecf20Sopenharmony_ci } 8768c2ecf20Sopenharmony_ci 8778c2ecf20Sopenharmony_ci return 1; 8788c2ecf20Sopenharmony_ci} 8798c2ecf20Sopenharmony_ci 8808c2ecf20Sopenharmony_civoid free_init_pages(const char *what, unsigned long begin, unsigned long end) 8818c2ecf20Sopenharmony_ci{ 8828c2ecf20Sopenharmony_ci unsigned long begin_aligned, end_aligned; 8838c2ecf20Sopenharmony_ci 8848c2ecf20Sopenharmony_ci /* Make sure boundaries are page aligned */ 8858c2ecf20Sopenharmony_ci begin_aligned = PAGE_ALIGN(begin); 8868c2ecf20Sopenharmony_ci end_aligned = end & PAGE_MASK; 8878c2ecf20Sopenharmony_ci 8888c2ecf20Sopenharmony_ci if (WARN_ON(begin_aligned != begin || end_aligned != end)) { 8898c2ecf20Sopenharmony_ci begin = begin_aligned; 8908c2ecf20Sopenharmony_ci end = end_aligned; 8918c2ecf20Sopenharmony_ci } 8928c2ecf20Sopenharmony_ci 8938c2ecf20Sopenharmony_ci if (begin >= end) 8948c2ecf20Sopenharmony_ci return; 8958c2ecf20Sopenharmony_ci 8968c2ecf20Sopenharmony_ci /* 8978c2ecf20Sopenharmony_ci * If debugging page accesses then do not free this memory but 8988c2ecf20Sopenharmony_ci * mark them not present - any buggy init-section access will 8998c2ecf20Sopenharmony_ci * create a kernel page fault: 9008c2ecf20Sopenharmony_ci */ 9018c2ecf20Sopenharmony_ci if (debug_pagealloc_enabled()) { 9028c2ecf20Sopenharmony_ci pr_info("debug: unmapping init [mem %#010lx-%#010lx]\n", 9038c2ecf20Sopenharmony_ci begin, end - 1); 9048c2ecf20Sopenharmony_ci /* 9058c2ecf20Sopenharmony_ci * Inform kmemleak about the hole in the memory since the 9068c2ecf20Sopenharmony_ci * corresponding pages will be unmapped. 9078c2ecf20Sopenharmony_ci */ 9088c2ecf20Sopenharmony_ci kmemleak_free_part((void *)begin, end - begin); 9098c2ecf20Sopenharmony_ci set_memory_np(begin, (end - begin) >> PAGE_SHIFT); 9108c2ecf20Sopenharmony_ci } else { 9118c2ecf20Sopenharmony_ci /* 9128c2ecf20Sopenharmony_ci * We just marked the kernel text read only above, now that 9138c2ecf20Sopenharmony_ci * we are going to free part of that, we need to make that 9148c2ecf20Sopenharmony_ci * writeable and non-executable first. 9158c2ecf20Sopenharmony_ci */ 9168c2ecf20Sopenharmony_ci set_memory_nx(begin, (end - begin) >> PAGE_SHIFT); 9178c2ecf20Sopenharmony_ci set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); 9188c2ecf20Sopenharmony_ci 9198c2ecf20Sopenharmony_ci free_reserved_area((void *)begin, (void *)end, 9208c2ecf20Sopenharmony_ci POISON_FREE_INITMEM, what); 9218c2ecf20Sopenharmony_ci } 9228c2ecf20Sopenharmony_ci} 9238c2ecf20Sopenharmony_ci 9248c2ecf20Sopenharmony_ci/* 9258c2ecf20Sopenharmony_ci * begin/end can be in the direct map or the "high kernel mapping" 9268c2ecf20Sopenharmony_ci * used for the kernel image only. free_init_pages() will do the 9278c2ecf20Sopenharmony_ci * right thing for either kind of address. 9288c2ecf20Sopenharmony_ci */ 9298c2ecf20Sopenharmony_civoid free_kernel_image_pages(const char *what, void *begin, void *end) 9308c2ecf20Sopenharmony_ci{ 9318c2ecf20Sopenharmony_ci unsigned long begin_ul = (unsigned long)begin; 9328c2ecf20Sopenharmony_ci unsigned long end_ul = (unsigned long)end; 9338c2ecf20Sopenharmony_ci unsigned long len_pages = (end_ul - begin_ul) >> PAGE_SHIFT; 9348c2ecf20Sopenharmony_ci 9358c2ecf20Sopenharmony_ci free_init_pages(what, begin_ul, end_ul); 9368c2ecf20Sopenharmony_ci 9378c2ecf20Sopenharmony_ci /* 9388c2ecf20Sopenharmony_ci * PTI maps some of the kernel into userspace. For performance, 9398c2ecf20Sopenharmony_ci * this includes some kernel areas that do not contain secrets. 9408c2ecf20Sopenharmony_ci * Those areas might be adjacent to the parts of the kernel image 9418c2ecf20Sopenharmony_ci * being freed, which may contain secrets. Remove the "high kernel 9428c2ecf20Sopenharmony_ci * image mapping" for these freed areas, ensuring they are not even 9438c2ecf20Sopenharmony_ci * potentially vulnerable to Meltdown regardless of the specific 9448c2ecf20Sopenharmony_ci * optimizations PTI is currently using. 9458c2ecf20Sopenharmony_ci * 9468c2ecf20Sopenharmony_ci * The "noalias" prevents unmapping the direct map alias which is 9478c2ecf20Sopenharmony_ci * needed to access the freed pages. 9488c2ecf20Sopenharmony_ci * 9498c2ecf20Sopenharmony_ci * This is only valid for 64bit kernels. 32bit has only one mapping 9508c2ecf20Sopenharmony_ci * which can't be treated in this way for obvious reasons. 9518c2ecf20Sopenharmony_ci */ 9528c2ecf20Sopenharmony_ci if (IS_ENABLED(CONFIG_X86_64) && cpu_feature_enabled(X86_FEATURE_PTI)) 9538c2ecf20Sopenharmony_ci set_memory_np_noalias(begin_ul, len_pages); 9548c2ecf20Sopenharmony_ci} 9558c2ecf20Sopenharmony_ci 9568c2ecf20Sopenharmony_civoid __ref free_initmem(void) 9578c2ecf20Sopenharmony_ci{ 9588c2ecf20Sopenharmony_ci e820__reallocate_tables(); 9598c2ecf20Sopenharmony_ci 9608c2ecf20Sopenharmony_ci mem_encrypt_free_decrypted_mem(); 9618c2ecf20Sopenharmony_ci 9628c2ecf20Sopenharmony_ci free_kernel_image_pages("unused kernel image (initmem)", 9638c2ecf20Sopenharmony_ci &__init_begin, &__init_end); 9648c2ecf20Sopenharmony_ci} 9658c2ecf20Sopenharmony_ci 9668c2ecf20Sopenharmony_ci#ifdef CONFIG_BLK_DEV_INITRD 9678c2ecf20Sopenharmony_civoid __init free_initrd_mem(unsigned long start, unsigned long end) 9688c2ecf20Sopenharmony_ci{ 9698c2ecf20Sopenharmony_ci /* 9708c2ecf20Sopenharmony_ci * end could be not aligned, and We can not align that, 9718c2ecf20Sopenharmony_ci * decompresser could be confused by aligned initrd_end 9728c2ecf20Sopenharmony_ci * We already reserve the end partial page before in 9738c2ecf20Sopenharmony_ci * - i386_start_kernel() 9748c2ecf20Sopenharmony_ci * - x86_64_start_kernel() 9758c2ecf20Sopenharmony_ci * - relocate_initrd() 9768c2ecf20Sopenharmony_ci * So here We can do PAGE_ALIGN() safely to get partial page to be freed 9778c2ecf20Sopenharmony_ci */ 9788c2ecf20Sopenharmony_ci free_init_pages("initrd", start, PAGE_ALIGN(end)); 9798c2ecf20Sopenharmony_ci} 9808c2ecf20Sopenharmony_ci#endif 9818c2ecf20Sopenharmony_ci 9828c2ecf20Sopenharmony_ci/* 9838c2ecf20Sopenharmony_ci * Calculate the precise size of the DMA zone (first 16 MB of RAM), 9848c2ecf20Sopenharmony_ci * and pass it to the MM layer - to help it set zone watermarks more 9858c2ecf20Sopenharmony_ci * accurately. 9868c2ecf20Sopenharmony_ci * 9878c2ecf20Sopenharmony_ci * Done on 64-bit systems only for the time being, although 32-bit systems 9888c2ecf20Sopenharmony_ci * might benefit from this as well. 9898c2ecf20Sopenharmony_ci */ 9908c2ecf20Sopenharmony_civoid __init memblock_find_dma_reserve(void) 9918c2ecf20Sopenharmony_ci{ 9928c2ecf20Sopenharmony_ci#ifdef CONFIG_X86_64 9938c2ecf20Sopenharmony_ci u64 nr_pages = 0, nr_free_pages = 0; 9948c2ecf20Sopenharmony_ci unsigned long start_pfn, end_pfn; 9958c2ecf20Sopenharmony_ci phys_addr_t start_addr, end_addr; 9968c2ecf20Sopenharmony_ci int i; 9978c2ecf20Sopenharmony_ci u64 u; 9988c2ecf20Sopenharmony_ci 9998c2ecf20Sopenharmony_ci /* 10008c2ecf20Sopenharmony_ci * Iterate over all memory ranges (free and reserved ones alike), 10018c2ecf20Sopenharmony_ci * to calculate the total number of pages in the first 16 MB of RAM: 10028c2ecf20Sopenharmony_ci */ 10038c2ecf20Sopenharmony_ci nr_pages = 0; 10048c2ecf20Sopenharmony_ci for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { 10058c2ecf20Sopenharmony_ci start_pfn = min(start_pfn, MAX_DMA_PFN); 10068c2ecf20Sopenharmony_ci end_pfn = min(end_pfn, MAX_DMA_PFN); 10078c2ecf20Sopenharmony_ci 10088c2ecf20Sopenharmony_ci nr_pages += end_pfn - start_pfn; 10098c2ecf20Sopenharmony_ci } 10108c2ecf20Sopenharmony_ci 10118c2ecf20Sopenharmony_ci /* 10128c2ecf20Sopenharmony_ci * Iterate over free memory ranges to calculate the number of free 10138c2ecf20Sopenharmony_ci * pages in the DMA zone, while not counting potential partial 10148c2ecf20Sopenharmony_ci * pages at the beginning or the end of the range: 10158c2ecf20Sopenharmony_ci */ 10168c2ecf20Sopenharmony_ci nr_free_pages = 0; 10178c2ecf20Sopenharmony_ci for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start_addr, &end_addr, NULL) { 10188c2ecf20Sopenharmony_ci start_pfn = min_t(unsigned long, PFN_UP(start_addr), MAX_DMA_PFN); 10198c2ecf20Sopenharmony_ci end_pfn = min_t(unsigned long, PFN_DOWN(end_addr), MAX_DMA_PFN); 10208c2ecf20Sopenharmony_ci 10218c2ecf20Sopenharmony_ci if (start_pfn < end_pfn) 10228c2ecf20Sopenharmony_ci nr_free_pages += end_pfn - start_pfn; 10238c2ecf20Sopenharmony_ci } 10248c2ecf20Sopenharmony_ci 10258c2ecf20Sopenharmony_ci set_dma_reserve(nr_pages - nr_free_pages); 10268c2ecf20Sopenharmony_ci#endif 10278c2ecf20Sopenharmony_ci} 10288c2ecf20Sopenharmony_ci 10298c2ecf20Sopenharmony_civoid __init zone_sizes_init(void) 10308c2ecf20Sopenharmony_ci{ 10318c2ecf20Sopenharmony_ci unsigned long max_zone_pfns[MAX_NR_ZONES]; 10328c2ecf20Sopenharmony_ci 10338c2ecf20Sopenharmony_ci memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 10348c2ecf20Sopenharmony_ci 10358c2ecf20Sopenharmony_ci#ifdef CONFIG_ZONE_DMA 10368c2ecf20Sopenharmony_ci max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn); 10378c2ecf20Sopenharmony_ci#endif 10388c2ecf20Sopenharmony_ci#ifdef CONFIG_ZONE_DMA32 10398c2ecf20Sopenharmony_ci max_zone_pfns[ZONE_DMA32] = min(MAX_DMA32_PFN, max_low_pfn); 10408c2ecf20Sopenharmony_ci#endif 10418c2ecf20Sopenharmony_ci max_zone_pfns[ZONE_NORMAL] = max_low_pfn; 10428c2ecf20Sopenharmony_ci#ifdef CONFIG_HIGHMEM 10438c2ecf20Sopenharmony_ci max_zone_pfns[ZONE_HIGHMEM] = max_pfn; 10448c2ecf20Sopenharmony_ci#endif 10458c2ecf20Sopenharmony_ci 10468c2ecf20Sopenharmony_ci free_area_init(max_zone_pfns); 10478c2ecf20Sopenharmony_ci} 10488c2ecf20Sopenharmony_ci 10498c2ecf20Sopenharmony_ci__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { 10508c2ecf20Sopenharmony_ci .loaded_mm = &init_mm, 10518c2ecf20Sopenharmony_ci .next_asid = 1, 10528c2ecf20Sopenharmony_ci .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ 10538c2ecf20Sopenharmony_ci}; 10548c2ecf20Sopenharmony_ci 10558c2ecf20Sopenharmony_civoid update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) 10568c2ecf20Sopenharmony_ci{ 10578c2ecf20Sopenharmony_ci /* entry 0 MUST be WB (hardwired to speed up translations) */ 10588c2ecf20Sopenharmony_ci BUG_ON(!entry && cache != _PAGE_CACHE_MODE_WB); 10598c2ecf20Sopenharmony_ci 10608c2ecf20Sopenharmony_ci __cachemode2pte_tbl[cache] = __cm_idx2pte(entry); 10618c2ecf20Sopenharmony_ci __pte2cachemode_tbl[entry] = cache; 10628c2ecf20Sopenharmony_ci} 10638c2ecf20Sopenharmony_ci 10648c2ecf20Sopenharmony_ci#ifdef CONFIG_SWAP 10658c2ecf20Sopenharmony_ciunsigned long max_swapfile_size(void) 10668c2ecf20Sopenharmony_ci{ 10678c2ecf20Sopenharmony_ci unsigned long pages; 10688c2ecf20Sopenharmony_ci 10698c2ecf20Sopenharmony_ci pages = generic_max_swapfile_size(); 10708c2ecf20Sopenharmony_ci 10718c2ecf20Sopenharmony_ci if (boot_cpu_has_bug(X86_BUG_L1TF) && l1tf_mitigation != L1TF_MITIGATION_OFF) { 10728c2ecf20Sopenharmony_ci /* Limit the swap file size to MAX_PA/2 for L1TF workaround */ 10738c2ecf20Sopenharmony_ci unsigned long long l1tf_limit = l1tf_pfn_limit(); 10748c2ecf20Sopenharmony_ci /* 10758c2ecf20Sopenharmony_ci * We encode swap offsets also with 3 bits below those for pfn 10768c2ecf20Sopenharmony_ci * which makes the usable limit higher. 10778c2ecf20Sopenharmony_ci */ 10788c2ecf20Sopenharmony_ci#if CONFIG_PGTABLE_LEVELS > 2 10798c2ecf20Sopenharmony_ci l1tf_limit <<= PAGE_SHIFT - SWP_OFFSET_FIRST_BIT; 10808c2ecf20Sopenharmony_ci#endif 10818c2ecf20Sopenharmony_ci pages = min_t(unsigned long long, l1tf_limit, pages); 10828c2ecf20Sopenharmony_ci } 10838c2ecf20Sopenharmony_ci return pages; 10848c2ecf20Sopenharmony_ci} 10858c2ecf20Sopenharmony_ci#endif 1086