162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * PowerPC version 462306a36Sopenharmony_ci * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) 562306a36Sopenharmony_ci * 662306a36Sopenharmony_ci * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) 762306a36Sopenharmony_ci * and Cort Dougan (PReP) (cort@cs.nmt.edu) 862306a36Sopenharmony_ci * Copyright (C) 1996 Paul Mackerras 962306a36Sopenharmony_ci * 1062306a36Sopenharmony_ci * Derived from "arch/i386/mm/init.c" 1162306a36Sopenharmony_ci * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 1262306a36Sopenharmony_ci * 1362306a36Sopenharmony_ci * Dave Engebretsen <engebret@us.ibm.com> 1462306a36Sopenharmony_ci * Rework for PPC64 port. 1562306a36Sopenharmony_ci */ 1662306a36Sopenharmony_ci 1762306a36Sopenharmony_ci#undef DEBUG 1862306a36Sopenharmony_ci 1962306a36Sopenharmony_ci#include <linux/signal.h> 2062306a36Sopenharmony_ci#include <linux/sched.h> 2162306a36Sopenharmony_ci#include <linux/kernel.h> 2262306a36Sopenharmony_ci#include <linux/errno.h> 2362306a36Sopenharmony_ci#include <linux/string.h> 2462306a36Sopenharmony_ci#include <linux/types.h> 2562306a36Sopenharmony_ci#include <linux/mman.h> 2662306a36Sopenharmony_ci#include <linux/mm.h> 2762306a36Sopenharmony_ci#include <linux/swap.h> 2862306a36Sopenharmony_ci#include <linux/stddef.h> 2962306a36Sopenharmony_ci#include <linux/vmalloc.h> 3062306a36Sopenharmony_ci#include <linux/init.h> 3162306a36Sopenharmony_ci#include <linux/delay.h> 3262306a36Sopenharmony_ci#include <linux/highmem.h> 3362306a36Sopenharmony_ci#include <linux/idr.h> 3462306a36Sopenharmony_ci#include <linux/nodemask.h> 3562306a36Sopenharmony_ci#include <linux/module.h> 3662306a36Sopenharmony_ci#include <linux/poison.h> 3762306a36Sopenharmony_ci#include <linux/memblock.h> 3862306a36Sopenharmony_ci#include <linux/hugetlb.h> 3962306a36Sopenharmony_ci#include <linux/slab.h> 4062306a36Sopenharmony_ci#include <linux/of_fdt.h> 4162306a36Sopenharmony_ci#include <linux/libfdt.h> 4262306a36Sopenharmony_ci#include <linux/memremap.h> 4362306a36Sopenharmony_ci#include <linux/memory.h> 4462306a36Sopenharmony_ci 4562306a36Sopenharmony_ci#include <asm/pgalloc.h> 4662306a36Sopenharmony_ci#include <asm/page.h> 4762306a36Sopenharmony_ci#include <asm/prom.h> 4862306a36Sopenharmony_ci#include <asm/rtas.h> 4962306a36Sopenharmony_ci#include <asm/io.h> 5062306a36Sopenharmony_ci#include <asm/mmu_context.h> 5162306a36Sopenharmony_ci#include <asm/mmu.h> 5262306a36Sopenharmony_ci#include <linux/uaccess.h> 5362306a36Sopenharmony_ci#include <asm/smp.h> 5462306a36Sopenharmony_ci#include <asm/machdep.h> 5562306a36Sopenharmony_ci#include <asm/tlb.h> 5662306a36Sopenharmony_ci#include <asm/eeh.h> 5762306a36Sopenharmony_ci#include <asm/processor.h> 5862306a36Sopenharmony_ci#include <asm/mmzone.h> 5962306a36Sopenharmony_ci#include <asm/cputable.h> 6062306a36Sopenharmony_ci#include <asm/sections.h> 6162306a36Sopenharmony_ci#include <asm/iommu.h> 6262306a36Sopenharmony_ci#include <asm/vdso.h> 6362306a36Sopenharmony_ci#include <asm/hugetlb.h> 6462306a36Sopenharmony_ci 6562306a36Sopenharmony_ci#include <mm/mmu_decl.h> 6662306a36Sopenharmony_ci 6762306a36Sopenharmony_ci#ifdef CONFIG_SPARSEMEM_VMEMMAP 6862306a36Sopenharmony_ci/* 6962306a36Sopenharmony_ci * Given an address within the vmemmap, determine the page that 7062306a36Sopenharmony_ci * represents the start of the subsection it is within. Note that we have to 7162306a36Sopenharmony_ci * do this by hand as the proffered address may not be correctly aligned. 7262306a36Sopenharmony_ci * Subtraction of non-aligned pointers produces undefined results. 7362306a36Sopenharmony_ci */ 7462306a36Sopenharmony_cistatic struct page * __meminit vmemmap_subsection_start(unsigned long vmemmap_addr) 7562306a36Sopenharmony_ci{ 7662306a36Sopenharmony_ci unsigned long start_pfn; 7762306a36Sopenharmony_ci unsigned long offset = vmemmap_addr - ((unsigned long)(vmemmap)); 7862306a36Sopenharmony_ci 7962306a36Sopenharmony_ci /* Return the pfn of the start of the section. */ 8062306a36Sopenharmony_ci start_pfn = (offset / sizeof(struct page)) & PAGE_SUBSECTION_MASK; 8162306a36Sopenharmony_ci return pfn_to_page(start_pfn); 8262306a36Sopenharmony_ci} 8362306a36Sopenharmony_ci 8462306a36Sopenharmony_ci/* 8562306a36Sopenharmony_ci * Since memory is added in sub-section chunks, before creating a new vmemmap 8662306a36Sopenharmony_ci * mapping, the kernel should check whether there is an existing memmap mapping 8762306a36Sopenharmony_ci * covering the new subsection added. This is needed because kernel can map 8862306a36Sopenharmony_ci * vmemmap area using 16MB pages which will cover a memory range of 16G. Such 8962306a36Sopenharmony_ci * a range covers multiple subsections (2M) 9062306a36Sopenharmony_ci * 9162306a36Sopenharmony_ci * If any subsection in the 16G range mapped by vmemmap is valid we consider the 9262306a36Sopenharmony_ci * vmemmap populated (There is a page table entry already present). We can't do 9362306a36Sopenharmony_ci * a page table lookup here because with the hash translation we don't keep 9462306a36Sopenharmony_ci * vmemmap details in linux page table. 9562306a36Sopenharmony_ci */ 9662306a36Sopenharmony_ciint __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size) 9762306a36Sopenharmony_ci{ 9862306a36Sopenharmony_ci struct page *start; 9962306a36Sopenharmony_ci unsigned long vmemmap_end = vmemmap_addr + vmemmap_map_size; 10062306a36Sopenharmony_ci start = vmemmap_subsection_start(vmemmap_addr); 10162306a36Sopenharmony_ci 10262306a36Sopenharmony_ci for (; (unsigned long)start < vmemmap_end; start += PAGES_PER_SUBSECTION) 10362306a36Sopenharmony_ci /* 10462306a36Sopenharmony_ci * pfn valid check here is intended to really check 10562306a36Sopenharmony_ci * whether we have any subsection already initialized 10662306a36Sopenharmony_ci * in this range. 10762306a36Sopenharmony_ci */ 10862306a36Sopenharmony_ci if (pfn_valid(page_to_pfn(start))) 10962306a36Sopenharmony_ci return 1; 11062306a36Sopenharmony_ci 11162306a36Sopenharmony_ci return 0; 11262306a36Sopenharmony_ci} 11362306a36Sopenharmony_ci 11462306a36Sopenharmony_ci/* 11562306a36Sopenharmony_ci * vmemmap virtual address space management does not have a traditional page 11662306a36Sopenharmony_ci * table to track which virtual struct pages are backed by physical mapping. 11762306a36Sopenharmony_ci * The virtual to physical mappings are tracked in a simple linked list 11862306a36Sopenharmony_ci * format. 'vmemmap_list' maintains the entire vmemmap physical mapping at 11962306a36Sopenharmony_ci * all times where as the 'next' list maintains the available 12062306a36Sopenharmony_ci * vmemmap_backing structures which have been deleted from the 12162306a36Sopenharmony_ci * 'vmemmap_global' list during system runtime (memory hotplug remove 12262306a36Sopenharmony_ci * operation). The freed 'vmemmap_backing' structures are reused later when 12362306a36Sopenharmony_ci * new requests come in without allocating fresh memory. This pointer also 12462306a36Sopenharmony_ci * tracks the allocated 'vmemmap_backing' structures as we allocate one 12562306a36Sopenharmony_ci * full page memory at a time when we dont have any. 12662306a36Sopenharmony_ci */ 12762306a36Sopenharmony_cistruct vmemmap_backing *vmemmap_list; 12862306a36Sopenharmony_cistatic struct vmemmap_backing *next; 12962306a36Sopenharmony_ci 13062306a36Sopenharmony_ci/* 13162306a36Sopenharmony_ci * The same pointer 'next' tracks individual chunks inside the allocated 13262306a36Sopenharmony_ci * full page during the boot time and again tracks the freed nodes during 13362306a36Sopenharmony_ci * runtime. It is racy but it does not happen as they are separated by the 13462306a36Sopenharmony_ci * boot process. Will create problem if some how we have memory hotplug 13562306a36Sopenharmony_ci * operation during boot !! 13662306a36Sopenharmony_ci */ 13762306a36Sopenharmony_cistatic int num_left; 13862306a36Sopenharmony_cistatic int num_freed; 13962306a36Sopenharmony_ci 14062306a36Sopenharmony_cistatic __meminit struct vmemmap_backing * vmemmap_list_alloc(int node) 14162306a36Sopenharmony_ci{ 14262306a36Sopenharmony_ci struct vmemmap_backing *vmem_back; 14362306a36Sopenharmony_ci /* get from freed entries first */ 14462306a36Sopenharmony_ci if (num_freed) { 14562306a36Sopenharmony_ci num_freed--; 14662306a36Sopenharmony_ci vmem_back = next; 14762306a36Sopenharmony_ci next = next->list; 14862306a36Sopenharmony_ci 14962306a36Sopenharmony_ci return vmem_back; 15062306a36Sopenharmony_ci } 15162306a36Sopenharmony_ci 15262306a36Sopenharmony_ci /* allocate a page when required and hand out chunks */ 15362306a36Sopenharmony_ci if (!num_left) { 15462306a36Sopenharmony_ci next = vmemmap_alloc_block(PAGE_SIZE, node); 15562306a36Sopenharmony_ci if (unlikely(!next)) { 15662306a36Sopenharmony_ci WARN_ON(1); 15762306a36Sopenharmony_ci return NULL; 15862306a36Sopenharmony_ci } 15962306a36Sopenharmony_ci num_left = PAGE_SIZE / sizeof(struct vmemmap_backing); 16062306a36Sopenharmony_ci } 16162306a36Sopenharmony_ci 16262306a36Sopenharmony_ci num_left--; 16362306a36Sopenharmony_ci 16462306a36Sopenharmony_ci return next++; 16562306a36Sopenharmony_ci} 16662306a36Sopenharmony_ci 16762306a36Sopenharmony_cistatic __meminit int vmemmap_list_populate(unsigned long phys, 16862306a36Sopenharmony_ci unsigned long start, 16962306a36Sopenharmony_ci int node) 17062306a36Sopenharmony_ci{ 17162306a36Sopenharmony_ci struct vmemmap_backing *vmem_back; 17262306a36Sopenharmony_ci 17362306a36Sopenharmony_ci vmem_back = vmemmap_list_alloc(node); 17462306a36Sopenharmony_ci if (unlikely(!vmem_back)) { 17562306a36Sopenharmony_ci pr_debug("vmemap list allocation failed\n"); 17662306a36Sopenharmony_ci return -ENOMEM; 17762306a36Sopenharmony_ci } 17862306a36Sopenharmony_ci 17962306a36Sopenharmony_ci vmem_back->phys = phys; 18062306a36Sopenharmony_ci vmem_back->virt_addr = start; 18162306a36Sopenharmony_ci vmem_back->list = vmemmap_list; 18262306a36Sopenharmony_ci 18362306a36Sopenharmony_ci vmemmap_list = vmem_back; 18462306a36Sopenharmony_ci return 0; 18562306a36Sopenharmony_ci} 18662306a36Sopenharmony_ci 18762306a36Sopenharmony_cibool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, 18862306a36Sopenharmony_ci unsigned long page_size) 18962306a36Sopenharmony_ci{ 19062306a36Sopenharmony_ci unsigned long nr_pfn = page_size / sizeof(struct page); 19162306a36Sopenharmony_ci unsigned long start_pfn = page_to_pfn((struct page *)start); 19262306a36Sopenharmony_ci 19362306a36Sopenharmony_ci if ((start_pfn + nr_pfn - 1) > altmap->end_pfn) 19462306a36Sopenharmony_ci return true; 19562306a36Sopenharmony_ci 19662306a36Sopenharmony_ci if (start_pfn < altmap->base_pfn) 19762306a36Sopenharmony_ci return true; 19862306a36Sopenharmony_ci 19962306a36Sopenharmony_ci return false; 20062306a36Sopenharmony_ci} 20162306a36Sopenharmony_ci 20262306a36Sopenharmony_cistatic int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int node, 20362306a36Sopenharmony_ci struct vmem_altmap *altmap) 20462306a36Sopenharmony_ci{ 20562306a36Sopenharmony_ci bool altmap_alloc; 20662306a36Sopenharmony_ci unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; 20762306a36Sopenharmony_ci 20862306a36Sopenharmony_ci /* Align to the page size of the linear mapping. */ 20962306a36Sopenharmony_ci start = ALIGN_DOWN(start, page_size); 21062306a36Sopenharmony_ci 21162306a36Sopenharmony_ci pr_debug("vmemmap_populate %lx..%lx, node %d\n", start, end, node); 21262306a36Sopenharmony_ci 21362306a36Sopenharmony_ci for (; start < end; start += page_size) { 21462306a36Sopenharmony_ci void *p = NULL; 21562306a36Sopenharmony_ci int rc; 21662306a36Sopenharmony_ci 21762306a36Sopenharmony_ci /* 21862306a36Sopenharmony_ci * This vmemmap range is backing different subsections. If any 21962306a36Sopenharmony_ci * of that subsection is marked valid, that means we already 22062306a36Sopenharmony_ci * have initialized a page table covering this range and hence 22162306a36Sopenharmony_ci * the vmemmap range is populated. 22262306a36Sopenharmony_ci */ 22362306a36Sopenharmony_ci if (vmemmap_populated(start, page_size)) 22462306a36Sopenharmony_ci continue; 22562306a36Sopenharmony_ci 22662306a36Sopenharmony_ci /* 22762306a36Sopenharmony_ci * Allocate from the altmap first if we have one. This may 22862306a36Sopenharmony_ci * fail due to alignment issues when using 16MB hugepages, so 22962306a36Sopenharmony_ci * fall back to system memory if the altmap allocation fail. 23062306a36Sopenharmony_ci */ 23162306a36Sopenharmony_ci if (altmap && !altmap_cross_boundary(altmap, start, page_size)) { 23262306a36Sopenharmony_ci p = vmemmap_alloc_block_buf(page_size, node, altmap); 23362306a36Sopenharmony_ci if (!p) 23462306a36Sopenharmony_ci pr_debug("altmap block allocation failed, falling back to system memory"); 23562306a36Sopenharmony_ci else 23662306a36Sopenharmony_ci altmap_alloc = true; 23762306a36Sopenharmony_ci } 23862306a36Sopenharmony_ci if (!p) { 23962306a36Sopenharmony_ci p = vmemmap_alloc_block_buf(page_size, node, NULL); 24062306a36Sopenharmony_ci altmap_alloc = false; 24162306a36Sopenharmony_ci } 24262306a36Sopenharmony_ci if (!p) 24362306a36Sopenharmony_ci return -ENOMEM; 24462306a36Sopenharmony_ci 24562306a36Sopenharmony_ci if (vmemmap_list_populate(__pa(p), start, node)) { 24662306a36Sopenharmony_ci /* 24762306a36Sopenharmony_ci * If we don't populate vmemap list, we don't have 24862306a36Sopenharmony_ci * the ability to free the allocated vmemmap 24962306a36Sopenharmony_ci * pages in section_deactivate. Hence free them 25062306a36Sopenharmony_ci * here. 25162306a36Sopenharmony_ci */ 25262306a36Sopenharmony_ci int nr_pfns = page_size >> PAGE_SHIFT; 25362306a36Sopenharmony_ci unsigned long page_order = get_order(page_size); 25462306a36Sopenharmony_ci 25562306a36Sopenharmony_ci if (altmap_alloc) 25662306a36Sopenharmony_ci vmem_altmap_free(altmap, nr_pfns); 25762306a36Sopenharmony_ci else 25862306a36Sopenharmony_ci free_pages((unsigned long)p, page_order); 25962306a36Sopenharmony_ci return -ENOMEM; 26062306a36Sopenharmony_ci } 26162306a36Sopenharmony_ci 26262306a36Sopenharmony_ci pr_debug(" * %016lx..%016lx allocated at %p\n", 26362306a36Sopenharmony_ci start, start + page_size, p); 26462306a36Sopenharmony_ci 26562306a36Sopenharmony_ci rc = vmemmap_create_mapping(start, page_size, __pa(p)); 26662306a36Sopenharmony_ci if (rc < 0) { 26762306a36Sopenharmony_ci pr_warn("%s: Unable to create vmemmap mapping: %d\n", 26862306a36Sopenharmony_ci __func__, rc); 26962306a36Sopenharmony_ci return -EFAULT; 27062306a36Sopenharmony_ci } 27162306a36Sopenharmony_ci } 27262306a36Sopenharmony_ci 27362306a36Sopenharmony_ci return 0; 27462306a36Sopenharmony_ci} 27562306a36Sopenharmony_ci 27662306a36Sopenharmony_ciint __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, 27762306a36Sopenharmony_ci struct vmem_altmap *altmap) 27862306a36Sopenharmony_ci{ 27962306a36Sopenharmony_ci 28062306a36Sopenharmony_ci#ifdef CONFIG_PPC_BOOK3S_64 28162306a36Sopenharmony_ci if (radix_enabled()) 28262306a36Sopenharmony_ci return radix__vmemmap_populate(start, end, node, altmap); 28362306a36Sopenharmony_ci#endif 28462306a36Sopenharmony_ci 28562306a36Sopenharmony_ci return __vmemmap_populate(start, end, node, altmap); 28662306a36Sopenharmony_ci} 28762306a36Sopenharmony_ci 28862306a36Sopenharmony_ci#ifdef CONFIG_MEMORY_HOTPLUG 28962306a36Sopenharmony_cistatic unsigned long vmemmap_list_free(unsigned long start) 29062306a36Sopenharmony_ci{ 29162306a36Sopenharmony_ci struct vmemmap_backing *vmem_back, *vmem_back_prev; 29262306a36Sopenharmony_ci 29362306a36Sopenharmony_ci vmem_back_prev = vmem_back = vmemmap_list; 29462306a36Sopenharmony_ci 29562306a36Sopenharmony_ci /* look for it with prev pointer recorded */ 29662306a36Sopenharmony_ci for (; vmem_back; vmem_back = vmem_back->list) { 29762306a36Sopenharmony_ci if (vmem_back->virt_addr == start) 29862306a36Sopenharmony_ci break; 29962306a36Sopenharmony_ci vmem_back_prev = vmem_back; 30062306a36Sopenharmony_ci } 30162306a36Sopenharmony_ci 30262306a36Sopenharmony_ci if (unlikely(!vmem_back)) 30362306a36Sopenharmony_ci return 0; 30462306a36Sopenharmony_ci 30562306a36Sopenharmony_ci /* remove it from vmemmap_list */ 30662306a36Sopenharmony_ci if (vmem_back == vmemmap_list) /* remove head */ 30762306a36Sopenharmony_ci vmemmap_list = vmem_back->list; 30862306a36Sopenharmony_ci else 30962306a36Sopenharmony_ci vmem_back_prev->list = vmem_back->list; 31062306a36Sopenharmony_ci 31162306a36Sopenharmony_ci /* next point to this freed entry */ 31262306a36Sopenharmony_ci vmem_back->list = next; 31362306a36Sopenharmony_ci next = vmem_back; 31462306a36Sopenharmony_ci num_freed++; 31562306a36Sopenharmony_ci 31662306a36Sopenharmony_ci return vmem_back->phys; 31762306a36Sopenharmony_ci} 31862306a36Sopenharmony_ci 31962306a36Sopenharmony_cistatic void __ref __vmemmap_free(unsigned long start, unsigned long end, 32062306a36Sopenharmony_ci struct vmem_altmap *altmap) 32162306a36Sopenharmony_ci{ 32262306a36Sopenharmony_ci unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; 32362306a36Sopenharmony_ci unsigned long page_order = get_order(page_size); 32462306a36Sopenharmony_ci unsigned long alt_start = ~0, alt_end = ~0; 32562306a36Sopenharmony_ci unsigned long base_pfn; 32662306a36Sopenharmony_ci 32762306a36Sopenharmony_ci start = ALIGN_DOWN(start, page_size); 32862306a36Sopenharmony_ci if (altmap) { 32962306a36Sopenharmony_ci alt_start = altmap->base_pfn; 33062306a36Sopenharmony_ci alt_end = altmap->base_pfn + altmap->reserve + altmap->free; 33162306a36Sopenharmony_ci } 33262306a36Sopenharmony_ci 33362306a36Sopenharmony_ci pr_debug("vmemmap_free %lx...%lx\n", start, end); 33462306a36Sopenharmony_ci 33562306a36Sopenharmony_ci for (; start < end; start += page_size) { 33662306a36Sopenharmony_ci unsigned long nr_pages, addr; 33762306a36Sopenharmony_ci struct page *page; 33862306a36Sopenharmony_ci 33962306a36Sopenharmony_ci /* 34062306a36Sopenharmony_ci * We have already marked the subsection we are trying to remove 34162306a36Sopenharmony_ci * invalid. So if we want to remove the vmemmap range, we 34262306a36Sopenharmony_ci * need to make sure there is no subsection marked valid 34362306a36Sopenharmony_ci * in this range. 34462306a36Sopenharmony_ci */ 34562306a36Sopenharmony_ci if (vmemmap_populated(start, page_size)) 34662306a36Sopenharmony_ci continue; 34762306a36Sopenharmony_ci 34862306a36Sopenharmony_ci addr = vmemmap_list_free(start); 34962306a36Sopenharmony_ci if (!addr) 35062306a36Sopenharmony_ci continue; 35162306a36Sopenharmony_ci 35262306a36Sopenharmony_ci page = pfn_to_page(addr >> PAGE_SHIFT); 35362306a36Sopenharmony_ci nr_pages = 1 << page_order; 35462306a36Sopenharmony_ci base_pfn = PHYS_PFN(addr); 35562306a36Sopenharmony_ci 35662306a36Sopenharmony_ci if (base_pfn >= alt_start && base_pfn < alt_end) { 35762306a36Sopenharmony_ci vmem_altmap_free(altmap, nr_pages); 35862306a36Sopenharmony_ci } else if (PageReserved(page)) { 35962306a36Sopenharmony_ci /* allocated from bootmem */ 36062306a36Sopenharmony_ci if (page_size < PAGE_SIZE) { 36162306a36Sopenharmony_ci /* 36262306a36Sopenharmony_ci * this shouldn't happen, but if it is 36362306a36Sopenharmony_ci * the case, leave the memory there 36462306a36Sopenharmony_ci */ 36562306a36Sopenharmony_ci WARN_ON_ONCE(1); 36662306a36Sopenharmony_ci } else { 36762306a36Sopenharmony_ci while (nr_pages--) 36862306a36Sopenharmony_ci free_reserved_page(page++); 36962306a36Sopenharmony_ci } 37062306a36Sopenharmony_ci } else { 37162306a36Sopenharmony_ci free_pages((unsigned long)(__va(addr)), page_order); 37262306a36Sopenharmony_ci } 37362306a36Sopenharmony_ci 37462306a36Sopenharmony_ci vmemmap_remove_mapping(start, page_size); 37562306a36Sopenharmony_ci } 37662306a36Sopenharmony_ci} 37762306a36Sopenharmony_ci 37862306a36Sopenharmony_civoid __ref vmemmap_free(unsigned long start, unsigned long end, 37962306a36Sopenharmony_ci struct vmem_altmap *altmap) 38062306a36Sopenharmony_ci{ 38162306a36Sopenharmony_ci#ifdef CONFIG_PPC_BOOK3S_64 38262306a36Sopenharmony_ci if (radix_enabled()) 38362306a36Sopenharmony_ci return radix__vmemmap_free(start, end, altmap); 38462306a36Sopenharmony_ci#endif 38562306a36Sopenharmony_ci return __vmemmap_free(start, end, altmap); 38662306a36Sopenharmony_ci} 38762306a36Sopenharmony_ci 38862306a36Sopenharmony_ci#endif 38962306a36Sopenharmony_civoid register_page_bootmem_memmap(unsigned long section_nr, 39062306a36Sopenharmony_ci struct page *start_page, unsigned long size) 39162306a36Sopenharmony_ci{ 39262306a36Sopenharmony_ci} 39362306a36Sopenharmony_ci 39462306a36Sopenharmony_ci#endif /* CONFIG_SPARSEMEM_VMEMMAP */ 39562306a36Sopenharmony_ci 39662306a36Sopenharmony_ci#ifdef CONFIG_PPC_BOOK3S_64 39762306a36Sopenharmony_ciunsigned int mmu_lpid_bits; 39862306a36Sopenharmony_ci#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 39962306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(mmu_lpid_bits); 40062306a36Sopenharmony_ci#endif 40162306a36Sopenharmony_ciunsigned int mmu_pid_bits; 40262306a36Sopenharmony_ci 40362306a36Sopenharmony_cistatic bool disable_radix = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT); 40462306a36Sopenharmony_ci 40562306a36Sopenharmony_cistatic int __init parse_disable_radix(char *p) 40662306a36Sopenharmony_ci{ 40762306a36Sopenharmony_ci bool val; 40862306a36Sopenharmony_ci 40962306a36Sopenharmony_ci if (!p) 41062306a36Sopenharmony_ci val = true; 41162306a36Sopenharmony_ci else if (kstrtobool(p, &val)) 41262306a36Sopenharmony_ci return -EINVAL; 41362306a36Sopenharmony_ci 41462306a36Sopenharmony_ci disable_radix = val; 41562306a36Sopenharmony_ci 41662306a36Sopenharmony_ci return 0; 41762306a36Sopenharmony_ci} 41862306a36Sopenharmony_ciearly_param("disable_radix", parse_disable_radix); 41962306a36Sopenharmony_ci 42062306a36Sopenharmony_ci/* 42162306a36Sopenharmony_ci * If we're running under a hypervisor, we need to check the contents of 42262306a36Sopenharmony_ci * /chosen/ibm,architecture-vec-5 to see if the hypervisor is willing to do 42362306a36Sopenharmony_ci * radix. If not, we clear the radix feature bit so we fall back to hash. 42462306a36Sopenharmony_ci */ 42562306a36Sopenharmony_cistatic void __init early_check_vec5(void) 42662306a36Sopenharmony_ci{ 42762306a36Sopenharmony_ci unsigned long root, chosen; 42862306a36Sopenharmony_ci int size; 42962306a36Sopenharmony_ci const u8 *vec5; 43062306a36Sopenharmony_ci u8 mmu_supported; 43162306a36Sopenharmony_ci 43262306a36Sopenharmony_ci root = of_get_flat_dt_root(); 43362306a36Sopenharmony_ci chosen = of_get_flat_dt_subnode_by_name(root, "chosen"); 43462306a36Sopenharmony_ci if (chosen == -FDT_ERR_NOTFOUND) { 43562306a36Sopenharmony_ci cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX; 43662306a36Sopenharmony_ci return; 43762306a36Sopenharmony_ci } 43862306a36Sopenharmony_ci vec5 = of_get_flat_dt_prop(chosen, "ibm,architecture-vec-5", &size); 43962306a36Sopenharmony_ci if (!vec5) { 44062306a36Sopenharmony_ci cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX; 44162306a36Sopenharmony_ci return; 44262306a36Sopenharmony_ci } 44362306a36Sopenharmony_ci if (size <= OV5_INDX(OV5_MMU_SUPPORT)) { 44462306a36Sopenharmony_ci cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX; 44562306a36Sopenharmony_ci return; 44662306a36Sopenharmony_ci } 44762306a36Sopenharmony_ci 44862306a36Sopenharmony_ci /* Check for supported configuration */ 44962306a36Sopenharmony_ci mmu_supported = vec5[OV5_INDX(OV5_MMU_SUPPORT)] & 45062306a36Sopenharmony_ci OV5_FEAT(OV5_MMU_SUPPORT); 45162306a36Sopenharmony_ci if (mmu_supported == OV5_FEAT(OV5_MMU_RADIX)) { 45262306a36Sopenharmony_ci /* Hypervisor only supports radix - check enabled && GTSE */ 45362306a36Sopenharmony_ci if (!early_radix_enabled()) { 45462306a36Sopenharmony_ci pr_warn("WARNING: Ignoring cmdline option disable_radix\n"); 45562306a36Sopenharmony_ci } 45662306a36Sopenharmony_ci if (!(vec5[OV5_INDX(OV5_RADIX_GTSE)] & 45762306a36Sopenharmony_ci OV5_FEAT(OV5_RADIX_GTSE))) { 45862306a36Sopenharmony_ci cur_cpu_spec->mmu_features &= ~MMU_FTR_GTSE; 45962306a36Sopenharmony_ci } else 46062306a36Sopenharmony_ci cur_cpu_spec->mmu_features |= MMU_FTR_GTSE; 46162306a36Sopenharmony_ci /* Do radix anyway - the hypervisor said we had to */ 46262306a36Sopenharmony_ci cur_cpu_spec->mmu_features |= MMU_FTR_TYPE_RADIX; 46362306a36Sopenharmony_ci } else if (mmu_supported == OV5_FEAT(OV5_MMU_HASH)) { 46462306a36Sopenharmony_ci /* Hypervisor only supports hash - disable radix */ 46562306a36Sopenharmony_ci cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX; 46662306a36Sopenharmony_ci cur_cpu_spec->mmu_features &= ~MMU_FTR_GTSE; 46762306a36Sopenharmony_ci } 46862306a36Sopenharmony_ci} 46962306a36Sopenharmony_ci 47062306a36Sopenharmony_cistatic int __init dt_scan_mmu_pid_width(unsigned long node, 47162306a36Sopenharmony_ci const char *uname, int depth, 47262306a36Sopenharmony_ci void *data) 47362306a36Sopenharmony_ci{ 47462306a36Sopenharmony_ci int size = 0; 47562306a36Sopenharmony_ci const __be32 *prop; 47662306a36Sopenharmony_ci const char *type = of_get_flat_dt_prop(node, "device_type", NULL); 47762306a36Sopenharmony_ci 47862306a36Sopenharmony_ci /* We are scanning "cpu" nodes only */ 47962306a36Sopenharmony_ci if (type == NULL || strcmp(type, "cpu") != 0) 48062306a36Sopenharmony_ci return 0; 48162306a36Sopenharmony_ci 48262306a36Sopenharmony_ci /* Find MMU LPID, PID register size */ 48362306a36Sopenharmony_ci prop = of_get_flat_dt_prop(node, "ibm,mmu-lpid-bits", &size); 48462306a36Sopenharmony_ci if (prop && size == 4) 48562306a36Sopenharmony_ci mmu_lpid_bits = be32_to_cpup(prop); 48662306a36Sopenharmony_ci 48762306a36Sopenharmony_ci prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size); 48862306a36Sopenharmony_ci if (prop && size == 4) 48962306a36Sopenharmony_ci mmu_pid_bits = be32_to_cpup(prop); 49062306a36Sopenharmony_ci 49162306a36Sopenharmony_ci if (!mmu_pid_bits && !mmu_lpid_bits) 49262306a36Sopenharmony_ci return 0; 49362306a36Sopenharmony_ci 49462306a36Sopenharmony_ci return 1; 49562306a36Sopenharmony_ci} 49662306a36Sopenharmony_ci 49762306a36Sopenharmony_ci/* 49862306a36Sopenharmony_ci * Outside hotplug the kernel uses this value to map the kernel direct map 49962306a36Sopenharmony_ci * with radix. To be compatible with older kernels, let's keep this value 50062306a36Sopenharmony_ci * as 16M which is also SECTION_SIZE with SPARSEMEM. We can ideally map 50162306a36Sopenharmony_ci * things with 1GB size in the case where we don't support hotplug. 50262306a36Sopenharmony_ci */ 50362306a36Sopenharmony_ci#ifndef CONFIG_MEMORY_HOTPLUG 50462306a36Sopenharmony_ci#define DEFAULT_MEMORY_BLOCK_SIZE SZ_16M 50562306a36Sopenharmony_ci#else 50662306a36Sopenharmony_ci#define DEFAULT_MEMORY_BLOCK_SIZE MIN_MEMORY_BLOCK_SIZE 50762306a36Sopenharmony_ci#endif 50862306a36Sopenharmony_ci 50962306a36Sopenharmony_cistatic void update_memory_block_size(unsigned long *block_size, unsigned long mem_size) 51062306a36Sopenharmony_ci{ 51162306a36Sopenharmony_ci unsigned long min_memory_block_size = DEFAULT_MEMORY_BLOCK_SIZE; 51262306a36Sopenharmony_ci 51362306a36Sopenharmony_ci for (; *block_size > min_memory_block_size; *block_size >>= 2) { 51462306a36Sopenharmony_ci if ((mem_size & *block_size) == 0) 51562306a36Sopenharmony_ci break; 51662306a36Sopenharmony_ci } 51762306a36Sopenharmony_ci} 51862306a36Sopenharmony_ci 51962306a36Sopenharmony_cistatic int __init probe_memory_block_size(unsigned long node, const char *uname, int 52062306a36Sopenharmony_ci depth, void *data) 52162306a36Sopenharmony_ci{ 52262306a36Sopenharmony_ci const char *type; 52362306a36Sopenharmony_ci unsigned long *block_size = (unsigned long *)data; 52462306a36Sopenharmony_ci const __be32 *reg, *endp; 52562306a36Sopenharmony_ci int l; 52662306a36Sopenharmony_ci 52762306a36Sopenharmony_ci if (depth != 1) 52862306a36Sopenharmony_ci return 0; 52962306a36Sopenharmony_ci /* 53062306a36Sopenharmony_ci * If we have dynamic-reconfiguration-memory node, use the 53162306a36Sopenharmony_ci * lmb value. 53262306a36Sopenharmony_ci */ 53362306a36Sopenharmony_ci if (strcmp(uname, "ibm,dynamic-reconfiguration-memory") == 0) { 53462306a36Sopenharmony_ci 53562306a36Sopenharmony_ci const __be32 *prop; 53662306a36Sopenharmony_ci 53762306a36Sopenharmony_ci prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &l); 53862306a36Sopenharmony_ci 53962306a36Sopenharmony_ci if (!prop || l < dt_root_size_cells * sizeof(__be32)) 54062306a36Sopenharmony_ci /* 54162306a36Sopenharmony_ci * Nothing in the device tree 54262306a36Sopenharmony_ci */ 54362306a36Sopenharmony_ci *block_size = DEFAULT_MEMORY_BLOCK_SIZE; 54462306a36Sopenharmony_ci else 54562306a36Sopenharmony_ci *block_size = of_read_number(prop, dt_root_size_cells); 54662306a36Sopenharmony_ci /* 54762306a36Sopenharmony_ci * We have found the final value. Don't probe further. 54862306a36Sopenharmony_ci */ 54962306a36Sopenharmony_ci return 1; 55062306a36Sopenharmony_ci } 55162306a36Sopenharmony_ci /* 55262306a36Sopenharmony_ci * Find all the device tree nodes of memory type and make sure 55362306a36Sopenharmony_ci * the area can be mapped using the memory block size value 55462306a36Sopenharmony_ci * we end up using. We start with 1G value and keep reducing 55562306a36Sopenharmony_ci * it such that we can map the entire area using memory_block_size. 55662306a36Sopenharmony_ci * This will be used on powernv and older pseries that don't 55762306a36Sopenharmony_ci * have ibm,lmb-size node. 55862306a36Sopenharmony_ci * For ex: with P5 we can end up with 55962306a36Sopenharmony_ci * memory@0 -> 128MB 56062306a36Sopenharmony_ci * memory@128M -> 64M 56162306a36Sopenharmony_ci * This will end up using 64MB memory block size value. 56262306a36Sopenharmony_ci */ 56362306a36Sopenharmony_ci type = of_get_flat_dt_prop(node, "device_type", NULL); 56462306a36Sopenharmony_ci if (type == NULL || strcmp(type, "memory") != 0) 56562306a36Sopenharmony_ci return 0; 56662306a36Sopenharmony_ci 56762306a36Sopenharmony_ci reg = of_get_flat_dt_prop(node, "linux,usable-memory", &l); 56862306a36Sopenharmony_ci if (!reg) 56962306a36Sopenharmony_ci reg = of_get_flat_dt_prop(node, "reg", &l); 57062306a36Sopenharmony_ci if (!reg) 57162306a36Sopenharmony_ci return 0; 57262306a36Sopenharmony_ci 57362306a36Sopenharmony_ci endp = reg + (l / sizeof(__be32)); 57462306a36Sopenharmony_ci while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) { 57562306a36Sopenharmony_ci const char *compatible; 57662306a36Sopenharmony_ci u64 size; 57762306a36Sopenharmony_ci 57862306a36Sopenharmony_ci dt_mem_next_cell(dt_root_addr_cells, ®); 57962306a36Sopenharmony_ci size = dt_mem_next_cell(dt_root_size_cells, ®); 58062306a36Sopenharmony_ci 58162306a36Sopenharmony_ci if (size) { 58262306a36Sopenharmony_ci update_memory_block_size(block_size, size); 58362306a36Sopenharmony_ci continue; 58462306a36Sopenharmony_ci } 58562306a36Sopenharmony_ci /* 58662306a36Sopenharmony_ci * ibm,coherent-device-memory with linux,usable-memory = 0 58762306a36Sopenharmony_ci * Force 256MiB block size. Work around for GPUs on P9 PowerNV 58862306a36Sopenharmony_ci * linux,usable-memory == 0 implies driver managed memory and 58962306a36Sopenharmony_ci * we can't use large memory block size due to hotplug/unplug 59062306a36Sopenharmony_ci * limitations. 59162306a36Sopenharmony_ci */ 59262306a36Sopenharmony_ci compatible = of_get_flat_dt_prop(node, "compatible", NULL); 59362306a36Sopenharmony_ci if (compatible && !strcmp(compatible, "ibm,coherent-device-memory")) { 59462306a36Sopenharmony_ci if (*block_size > SZ_256M) 59562306a36Sopenharmony_ci *block_size = SZ_256M; 59662306a36Sopenharmony_ci /* 59762306a36Sopenharmony_ci * We keep 256M as the upper limit with GPU present. 59862306a36Sopenharmony_ci */ 59962306a36Sopenharmony_ci return 0; 60062306a36Sopenharmony_ci } 60162306a36Sopenharmony_ci } 60262306a36Sopenharmony_ci /* continue looking for other memory device types */ 60362306a36Sopenharmony_ci return 0; 60462306a36Sopenharmony_ci} 60562306a36Sopenharmony_ci 60662306a36Sopenharmony_ci/* 60762306a36Sopenharmony_ci * start with 1G memory block size. Early init will 60862306a36Sopenharmony_ci * fix this with correct value. 60962306a36Sopenharmony_ci */ 61062306a36Sopenharmony_ciunsigned long memory_block_size __ro_after_init = 1UL << 30; 61162306a36Sopenharmony_cistatic void __init early_init_memory_block_size(void) 61262306a36Sopenharmony_ci{ 61362306a36Sopenharmony_ci /* 61462306a36Sopenharmony_ci * We need to do memory_block_size probe early so that 61562306a36Sopenharmony_ci * radix__early_init_mmu() can use this as limit for 61662306a36Sopenharmony_ci * mapping page size. 61762306a36Sopenharmony_ci */ 61862306a36Sopenharmony_ci of_scan_flat_dt(probe_memory_block_size, &memory_block_size); 61962306a36Sopenharmony_ci} 62062306a36Sopenharmony_ci 62162306a36Sopenharmony_civoid __init mmu_early_init_devtree(void) 62262306a36Sopenharmony_ci{ 62362306a36Sopenharmony_ci bool hvmode = !!(mfmsr() & MSR_HV); 62462306a36Sopenharmony_ci 62562306a36Sopenharmony_ci /* Disable radix mode based on kernel command line. */ 62662306a36Sopenharmony_ci if (disable_radix) { 62762306a36Sopenharmony_ci if (IS_ENABLED(CONFIG_PPC_64S_HASH_MMU)) 62862306a36Sopenharmony_ci cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX; 62962306a36Sopenharmony_ci else 63062306a36Sopenharmony_ci pr_warn("WARNING: Ignoring cmdline option disable_radix\n"); 63162306a36Sopenharmony_ci } 63262306a36Sopenharmony_ci 63362306a36Sopenharmony_ci of_scan_flat_dt(dt_scan_mmu_pid_width, NULL); 63462306a36Sopenharmony_ci if (hvmode && !mmu_lpid_bits) { 63562306a36Sopenharmony_ci if (early_cpu_has_feature(CPU_FTR_ARCH_207S)) 63662306a36Sopenharmony_ci mmu_lpid_bits = 12; /* POWER8-10 */ 63762306a36Sopenharmony_ci else 63862306a36Sopenharmony_ci mmu_lpid_bits = 10; /* POWER7 */ 63962306a36Sopenharmony_ci } 64062306a36Sopenharmony_ci if (!mmu_pid_bits) { 64162306a36Sopenharmony_ci if (early_cpu_has_feature(CPU_FTR_ARCH_300)) 64262306a36Sopenharmony_ci mmu_pid_bits = 20; /* POWER9-10 */ 64362306a36Sopenharmony_ci } 64462306a36Sopenharmony_ci 64562306a36Sopenharmony_ci /* 64662306a36Sopenharmony_ci * Check /chosen/ibm,architecture-vec-5 if running as a guest. 64762306a36Sopenharmony_ci * When running bare-metal, we can use radix if we like 64862306a36Sopenharmony_ci * even though the ibm,architecture-vec-5 property created by 64962306a36Sopenharmony_ci * skiboot doesn't have the necessary bits set. 65062306a36Sopenharmony_ci */ 65162306a36Sopenharmony_ci if (!hvmode) 65262306a36Sopenharmony_ci early_check_vec5(); 65362306a36Sopenharmony_ci 65462306a36Sopenharmony_ci early_init_memory_block_size(); 65562306a36Sopenharmony_ci 65662306a36Sopenharmony_ci if (early_radix_enabled()) { 65762306a36Sopenharmony_ci radix__early_init_devtree(); 65862306a36Sopenharmony_ci 65962306a36Sopenharmony_ci /* 66062306a36Sopenharmony_ci * We have finalized the translation we are going to use by now. 66162306a36Sopenharmony_ci * Radix mode is not limited by RMA / VRMA addressing. 66262306a36Sopenharmony_ci * Hence don't limit memblock allocations. 66362306a36Sopenharmony_ci */ 66462306a36Sopenharmony_ci ppc64_rma_size = ULONG_MAX; 66562306a36Sopenharmony_ci memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); 66662306a36Sopenharmony_ci } else 66762306a36Sopenharmony_ci hash__early_init_devtree(); 66862306a36Sopenharmony_ci 66962306a36Sopenharmony_ci if (IS_ENABLED(CONFIG_HUGETLB_PAGE_SIZE_VARIABLE)) 67062306a36Sopenharmony_ci hugetlbpage_init_defaultsize(); 67162306a36Sopenharmony_ci 67262306a36Sopenharmony_ci if (!(cur_cpu_spec->mmu_features & MMU_FTR_HPTE_TABLE) && 67362306a36Sopenharmony_ci !(cur_cpu_spec->mmu_features & MMU_FTR_TYPE_RADIX)) 67462306a36Sopenharmony_ci panic("kernel does not support any MMU type offered by platform"); 67562306a36Sopenharmony_ci} 67662306a36Sopenharmony_ci#endif /* CONFIG_PPC_BOOK3S_64 */ 677