xref: /kernel/linux/linux-5.10/arch/x86/mm/init_64.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 *  linux/arch/x86_64/mm/init.c
4 *
5 *  Copyright (C) 1995  Linus Torvalds
6 *  Copyright (C) 2000  Pavel Machek <pavel@ucw.cz>
7 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
8 */
9
10#include <linux/signal.h>
11#include <linux/sched.h>
12#include <linux/kernel.h>
13#include <linux/errno.h>
14#include <linux/string.h>
15#include <linux/types.h>
16#include <linux/ptrace.h>
17#include <linux/mman.h>
18#include <linux/mm.h>
19#include <linux/swap.h>
20#include <linux/smp.h>
21#include <linux/init.h>
22#include <linux/initrd.h>
23#include <linux/pagemap.h>
24#include <linux/memblock.h>
25#include <linux/proc_fs.h>
26#include <linux/pci.h>
27#include <linux/pfn.h>
28#include <linux/poison.h>
29#include <linux/dma-mapping.h>
30#include <linux/memory.h>
31#include <linux/memory_hotplug.h>
32#include <linux/memremap.h>
33#include <linux/nmi.h>
34#include <linux/gfp.h>
35#include <linux/kcore.h>
36
37#include <asm/processor.h>
38#include <asm/bios_ebda.h>
39#include <linux/uaccess.h>
40#include <asm/pgalloc.h>
41#include <asm/dma.h>
42#include <asm/fixmap.h>
43#include <asm/e820/api.h>
44#include <asm/apic.h>
45#include <asm/tlb.h>
46#include <asm/mmu_context.h>
47#include <asm/proto.h>
48#include <asm/smp.h>
49#include <asm/sections.h>
50#include <asm/kdebug.h>
51#include <asm/numa.h>
52#include <asm/set_memory.h>
53#include <asm/init.h>
54#include <asm/uv/uv.h>
55#include <asm/setup.h>
56#include <asm/ftrace.h>
57
58#include "mm_internal.h"
59
60#include "ident_map.c"
61
62#define DEFINE_POPULATE(fname, type1, type2, init)		\
63static inline void fname##_init(struct mm_struct *mm,		\
64		type1##_t *arg1, type2##_t *arg2, bool init)	\
65{								\
66	if (init)						\
67		fname##_safe(mm, arg1, arg2);			\
68	else							\
69		fname(mm, arg1, arg2);				\
70}
71
72DEFINE_POPULATE(p4d_populate, p4d, pud, init)
73DEFINE_POPULATE(pgd_populate, pgd, p4d, init)
74DEFINE_POPULATE(pud_populate, pud, pmd, init)
75DEFINE_POPULATE(pmd_populate_kernel, pmd, pte, init)
76
77#define DEFINE_ENTRY(type1, type2, init)			\
78static inline void set_##type1##_init(type1##_t *arg1,		\
79			type2##_t arg2, bool init)		\
80{								\
81	if (init)						\
82		set_##type1##_safe(arg1, arg2);			\
83	else							\
84		set_##type1(arg1, arg2);			\
85}
86
87DEFINE_ENTRY(p4d, p4d, init)
88DEFINE_ENTRY(pud, pud, init)
89DEFINE_ENTRY(pmd, pmd, init)
90DEFINE_ENTRY(pte, pte, init)
91
92
93/*
94 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
95 * physical space so we can cache the place of the first one and move
96 * around without checking the pgd every time.
97 */
98
99/* Bits supported by the hardware: */
100pteval_t __supported_pte_mask __read_mostly = ~0;
101/* Bits allowed in normal kernel mappings: */
102pteval_t __default_kernel_pte_mask __read_mostly = ~0;
103EXPORT_SYMBOL_GPL(__supported_pte_mask);
104/* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */
105EXPORT_SYMBOL(__default_kernel_pte_mask);
106
107int force_personality32;
108
109/*
110 * noexec32=on|off
111 * Control non executable heap for 32bit processes.
112 * To control the stack too use noexec=off
113 *
114 * on	PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
115 * off	PROT_READ implies PROT_EXEC
116 */
117static int __init nonx32_setup(char *str)
118{
119	if (!strcmp(str, "on"))
120		force_personality32 &= ~READ_IMPLIES_EXEC;
121	else if (!strcmp(str, "off"))
122		force_personality32 |= READ_IMPLIES_EXEC;
123	return 1;
124}
125__setup("noexec32=", nonx32_setup);
126
127static void sync_global_pgds_l5(unsigned long start, unsigned long end)
128{
129	unsigned long addr;
130
131	for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
132		const pgd_t *pgd_ref = pgd_offset_k(addr);
133		struct page *page;
134
135		/* Check for overflow */
136		if (addr < start)
137			break;
138
139		if (pgd_none(*pgd_ref))
140			continue;
141
142		spin_lock(&pgd_lock);
143		list_for_each_entry(page, &pgd_list, lru) {
144			pgd_t *pgd;
145			spinlock_t *pgt_lock;
146
147			pgd = (pgd_t *)page_address(page) + pgd_index(addr);
148			/* the pgt_lock only for Xen */
149			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
150			spin_lock(pgt_lock);
151
152			if (!pgd_none(*pgd_ref) && !pgd_none(*pgd))
153				BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
154
155			if (pgd_none(*pgd))
156				set_pgd(pgd, *pgd_ref);
157
158			spin_unlock(pgt_lock);
159		}
160		spin_unlock(&pgd_lock);
161	}
162}
163
164static void sync_global_pgds_l4(unsigned long start, unsigned long end)
165{
166	unsigned long addr;
167
168	for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
169		pgd_t *pgd_ref = pgd_offset_k(addr);
170		const p4d_t *p4d_ref;
171		struct page *page;
172
173		/*
174		 * With folded p4d, pgd_none() is always false, we need to
175		 * handle synchonization on p4d level.
176		 */
177		MAYBE_BUILD_BUG_ON(pgd_none(*pgd_ref));
178		p4d_ref = p4d_offset(pgd_ref, addr);
179
180		if (p4d_none(*p4d_ref))
181			continue;
182
183		spin_lock(&pgd_lock);
184		list_for_each_entry(page, &pgd_list, lru) {
185			pgd_t *pgd;
186			p4d_t *p4d;
187			spinlock_t *pgt_lock;
188
189			pgd = (pgd_t *)page_address(page) + pgd_index(addr);
190			p4d = p4d_offset(pgd, addr);
191			/* the pgt_lock only for Xen */
192			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
193			spin_lock(pgt_lock);
194
195			if (!p4d_none(*p4d_ref) && !p4d_none(*p4d))
196				BUG_ON(p4d_pgtable(*p4d)
197				       != p4d_pgtable(*p4d_ref));
198
199			if (p4d_none(*p4d))
200				set_p4d(p4d, *p4d_ref);
201
202			spin_unlock(pgt_lock);
203		}
204		spin_unlock(&pgd_lock);
205	}
206}
207
208/*
209 * When memory was added make sure all the processes MM have
210 * suitable PGD entries in the local PGD level page.
211 */
212static void sync_global_pgds(unsigned long start, unsigned long end)
213{
214	if (pgtable_l5_enabled())
215		sync_global_pgds_l5(start, end);
216	else
217		sync_global_pgds_l4(start, end);
218}
219
220/*
221 * NOTE: This function is marked __ref because it calls __init function
222 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
223 */
224static __ref void *spp_getpage(void)
225{
226	void *ptr;
227
228	if (after_bootmem)
229		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
230	else
231		ptr = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
232
233	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
234		panic("set_pte_phys: cannot allocate page data %s\n",
235			after_bootmem ? "after bootmem" : "");
236	}
237
238	pr_debug("spp_getpage %p\n", ptr);
239
240	return ptr;
241}
242
243static p4d_t *fill_p4d(pgd_t *pgd, unsigned long vaddr)
244{
245	if (pgd_none(*pgd)) {
246		p4d_t *p4d = (p4d_t *)spp_getpage();
247		pgd_populate(&init_mm, pgd, p4d);
248		if (p4d != p4d_offset(pgd, 0))
249			printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
250			       p4d, p4d_offset(pgd, 0));
251	}
252	return p4d_offset(pgd, vaddr);
253}
254
255static pud_t *fill_pud(p4d_t *p4d, unsigned long vaddr)
256{
257	if (p4d_none(*p4d)) {
258		pud_t *pud = (pud_t *)spp_getpage();
259		p4d_populate(&init_mm, p4d, pud);
260		if (pud != pud_offset(p4d, 0))
261			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
262			       pud, pud_offset(p4d, 0));
263	}
264	return pud_offset(p4d, vaddr);
265}
266
267static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
268{
269	if (pud_none(*pud)) {
270		pmd_t *pmd = (pmd_t *) spp_getpage();
271		pud_populate(&init_mm, pud, pmd);
272		if (pmd != pmd_offset(pud, 0))
273			printk(KERN_ERR "PAGETABLE BUG #02! %p <-> %p\n",
274			       pmd, pmd_offset(pud, 0));
275	}
276	return pmd_offset(pud, vaddr);
277}
278
279static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
280{
281	if (pmd_none(*pmd)) {
282		pte_t *pte = (pte_t *) spp_getpage();
283		pmd_populate_kernel(&init_mm, pmd, pte);
284		if (pte != pte_offset_kernel(pmd, 0))
285			printk(KERN_ERR "PAGETABLE BUG #03!\n");
286	}
287	return pte_offset_kernel(pmd, vaddr);
288}
289
290static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte)
291{
292	pmd_t *pmd = fill_pmd(pud, vaddr);
293	pte_t *pte = fill_pte(pmd, vaddr);
294
295	set_pte(pte, new_pte);
296
297	/*
298	 * It's enough to flush this one mapping.
299	 * (PGE mappings get flushed as well)
300	 */
301	flush_tlb_one_kernel(vaddr);
302}
303
304void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte)
305{
306	p4d_t *p4d = p4d_page + p4d_index(vaddr);
307	pud_t *pud = fill_pud(p4d, vaddr);
308
309	__set_pte_vaddr(pud, vaddr, new_pte);
310}
311
312void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
313{
314	pud_t *pud = pud_page + pud_index(vaddr);
315
316	__set_pte_vaddr(pud, vaddr, new_pte);
317}
318
319void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
320{
321	pgd_t *pgd;
322	p4d_t *p4d_page;
323
324	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));
325
326	pgd = pgd_offset_k(vaddr);
327	if (pgd_none(*pgd)) {
328		printk(KERN_ERR
329			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
330		return;
331	}
332
333	p4d_page = p4d_offset(pgd, 0);
334	set_pte_vaddr_p4d(p4d_page, vaddr, pteval);
335}
336
337pmd_t * __init populate_extra_pmd(unsigned long vaddr)
338{
339	pgd_t *pgd;
340	p4d_t *p4d;
341	pud_t *pud;
342
343	pgd = pgd_offset_k(vaddr);
344	p4d = fill_p4d(pgd, vaddr);
345	pud = fill_pud(p4d, vaddr);
346	return fill_pmd(pud, vaddr);
347}
348
349pte_t * __init populate_extra_pte(unsigned long vaddr)
350{
351	pmd_t *pmd;
352
353	pmd = populate_extra_pmd(vaddr);
354	return fill_pte(pmd, vaddr);
355}
356
357/*
358 * Create large page table mappings for a range of physical addresses.
359 */
360static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
361					enum page_cache_mode cache)
362{
363	pgd_t *pgd;
364	p4d_t *p4d;
365	pud_t *pud;
366	pmd_t *pmd;
367	pgprot_t prot;
368
369	pgprot_val(prot) = pgprot_val(PAGE_KERNEL_LARGE) |
370		protval_4k_2_large(cachemode2protval(cache));
371	BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
372	for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
373		pgd = pgd_offset_k((unsigned long)__va(phys));
374		if (pgd_none(*pgd)) {
375			p4d = (p4d_t *) spp_getpage();
376			set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE |
377						_PAGE_USER));
378		}
379		p4d = p4d_offset(pgd, (unsigned long)__va(phys));
380		if (p4d_none(*p4d)) {
381			pud = (pud_t *) spp_getpage();
382			set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE |
383						_PAGE_USER));
384		}
385		pud = pud_offset(p4d, (unsigned long)__va(phys));
386		if (pud_none(*pud)) {
387			pmd = (pmd_t *) spp_getpage();
388			set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
389						_PAGE_USER));
390		}
391		pmd = pmd_offset(pud, phys);
392		BUG_ON(!pmd_none(*pmd));
393		set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
394	}
395}
396
397void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
398{
399	__init_extra_mapping(phys, size, _PAGE_CACHE_MODE_WB);
400}
401
402void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
403{
404	__init_extra_mapping(phys, size, _PAGE_CACHE_MODE_UC);
405}
406
407/*
408 * The head.S code sets up the kernel high mapping:
409 *
410 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
411 *
412 * phys_base holds the negative offset to the kernel, which is added
413 * to the compile time generated pmds. This results in invalid pmds up
414 * to the point where we hit the physaddr 0 mapping.
415 *
416 * We limit the mappings to the region from _text to _brk_end.  _brk_end
417 * is rounded up to the 2MB boundary. This catches the invalid pmds as
418 * well, as they are located before _text:
419 */
420void __init cleanup_highmap(void)
421{
422	unsigned long vaddr = __START_KERNEL_map;
423	unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE;
424	unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
425	pmd_t *pmd = level2_kernel_pgt;
426
427	/*
428	 * Native path, max_pfn_mapped is not set yet.
429	 * Xen has valid max_pfn_mapped set in
430	 *	arch/x86/xen/mmu.c:xen_setup_kernel_pagetable().
431	 */
432	if (max_pfn_mapped)
433		vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
434
435	for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
436		if (pmd_none(*pmd))
437			continue;
438		if (vaddr < (unsigned long) _text || vaddr > end)
439			set_pmd(pmd, __pmd(0));
440	}
441}
442
443/*
444 * Create PTE level page table mapping for physical addresses.
445 * It returns the last physical address mapped.
446 */
447static unsigned long __meminit
448phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
449	      pgprot_t prot, bool init)
450{
451	unsigned long pages = 0, paddr_next;
452	unsigned long paddr_last = paddr_end;
453	pte_t *pte;
454	int i;
455
456	pte = pte_page + pte_index(paddr);
457	i = pte_index(paddr);
458
459	for (; i < PTRS_PER_PTE; i++, paddr = paddr_next, pte++) {
460		paddr_next = (paddr & PAGE_MASK) + PAGE_SIZE;
461		if (paddr >= paddr_end) {
462			if (!after_bootmem &&
463			    !e820__mapped_any(paddr & PAGE_MASK, paddr_next,
464					     E820_TYPE_RAM) &&
465			    !e820__mapped_any(paddr & PAGE_MASK, paddr_next,
466					     E820_TYPE_RESERVED_KERN))
467				set_pte_init(pte, __pte(0), init);
468			continue;
469		}
470
471		/*
472		 * We will re-use the existing mapping.
473		 * Xen for example has some special requirements, like mapping
474		 * pagetable pages as RO. So assume someone who pre-setup
475		 * these mappings are more intelligent.
476		 */
477		if (!pte_none(*pte)) {
478			if (!after_bootmem)
479				pages++;
480			continue;
481		}
482
483		if (0)
484			pr_info("   pte=%p addr=%lx pte=%016lx\n", pte, paddr,
485				pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte);
486		pages++;
487		set_pte_init(pte, pfn_pte(paddr >> PAGE_SHIFT, prot), init);
488		paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE;
489	}
490
491	update_page_count(PG_LEVEL_4K, pages);
492
493	return paddr_last;
494}
495
496/*
497 * Create PMD level page table mapping for physical addresses. The virtual
498 * and physical address have to be aligned at this level.
499 * It returns the last physical address mapped.
500 */
501static unsigned long __meminit
502phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
503	      unsigned long page_size_mask, pgprot_t prot, bool init)
504{
505	unsigned long pages = 0, paddr_next;
506	unsigned long paddr_last = paddr_end;
507
508	int i = pmd_index(paddr);
509
510	for (; i < PTRS_PER_PMD; i++, paddr = paddr_next) {
511		pmd_t *pmd = pmd_page + pmd_index(paddr);
512		pte_t *pte;
513		pgprot_t new_prot = prot;
514
515		paddr_next = (paddr & PMD_MASK) + PMD_SIZE;
516		if (paddr >= paddr_end) {
517			if (!after_bootmem &&
518			    !e820__mapped_any(paddr & PMD_MASK, paddr_next,
519					     E820_TYPE_RAM) &&
520			    !e820__mapped_any(paddr & PMD_MASK, paddr_next,
521					     E820_TYPE_RESERVED_KERN))
522				set_pmd_init(pmd, __pmd(0), init);
523			continue;
524		}
525
526		if (!pmd_none(*pmd)) {
527			if (!pmd_large(*pmd)) {
528				spin_lock(&init_mm.page_table_lock);
529				pte = (pte_t *)pmd_page_vaddr(*pmd);
530				paddr_last = phys_pte_init(pte, paddr,
531							   paddr_end, prot,
532							   init);
533				spin_unlock(&init_mm.page_table_lock);
534				continue;
535			}
536			/*
537			 * If we are ok with PG_LEVEL_2M mapping, then we will
538			 * use the existing mapping,
539			 *
540			 * Otherwise, we will split the large page mapping but
541			 * use the same existing protection bits except for
542			 * large page, so that we don't violate Intel's TLB
543			 * Application note (317080) which says, while changing
544			 * the page sizes, new and old translations should
545			 * not differ with respect to page frame and
546			 * attributes.
547			 */
548			if (page_size_mask & (1 << PG_LEVEL_2M)) {
549				if (!after_bootmem)
550					pages++;
551				paddr_last = paddr_next;
552				continue;
553			}
554			new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
555		}
556
557		if (page_size_mask & (1<<PG_LEVEL_2M)) {
558			pages++;
559			spin_lock(&init_mm.page_table_lock);
560			set_pte_init((pte_t *)pmd,
561				     pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT,
562					     __pgprot(pgprot_val(prot) | _PAGE_PSE)),
563				     init);
564			spin_unlock(&init_mm.page_table_lock);
565			paddr_last = paddr_next;
566			continue;
567		}
568
569		pte = alloc_low_page();
570		paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot, init);
571
572		spin_lock(&init_mm.page_table_lock);
573		pmd_populate_kernel_init(&init_mm, pmd, pte, init);
574		spin_unlock(&init_mm.page_table_lock);
575	}
576	update_page_count(PG_LEVEL_2M, pages);
577	return paddr_last;
578}
579
580/*
581 * Create PUD level page table mapping for physical addresses. The virtual
582 * and physical address do not have to be aligned at this level. KASLR can
583 * randomize virtual addresses up to this level.
584 * It returns the last physical address mapped.
585 */
586static unsigned long __meminit
587phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
588	      unsigned long page_size_mask, pgprot_t _prot, bool init)
589{
590	unsigned long pages = 0, paddr_next;
591	unsigned long paddr_last = paddr_end;
592	unsigned long vaddr = (unsigned long)__va(paddr);
593	int i = pud_index(vaddr);
594
595	for (; i < PTRS_PER_PUD; i++, paddr = paddr_next) {
596		pud_t *pud;
597		pmd_t *pmd;
598		pgprot_t prot = _prot;
599
600		vaddr = (unsigned long)__va(paddr);
601		pud = pud_page + pud_index(vaddr);
602		paddr_next = (paddr & PUD_MASK) + PUD_SIZE;
603
604		if (paddr >= paddr_end) {
605			if (!after_bootmem &&
606			    !e820__mapped_any(paddr & PUD_MASK, paddr_next,
607					     E820_TYPE_RAM) &&
608			    !e820__mapped_any(paddr & PUD_MASK, paddr_next,
609					     E820_TYPE_RESERVED_KERN))
610				set_pud_init(pud, __pud(0), init);
611			continue;
612		}
613
614		if (!pud_none(*pud)) {
615			if (!pud_large(*pud)) {
616				pmd = pmd_offset(pud, 0);
617				paddr_last = phys_pmd_init(pmd, paddr,
618							   paddr_end,
619							   page_size_mask,
620							   prot, init);
621				continue;
622			}
623			/*
624			 * If we are ok with PG_LEVEL_1G mapping, then we will
625			 * use the existing mapping.
626			 *
627			 * Otherwise, we will split the gbpage mapping but use
628			 * the same existing protection  bits except for large
629			 * page, so that we don't violate Intel's TLB
630			 * Application note (317080) which says, while changing
631			 * the page sizes, new and old translations should
632			 * not differ with respect to page frame and
633			 * attributes.
634			 */
635			if (page_size_mask & (1 << PG_LEVEL_1G)) {
636				if (!after_bootmem)
637					pages++;
638				paddr_last = paddr_next;
639				continue;
640			}
641			prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
642		}
643
644		if (page_size_mask & (1<<PG_LEVEL_1G)) {
645			pages++;
646			spin_lock(&init_mm.page_table_lock);
647
648			prot = __pgprot(pgprot_val(prot) | _PAGE_PSE);
649
650			set_pte_init((pte_t *)pud,
651				     pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT,
652					     prot),
653				     init);
654			spin_unlock(&init_mm.page_table_lock);
655			paddr_last = paddr_next;
656			continue;
657		}
658
659		pmd = alloc_low_page();
660		paddr_last = phys_pmd_init(pmd, paddr, paddr_end,
661					   page_size_mask, prot, init);
662
663		spin_lock(&init_mm.page_table_lock);
664		pud_populate_init(&init_mm, pud, pmd, init);
665		spin_unlock(&init_mm.page_table_lock);
666	}
667
668	update_page_count(PG_LEVEL_1G, pages);
669
670	return paddr_last;
671}
672
673static unsigned long __meminit
674phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
675	      unsigned long page_size_mask, pgprot_t prot, bool init)
676{
677	unsigned long vaddr, vaddr_end, vaddr_next, paddr_next, paddr_last;
678
679	paddr_last = paddr_end;
680	vaddr = (unsigned long)__va(paddr);
681	vaddr_end = (unsigned long)__va(paddr_end);
682
683	if (!pgtable_l5_enabled())
684		return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end,
685				     page_size_mask, prot, init);
686
687	for (; vaddr < vaddr_end; vaddr = vaddr_next) {
688		p4d_t *p4d = p4d_page + p4d_index(vaddr);
689		pud_t *pud;
690
691		vaddr_next = (vaddr & P4D_MASK) + P4D_SIZE;
692		paddr = __pa(vaddr);
693
694		if (paddr >= paddr_end) {
695			paddr_next = __pa(vaddr_next);
696			if (!after_bootmem &&
697			    !e820__mapped_any(paddr & P4D_MASK, paddr_next,
698					     E820_TYPE_RAM) &&
699			    !e820__mapped_any(paddr & P4D_MASK, paddr_next,
700					     E820_TYPE_RESERVED_KERN))
701				set_p4d_init(p4d, __p4d(0), init);
702			continue;
703		}
704
705		if (!p4d_none(*p4d)) {
706			pud = pud_offset(p4d, 0);
707			paddr_last = phys_pud_init(pud, paddr, __pa(vaddr_end),
708					page_size_mask, prot, init);
709			continue;
710		}
711
712		pud = alloc_low_page();
713		paddr_last = phys_pud_init(pud, paddr, __pa(vaddr_end),
714					   page_size_mask, prot, init);
715
716		spin_lock(&init_mm.page_table_lock);
717		p4d_populate_init(&init_mm, p4d, pud, init);
718		spin_unlock(&init_mm.page_table_lock);
719	}
720
721	return paddr_last;
722}
723
724static unsigned long __meminit
725__kernel_physical_mapping_init(unsigned long paddr_start,
726			       unsigned long paddr_end,
727			       unsigned long page_size_mask,
728			       pgprot_t prot, bool init)
729{
730	bool pgd_changed = false;
731	unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last;
732
733	paddr_last = paddr_end;
734	vaddr = (unsigned long)__va(paddr_start);
735	vaddr_end = (unsigned long)__va(paddr_end);
736	vaddr_start = vaddr;
737
738	for (; vaddr < vaddr_end; vaddr = vaddr_next) {
739		pgd_t *pgd = pgd_offset_k(vaddr);
740		p4d_t *p4d;
741
742		vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE;
743
744		if (pgd_val(*pgd)) {
745			p4d = (p4d_t *)pgd_page_vaddr(*pgd);
746			paddr_last = phys_p4d_init(p4d, __pa(vaddr),
747						   __pa(vaddr_end),
748						   page_size_mask,
749						   prot, init);
750			continue;
751		}
752
753		p4d = alloc_low_page();
754		paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end),
755					   page_size_mask, prot, init);
756
757		spin_lock(&init_mm.page_table_lock);
758		if (pgtable_l5_enabled())
759			pgd_populate_init(&init_mm, pgd, p4d, init);
760		else
761			p4d_populate_init(&init_mm, p4d_offset(pgd, vaddr),
762					  (pud_t *) p4d, init);
763
764		spin_unlock(&init_mm.page_table_lock);
765		pgd_changed = true;
766	}
767
768	if (pgd_changed)
769		sync_global_pgds(vaddr_start, vaddr_end - 1);
770
771	return paddr_last;
772}
773
774
775/*
776 * Create page table mapping for the physical memory for specific physical
777 * addresses. Note that it can only be used to populate non-present entries.
778 * The virtual and physical addresses have to be aligned on PMD level
779 * down. It returns the last physical address mapped.
780 */
781unsigned long __meminit
782kernel_physical_mapping_init(unsigned long paddr_start,
783			     unsigned long paddr_end,
784			     unsigned long page_size_mask, pgprot_t prot)
785{
786	return __kernel_physical_mapping_init(paddr_start, paddr_end,
787					      page_size_mask, prot, true);
788}
789
790/*
791 * This function is similar to kernel_physical_mapping_init() above with the
792 * exception that it uses set_{pud,pmd}() instead of the set_{pud,pte}_safe()
793 * when updating the mapping. The caller is responsible to flush the TLBs after
794 * the function returns.
795 */
796unsigned long __meminit
797kernel_physical_mapping_change(unsigned long paddr_start,
798			       unsigned long paddr_end,
799			       unsigned long page_size_mask)
800{
801	return __kernel_physical_mapping_init(paddr_start, paddr_end,
802					      page_size_mask, PAGE_KERNEL,
803					      false);
804}
805
806#ifndef CONFIG_NUMA
807void __init initmem_init(void)
808{
809	memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0);
810}
811#endif
812
813void __init paging_init(void)
814{
815	sparse_init();
816
817	/*
818	 * clear the default setting with node 0
819	 * note: don't use nodes_clear here, that is really clearing when
820	 *	 numa support is not compiled in, and later node_set_state
821	 *	 will not set it back.
822	 */
823	node_clear_state(0, N_MEMORY);
824	node_clear_state(0, N_NORMAL_MEMORY);
825
826	zone_sizes_init();
827}
828
829/*
830 * Memory hotplug specific functions
831 */
832#ifdef CONFIG_MEMORY_HOTPLUG
833/*
834 * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
835 * updating.
836 */
837static void update_end_of_memory_vars(u64 start, u64 size)
838{
839	unsigned long end_pfn = PFN_UP(start + size);
840
841	if (end_pfn > max_pfn) {
842		max_pfn = end_pfn;
843		max_low_pfn = end_pfn;
844		high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
845	}
846}
847
848int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
849	      struct mhp_params *params)
850{
851	int ret;
852
853	ret = __add_pages(nid, start_pfn, nr_pages, params);
854	WARN_ON_ONCE(ret);
855
856	/* update max_pfn, max_low_pfn and high_memory */
857	update_end_of_memory_vars(start_pfn << PAGE_SHIFT,
858				  nr_pages << PAGE_SHIFT);
859
860	return ret;
861}
862
863int arch_add_memory(int nid, u64 start, u64 size,
864		    struct mhp_params *params)
865{
866	unsigned long start_pfn = start >> PAGE_SHIFT;
867	unsigned long nr_pages = size >> PAGE_SHIFT;
868
869	init_memory_mapping(start, start + size, params->pgprot);
870
871	return add_pages(nid, start_pfn, nr_pages, params);
872}
873
874#define PAGE_INUSE 0xFD
875
876static void __meminit free_pagetable(struct page *page, int order)
877{
878	unsigned long magic;
879	unsigned int nr_pages = 1 << order;
880
881	/* bootmem page has reserved flag */
882	if (PageReserved(page)) {
883		__ClearPageReserved(page);
884
885		magic = (unsigned long)page->freelist;
886		if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
887			while (nr_pages--)
888				put_page_bootmem(page++);
889		} else
890			while (nr_pages--)
891				free_reserved_page(page++);
892	} else
893		free_pages((unsigned long)page_address(page), order);
894}
895
896static void __meminit free_hugepage_table(struct page *page,
897		struct vmem_altmap *altmap)
898{
899	if (altmap)
900		vmem_altmap_free(altmap, PMD_SIZE / PAGE_SIZE);
901	else
902		free_pagetable(page, get_order(PMD_SIZE));
903}
904
905static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
906{
907	pte_t *pte;
908	int i;
909
910	for (i = 0; i < PTRS_PER_PTE; i++) {
911		pte = pte_start + i;
912		if (!pte_none(*pte))
913			return;
914	}
915
916	/* free a pte talbe */
917	free_pagetable(pmd_page(*pmd), 0);
918	spin_lock(&init_mm.page_table_lock);
919	pmd_clear(pmd);
920	spin_unlock(&init_mm.page_table_lock);
921}
922
923static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
924{
925	pmd_t *pmd;
926	int i;
927
928	for (i = 0; i < PTRS_PER_PMD; i++) {
929		pmd = pmd_start + i;
930		if (!pmd_none(*pmd))
931			return;
932	}
933
934	/* free a pmd talbe */
935	free_pagetable(pud_page(*pud), 0);
936	spin_lock(&init_mm.page_table_lock);
937	pud_clear(pud);
938	spin_unlock(&init_mm.page_table_lock);
939}
940
941static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
942{
943	pud_t *pud;
944	int i;
945
946	for (i = 0; i < PTRS_PER_PUD; i++) {
947		pud = pud_start + i;
948		if (!pud_none(*pud))
949			return;
950	}
951
952	/* free a pud talbe */
953	free_pagetable(p4d_page(*p4d), 0);
954	spin_lock(&init_mm.page_table_lock);
955	p4d_clear(p4d);
956	spin_unlock(&init_mm.page_table_lock);
957}
958
959static void __meminit
960remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
961		 bool direct)
962{
963	unsigned long next, pages = 0;
964	pte_t *pte;
965	void *page_addr;
966	phys_addr_t phys_addr;
967
968	pte = pte_start + pte_index(addr);
969	for (; addr < end; addr = next, pte++) {
970		next = (addr + PAGE_SIZE) & PAGE_MASK;
971		if (next > end)
972			next = end;
973
974		if (!pte_present(*pte))
975			continue;
976
977		/*
978		 * We mapped [0,1G) memory as identity mapping when
979		 * initializing, in arch/x86/kernel/head_64.S. These
980		 * pagetables cannot be removed.
981		 */
982		phys_addr = pte_val(*pte) + (addr & PAGE_MASK);
983		if (phys_addr < (phys_addr_t)0x40000000)
984			return;
985
986		if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
987			/*
988			 * Do not free direct mapping pages since they were
989			 * freed when offlining, or simplely not in use.
990			 */
991			if (!direct)
992				free_pagetable(pte_page(*pte), 0);
993
994			spin_lock(&init_mm.page_table_lock);
995			pte_clear(&init_mm, addr, pte);
996			spin_unlock(&init_mm.page_table_lock);
997
998			/* For non-direct mapping, pages means nothing. */
999			pages++;
1000		} else {
1001			/*
1002			 * If we are here, we are freeing vmemmap pages since
1003			 * direct mapped memory ranges to be freed are aligned.
1004			 *
1005			 * If we are not removing the whole page, it means
1006			 * other page structs in this page are being used and
1007			 * we canot remove them. So fill the unused page_structs
1008			 * with 0xFD, and remove the page when it is wholly
1009			 * filled with 0xFD.
1010			 */
1011			memset((void *)addr, PAGE_INUSE, next - addr);
1012
1013			page_addr = page_address(pte_page(*pte));
1014			if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
1015				free_pagetable(pte_page(*pte), 0);
1016
1017				spin_lock(&init_mm.page_table_lock);
1018				pte_clear(&init_mm, addr, pte);
1019				spin_unlock(&init_mm.page_table_lock);
1020			}
1021		}
1022	}
1023
1024	/* Call free_pte_table() in remove_pmd_table(). */
1025	flush_tlb_all();
1026	if (direct)
1027		update_page_count(PG_LEVEL_4K, -pages);
1028}
1029
1030static void __meminit
1031remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
1032		 bool direct, struct vmem_altmap *altmap)
1033{
1034	unsigned long next, pages = 0;
1035	pte_t *pte_base;
1036	pmd_t *pmd;
1037	void *page_addr;
1038
1039	pmd = pmd_start + pmd_index(addr);
1040	for (; addr < end; addr = next, pmd++) {
1041		next = pmd_addr_end(addr, end);
1042
1043		if (!pmd_present(*pmd))
1044			continue;
1045
1046		if (pmd_large(*pmd)) {
1047			if (IS_ALIGNED(addr, PMD_SIZE) &&
1048			    IS_ALIGNED(next, PMD_SIZE)) {
1049				if (!direct)
1050					free_hugepage_table(pmd_page(*pmd),
1051							    altmap);
1052
1053				spin_lock(&init_mm.page_table_lock);
1054				pmd_clear(pmd);
1055				spin_unlock(&init_mm.page_table_lock);
1056				pages++;
1057			} else {
1058				/* If here, we are freeing vmemmap pages. */
1059				memset((void *)addr, PAGE_INUSE, next - addr);
1060
1061				page_addr = page_address(pmd_page(*pmd));
1062				if (!memchr_inv(page_addr, PAGE_INUSE,
1063						PMD_SIZE)) {
1064					free_hugepage_table(pmd_page(*pmd),
1065							    altmap);
1066
1067					spin_lock(&init_mm.page_table_lock);
1068					pmd_clear(pmd);
1069					spin_unlock(&init_mm.page_table_lock);
1070				}
1071			}
1072
1073			continue;
1074		}
1075
1076		pte_base = (pte_t *)pmd_page_vaddr(*pmd);
1077		remove_pte_table(pte_base, addr, next, direct);
1078		free_pte_table(pte_base, pmd);
1079	}
1080
1081	/* Call free_pmd_table() in remove_pud_table(). */
1082	if (direct)
1083		update_page_count(PG_LEVEL_2M, -pages);
1084}
1085
1086static void __meminit
1087remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
1088		 struct vmem_altmap *altmap, bool direct)
1089{
1090	unsigned long next, pages = 0;
1091	pmd_t *pmd_base;
1092	pud_t *pud;
1093	void *page_addr;
1094
1095	pud = pud_start + pud_index(addr);
1096	for (; addr < end; addr = next, pud++) {
1097		next = pud_addr_end(addr, end);
1098
1099		if (!pud_present(*pud))
1100			continue;
1101
1102		if (pud_large(*pud)) {
1103			if (IS_ALIGNED(addr, PUD_SIZE) &&
1104			    IS_ALIGNED(next, PUD_SIZE)) {
1105				if (!direct)
1106					free_pagetable(pud_page(*pud),
1107						       get_order(PUD_SIZE));
1108
1109				spin_lock(&init_mm.page_table_lock);
1110				pud_clear(pud);
1111				spin_unlock(&init_mm.page_table_lock);
1112				pages++;
1113			} else {
1114				/* If here, we are freeing vmemmap pages. */
1115				memset((void *)addr, PAGE_INUSE, next - addr);
1116
1117				page_addr = page_address(pud_page(*pud));
1118				if (!memchr_inv(page_addr, PAGE_INUSE,
1119						PUD_SIZE)) {
1120					free_pagetable(pud_page(*pud),
1121						       get_order(PUD_SIZE));
1122
1123					spin_lock(&init_mm.page_table_lock);
1124					pud_clear(pud);
1125					spin_unlock(&init_mm.page_table_lock);
1126				}
1127			}
1128
1129			continue;
1130		}
1131
1132		pmd_base = pmd_offset(pud, 0);
1133		remove_pmd_table(pmd_base, addr, next, direct, altmap);
1134		free_pmd_table(pmd_base, pud);
1135	}
1136
1137	if (direct)
1138		update_page_count(PG_LEVEL_1G, -pages);
1139}
1140
1141static void __meminit
1142remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end,
1143		 struct vmem_altmap *altmap, bool direct)
1144{
1145	unsigned long next, pages = 0;
1146	pud_t *pud_base;
1147	p4d_t *p4d;
1148
1149	p4d = p4d_start + p4d_index(addr);
1150	for (; addr < end; addr = next, p4d++) {
1151		next = p4d_addr_end(addr, end);
1152
1153		if (!p4d_present(*p4d))
1154			continue;
1155
1156		BUILD_BUG_ON(p4d_large(*p4d));
1157
1158		pud_base = pud_offset(p4d, 0);
1159		remove_pud_table(pud_base, addr, next, altmap, direct);
1160		/*
1161		 * For 4-level page tables we do not want to free PUDs, but in the
1162		 * 5-level case we should free them. This code will have to change
1163		 * to adapt for boot-time switching between 4 and 5 level page tables.
1164		 */
1165		if (pgtable_l5_enabled())
1166			free_pud_table(pud_base, p4d);
1167	}
1168
1169	if (direct)
1170		update_page_count(PG_LEVEL_512G, -pages);
1171}
1172
1173/* start and end are both virtual address. */
1174static void __meminit
1175remove_pagetable(unsigned long start, unsigned long end, bool direct,
1176		struct vmem_altmap *altmap)
1177{
1178	unsigned long next;
1179	unsigned long addr;
1180	pgd_t *pgd;
1181	p4d_t *p4d;
1182
1183	for (addr = start; addr < end; addr = next) {
1184		next = pgd_addr_end(addr, end);
1185
1186		pgd = pgd_offset_k(addr);
1187		if (!pgd_present(*pgd))
1188			continue;
1189
1190		p4d = p4d_offset(pgd, 0);
1191		remove_p4d_table(p4d, addr, next, altmap, direct);
1192	}
1193
1194	flush_tlb_all();
1195}
1196
1197void __ref vmemmap_free(unsigned long start, unsigned long end,
1198		struct vmem_altmap *altmap)
1199{
1200	remove_pagetable(start, end, false, altmap);
1201}
1202
1203static void __meminit
1204kernel_physical_mapping_remove(unsigned long start, unsigned long end)
1205{
1206	start = (unsigned long)__va(start);
1207	end = (unsigned long)__va(end);
1208
1209	remove_pagetable(start, end, true, NULL);
1210}
1211
1212void __ref arch_remove_memory(int nid, u64 start, u64 size,
1213			      struct vmem_altmap *altmap)
1214{
1215	unsigned long start_pfn = start >> PAGE_SHIFT;
1216	unsigned long nr_pages = size >> PAGE_SHIFT;
1217
1218	__remove_pages(start_pfn, nr_pages, altmap);
1219	kernel_physical_mapping_remove(start, start + size);
1220}
1221#endif /* CONFIG_MEMORY_HOTPLUG */
1222
1223static struct kcore_list kcore_vsyscall;
1224
1225static void __init register_page_bootmem_info(void)
1226{
1227#ifdef CONFIG_NUMA
1228	int i;
1229
1230	for_each_online_node(i)
1231		register_page_bootmem_info_node(NODE_DATA(i));
1232#endif
1233}
1234
1235/*
1236 * Pre-allocates page-table pages for the vmalloc area in the kernel page-table.
1237 * Only the level which needs to be synchronized between all page-tables is
1238 * allocated because the synchronization can be expensive.
1239 */
1240static void __init preallocate_vmalloc_pages(void)
1241{
1242	unsigned long addr;
1243	const char *lvl;
1244
1245	for (addr = VMALLOC_START; addr <= VMALLOC_END; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
1246		pgd_t *pgd = pgd_offset_k(addr);
1247		p4d_t *p4d;
1248		pud_t *pud;
1249
1250		lvl = "p4d";
1251		p4d = p4d_alloc(&init_mm, pgd, addr);
1252		if (!p4d)
1253			goto failed;
1254
1255		if (pgtable_l5_enabled())
1256			continue;
1257
1258		/*
1259		 * The goal here is to allocate all possibly required
1260		 * hardware page tables pointed to by the top hardware
1261		 * level.
1262		 *
1263		 * On 4-level systems, the P4D layer is folded away and
1264		 * the above code does no preallocation.  Below, go down
1265		 * to the pud _software_ level to ensure the second
1266		 * hardware level is allocated on 4-level systems too.
1267		 */
1268		lvl = "pud";
1269		pud = pud_alloc(&init_mm, p4d, addr);
1270		if (!pud)
1271			goto failed;
1272	}
1273
1274	return;
1275
1276failed:
1277
1278	/*
1279	 * The pages have to be there now or they will be missing in
1280	 * process page-tables later.
1281	 */
1282	panic("Failed to pre-allocate %s pages for vmalloc area\n", lvl);
1283}
1284
1285void __init mem_init(void)
1286{
1287	pci_iommu_alloc();
1288
1289	/* clear_bss() already clear the empty_zero_page */
1290
1291	/* this will put all memory onto the freelists */
1292	memblock_free_all();
1293	after_bootmem = 1;
1294	x86_init.hyper.init_after_bootmem();
1295
1296	/*
1297	 * Must be done after boot memory is put on freelist, because here we
1298	 * might set fields in deferred struct pages that have not yet been
1299	 * initialized, and memblock_free_all() initializes all the reserved
1300	 * deferred pages for us.
1301	 */
1302	register_page_bootmem_info();
1303
1304	/* Register memory areas for /proc/kcore */
1305	if (get_gate_vma(&init_mm))
1306		kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_USER);
1307
1308	preallocate_vmalloc_pages();
1309
1310	mem_init_print_info(NULL);
1311}
1312
1313#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1314int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask)
1315{
1316	/*
1317	 * More CPUs always led to greater speedups on tested systems, up to
1318	 * all the nodes' CPUs.  Use all since the system is otherwise idle
1319	 * now.
1320	 */
1321	return max_t(int, cpumask_weight(node_cpumask), 1);
1322}
1323#endif
1324
1325int kernel_set_to_readonly;
1326
1327void mark_rodata_ro(void)
1328{
1329	unsigned long start = PFN_ALIGN(_text);
1330	unsigned long rodata_start = PFN_ALIGN(__start_rodata);
1331	unsigned long end = (unsigned long)__end_rodata_hpage_align;
1332	unsigned long text_end = PFN_ALIGN(_etext);
1333	unsigned long rodata_end = PFN_ALIGN(__end_rodata);
1334	unsigned long all_end;
1335
1336	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
1337	       (end - start) >> 10);
1338	set_memory_ro(start, (end - start) >> PAGE_SHIFT);
1339
1340	kernel_set_to_readonly = 1;
1341
1342	/*
1343	 * The rodata/data/bss/brk section (but not the kernel text!)
1344	 * should also be not-executable.
1345	 *
1346	 * We align all_end to PMD_SIZE because the existing mapping
1347	 * is a full PMD. If we would align _brk_end to PAGE_SIZE we
1348	 * split the PMD and the reminder between _brk_end and the end
1349	 * of the PMD will remain mapped executable.
1350	 *
1351	 * Any PMD which was setup after the one which covers _brk_end
1352	 * has been zapped already via cleanup_highmem().
1353	 */
1354	all_end = roundup((unsigned long)_brk_end, PMD_SIZE);
1355	set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT);
1356
1357	set_ftrace_ops_ro();
1358
1359#ifdef CONFIG_CPA_DEBUG
1360	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
1361	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
1362
1363	printk(KERN_INFO "Testing CPA: again\n");
1364	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
1365#endif
1366
1367	free_kernel_image_pages("unused kernel image (text/rodata gap)",
1368				(void *)text_end, (void *)rodata_start);
1369	free_kernel_image_pages("unused kernel image (rodata/data gap)",
1370				(void *)rodata_end, (void *)_sdata);
1371
1372	debug_checkwx();
1373}
1374
1375int kern_addr_valid(unsigned long addr)
1376{
1377	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
1378	pgd_t *pgd;
1379	p4d_t *p4d;
1380	pud_t *pud;
1381	pmd_t *pmd;
1382	pte_t *pte;
1383
1384	if (above != 0 && above != -1UL)
1385		return 0;
1386
1387	pgd = pgd_offset_k(addr);
1388	if (pgd_none(*pgd))
1389		return 0;
1390
1391	p4d = p4d_offset(pgd, addr);
1392	if (!p4d_present(*p4d))
1393		return 0;
1394
1395	pud = pud_offset(p4d, addr);
1396	if (!pud_present(*pud))
1397		return 0;
1398
1399	if (pud_large(*pud))
1400		return pfn_valid(pud_pfn(*pud));
1401
1402	pmd = pmd_offset(pud, addr);
1403	if (!pmd_present(*pmd))
1404		return 0;
1405
1406	if (pmd_large(*pmd))
1407		return pfn_valid(pmd_pfn(*pmd));
1408
1409	pte = pte_offset_kernel(pmd, addr);
1410	if (pte_none(*pte))
1411		return 0;
1412
1413	return pfn_valid(pte_pfn(*pte));
1414}
1415
1416/*
1417 * Block size is the minimum amount of memory which can be hotplugged or
1418 * hotremoved. It must be power of two and must be equal or larger than
1419 * MIN_MEMORY_BLOCK_SIZE.
1420 */
1421#define MAX_BLOCK_SIZE (2UL << 30)
1422
1423/* Amount of ram needed to start using large blocks */
1424#define MEM_SIZE_FOR_LARGE_BLOCK (64UL << 30)
1425
1426/* Adjustable memory block size */
1427static unsigned long set_memory_block_size;
1428int __init set_memory_block_size_order(unsigned int order)
1429{
1430	unsigned long size = 1UL << order;
1431
1432	if (size > MEM_SIZE_FOR_LARGE_BLOCK || size < MIN_MEMORY_BLOCK_SIZE)
1433		return -EINVAL;
1434
1435	set_memory_block_size = size;
1436	return 0;
1437}
1438
1439static unsigned long probe_memory_block_size(void)
1440{
1441	unsigned long boot_mem_end = max_pfn << PAGE_SHIFT;
1442	unsigned long bz;
1443
1444	/* If memory block size has been set, then use it */
1445	bz = set_memory_block_size;
1446	if (bz)
1447		goto done;
1448
1449	/* Use regular block if RAM is smaller than MEM_SIZE_FOR_LARGE_BLOCK */
1450	if (boot_mem_end < MEM_SIZE_FOR_LARGE_BLOCK) {
1451		bz = MIN_MEMORY_BLOCK_SIZE;
1452		goto done;
1453	}
1454
1455	/*
1456	 * Use max block size to minimize overhead on bare metal, where
1457	 * alignment for memory hotplug isn't a concern.
1458	 */
1459	if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
1460		bz = MAX_BLOCK_SIZE;
1461		goto done;
1462	}
1463
1464	/* Find the largest allowed block size that aligns to memory end */
1465	for (bz = MAX_BLOCK_SIZE; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) {
1466		if (IS_ALIGNED(boot_mem_end, bz))
1467			break;
1468	}
1469done:
1470	pr_info("x86/mm: Memory block size: %ldMB\n", bz >> 20);
1471
1472	return bz;
1473}
1474
1475static unsigned long memory_block_size_probed;
1476unsigned long memory_block_size_bytes(void)
1477{
1478	if (!memory_block_size_probed)
1479		memory_block_size_probed = probe_memory_block_size();
1480
1481	return memory_block_size_probed;
1482}
1483
1484#ifdef CONFIG_SPARSEMEM_VMEMMAP
1485/*
1486 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
1487 */
1488static long __meminitdata addr_start, addr_end;
1489static void __meminitdata *p_start, *p_end;
1490static int __meminitdata node_start;
1491
1492static int __meminit vmemmap_populate_hugepages(unsigned long start,
1493		unsigned long end, int node, struct vmem_altmap *altmap)
1494{
1495	unsigned long addr;
1496	unsigned long next;
1497	pgd_t *pgd;
1498	p4d_t *p4d;
1499	pud_t *pud;
1500	pmd_t *pmd;
1501
1502	for (addr = start; addr < end; addr = next) {
1503		next = pmd_addr_end(addr, end);
1504
1505		pgd = vmemmap_pgd_populate(addr, node);
1506		if (!pgd)
1507			return -ENOMEM;
1508
1509		p4d = vmemmap_p4d_populate(pgd, addr, node);
1510		if (!p4d)
1511			return -ENOMEM;
1512
1513		pud = vmemmap_pud_populate(p4d, addr, node);
1514		if (!pud)
1515			return -ENOMEM;
1516
1517		pmd = pmd_offset(pud, addr);
1518		if (pmd_none(*pmd)) {
1519			void *p;
1520
1521			p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
1522			if (p) {
1523				pte_t entry;
1524
1525				entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
1526						PAGE_KERNEL_LARGE);
1527				set_pmd(pmd, __pmd(pte_val(entry)));
1528
1529				/* check to see if we have contiguous blocks */
1530				if (p_end != p || node_start != node) {
1531					if (p_start)
1532						pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
1533						       addr_start, addr_end-1, p_start, p_end-1, node_start);
1534					addr_start = addr;
1535					node_start = node;
1536					p_start = p;
1537				}
1538
1539				addr_end = addr + PMD_SIZE;
1540				p_end = p + PMD_SIZE;
1541				continue;
1542			} else if (altmap)
1543				return -ENOMEM; /* no fallback */
1544		} else if (pmd_large(*pmd)) {
1545			vmemmap_verify((pte_t *)pmd, node, addr, next);
1546			continue;
1547		}
1548		if (vmemmap_populate_basepages(addr, next, node, NULL))
1549			return -ENOMEM;
1550	}
1551	return 0;
1552}
1553
1554int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
1555		struct vmem_altmap *altmap)
1556{
1557	int err;
1558
1559	if (end - start < PAGES_PER_SECTION * sizeof(struct page))
1560		err = vmemmap_populate_basepages(start, end, node, NULL);
1561	else if (boot_cpu_has(X86_FEATURE_PSE))
1562		err = vmemmap_populate_hugepages(start, end, node, altmap);
1563	else if (altmap) {
1564		pr_err_once("%s: no cpu support for altmap allocations\n",
1565				__func__);
1566		err = -ENOMEM;
1567	} else
1568		err = vmemmap_populate_basepages(start, end, node, NULL);
1569	if (!err)
1570		sync_global_pgds(start, end - 1);
1571	return err;
1572}
1573
1574#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE)
1575void register_page_bootmem_memmap(unsigned long section_nr,
1576				  struct page *start_page, unsigned long nr_pages)
1577{
1578	unsigned long addr = (unsigned long)start_page;
1579	unsigned long end = (unsigned long)(start_page + nr_pages);
1580	unsigned long next;
1581	pgd_t *pgd;
1582	p4d_t *p4d;
1583	pud_t *pud;
1584	pmd_t *pmd;
1585	unsigned int nr_pmd_pages;
1586	struct page *page;
1587
1588	for (; addr < end; addr = next) {
1589		pte_t *pte = NULL;
1590
1591		pgd = pgd_offset_k(addr);
1592		if (pgd_none(*pgd)) {
1593			next = (addr + PAGE_SIZE) & PAGE_MASK;
1594			continue;
1595		}
1596		get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO);
1597
1598		p4d = p4d_offset(pgd, addr);
1599		if (p4d_none(*p4d)) {
1600			next = (addr + PAGE_SIZE) & PAGE_MASK;
1601			continue;
1602		}
1603		get_page_bootmem(section_nr, p4d_page(*p4d), MIX_SECTION_INFO);
1604
1605		pud = pud_offset(p4d, addr);
1606		if (pud_none(*pud)) {
1607			next = (addr + PAGE_SIZE) & PAGE_MASK;
1608			continue;
1609		}
1610		get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
1611
1612		if (!boot_cpu_has(X86_FEATURE_PSE)) {
1613			next = (addr + PAGE_SIZE) & PAGE_MASK;
1614			pmd = pmd_offset(pud, addr);
1615			if (pmd_none(*pmd))
1616				continue;
1617			get_page_bootmem(section_nr, pmd_page(*pmd),
1618					 MIX_SECTION_INFO);
1619
1620			pte = pte_offset_kernel(pmd, addr);
1621			if (pte_none(*pte))
1622				continue;
1623			get_page_bootmem(section_nr, pte_page(*pte),
1624					 SECTION_INFO);
1625		} else {
1626			next = pmd_addr_end(addr, end);
1627
1628			pmd = pmd_offset(pud, addr);
1629			if (pmd_none(*pmd))
1630				continue;
1631
1632			nr_pmd_pages = 1 << get_order(PMD_SIZE);
1633			page = pmd_page(*pmd);
1634			while (nr_pmd_pages--)
1635				get_page_bootmem(section_nr, page++,
1636						 SECTION_INFO);
1637		}
1638	}
1639}
1640#endif
1641
1642void __meminit vmemmap_populate_print_last(void)
1643{
1644	if (p_start) {
1645		pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
1646			addr_start, addr_end-1, p_start, p_end-1, node_start);
1647		p_start = NULL;
1648		p_end = NULL;
1649		node_start = 0;
1650	}
1651}
1652#endif
1653