xref: /kernel/linux/linux-6.6/mm/page_ext.c (revision 62306a36)
162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
262306a36Sopenharmony_ci#include <linux/mm.h>
362306a36Sopenharmony_ci#include <linux/mmzone.h>
462306a36Sopenharmony_ci#include <linux/memblock.h>
562306a36Sopenharmony_ci#include <linux/page_ext.h>
662306a36Sopenharmony_ci#include <linux/memory.h>
762306a36Sopenharmony_ci#include <linux/vmalloc.h>
862306a36Sopenharmony_ci#include <linux/kmemleak.h>
962306a36Sopenharmony_ci#include <linux/page_owner.h>
1062306a36Sopenharmony_ci#include <linux/page_idle.h>
1162306a36Sopenharmony_ci#include <linux/page_table_check.h>
1262306a36Sopenharmony_ci#include <linux/rcupdate.h>
1362306a36Sopenharmony_ci
1462306a36Sopenharmony_ci/*
1562306a36Sopenharmony_ci * struct page extension
1662306a36Sopenharmony_ci *
1762306a36Sopenharmony_ci * This is the feature to manage memory for extended data per page.
1862306a36Sopenharmony_ci *
1962306a36Sopenharmony_ci * Until now, we must modify struct page itself to store extra data per page.
2062306a36Sopenharmony_ci * This requires rebuilding the kernel and it is really time consuming process.
2162306a36Sopenharmony_ci * And, sometimes, rebuild is impossible due to third party module dependency.
2262306a36Sopenharmony_ci * At last, enlarging struct page could cause un-wanted system behaviour change.
2362306a36Sopenharmony_ci *
2462306a36Sopenharmony_ci * This feature is intended to overcome above mentioned problems. This feature
2562306a36Sopenharmony_ci * allocates memory for extended data per page in certain place rather than
2662306a36Sopenharmony_ci * the struct page itself. This memory can be accessed by the accessor
2762306a36Sopenharmony_ci * functions provided by this code. During the boot process, it checks whether
2862306a36Sopenharmony_ci * allocation of huge chunk of memory is needed or not. If not, it avoids
2962306a36Sopenharmony_ci * allocating memory at all. With this advantage, we can include this feature
3062306a36Sopenharmony_ci * into the kernel in default and can avoid rebuild and solve related problems.
3162306a36Sopenharmony_ci *
3262306a36Sopenharmony_ci * To help these things to work well, there are two callbacks for clients. One
3362306a36Sopenharmony_ci * is the need callback which is mandatory if user wants to avoid useless
3462306a36Sopenharmony_ci * memory allocation at boot-time. The other is optional, init callback, which
3562306a36Sopenharmony_ci * is used to do proper initialization after memory is allocated.
3662306a36Sopenharmony_ci *
3762306a36Sopenharmony_ci * The need callback is used to decide whether extended memory allocation is
3862306a36Sopenharmony_ci * needed or not. Sometimes users want to deactivate some features in this
3962306a36Sopenharmony_ci * boot and extra memory would be unnecessary. In this case, to avoid
4062306a36Sopenharmony_ci * allocating huge chunk of memory, each clients represent their need of
4162306a36Sopenharmony_ci * extra memory through the need callback. If one of the need callbacks
4262306a36Sopenharmony_ci * returns true, it means that someone needs extra memory so that
4362306a36Sopenharmony_ci * page extension core should allocates memory for page extension. If
4462306a36Sopenharmony_ci * none of need callbacks return true, memory isn't needed at all in this boot
4562306a36Sopenharmony_ci * and page extension core can skip to allocate memory. As result,
4662306a36Sopenharmony_ci * none of memory is wasted.
4762306a36Sopenharmony_ci *
4862306a36Sopenharmony_ci * When need callback returns true, page_ext checks if there is a request for
4962306a36Sopenharmony_ci * extra memory through size in struct page_ext_operations. If it is non-zero,
5062306a36Sopenharmony_ci * extra space is allocated for each page_ext entry and offset is returned to
5162306a36Sopenharmony_ci * user through offset in struct page_ext_operations.
5262306a36Sopenharmony_ci *
5362306a36Sopenharmony_ci * The init callback is used to do proper initialization after page extension
5462306a36Sopenharmony_ci * is completely initialized. In sparse memory system, extra memory is
5562306a36Sopenharmony_ci * allocated some time later than memmap is allocated. In other words, lifetime
5662306a36Sopenharmony_ci * of memory for page extension isn't same with memmap for struct page.
5762306a36Sopenharmony_ci * Therefore, clients can't store extra data until page extension is
5862306a36Sopenharmony_ci * initialized, even if pages are allocated and used freely. This could
5962306a36Sopenharmony_ci * cause inadequate state of extra data per page, so, to prevent it, client
6062306a36Sopenharmony_ci * can utilize this callback to initialize the state of it correctly.
6162306a36Sopenharmony_ci */
6262306a36Sopenharmony_ci
6362306a36Sopenharmony_ci#ifdef CONFIG_SPARSEMEM
6462306a36Sopenharmony_ci#define PAGE_EXT_INVALID       (0x1)
6562306a36Sopenharmony_ci#endif
6662306a36Sopenharmony_ci
6762306a36Sopenharmony_ci#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
6862306a36Sopenharmony_cistatic bool need_page_idle(void)
6962306a36Sopenharmony_ci{
7062306a36Sopenharmony_ci	return true;
7162306a36Sopenharmony_ci}
7262306a36Sopenharmony_cistatic struct page_ext_operations page_idle_ops __initdata = {
7362306a36Sopenharmony_ci	.need = need_page_idle,
7462306a36Sopenharmony_ci	.need_shared_flags = true,
7562306a36Sopenharmony_ci};
7662306a36Sopenharmony_ci#endif
7762306a36Sopenharmony_ci
7862306a36Sopenharmony_cistatic struct page_ext_operations *page_ext_ops[] __initdata = {
7962306a36Sopenharmony_ci#ifdef CONFIG_PAGE_OWNER
8062306a36Sopenharmony_ci	&page_owner_ops,
8162306a36Sopenharmony_ci#endif
8262306a36Sopenharmony_ci#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
8362306a36Sopenharmony_ci	&page_idle_ops,
8462306a36Sopenharmony_ci#endif
8562306a36Sopenharmony_ci#ifdef CONFIG_PAGE_TABLE_CHECK
8662306a36Sopenharmony_ci	&page_table_check_ops,
8762306a36Sopenharmony_ci#endif
8862306a36Sopenharmony_ci};
8962306a36Sopenharmony_ci
9062306a36Sopenharmony_ciunsigned long page_ext_size;
9162306a36Sopenharmony_ci
9262306a36Sopenharmony_cistatic unsigned long total_usage;
9362306a36Sopenharmony_ci
9462306a36Sopenharmony_cibool early_page_ext __meminitdata;
9562306a36Sopenharmony_cistatic int __init setup_early_page_ext(char *str)
9662306a36Sopenharmony_ci{
9762306a36Sopenharmony_ci	early_page_ext = true;
9862306a36Sopenharmony_ci	return 0;
9962306a36Sopenharmony_ci}
10062306a36Sopenharmony_ciearly_param("early_page_ext", setup_early_page_ext);
10162306a36Sopenharmony_ci
10262306a36Sopenharmony_cistatic bool __init invoke_need_callbacks(void)
10362306a36Sopenharmony_ci{
10462306a36Sopenharmony_ci	int i;
10562306a36Sopenharmony_ci	int entries = ARRAY_SIZE(page_ext_ops);
10662306a36Sopenharmony_ci	bool need = false;
10762306a36Sopenharmony_ci
10862306a36Sopenharmony_ci	for (i = 0; i < entries; i++) {
10962306a36Sopenharmony_ci		if (page_ext_ops[i]->need()) {
11062306a36Sopenharmony_ci			if (page_ext_ops[i]->need_shared_flags) {
11162306a36Sopenharmony_ci				page_ext_size = sizeof(struct page_ext);
11262306a36Sopenharmony_ci				break;
11362306a36Sopenharmony_ci			}
11462306a36Sopenharmony_ci		}
11562306a36Sopenharmony_ci	}
11662306a36Sopenharmony_ci
11762306a36Sopenharmony_ci	for (i = 0; i < entries; i++) {
11862306a36Sopenharmony_ci		if (page_ext_ops[i]->need()) {
11962306a36Sopenharmony_ci			page_ext_ops[i]->offset = page_ext_size;
12062306a36Sopenharmony_ci			page_ext_size += page_ext_ops[i]->size;
12162306a36Sopenharmony_ci			need = true;
12262306a36Sopenharmony_ci		}
12362306a36Sopenharmony_ci	}
12462306a36Sopenharmony_ci
12562306a36Sopenharmony_ci	return need;
12662306a36Sopenharmony_ci}
12762306a36Sopenharmony_ci
12862306a36Sopenharmony_cistatic void __init invoke_init_callbacks(void)
12962306a36Sopenharmony_ci{
13062306a36Sopenharmony_ci	int i;
13162306a36Sopenharmony_ci	int entries = ARRAY_SIZE(page_ext_ops);
13262306a36Sopenharmony_ci
13362306a36Sopenharmony_ci	for (i = 0; i < entries; i++) {
13462306a36Sopenharmony_ci		if (page_ext_ops[i]->init)
13562306a36Sopenharmony_ci			page_ext_ops[i]->init();
13662306a36Sopenharmony_ci	}
13762306a36Sopenharmony_ci}
13862306a36Sopenharmony_ci
13962306a36Sopenharmony_cistatic inline struct page_ext *get_entry(void *base, unsigned long index)
14062306a36Sopenharmony_ci{
14162306a36Sopenharmony_ci	return base + page_ext_size * index;
14262306a36Sopenharmony_ci}
14362306a36Sopenharmony_ci
14462306a36Sopenharmony_ci#ifndef CONFIG_SPARSEMEM
14562306a36Sopenharmony_civoid __init page_ext_init_flatmem_late(void)
14662306a36Sopenharmony_ci{
14762306a36Sopenharmony_ci	invoke_init_callbacks();
14862306a36Sopenharmony_ci}
14962306a36Sopenharmony_ci
15062306a36Sopenharmony_civoid __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
15162306a36Sopenharmony_ci{
15262306a36Sopenharmony_ci	pgdat->node_page_ext = NULL;
15362306a36Sopenharmony_ci}
15462306a36Sopenharmony_ci
15562306a36Sopenharmony_cistatic struct page_ext *lookup_page_ext(const struct page *page)
15662306a36Sopenharmony_ci{
15762306a36Sopenharmony_ci	unsigned long pfn = page_to_pfn(page);
15862306a36Sopenharmony_ci	unsigned long index;
15962306a36Sopenharmony_ci	struct page_ext *base;
16062306a36Sopenharmony_ci
16162306a36Sopenharmony_ci	WARN_ON_ONCE(!rcu_read_lock_held());
16262306a36Sopenharmony_ci	base = NODE_DATA(page_to_nid(page))->node_page_ext;
16362306a36Sopenharmony_ci	/*
16462306a36Sopenharmony_ci	 * The sanity checks the page allocator does upon freeing a
16562306a36Sopenharmony_ci	 * page can reach here before the page_ext arrays are
16662306a36Sopenharmony_ci	 * allocated when feeding a range of pages to the allocator
16762306a36Sopenharmony_ci	 * for the first time during bootup or memory hotplug.
16862306a36Sopenharmony_ci	 */
16962306a36Sopenharmony_ci	if (unlikely(!base))
17062306a36Sopenharmony_ci		return NULL;
17162306a36Sopenharmony_ci	index = pfn - round_down(node_start_pfn(page_to_nid(page)),
17262306a36Sopenharmony_ci					MAX_ORDER_NR_PAGES);
17362306a36Sopenharmony_ci	return get_entry(base, index);
17462306a36Sopenharmony_ci}
17562306a36Sopenharmony_ci
17662306a36Sopenharmony_cistatic int __init alloc_node_page_ext(int nid)
17762306a36Sopenharmony_ci{
17862306a36Sopenharmony_ci	struct page_ext *base;
17962306a36Sopenharmony_ci	unsigned long table_size;
18062306a36Sopenharmony_ci	unsigned long nr_pages;
18162306a36Sopenharmony_ci
18262306a36Sopenharmony_ci	nr_pages = NODE_DATA(nid)->node_spanned_pages;
18362306a36Sopenharmony_ci	if (!nr_pages)
18462306a36Sopenharmony_ci		return 0;
18562306a36Sopenharmony_ci
18662306a36Sopenharmony_ci	/*
18762306a36Sopenharmony_ci	 * Need extra space if node range is not aligned with
18862306a36Sopenharmony_ci	 * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm
18962306a36Sopenharmony_ci	 * checks buddy's status, range could be out of exact node range.
19062306a36Sopenharmony_ci	 */
19162306a36Sopenharmony_ci	if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) ||
19262306a36Sopenharmony_ci		!IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES))
19362306a36Sopenharmony_ci		nr_pages += MAX_ORDER_NR_PAGES;
19462306a36Sopenharmony_ci
19562306a36Sopenharmony_ci	table_size = page_ext_size * nr_pages;
19662306a36Sopenharmony_ci
19762306a36Sopenharmony_ci	base = memblock_alloc_try_nid(
19862306a36Sopenharmony_ci			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
19962306a36Sopenharmony_ci			MEMBLOCK_ALLOC_ACCESSIBLE, nid);
20062306a36Sopenharmony_ci	if (!base)
20162306a36Sopenharmony_ci		return -ENOMEM;
20262306a36Sopenharmony_ci	NODE_DATA(nid)->node_page_ext = base;
20362306a36Sopenharmony_ci	total_usage += table_size;
20462306a36Sopenharmony_ci	return 0;
20562306a36Sopenharmony_ci}
20662306a36Sopenharmony_ci
20762306a36Sopenharmony_civoid __init page_ext_init_flatmem(void)
20862306a36Sopenharmony_ci{
20962306a36Sopenharmony_ci
21062306a36Sopenharmony_ci	int nid, fail;
21162306a36Sopenharmony_ci
21262306a36Sopenharmony_ci	if (!invoke_need_callbacks())
21362306a36Sopenharmony_ci		return;
21462306a36Sopenharmony_ci
21562306a36Sopenharmony_ci	for_each_online_node(nid)  {
21662306a36Sopenharmony_ci		fail = alloc_node_page_ext(nid);
21762306a36Sopenharmony_ci		if (fail)
21862306a36Sopenharmony_ci			goto fail;
21962306a36Sopenharmony_ci	}
22062306a36Sopenharmony_ci	pr_info("allocated %ld bytes of page_ext\n", total_usage);
22162306a36Sopenharmony_ci	return;
22262306a36Sopenharmony_ci
22362306a36Sopenharmony_cifail:
22462306a36Sopenharmony_ci	pr_crit("allocation of page_ext failed.\n");
22562306a36Sopenharmony_ci	panic("Out of memory");
22662306a36Sopenharmony_ci}
22762306a36Sopenharmony_ci
22862306a36Sopenharmony_ci#else /* CONFIG_SPARSEMEM */
22962306a36Sopenharmony_cistatic bool page_ext_invalid(struct page_ext *page_ext)
23062306a36Sopenharmony_ci{
23162306a36Sopenharmony_ci	return !page_ext || (((unsigned long)page_ext & PAGE_EXT_INVALID) == PAGE_EXT_INVALID);
23262306a36Sopenharmony_ci}
23362306a36Sopenharmony_ci
23462306a36Sopenharmony_cistatic struct page_ext *lookup_page_ext(const struct page *page)
23562306a36Sopenharmony_ci{
23662306a36Sopenharmony_ci	unsigned long pfn = page_to_pfn(page);
23762306a36Sopenharmony_ci	struct mem_section *section = __pfn_to_section(pfn);
23862306a36Sopenharmony_ci	struct page_ext *page_ext = READ_ONCE(section->page_ext);
23962306a36Sopenharmony_ci
24062306a36Sopenharmony_ci	WARN_ON_ONCE(!rcu_read_lock_held());
24162306a36Sopenharmony_ci	/*
24262306a36Sopenharmony_ci	 * The sanity checks the page allocator does upon freeing a
24362306a36Sopenharmony_ci	 * page can reach here before the page_ext arrays are
24462306a36Sopenharmony_ci	 * allocated when feeding a range of pages to the allocator
24562306a36Sopenharmony_ci	 * for the first time during bootup or memory hotplug.
24662306a36Sopenharmony_ci	 */
24762306a36Sopenharmony_ci	if (page_ext_invalid(page_ext))
24862306a36Sopenharmony_ci		return NULL;
24962306a36Sopenharmony_ci	return get_entry(page_ext, pfn);
25062306a36Sopenharmony_ci}
25162306a36Sopenharmony_ci
25262306a36Sopenharmony_cistatic void *__meminit alloc_page_ext(size_t size, int nid)
25362306a36Sopenharmony_ci{
25462306a36Sopenharmony_ci	gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
25562306a36Sopenharmony_ci	void *addr = NULL;
25662306a36Sopenharmony_ci
25762306a36Sopenharmony_ci	addr = alloc_pages_exact_nid(nid, size, flags);
25862306a36Sopenharmony_ci	if (addr) {
25962306a36Sopenharmony_ci		kmemleak_alloc(addr, size, 1, flags);
26062306a36Sopenharmony_ci		return addr;
26162306a36Sopenharmony_ci	}
26262306a36Sopenharmony_ci
26362306a36Sopenharmony_ci	addr = vzalloc_node(size, nid);
26462306a36Sopenharmony_ci
26562306a36Sopenharmony_ci	return addr;
26662306a36Sopenharmony_ci}
26762306a36Sopenharmony_ci
26862306a36Sopenharmony_cistatic int __meminit init_section_page_ext(unsigned long pfn, int nid)
26962306a36Sopenharmony_ci{
27062306a36Sopenharmony_ci	struct mem_section *section;
27162306a36Sopenharmony_ci	struct page_ext *base;
27262306a36Sopenharmony_ci	unsigned long table_size;
27362306a36Sopenharmony_ci
27462306a36Sopenharmony_ci	section = __pfn_to_section(pfn);
27562306a36Sopenharmony_ci
27662306a36Sopenharmony_ci	if (section->page_ext)
27762306a36Sopenharmony_ci		return 0;
27862306a36Sopenharmony_ci
27962306a36Sopenharmony_ci	table_size = page_ext_size * PAGES_PER_SECTION;
28062306a36Sopenharmony_ci	base = alloc_page_ext(table_size, nid);
28162306a36Sopenharmony_ci
28262306a36Sopenharmony_ci	/*
28362306a36Sopenharmony_ci	 * The value stored in section->page_ext is (base - pfn)
28462306a36Sopenharmony_ci	 * and it does not point to the memory block allocated above,
28562306a36Sopenharmony_ci	 * causing kmemleak false positives.
28662306a36Sopenharmony_ci	 */
28762306a36Sopenharmony_ci	kmemleak_not_leak(base);
28862306a36Sopenharmony_ci
28962306a36Sopenharmony_ci	if (!base) {
29062306a36Sopenharmony_ci		pr_err("page ext allocation failure\n");
29162306a36Sopenharmony_ci		return -ENOMEM;
29262306a36Sopenharmony_ci	}
29362306a36Sopenharmony_ci
29462306a36Sopenharmony_ci	/*
29562306a36Sopenharmony_ci	 * The passed "pfn" may not be aligned to SECTION.  For the calculation
29662306a36Sopenharmony_ci	 * we need to apply a mask.
29762306a36Sopenharmony_ci	 */
29862306a36Sopenharmony_ci	pfn &= PAGE_SECTION_MASK;
29962306a36Sopenharmony_ci	section->page_ext = (void *)base - page_ext_size * pfn;
30062306a36Sopenharmony_ci	total_usage += table_size;
30162306a36Sopenharmony_ci	return 0;
30262306a36Sopenharmony_ci}
30362306a36Sopenharmony_ci
30462306a36Sopenharmony_cistatic void free_page_ext(void *addr)
30562306a36Sopenharmony_ci{
30662306a36Sopenharmony_ci	if (is_vmalloc_addr(addr)) {
30762306a36Sopenharmony_ci		vfree(addr);
30862306a36Sopenharmony_ci	} else {
30962306a36Sopenharmony_ci		struct page *page = virt_to_page(addr);
31062306a36Sopenharmony_ci		size_t table_size;
31162306a36Sopenharmony_ci
31262306a36Sopenharmony_ci		table_size = page_ext_size * PAGES_PER_SECTION;
31362306a36Sopenharmony_ci
31462306a36Sopenharmony_ci		BUG_ON(PageReserved(page));
31562306a36Sopenharmony_ci		kmemleak_free(addr);
31662306a36Sopenharmony_ci		free_pages_exact(addr, table_size);
31762306a36Sopenharmony_ci	}
31862306a36Sopenharmony_ci}
31962306a36Sopenharmony_ci
32062306a36Sopenharmony_cistatic void __free_page_ext(unsigned long pfn)
32162306a36Sopenharmony_ci{
32262306a36Sopenharmony_ci	struct mem_section *ms;
32362306a36Sopenharmony_ci	struct page_ext *base;
32462306a36Sopenharmony_ci
32562306a36Sopenharmony_ci	ms = __pfn_to_section(pfn);
32662306a36Sopenharmony_ci	if (!ms || !ms->page_ext)
32762306a36Sopenharmony_ci		return;
32862306a36Sopenharmony_ci
32962306a36Sopenharmony_ci	base = READ_ONCE(ms->page_ext);
33062306a36Sopenharmony_ci	/*
33162306a36Sopenharmony_ci	 * page_ext here can be valid while doing the roll back
33262306a36Sopenharmony_ci	 * operation in online_page_ext().
33362306a36Sopenharmony_ci	 */
33462306a36Sopenharmony_ci	if (page_ext_invalid(base))
33562306a36Sopenharmony_ci		base = (void *)base - PAGE_EXT_INVALID;
33662306a36Sopenharmony_ci	WRITE_ONCE(ms->page_ext, NULL);
33762306a36Sopenharmony_ci
33862306a36Sopenharmony_ci	base = get_entry(base, pfn);
33962306a36Sopenharmony_ci	free_page_ext(base);
34062306a36Sopenharmony_ci}
34162306a36Sopenharmony_ci
34262306a36Sopenharmony_cistatic void __invalidate_page_ext(unsigned long pfn)
34362306a36Sopenharmony_ci{
34462306a36Sopenharmony_ci	struct mem_section *ms;
34562306a36Sopenharmony_ci	void *val;
34662306a36Sopenharmony_ci
34762306a36Sopenharmony_ci	ms = __pfn_to_section(pfn);
34862306a36Sopenharmony_ci	if (!ms || !ms->page_ext)
34962306a36Sopenharmony_ci		return;
35062306a36Sopenharmony_ci	val = (void *)ms->page_ext + PAGE_EXT_INVALID;
35162306a36Sopenharmony_ci	WRITE_ONCE(ms->page_ext, val);
35262306a36Sopenharmony_ci}
35362306a36Sopenharmony_ci
35462306a36Sopenharmony_cistatic int __meminit online_page_ext(unsigned long start_pfn,
35562306a36Sopenharmony_ci				unsigned long nr_pages,
35662306a36Sopenharmony_ci				int nid)
35762306a36Sopenharmony_ci{
35862306a36Sopenharmony_ci	unsigned long start, end, pfn;
35962306a36Sopenharmony_ci	int fail = 0;
36062306a36Sopenharmony_ci
36162306a36Sopenharmony_ci	start = SECTION_ALIGN_DOWN(start_pfn);
36262306a36Sopenharmony_ci	end = SECTION_ALIGN_UP(start_pfn + nr_pages);
36362306a36Sopenharmony_ci
36462306a36Sopenharmony_ci	if (nid == NUMA_NO_NODE) {
36562306a36Sopenharmony_ci		/*
36662306a36Sopenharmony_ci		 * In this case, "nid" already exists and contains valid memory.
36762306a36Sopenharmony_ci		 * "start_pfn" passed to us is a pfn which is an arg for
36862306a36Sopenharmony_ci		 * online__pages(), and start_pfn should exist.
36962306a36Sopenharmony_ci		 */
37062306a36Sopenharmony_ci		nid = pfn_to_nid(start_pfn);
37162306a36Sopenharmony_ci		VM_BUG_ON(!node_online(nid));
37262306a36Sopenharmony_ci	}
37362306a36Sopenharmony_ci
37462306a36Sopenharmony_ci	for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION)
37562306a36Sopenharmony_ci		fail = init_section_page_ext(pfn, nid);
37662306a36Sopenharmony_ci	if (!fail)
37762306a36Sopenharmony_ci		return 0;
37862306a36Sopenharmony_ci
37962306a36Sopenharmony_ci	/* rollback */
38062306a36Sopenharmony_ci	end = pfn - PAGES_PER_SECTION;
38162306a36Sopenharmony_ci	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
38262306a36Sopenharmony_ci		__free_page_ext(pfn);
38362306a36Sopenharmony_ci
38462306a36Sopenharmony_ci	return -ENOMEM;
38562306a36Sopenharmony_ci}
38662306a36Sopenharmony_ci
38762306a36Sopenharmony_cistatic void __meminit offline_page_ext(unsigned long start_pfn,
38862306a36Sopenharmony_ci				unsigned long nr_pages)
38962306a36Sopenharmony_ci{
39062306a36Sopenharmony_ci	unsigned long start, end, pfn;
39162306a36Sopenharmony_ci
39262306a36Sopenharmony_ci	start = SECTION_ALIGN_DOWN(start_pfn);
39362306a36Sopenharmony_ci	end = SECTION_ALIGN_UP(start_pfn + nr_pages);
39462306a36Sopenharmony_ci
39562306a36Sopenharmony_ci	/*
39662306a36Sopenharmony_ci	 * Freeing of page_ext is done in 3 steps to avoid
39762306a36Sopenharmony_ci	 * use-after-free of it:
39862306a36Sopenharmony_ci	 * 1) Traverse all the sections and mark their page_ext
39962306a36Sopenharmony_ci	 *    as invalid.
40062306a36Sopenharmony_ci	 * 2) Wait for all the existing users of page_ext who
40162306a36Sopenharmony_ci	 *    started before invalidation to finish.
40262306a36Sopenharmony_ci	 * 3) Free the page_ext.
40362306a36Sopenharmony_ci	 */
40462306a36Sopenharmony_ci	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
40562306a36Sopenharmony_ci		__invalidate_page_ext(pfn);
40662306a36Sopenharmony_ci
40762306a36Sopenharmony_ci	synchronize_rcu();
40862306a36Sopenharmony_ci
40962306a36Sopenharmony_ci	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
41062306a36Sopenharmony_ci		__free_page_ext(pfn);
41162306a36Sopenharmony_ci}
41262306a36Sopenharmony_ci
41362306a36Sopenharmony_cistatic int __meminit page_ext_callback(struct notifier_block *self,
41462306a36Sopenharmony_ci			       unsigned long action, void *arg)
41562306a36Sopenharmony_ci{
41662306a36Sopenharmony_ci	struct memory_notify *mn = arg;
41762306a36Sopenharmony_ci	int ret = 0;
41862306a36Sopenharmony_ci
41962306a36Sopenharmony_ci	switch (action) {
42062306a36Sopenharmony_ci	case MEM_GOING_ONLINE:
42162306a36Sopenharmony_ci		ret = online_page_ext(mn->start_pfn,
42262306a36Sopenharmony_ci				   mn->nr_pages, mn->status_change_nid);
42362306a36Sopenharmony_ci		break;
42462306a36Sopenharmony_ci	case MEM_OFFLINE:
42562306a36Sopenharmony_ci		offline_page_ext(mn->start_pfn,
42662306a36Sopenharmony_ci				mn->nr_pages);
42762306a36Sopenharmony_ci		break;
42862306a36Sopenharmony_ci	case MEM_CANCEL_ONLINE:
42962306a36Sopenharmony_ci		offline_page_ext(mn->start_pfn,
43062306a36Sopenharmony_ci				mn->nr_pages);
43162306a36Sopenharmony_ci		break;
43262306a36Sopenharmony_ci	case MEM_GOING_OFFLINE:
43362306a36Sopenharmony_ci		break;
43462306a36Sopenharmony_ci	case MEM_ONLINE:
43562306a36Sopenharmony_ci	case MEM_CANCEL_OFFLINE:
43662306a36Sopenharmony_ci		break;
43762306a36Sopenharmony_ci	}
43862306a36Sopenharmony_ci
43962306a36Sopenharmony_ci	return notifier_from_errno(ret);
44062306a36Sopenharmony_ci}
44162306a36Sopenharmony_ci
44262306a36Sopenharmony_civoid __init page_ext_init(void)
44362306a36Sopenharmony_ci{
44462306a36Sopenharmony_ci	unsigned long pfn;
44562306a36Sopenharmony_ci	int nid;
44662306a36Sopenharmony_ci
44762306a36Sopenharmony_ci	if (!invoke_need_callbacks())
44862306a36Sopenharmony_ci		return;
44962306a36Sopenharmony_ci
45062306a36Sopenharmony_ci	for_each_node_state(nid, N_MEMORY) {
45162306a36Sopenharmony_ci		unsigned long start_pfn, end_pfn;
45262306a36Sopenharmony_ci
45362306a36Sopenharmony_ci		start_pfn = node_start_pfn(nid);
45462306a36Sopenharmony_ci		end_pfn = node_end_pfn(nid);
45562306a36Sopenharmony_ci		/*
45662306a36Sopenharmony_ci		 * start_pfn and end_pfn may not be aligned to SECTION and the
45762306a36Sopenharmony_ci		 * page->flags of out of node pages are not initialized.  So we
45862306a36Sopenharmony_ci		 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
45962306a36Sopenharmony_ci		 */
46062306a36Sopenharmony_ci		for (pfn = start_pfn; pfn < end_pfn;
46162306a36Sopenharmony_ci			pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
46262306a36Sopenharmony_ci
46362306a36Sopenharmony_ci			if (!pfn_valid(pfn))
46462306a36Sopenharmony_ci				continue;
46562306a36Sopenharmony_ci			/*
46662306a36Sopenharmony_ci			 * Nodes's pfns can be overlapping.
46762306a36Sopenharmony_ci			 * We know some arch can have a nodes layout such as
46862306a36Sopenharmony_ci			 * -------------pfn-------------->
46962306a36Sopenharmony_ci			 * N0 | N1 | N2 | N0 | N1 | N2|....
47062306a36Sopenharmony_ci			 */
47162306a36Sopenharmony_ci			if (pfn_to_nid(pfn) != nid)
47262306a36Sopenharmony_ci				continue;
47362306a36Sopenharmony_ci			if (init_section_page_ext(pfn, nid))
47462306a36Sopenharmony_ci				goto oom;
47562306a36Sopenharmony_ci			cond_resched();
47662306a36Sopenharmony_ci		}
47762306a36Sopenharmony_ci	}
47862306a36Sopenharmony_ci	hotplug_memory_notifier(page_ext_callback, DEFAULT_CALLBACK_PRI);
47962306a36Sopenharmony_ci	pr_info("allocated %ld bytes of page_ext\n", total_usage);
48062306a36Sopenharmony_ci	invoke_init_callbacks();
48162306a36Sopenharmony_ci	return;
48262306a36Sopenharmony_ci
48362306a36Sopenharmony_cioom:
48462306a36Sopenharmony_ci	panic("Out of memory");
48562306a36Sopenharmony_ci}
48662306a36Sopenharmony_ci
48762306a36Sopenharmony_civoid __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
48862306a36Sopenharmony_ci{
48962306a36Sopenharmony_ci}
49062306a36Sopenharmony_ci
49162306a36Sopenharmony_ci#endif
49262306a36Sopenharmony_ci
49362306a36Sopenharmony_ci/**
49462306a36Sopenharmony_ci * page_ext_get() - Get the extended information for a page.
49562306a36Sopenharmony_ci * @page: The page we're interested in.
49662306a36Sopenharmony_ci *
49762306a36Sopenharmony_ci * Ensures that the page_ext will remain valid until page_ext_put()
49862306a36Sopenharmony_ci * is called.
49962306a36Sopenharmony_ci *
50062306a36Sopenharmony_ci * Return: NULL if no page_ext exists for this page.
50162306a36Sopenharmony_ci * Context: Any context.  Caller may not sleep until they have called
50262306a36Sopenharmony_ci * page_ext_put().
50362306a36Sopenharmony_ci */
50462306a36Sopenharmony_cistruct page_ext *page_ext_get(struct page *page)
50562306a36Sopenharmony_ci{
50662306a36Sopenharmony_ci	struct page_ext *page_ext;
50762306a36Sopenharmony_ci
50862306a36Sopenharmony_ci	rcu_read_lock();
50962306a36Sopenharmony_ci	page_ext = lookup_page_ext(page);
51062306a36Sopenharmony_ci	if (!page_ext) {
51162306a36Sopenharmony_ci		rcu_read_unlock();
51262306a36Sopenharmony_ci		return NULL;
51362306a36Sopenharmony_ci	}
51462306a36Sopenharmony_ci
51562306a36Sopenharmony_ci	return page_ext;
51662306a36Sopenharmony_ci}
51762306a36Sopenharmony_ci
51862306a36Sopenharmony_ci/**
51962306a36Sopenharmony_ci * page_ext_put() - Working with page extended information is done.
52062306a36Sopenharmony_ci * @page_ext: Page extended information received from page_ext_get().
52162306a36Sopenharmony_ci *
52262306a36Sopenharmony_ci * The page extended information of the page may not be valid after this
52362306a36Sopenharmony_ci * function is called.
52462306a36Sopenharmony_ci *
52562306a36Sopenharmony_ci * Return: None.
52662306a36Sopenharmony_ci * Context: Any context with corresponding page_ext_get() is called.
52762306a36Sopenharmony_ci */
52862306a36Sopenharmony_civoid page_ext_put(struct page_ext *page_ext)
52962306a36Sopenharmony_ci{
53062306a36Sopenharmony_ci	if (unlikely(!page_ext))
53162306a36Sopenharmony_ci		return;
53262306a36Sopenharmony_ci
53362306a36Sopenharmony_ci	rcu_read_unlock();
53462306a36Sopenharmony_ci}
535