162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * mm/percpu.c - percpu memory allocator
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Copyright (C) 2009		SUSE Linux Products GmbH
662306a36Sopenharmony_ci * Copyright (C) 2009		Tejun Heo <tj@kernel.org>
762306a36Sopenharmony_ci *
862306a36Sopenharmony_ci * Copyright (C) 2017		Facebook Inc.
962306a36Sopenharmony_ci * Copyright (C) 2017		Dennis Zhou <dennis@kernel.org>
1062306a36Sopenharmony_ci *
1162306a36Sopenharmony_ci * The percpu allocator handles both static and dynamic areas.  Percpu
1262306a36Sopenharmony_ci * areas are allocated in chunks which are divided into units.  There is
1362306a36Sopenharmony_ci * a 1-to-1 mapping for units to possible cpus.  These units are grouped
1462306a36Sopenharmony_ci * based on NUMA properties of the machine.
1562306a36Sopenharmony_ci *
1662306a36Sopenharmony_ci *  c0                           c1                         c2
1762306a36Sopenharmony_ci *  -------------------          -------------------        ------------
1862306a36Sopenharmony_ci * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
1962306a36Sopenharmony_ci *  -------------------  ......  -------------------  ....  ------------
2062306a36Sopenharmony_ci *
2162306a36Sopenharmony_ci * Allocation is done by offsets into a unit's address space.  Ie., an
2262306a36Sopenharmony_ci * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0,
2362306a36Sopenharmony_ci * c1:u1, c1:u2, etc.  On NUMA machines, the mapping may be non-linear
2462306a36Sopenharmony_ci * and even sparse.  Access is handled by configuring percpu base
2562306a36Sopenharmony_ci * registers according to the cpu to unit mappings and offsetting the
2662306a36Sopenharmony_ci * base address using pcpu_unit_size.
2762306a36Sopenharmony_ci *
2862306a36Sopenharmony_ci * There is special consideration for the first chunk which must handle
2962306a36Sopenharmony_ci * the static percpu variables in the kernel image as allocation services
3062306a36Sopenharmony_ci * are not online yet.  In short, the first chunk is structured like so:
3162306a36Sopenharmony_ci *
3262306a36Sopenharmony_ci *                  <Static | [Reserved] | Dynamic>
3362306a36Sopenharmony_ci *
3462306a36Sopenharmony_ci * The static data is copied from the original section managed by the
3562306a36Sopenharmony_ci * linker.  The reserved section, if non-zero, primarily manages static
3662306a36Sopenharmony_ci * percpu variables from kernel modules.  Finally, the dynamic section
3762306a36Sopenharmony_ci * takes care of normal allocations.
3862306a36Sopenharmony_ci *
3962306a36Sopenharmony_ci * The allocator organizes chunks into lists according to free size and
4062306a36Sopenharmony_ci * memcg-awareness.  To make a percpu allocation memcg-aware the __GFP_ACCOUNT
4162306a36Sopenharmony_ci * flag should be passed.  All memcg-aware allocations are sharing one set
4262306a36Sopenharmony_ci * of chunks and all unaccounted allocations and allocations performed
4362306a36Sopenharmony_ci * by processes belonging to the root memory cgroup are using the second set.
4462306a36Sopenharmony_ci *
4562306a36Sopenharmony_ci * The allocator tries to allocate from the fullest chunk first. Each chunk
4662306a36Sopenharmony_ci * is managed by a bitmap with metadata blocks.  The allocation map is updated
4762306a36Sopenharmony_ci * on every allocation and free to reflect the current state while the boundary
4862306a36Sopenharmony_ci * map is only updated on allocation.  Each metadata block contains
4962306a36Sopenharmony_ci * information to help mitigate the need to iterate over large portions
5062306a36Sopenharmony_ci * of the bitmap.  The reverse mapping from page to chunk is stored in
5162306a36Sopenharmony_ci * the page's index.  Lastly, units are lazily backed and grow in unison.
5262306a36Sopenharmony_ci *
5362306a36Sopenharmony_ci * There is a unique conversion that goes on here between bytes and bits.
5462306a36Sopenharmony_ci * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE.  The chunk
5562306a36Sopenharmony_ci * tracks the number of pages it is responsible for in nr_pages.  Helper
5662306a36Sopenharmony_ci * functions are used to convert from between the bytes, bits, and blocks.
5762306a36Sopenharmony_ci * All hints are managed in bits unless explicitly stated.
5862306a36Sopenharmony_ci *
5962306a36Sopenharmony_ci * To use this allocator, arch code should do the following:
6062306a36Sopenharmony_ci *
6162306a36Sopenharmony_ci * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
6262306a36Sopenharmony_ci *   regular address to percpu pointer and back if they need to be
6362306a36Sopenharmony_ci *   different from the default
6462306a36Sopenharmony_ci *
6562306a36Sopenharmony_ci * - use pcpu_setup_first_chunk() during percpu area initialization to
6662306a36Sopenharmony_ci *   setup the first chunk containing the kernel static percpu area
6762306a36Sopenharmony_ci */
6862306a36Sopenharmony_ci
6962306a36Sopenharmony_ci#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7062306a36Sopenharmony_ci
7162306a36Sopenharmony_ci#include <linux/bitmap.h>
7262306a36Sopenharmony_ci#include <linux/cpumask.h>
7362306a36Sopenharmony_ci#include <linux/memblock.h>
7462306a36Sopenharmony_ci#include <linux/err.h>
7562306a36Sopenharmony_ci#include <linux/list.h>
7662306a36Sopenharmony_ci#include <linux/log2.h>
7762306a36Sopenharmony_ci#include <linux/mm.h>
7862306a36Sopenharmony_ci#include <linux/module.h>
7962306a36Sopenharmony_ci#include <linux/mutex.h>
8062306a36Sopenharmony_ci#include <linux/percpu.h>
8162306a36Sopenharmony_ci#include <linux/pfn.h>
8262306a36Sopenharmony_ci#include <linux/slab.h>
8362306a36Sopenharmony_ci#include <linux/spinlock.h>
8462306a36Sopenharmony_ci#include <linux/vmalloc.h>
8562306a36Sopenharmony_ci#include <linux/workqueue.h>
8662306a36Sopenharmony_ci#include <linux/kmemleak.h>
8762306a36Sopenharmony_ci#include <linux/sched.h>
8862306a36Sopenharmony_ci#include <linux/sched/mm.h>
8962306a36Sopenharmony_ci#include <linux/memcontrol.h>
9062306a36Sopenharmony_ci
9162306a36Sopenharmony_ci#include <asm/cacheflush.h>
9262306a36Sopenharmony_ci#include <asm/sections.h>
9362306a36Sopenharmony_ci#include <asm/tlbflush.h>
9462306a36Sopenharmony_ci#include <asm/io.h>
9562306a36Sopenharmony_ci
9662306a36Sopenharmony_ci#define CREATE_TRACE_POINTS
9762306a36Sopenharmony_ci#include <trace/events/percpu.h>
9862306a36Sopenharmony_ci
9962306a36Sopenharmony_ci#include "percpu-internal.h"
10062306a36Sopenharmony_ci
10162306a36Sopenharmony_ci/*
10262306a36Sopenharmony_ci * The slots are sorted by the size of the biggest continuous free area.
10362306a36Sopenharmony_ci * 1-31 bytes share the same slot.
10462306a36Sopenharmony_ci */
10562306a36Sopenharmony_ci#define PCPU_SLOT_BASE_SHIFT		5
10662306a36Sopenharmony_ci/* chunks in slots below this are subject to being sidelined on failed alloc */
10762306a36Sopenharmony_ci#define PCPU_SLOT_FAIL_THRESHOLD	3
10862306a36Sopenharmony_ci
10962306a36Sopenharmony_ci#define PCPU_EMPTY_POP_PAGES_LOW	2
11062306a36Sopenharmony_ci#define PCPU_EMPTY_POP_PAGES_HIGH	4
11162306a36Sopenharmony_ci
11262306a36Sopenharmony_ci#ifdef CONFIG_SMP
11362306a36Sopenharmony_ci/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
11462306a36Sopenharmony_ci#ifndef __addr_to_pcpu_ptr
11562306a36Sopenharmony_ci#define __addr_to_pcpu_ptr(addr)					\
11662306a36Sopenharmony_ci	(void __percpu *)((unsigned long)(addr) -			\
11762306a36Sopenharmony_ci			  (unsigned long)pcpu_base_addr	+		\
11862306a36Sopenharmony_ci			  (unsigned long)__per_cpu_start)
11962306a36Sopenharmony_ci#endif
12062306a36Sopenharmony_ci#ifndef __pcpu_ptr_to_addr
12162306a36Sopenharmony_ci#define __pcpu_ptr_to_addr(ptr)						\
12262306a36Sopenharmony_ci	(void __force *)((unsigned long)(ptr) +				\
12362306a36Sopenharmony_ci			 (unsigned long)pcpu_base_addr -		\
12462306a36Sopenharmony_ci			 (unsigned long)__per_cpu_start)
12562306a36Sopenharmony_ci#endif
12662306a36Sopenharmony_ci#else	/* CONFIG_SMP */
12762306a36Sopenharmony_ci/* on UP, it's always identity mapped */
12862306a36Sopenharmony_ci#define __addr_to_pcpu_ptr(addr)	(void __percpu *)(addr)
12962306a36Sopenharmony_ci#define __pcpu_ptr_to_addr(ptr)		(void __force *)(ptr)
13062306a36Sopenharmony_ci#endif	/* CONFIG_SMP */
13162306a36Sopenharmony_ci
13262306a36Sopenharmony_cistatic int pcpu_unit_pages __ro_after_init;
13362306a36Sopenharmony_cistatic int pcpu_unit_size __ro_after_init;
13462306a36Sopenharmony_cistatic int pcpu_nr_units __ro_after_init;
13562306a36Sopenharmony_cistatic int pcpu_atom_size __ro_after_init;
13662306a36Sopenharmony_ciint pcpu_nr_slots __ro_after_init;
13762306a36Sopenharmony_cistatic int pcpu_free_slot __ro_after_init;
13862306a36Sopenharmony_ciint pcpu_sidelined_slot __ro_after_init;
13962306a36Sopenharmony_ciint pcpu_to_depopulate_slot __ro_after_init;
14062306a36Sopenharmony_cistatic size_t pcpu_chunk_struct_size __ro_after_init;
14162306a36Sopenharmony_ci
14262306a36Sopenharmony_ci/* cpus with the lowest and highest unit addresses */
14362306a36Sopenharmony_cistatic unsigned int pcpu_low_unit_cpu __ro_after_init;
14462306a36Sopenharmony_cistatic unsigned int pcpu_high_unit_cpu __ro_after_init;
14562306a36Sopenharmony_ci
14662306a36Sopenharmony_ci/* the address of the first chunk which starts with the kernel static area */
14762306a36Sopenharmony_civoid *pcpu_base_addr __ro_after_init;
14862306a36Sopenharmony_ci
14962306a36Sopenharmony_cistatic const int *pcpu_unit_map __ro_after_init;		/* cpu -> unit */
15062306a36Sopenharmony_ciconst unsigned long *pcpu_unit_offsets __ro_after_init;	/* cpu -> unit offset */
15162306a36Sopenharmony_ci
15262306a36Sopenharmony_ci/* group information, used for vm allocation */
15362306a36Sopenharmony_cistatic int pcpu_nr_groups __ro_after_init;
15462306a36Sopenharmony_cistatic const unsigned long *pcpu_group_offsets __ro_after_init;
15562306a36Sopenharmony_cistatic const size_t *pcpu_group_sizes __ro_after_init;
15662306a36Sopenharmony_ci
15762306a36Sopenharmony_ci/*
15862306a36Sopenharmony_ci * The first chunk which always exists.  Note that unlike other
15962306a36Sopenharmony_ci * chunks, this one can be allocated and mapped in several different
16062306a36Sopenharmony_ci * ways and thus often doesn't live in the vmalloc area.
16162306a36Sopenharmony_ci */
16262306a36Sopenharmony_cistruct pcpu_chunk *pcpu_first_chunk __ro_after_init;
16362306a36Sopenharmony_ci
16462306a36Sopenharmony_ci/*
16562306a36Sopenharmony_ci * Optional reserved chunk.  This chunk reserves part of the first
16662306a36Sopenharmony_ci * chunk and serves it for reserved allocations.  When the reserved
16762306a36Sopenharmony_ci * region doesn't exist, the following variable is NULL.
16862306a36Sopenharmony_ci */
16962306a36Sopenharmony_cistruct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
17062306a36Sopenharmony_ci
17162306a36Sopenharmony_ciDEFINE_SPINLOCK(pcpu_lock);	/* all internal data structures */
17262306a36Sopenharmony_cistatic DEFINE_MUTEX(pcpu_alloc_mutex);	/* chunk create/destroy, [de]pop, map ext */
17362306a36Sopenharmony_ci
17462306a36Sopenharmony_cistruct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */
17562306a36Sopenharmony_ci
17662306a36Sopenharmony_ci/*
17762306a36Sopenharmony_ci * The number of empty populated pages, protected by pcpu_lock.
17862306a36Sopenharmony_ci * The reserved chunk doesn't contribute to the count.
17962306a36Sopenharmony_ci */
18062306a36Sopenharmony_ciint pcpu_nr_empty_pop_pages;
18162306a36Sopenharmony_ci
18262306a36Sopenharmony_ci/*
18362306a36Sopenharmony_ci * The number of populated pages in use by the allocator, protected by
18462306a36Sopenharmony_ci * pcpu_lock.  This number is kept per a unit per chunk (i.e. when a page gets
18562306a36Sopenharmony_ci * allocated/deallocated, it is allocated/deallocated in all units of a chunk
18662306a36Sopenharmony_ci * and increments/decrements this count by 1).
18762306a36Sopenharmony_ci */
18862306a36Sopenharmony_cistatic unsigned long pcpu_nr_populated;
18962306a36Sopenharmony_ci
19062306a36Sopenharmony_ci/*
19162306a36Sopenharmony_ci * Balance work is used to populate or destroy chunks asynchronously.  We
19262306a36Sopenharmony_ci * try to keep the number of populated free pages between
19362306a36Sopenharmony_ci * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
19462306a36Sopenharmony_ci * empty chunk.
19562306a36Sopenharmony_ci */
19662306a36Sopenharmony_cistatic void pcpu_balance_workfn(struct work_struct *work);
19762306a36Sopenharmony_cistatic DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
19862306a36Sopenharmony_cistatic bool pcpu_async_enabled __read_mostly;
19962306a36Sopenharmony_cistatic bool pcpu_atomic_alloc_failed;
20062306a36Sopenharmony_ci
20162306a36Sopenharmony_cistatic void pcpu_schedule_balance_work(void)
20262306a36Sopenharmony_ci{
20362306a36Sopenharmony_ci	if (pcpu_async_enabled)
20462306a36Sopenharmony_ci		schedule_work(&pcpu_balance_work);
20562306a36Sopenharmony_ci}
20662306a36Sopenharmony_ci
20762306a36Sopenharmony_ci/**
20862306a36Sopenharmony_ci * pcpu_addr_in_chunk - check if the address is served from this chunk
20962306a36Sopenharmony_ci * @chunk: chunk of interest
21062306a36Sopenharmony_ci * @addr: percpu address
21162306a36Sopenharmony_ci *
21262306a36Sopenharmony_ci * RETURNS:
21362306a36Sopenharmony_ci * True if the address is served from this chunk.
21462306a36Sopenharmony_ci */
21562306a36Sopenharmony_cistatic bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
21662306a36Sopenharmony_ci{
21762306a36Sopenharmony_ci	void *start_addr, *end_addr;
21862306a36Sopenharmony_ci
21962306a36Sopenharmony_ci	if (!chunk)
22062306a36Sopenharmony_ci		return false;
22162306a36Sopenharmony_ci
22262306a36Sopenharmony_ci	start_addr = chunk->base_addr + chunk->start_offset;
22362306a36Sopenharmony_ci	end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
22462306a36Sopenharmony_ci		   chunk->end_offset;
22562306a36Sopenharmony_ci
22662306a36Sopenharmony_ci	return addr >= start_addr && addr < end_addr;
22762306a36Sopenharmony_ci}
22862306a36Sopenharmony_ci
22962306a36Sopenharmony_cistatic int __pcpu_size_to_slot(int size)
23062306a36Sopenharmony_ci{
23162306a36Sopenharmony_ci	int highbit = fls(size);	/* size is in bytes */
23262306a36Sopenharmony_ci	return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
23362306a36Sopenharmony_ci}
23462306a36Sopenharmony_ci
23562306a36Sopenharmony_cistatic int pcpu_size_to_slot(int size)
23662306a36Sopenharmony_ci{
23762306a36Sopenharmony_ci	if (size == pcpu_unit_size)
23862306a36Sopenharmony_ci		return pcpu_free_slot;
23962306a36Sopenharmony_ci	return __pcpu_size_to_slot(size);
24062306a36Sopenharmony_ci}
24162306a36Sopenharmony_ci
24262306a36Sopenharmony_cistatic int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
24362306a36Sopenharmony_ci{
24462306a36Sopenharmony_ci	const struct pcpu_block_md *chunk_md = &chunk->chunk_md;
24562306a36Sopenharmony_ci
24662306a36Sopenharmony_ci	if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE ||
24762306a36Sopenharmony_ci	    chunk_md->contig_hint == 0)
24862306a36Sopenharmony_ci		return 0;
24962306a36Sopenharmony_ci
25062306a36Sopenharmony_ci	return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE);
25162306a36Sopenharmony_ci}
25262306a36Sopenharmony_ci
25362306a36Sopenharmony_ci/* set the pointer to a chunk in a page struct */
25462306a36Sopenharmony_cistatic void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
25562306a36Sopenharmony_ci{
25662306a36Sopenharmony_ci	page->index = (unsigned long)pcpu;
25762306a36Sopenharmony_ci}
25862306a36Sopenharmony_ci
25962306a36Sopenharmony_ci/* obtain pointer to a chunk from a page struct */
26062306a36Sopenharmony_cistatic struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
26162306a36Sopenharmony_ci{
26262306a36Sopenharmony_ci	return (struct pcpu_chunk *)page->index;
26362306a36Sopenharmony_ci}
26462306a36Sopenharmony_ci
26562306a36Sopenharmony_cistatic int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
26662306a36Sopenharmony_ci{
26762306a36Sopenharmony_ci	return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
26862306a36Sopenharmony_ci}
26962306a36Sopenharmony_ci
27062306a36Sopenharmony_cistatic unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
27162306a36Sopenharmony_ci{
27262306a36Sopenharmony_ci	return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
27362306a36Sopenharmony_ci}
27462306a36Sopenharmony_ci
27562306a36Sopenharmony_cistatic unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
27662306a36Sopenharmony_ci				     unsigned int cpu, int page_idx)
27762306a36Sopenharmony_ci{
27862306a36Sopenharmony_ci	return (unsigned long)chunk->base_addr +
27962306a36Sopenharmony_ci	       pcpu_unit_page_offset(cpu, page_idx);
28062306a36Sopenharmony_ci}
28162306a36Sopenharmony_ci
28262306a36Sopenharmony_ci/*
28362306a36Sopenharmony_ci * The following are helper functions to help access bitmaps and convert
28462306a36Sopenharmony_ci * between bitmap offsets to address offsets.
28562306a36Sopenharmony_ci */
28662306a36Sopenharmony_cistatic unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
28762306a36Sopenharmony_ci{
28862306a36Sopenharmony_ci	return chunk->alloc_map +
28962306a36Sopenharmony_ci	       (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
29062306a36Sopenharmony_ci}
29162306a36Sopenharmony_ci
29262306a36Sopenharmony_cistatic unsigned long pcpu_off_to_block_index(int off)
29362306a36Sopenharmony_ci{
29462306a36Sopenharmony_ci	return off / PCPU_BITMAP_BLOCK_BITS;
29562306a36Sopenharmony_ci}
29662306a36Sopenharmony_ci
29762306a36Sopenharmony_cistatic unsigned long pcpu_off_to_block_off(int off)
29862306a36Sopenharmony_ci{
29962306a36Sopenharmony_ci	return off & (PCPU_BITMAP_BLOCK_BITS - 1);
30062306a36Sopenharmony_ci}
30162306a36Sopenharmony_ci
30262306a36Sopenharmony_cistatic unsigned long pcpu_block_off_to_off(int index, int off)
30362306a36Sopenharmony_ci{
30462306a36Sopenharmony_ci	return index * PCPU_BITMAP_BLOCK_BITS + off;
30562306a36Sopenharmony_ci}
30662306a36Sopenharmony_ci
30762306a36Sopenharmony_ci/**
30862306a36Sopenharmony_ci * pcpu_check_block_hint - check against the contig hint
30962306a36Sopenharmony_ci * @block: block of interest
31062306a36Sopenharmony_ci * @bits: size of allocation
31162306a36Sopenharmony_ci * @align: alignment of area (max PAGE_SIZE)
31262306a36Sopenharmony_ci *
31362306a36Sopenharmony_ci * Check to see if the allocation can fit in the block's contig hint.
31462306a36Sopenharmony_ci * Note, a chunk uses the same hints as a block so this can also check against
31562306a36Sopenharmony_ci * the chunk's contig hint.
31662306a36Sopenharmony_ci */
31762306a36Sopenharmony_cistatic bool pcpu_check_block_hint(struct pcpu_block_md *block, int bits,
31862306a36Sopenharmony_ci				  size_t align)
31962306a36Sopenharmony_ci{
32062306a36Sopenharmony_ci	int bit_off = ALIGN(block->contig_hint_start, align) -
32162306a36Sopenharmony_ci		block->contig_hint_start;
32262306a36Sopenharmony_ci
32362306a36Sopenharmony_ci	return bit_off + bits <= block->contig_hint;
32462306a36Sopenharmony_ci}
32562306a36Sopenharmony_ci
32662306a36Sopenharmony_ci/*
32762306a36Sopenharmony_ci * pcpu_next_hint - determine which hint to use
32862306a36Sopenharmony_ci * @block: block of interest
32962306a36Sopenharmony_ci * @alloc_bits: size of allocation
33062306a36Sopenharmony_ci *
33162306a36Sopenharmony_ci * This determines if we should scan based on the scan_hint or first_free.
33262306a36Sopenharmony_ci * In general, we want to scan from first_free to fulfill allocations by
33362306a36Sopenharmony_ci * first fit.  However, if we know a scan_hint at position scan_hint_start
33462306a36Sopenharmony_ci * cannot fulfill an allocation, we can begin scanning from there knowing
33562306a36Sopenharmony_ci * the contig_hint will be our fallback.
33662306a36Sopenharmony_ci */
33762306a36Sopenharmony_cistatic int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits)
33862306a36Sopenharmony_ci{
33962306a36Sopenharmony_ci	/*
34062306a36Sopenharmony_ci	 * The three conditions below determine if we can skip past the
34162306a36Sopenharmony_ci	 * scan_hint.  First, does the scan hint exist.  Second, is the
34262306a36Sopenharmony_ci	 * contig_hint after the scan_hint (possibly not true iff
34362306a36Sopenharmony_ci	 * contig_hint == scan_hint).  Third, is the allocation request
34462306a36Sopenharmony_ci	 * larger than the scan_hint.
34562306a36Sopenharmony_ci	 */
34662306a36Sopenharmony_ci	if (block->scan_hint &&
34762306a36Sopenharmony_ci	    block->contig_hint_start > block->scan_hint_start &&
34862306a36Sopenharmony_ci	    alloc_bits > block->scan_hint)
34962306a36Sopenharmony_ci		return block->scan_hint_start + block->scan_hint;
35062306a36Sopenharmony_ci
35162306a36Sopenharmony_ci	return block->first_free;
35262306a36Sopenharmony_ci}
35362306a36Sopenharmony_ci
35462306a36Sopenharmony_ci/**
35562306a36Sopenharmony_ci * pcpu_next_md_free_region - finds the next hint free area
35662306a36Sopenharmony_ci * @chunk: chunk of interest
35762306a36Sopenharmony_ci * @bit_off: chunk offset
35862306a36Sopenharmony_ci * @bits: size of free area
35962306a36Sopenharmony_ci *
36062306a36Sopenharmony_ci * Helper function for pcpu_for_each_md_free_region.  It checks
36162306a36Sopenharmony_ci * block->contig_hint and performs aggregation across blocks to find the
36262306a36Sopenharmony_ci * next hint.  It modifies bit_off and bits in-place to be consumed in the
36362306a36Sopenharmony_ci * loop.
36462306a36Sopenharmony_ci */
36562306a36Sopenharmony_cistatic void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
36662306a36Sopenharmony_ci				     int *bits)
36762306a36Sopenharmony_ci{
36862306a36Sopenharmony_ci	int i = pcpu_off_to_block_index(*bit_off);
36962306a36Sopenharmony_ci	int block_off = pcpu_off_to_block_off(*bit_off);
37062306a36Sopenharmony_ci	struct pcpu_block_md *block;
37162306a36Sopenharmony_ci
37262306a36Sopenharmony_ci	*bits = 0;
37362306a36Sopenharmony_ci	for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
37462306a36Sopenharmony_ci	     block++, i++) {
37562306a36Sopenharmony_ci		/* handles contig area across blocks */
37662306a36Sopenharmony_ci		if (*bits) {
37762306a36Sopenharmony_ci			*bits += block->left_free;
37862306a36Sopenharmony_ci			if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
37962306a36Sopenharmony_ci				continue;
38062306a36Sopenharmony_ci			return;
38162306a36Sopenharmony_ci		}
38262306a36Sopenharmony_ci
38362306a36Sopenharmony_ci		/*
38462306a36Sopenharmony_ci		 * This checks three things.  First is there a contig_hint to
38562306a36Sopenharmony_ci		 * check.  Second, have we checked this hint before by
38662306a36Sopenharmony_ci		 * comparing the block_off.  Third, is this the same as the
38762306a36Sopenharmony_ci		 * right contig hint.  In the last case, it spills over into
38862306a36Sopenharmony_ci		 * the next block and should be handled by the contig area
38962306a36Sopenharmony_ci		 * across blocks code.
39062306a36Sopenharmony_ci		 */
39162306a36Sopenharmony_ci		*bits = block->contig_hint;
39262306a36Sopenharmony_ci		if (*bits && block->contig_hint_start >= block_off &&
39362306a36Sopenharmony_ci		    *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
39462306a36Sopenharmony_ci			*bit_off = pcpu_block_off_to_off(i,
39562306a36Sopenharmony_ci					block->contig_hint_start);
39662306a36Sopenharmony_ci			return;
39762306a36Sopenharmony_ci		}
39862306a36Sopenharmony_ci		/* reset to satisfy the second predicate above */
39962306a36Sopenharmony_ci		block_off = 0;
40062306a36Sopenharmony_ci
40162306a36Sopenharmony_ci		*bits = block->right_free;
40262306a36Sopenharmony_ci		*bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
40362306a36Sopenharmony_ci	}
40462306a36Sopenharmony_ci}
40562306a36Sopenharmony_ci
40662306a36Sopenharmony_ci/**
40762306a36Sopenharmony_ci * pcpu_next_fit_region - finds fit areas for a given allocation request
40862306a36Sopenharmony_ci * @chunk: chunk of interest
40962306a36Sopenharmony_ci * @alloc_bits: size of allocation
41062306a36Sopenharmony_ci * @align: alignment of area (max PAGE_SIZE)
41162306a36Sopenharmony_ci * @bit_off: chunk offset
41262306a36Sopenharmony_ci * @bits: size of free area
41362306a36Sopenharmony_ci *
41462306a36Sopenharmony_ci * Finds the next free region that is viable for use with a given size and
41562306a36Sopenharmony_ci * alignment.  This only returns if there is a valid area to be used for this
41662306a36Sopenharmony_ci * allocation.  block->first_free is returned if the allocation request fits
41762306a36Sopenharmony_ci * within the block to see if the request can be fulfilled prior to the contig
41862306a36Sopenharmony_ci * hint.
41962306a36Sopenharmony_ci */
42062306a36Sopenharmony_cistatic void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
42162306a36Sopenharmony_ci				 int align, int *bit_off, int *bits)
42262306a36Sopenharmony_ci{
42362306a36Sopenharmony_ci	int i = pcpu_off_to_block_index(*bit_off);
42462306a36Sopenharmony_ci	int block_off = pcpu_off_to_block_off(*bit_off);
42562306a36Sopenharmony_ci	struct pcpu_block_md *block;
42662306a36Sopenharmony_ci
42762306a36Sopenharmony_ci	*bits = 0;
42862306a36Sopenharmony_ci	for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
42962306a36Sopenharmony_ci	     block++, i++) {
43062306a36Sopenharmony_ci		/* handles contig area across blocks */
43162306a36Sopenharmony_ci		if (*bits) {
43262306a36Sopenharmony_ci			*bits += block->left_free;
43362306a36Sopenharmony_ci			if (*bits >= alloc_bits)
43462306a36Sopenharmony_ci				return;
43562306a36Sopenharmony_ci			if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
43662306a36Sopenharmony_ci				continue;
43762306a36Sopenharmony_ci		}
43862306a36Sopenharmony_ci
43962306a36Sopenharmony_ci		/* check block->contig_hint */
44062306a36Sopenharmony_ci		*bits = ALIGN(block->contig_hint_start, align) -
44162306a36Sopenharmony_ci			block->contig_hint_start;
44262306a36Sopenharmony_ci		/*
44362306a36Sopenharmony_ci		 * This uses the block offset to determine if this has been
44462306a36Sopenharmony_ci		 * checked in the prior iteration.
44562306a36Sopenharmony_ci		 */
44662306a36Sopenharmony_ci		if (block->contig_hint &&
44762306a36Sopenharmony_ci		    block->contig_hint_start >= block_off &&
44862306a36Sopenharmony_ci		    block->contig_hint >= *bits + alloc_bits) {
44962306a36Sopenharmony_ci			int start = pcpu_next_hint(block, alloc_bits);
45062306a36Sopenharmony_ci
45162306a36Sopenharmony_ci			*bits += alloc_bits + block->contig_hint_start -
45262306a36Sopenharmony_ci				 start;
45362306a36Sopenharmony_ci			*bit_off = pcpu_block_off_to_off(i, start);
45462306a36Sopenharmony_ci			return;
45562306a36Sopenharmony_ci		}
45662306a36Sopenharmony_ci		/* reset to satisfy the second predicate above */
45762306a36Sopenharmony_ci		block_off = 0;
45862306a36Sopenharmony_ci
45962306a36Sopenharmony_ci		*bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
46062306a36Sopenharmony_ci				 align);
46162306a36Sopenharmony_ci		*bits = PCPU_BITMAP_BLOCK_BITS - *bit_off;
46262306a36Sopenharmony_ci		*bit_off = pcpu_block_off_to_off(i, *bit_off);
46362306a36Sopenharmony_ci		if (*bits >= alloc_bits)
46462306a36Sopenharmony_ci			return;
46562306a36Sopenharmony_ci	}
46662306a36Sopenharmony_ci
46762306a36Sopenharmony_ci	/* no valid offsets were found - fail condition */
46862306a36Sopenharmony_ci	*bit_off = pcpu_chunk_map_bits(chunk);
46962306a36Sopenharmony_ci}
47062306a36Sopenharmony_ci
47162306a36Sopenharmony_ci/*
47262306a36Sopenharmony_ci * Metadata free area iterators.  These perform aggregation of free areas
47362306a36Sopenharmony_ci * based on the metadata blocks and return the offset @bit_off and size in
47462306a36Sopenharmony_ci * bits of the free area @bits.  pcpu_for_each_fit_region only returns when
47562306a36Sopenharmony_ci * a fit is found for the allocation request.
47662306a36Sopenharmony_ci */
47762306a36Sopenharmony_ci#define pcpu_for_each_md_free_region(chunk, bit_off, bits)		\
47862306a36Sopenharmony_ci	for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits));	\
47962306a36Sopenharmony_ci	     (bit_off) < pcpu_chunk_map_bits((chunk));			\
48062306a36Sopenharmony_ci	     (bit_off) += (bits) + 1,					\
48162306a36Sopenharmony_ci	     pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))
48262306a36Sopenharmony_ci
48362306a36Sopenharmony_ci#define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits)     \
48462306a36Sopenharmony_ci	for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
48562306a36Sopenharmony_ci				  &(bits));				      \
48662306a36Sopenharmony_ci	     (bit_off) < pcpu_chunk_map_bits((chunk));			      \
48762306a36Sopenharmony_ci	     (bit_off) += (bits),					      \
48862306a36Sopenharmony_ci	     pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
48962306a36Sopenharmony_ci				  &(bits)))
49062306a36Sopenharmony_ci
49162306a36Sopenharmony_ci/**
49262306a36Sopenharmony_ci * pcpu_mem_zalloc - allocate memory
49362306a36Sopenharmony_ci * @size: bytes to allocate
49462306a36Sopenharmony_ci * @gfp: allocation flags
49562306a36Sopenharmony_ci *
49662306a36Sopenharmony_ci * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
49762306a36Sopenharmony_ci * kzalloc() is used; otherwise, the equivalent of vzalloc() is used.
49862306a36Sopenharmony_ci * This is to facilitate passing through whitelisted flags.  The
49962306a36Sopenharmony_ci * returned memory is always zeroed.
50062306a36Sopenharmony_ci *
50162306a36Sopenharmony_ci * RETURNS:
50262306a36Sopenharmony_ci * Pointer to the allocated area on success, NULL on failure.
50362306a36Sopenharmony_ci */
50462306a36Sopenharmony_cistatic void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
50562306a36Sopenharmony_ci{
50662306a36Sopenharmony_ci	if (WARN_ON_ONCE(!slab_is_available()))
50762306a36Sopenharmony_ci		return NULL;
50862306a36Sopenharmony_ci
50962306a36Sopenharmony_ci	if (size <= PAGE_SIZE)
51062306a36Sopenharmony_ci		return kzalloc(size, gfp);
51162306a36Sopenharmony_ci	else
51262306a36Sopenharmony_ci		return __vmalloc(size, gfp | __GFP_ZERO);
51362306a36Sopenharmony_ci}
51462306a36Sopenharmony_ci
51562306a36Sopenharmony_ci/**
51662306a36Sopenharmony_ci * pcpu_mem_free - free memory
51762306a36Sopenharmony_ci * @ptr: memory to free
51862306a36Sopenharmony_ci *
51962306a36Sopenharmony_ci * Free @ptr.  @ptr should have been allocated using pcpu_mem_zalloc().
52062306a36Sopenharmony_ci */
52162306a36Sopenharmony_cistatic void pcpu_mem_free(void *ptr)
52262306a36Sopenharmony_ci{
52362306a36Sopenharmony_ci	kvfree(ptr);
52462306a36Sopenharmony_ci}
52562306a36Sopenharmony_ci
52662306a36Sopenharmony_cistatic void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
52762306a36Sopenharmony_ci			      bool move_front)
52862306a36Sopenharmony_ci{
52962306a36Sopenharmony_ci	if (chunk != pcpu_reserved_chunk) {
53062306a36Sopenharmony_ci		if (move_front)
53162306a36Sopenharmony_ci			list_move(&chunk->list, &pcpu_chunk_lists[slot]);
53262306a36Sopenharmony_ci		else
53362306a36Sopenharmony_ci			list_move_tail(&chunk->list, &pcpu_chunk_lists[slot]);
53462306a36Sopenharmony_ci	}
53562306a36Sopenharmony_ci}
53662306a36Sopenharmony_ci
53762306a36Sopenharmony_cistatic void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot)
53862306a36Sopenharmony_ci{
53962306a36Sopenharmony_ci	__pcpu_chunk_move(chunk, slot, true);
54062306a36Sopenharmony_ci}
54162306a36Sopenharmony_ci
54262306a36Sopenharmony_ci/**
54362306a36Sopenharmony_ci * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
54462306a36Sopenharmony_ci * @chunk: chunk of interest
54562306a36Sopenharmony_ci * @oslot: the previous slot it was on
54662306a36Sopenharmony_ci *
54762306a36Sopenharmony_ci * This function is called after an allocation or free changed @chunk.
54862306a36Sopenharmony_ci * New slot according to the changed state is determined and @chunk is
54962306a36Sopenharmony_ci * moved to the slot.  Note that the reserved chunk is never put on
55062306a36Sopenharmony_ci * chunk slots.
55162306a36Sopenharmony_ci *
55262306a36Sopenharmony_ci * CONTEXT:
55362306a36Sopenharmony_ci * pcpu_lock.
55462306a36Sopenharmony_ci */
55562306a36Sopenharmony_cistatic void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
55662306a36Sopenharmony_ci{
55762306a36Sopenharmony_ci	int nslot = pcpu_chunk_slot(chunk);
55862306a36Sopenharmony_ci
55962306a36Sopenharmony_ci	/* leave isolated chunks in-place */
56062306a36Sopenharmony_ci	if (chunk->isolated)
56162306a36Sopenharmony_ci		return;
56262306a36Sopenharmony_ci
56362306a36Sopenharmony_ci	if (oslot != nslot)
56462306a36Sopenharmony_ci		__pcpu_chunk_move(chunk, nslot, oslot < nslot);
56562306a36Sopenharmony_ci}
56662306a36Sopenharmony_ci
56762306a36Sopenharmony_cistatic void pcpu_isolate_chunk(struct pcpu_chunk *chunk)
56862306a36Sopenharmony_ci{
56962306a36Sopenharmony_ci	lockdep_assert_held(&pcpu_lock);
57062306a36Sopenharmony_ci
57162306a36Sopenharmony_ci	if (!chunk->isolated) {
57262306a36Sopenharmony_ci		chunk->isolated = true;
57362306a36Sopenharmony_ci		pcpu_nr_empty_pop_pages -= chunk->nr_empty_pop_pages;
57462306a36Sopenharmony_ci	}
57562306a36Sopenharmony_ci	list_move(&chunk->list, &pcpu_chunk_lists[pcpu_to_depopulate_slot]);
57662306a36Sopenharmony_ci}
57762306a36Sopenharmony_ci
57862306a36Sopenharmony_cistatic void pcpu_reintegrate_chunk(struct pcpu_chunk *chunk)
57962306a36Sopenharmony_ci{
58062306a36Sopenharmony_ci	lockdep_assert_held(&pcpu_lock);
58162306a36Sopenharmony_ci
58262306a36Sopenharmony_ci	if (chunk->isolated) {
58362306a36Sopenharmony_ci		chunk->isolated = false;
58462306a36Sopenharmony_ci		pcpu_nr_empty_pop_pages += chunk->nr_empty_pop_pages;
58562306a36Sopenharmony_ci		pcpu_chunk_relocate(chunk, -1);
58662306a36Sopenharmony_ci	}
58762306a36Sopenharmony_ci}
58862306a36Sopenharmony_ci
58962306a36Sopenharmony_ci/*
59062306a36Sopenharmony_ci * pcpu_update_empty_pages - update empty page counters
59162306a36Sopenharmony_ci * @chunk: chunk of interest
59262306a36Sopenharmony_ci * @nr: nr of empty pages
59362306a36Sopenharmony_ci *
59462306a36Sopenharmony_ci * This is used to keep track of the empty pages now based on the premise
59562306a36Sopenharmony_ci * a md_block covers a page.  The hint update functions recognize if a block
59662306a36Sopenharmony_ci * is made full or broken to calculate deltas for keeping track of free pages.
59762306a36Sopenharmony_ci */
59862306a36Sopenharmony_cistatic inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
59962306a36Sopenharmony_ci{
60062306a36Sopenharmony_ci	chunk->nr_empty_pop_pages += nr;
60162306a36Sopenharmony_ci	if (chunk != pcpu_reserved_chunk && !chunk->isolated)
60262306a36Sopenharmony_ci		pcpu_nr_empty_pop_pages += nr;
60362306a36Sopenharmony_ci}
60462306a36Sopenharmony_ci
60562306a36Sopenharmony_ci/*
60662306a36Sopenharmony_ci * pcpu_region_overlap - determines if two regions overlap
60762306a36Sopenharmony_ci * @a: start of first region, inclusive
60862306a36Sopenharmony_ci * @b: end of first region, exclusive
60962306a36Sopenharmony_ci * @x: start of second region, inclusive
61062306a36Sopenharmony_ci * @y: end of second region, exclusive
61162306a36Sopenharmony_ci *
61262306a36Sopenharmony_ci * This is used to determine if the hint region [a, b) overlaps with the
61362306a36Sopenharmony_ci * allocated region [x, y).
61462306a36Sopenharmony_ci */
61562306a36Sopenharmony_cistatic inline bool pcpu_region_overlap(int a, int b, int x, int y)
61662306a36Sopenharmony_ci{
61762306a36Sopenharmony_ci	return (a < y) && (x < b);
61862306a36Sopenharmony_ci}
61962306a36Sopenharmony_ci
62062306a36Sopenharmony_ci/**
62162306a36Sopenharmony_ci * pcpu_block_update - updates a block given a free area
62262306a36Sopenharmony_ci * @block: block of interest
62362306a36Sopenharmony_ci * @start: start offset in block
62462306a36Sopenharmony_ci * @end: end offset in block
62562306a36Sopenharmony_ci *
62662306a36Sopenharmony_ci * Updates a block given a known free area.  The region [start, end) is
62762306a36Sopenharmony_ci * expected to be the entirety of the free area within a block.  Chooses
62862306a36Sopenharmony_ci * the best starting offset if the contig hints are equal.
62962306a36Sopenharmony_ci */
63062306a36Sopenharmony_cistatic void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
63162306a36Sopenharmony_ci{
63262306a36Sopenharmony_ci	int contig = end - start;
63362306a36Sopenharmony_ci
63462306a36Sopenharmony_ci	block->first_free = min(block->first_free, start);
63562306a36Sopenharmony_ci	if (start == 0)
63662306a36Sopenharmony_ci		block->left_free = contig;
63762306a36Sopenharmony_ci
63862306a36Sopenharmony_ci	if (end == block->nr_bits)
63962306a36Sopenharmony_ci		block->right_free = contig;
64062306a36Sopenharmony_ci
64162306a36Sopenharmony_ci	if (contig > block->contig_hint) {
64262306a36Sopenharmony_ci		/* promote the old contig_hint to be the new scan_hint */
64362306a36Sopenharmony_ci		if (start > block->contig_hint_start) {
64462306a36Sopenharmony_ci			if (block->contig_hint > block->scan_hint) {
64562306a36Sopenharmony_ci				block->scan_hint_start =
64662306a36Sopenharmony_ci					block->contig_hint_start;
64762306a36Sopenharmony_ci				block->scan_hint = block->contig_hint;
64862306a36Sopenharmony_ci			} else if (start < block->scan_hint_start) {
64962306a36Sopenharmony_ci				/*
65062306a36Sopenharmony_ci				 * The old contig_hint == scan_hint.  But, the
65162306a36Sopenharmony_ci				 * new contig is larger so hold the invariant
65262306a36Sopenharmony_ci				 * scan_hint_start < contig_hint_start.
65362306a36Sopenharmony_ci				 */
65462306a36Sopenharmony_ci				block->scan_hint = 0;
65562306a36Sopenharmony_ci			}
65662306a36Sopenharmony_ci		} else {
65762306a36Sopenharmony_ci			block->scan_hint = 0;
65862306a36Sopenharmony_ci		}
65962306a36Sopenharmony_ci		block->contig_hint_start = start;
66062306a36Sopenharmony_ci		block->contig_hint = contig;
66162306a36Sopenharmony_ci	} else if (contig == block->contig_hint) {
66262306a36Sopenharmony_ci		if (block->contig_hint_start &&
66362306a36Sopenharmony_ci		    (!start ||
66462306a36Sopenharmony_ci		     __ffs(start) > __ffs(block->contig_hint_start))) {
66562306a36Sopenharmony_ci			/* start has a better alignment so use it */
66662306a36Sopenharmony_ci			block->contig_hint_start = start;
66762306a36Sopenharmony_ci			if (start < block->scan_hint_start &&
66862306a36Sopenharmony_ci			    block->contig_hint > block->scan_hint)
66962306a36Sopenharmony_ci				block->scan_hint = 0;
67062306a36Sopenharmony_ci		} else if (start > block->scan_hint_start ||
67162306a36Sopenharmony_ci			   block->contig_hint > block->scan_hint) {
67262306a36Sopenharmony_ci			/*
67362306a36Sopenharmony_ci			 * Knowing contig == contig_hint, update the scan_hint
67462306a36Sopenharmony_ci			 * if it is farther than or larger than the current
67562306a36Sopenharmony_ci			 * scan_hint.
67662306a36Sopenharmony_ci			 */
67762306a36Sopenharmony_ci			block->scan_hint_start = start;
67862306a36Sopenharmony_ci			block->scan_hint = contig;
67962306a36Sopenharmony_ci		}
68062306a36Sopenharmony_ci	} else {
68162306a36Sopenharmony_ci		/*
68262306a36Sopenharmony_ci		 * The region is smaller than the contig_hint.  So only update
68362306a36Sopenharmony_ci		 * the scan_hint if it is larger than or equal and farther than
68462306a36Sopenharmony_ci		 * the current scan_hint.
68562306a36Sopenharmony_ci		 */
68662306a36Sopenharmony_ci		if ((start < block->contig_hint_start &&
68762306a36Sopenharmony_ci		     (contig > block->scan_hint ||
68862306a36Sopenharmony_ci		      (contig == block->scan_hint &&
68962306a36Sopenharmony_ci		       start > block->scan_hint_start)))) {
69062306a36Sopenharmony_ci			block->scan_hint_start = start;
69162306a36Sopenharmony_ci			block->scan_hint = contig;
69262306a36Sopenharmony_ci		}
69362306a36Sopenharmony_ci	}
69462306a36Sopenharmony_ci}
69562306a36Sopenharmony_ci
69662306a36Sopenharmony_ci/*
69762306a36Sopenharmony_ci * pcpu_block_update_scan - update a block given a free area from a scan
69862306a36Sopenharmony_ci * @chunk: chunk of interest
69962306a36Sopenharmony_ci * @bit_off: chunk offset
70062306a36Sopenharmony_ci * @bits: size of free area
70162306a36Sopenharmony_ci *
70262306a36Sopenharmony_ci * Finding the final allocation spot first goes through pcpu_find_block_fit()
70362306a36Sopenharmony_ci * to find a block that can hold the allocation and then pcpu_alloc_area()
70462306a36Sopenharmony_ci * where a scan is used.  When allocations require specific alignments,
70562306a36Sopenharmony_ci * we can inadvertently create holes which will not be seen in the alloc
70662306a36Sopenharmony_ci * or free paths.
70762306a36Sopenharmony_ci *
70862306a36Sopenharmony_ci * This takes a given free area hole and updates a block as it may change the
70962306a36Sopenharmony_ci * scan_hint.  We need to scan backwards to ensure we don't miss free bits
71062306a36Sopenharmony_ci * from alignment.
71162306a36Sopenharmony_ci */
71262306a36Sopenharmony_cistatic void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off,
71362306a36Sopenharmony_ci				   int bits)
71462306a36Sopenharmony_ci{
71562306a36Sopenharmony_ci	int s_off = pcpu_off_to_block_off(bit_off);
71662306a36Sopenharmony_ci	int e_off = s_off + bits;
71762306a36Sopenharmony_ci	int s_index, l_bit;
71862306a36Sopenharmony_ci	struct pcpu_block_md *block;
71962306a36Sopenharmony_ci
72062306a36Sopenharmony_ci	if (e_off > PCPU_BITMAP_BLOCK_BITS)
72162306a36Sopenharmony_ci		return;
72262306a36Sopenharmony_ci
72362306a36Sopenharmony_ci	s_index = pcpu_off_to_block_index(bit_off);
72462306a36Sopenharmony_ci	block = chunk->md_blocks + s_index;
72562306a36Sopenharmony_ci
72662306a36Sopenharmony_ci	/* scan backwards in case of alignment skipping free bits */
72762306a36Sopenharmony_ci	l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off);
72862306a36Sopenharmony_ci	s_off = (s_off == l_bit) ? 0 : l_bit + 1;
72962306a36Sopenharmony_ci
73062306a36Sopenharmony_ci	pcpu_block_update(block, s_off, e_off);
73162306a36Sopenharmony_ci}
73262306a36Sopenharmony_ci
73362306a36Sopenharmony_ci/**
73462306a36Sopenharmony_ci * pcpu_chunk_refresh_hint - updates metadata about a chunk
73562306a36Sopenharmony_ci * @chunk: chunk of interest
73662306a36Sopenharmony_ci * @full_scan: if we should scan from the beginning
73762306a36Sopenharmony_ci *
73862306a36Sopenharmony_ci * Iterates over the metadata blocks to find the largest contig area.
73962306a36Sopenharmony_ci * A full scan can be avoided on the allocation path as this is triggered
74062306a36Sopenharmony_ci * if we broke the contig_hint.  In doing so, the scan_hint will be before
74162306a36Sopenharmony_ci * the contig_hint or after if the scan_hint == contig_hint.  This cannot
74262306a36Sopenharmony_ci * be prevented on freeing as we want to find the largest area possibly
74362306a36Sopenharmony_ci * spanning blocks.
74462306a36Sopenharmony_ci */
74562306a36Sopenharmony_cistatic void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan)
74662306a36Sopenharmony_ci{
74762306a36Sopenharmony_ci	struct pcpu_block_md *chunk_md = &chunk->chunk_md;
74862306a36Sopenharmony_ci	int bit_off, bits;
74962306a36Sopenharmony_ci
75062306a36Sopenharmony_ci	/* promote scan_hint to contig_hint */
75162306a36Sopenharmony_ci	if (!full_scan && chunk_md->scan_hint) {
75262306a36Sopenharmony_ci		bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint;
75362306a36Sopenharmony_ci		chunk_md->contig_hint_start = chunk_md->scan_hint_start;
75462306a36Sopenharmony_ci		chunk_md->contig_hint = chunk_md->scan_hint;
75562306a36Sopenharmony_ci		chunk_md->scan_hint = 0;
75662306a36Sopenharmony_ci	} else {
75762306a36Sopenharmony_ci		bit_off = chunk_md->first_free;
75862306a36Sopenharmony_ci		chunk_md->contig_hint = 0;
75962306a36Sopenharmony_ci	}
76062306a36Sopenharmony_ci
76162306a36Sopenharmony_ci	bits = 0;
76262306a36Sopenharmony_ci	pcpu_for_each_md_free_region(chunk, bit_off, bits)
76362306a36Sopenharmony_ci		pcpu_block_update(chunk_md, bit_off, bit_off + bits);
76462306a36Sopenharmony_ci}
76562306a36Sopenharmony_ci
76662306a36Sopenharmony_ci/**
76762306a36Sopenharmony_ci * pcpu_block_refresh_hint
76862306a36Sopenharmony_ci * @chunk: chunk of interest
76962306a36Sopenharmony_ci * @index: index of the metadata block
77062306a36Sopenharmony_ci *
77162306a36Sopenharmony_ci * Scans over the block beginning at first_free and updates the block
77262306a36Sopenharmony_ci * metadata accordingly.
77362306a36Sopenharmony_ci */
77462306a36Sopenharmony_cistatic void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
77562306a36Sopenharmony_ci{
77662306a36Sopenharmony_ci	struct pcpu_block_md *block = chunk->md_blocks + index;
77762306a36Sopenharmony_ci	unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
77862306a36Sopenharmony_ci	unsigned int start, end;	/* region start, region end */
77962306a36Sopenharmony_ci
78062306a36Sopenharmony_ci	/* promote scan_hint to contig_hint */
78162306a36Sopenharmony_ci	if (block->scan_hint) {
78262306a36Sopenharmony_ci		start = block->scan_hint_start + block->scan_hint;
78362306a36Sopenharmony_ci		block->contig_hint_start = block->scan_hint_start;
78462306a36Sopenharmony_ci		block->contig_hint = block->scan_hint;
78562306a36Sopenharmony_ci		block->scan_hint = 0;
78662306a36Sopenharmony_ci	} else {
78762306a36Sopenharmony_ci		start = block->first_free;
78862306a36Sopenharmony_ci		block->contig_hint = 0;
78962306a36Sopenharmony_ci	}
79062306a36Sopenharmony_ci
79162306a36Sopenharmony_ci	block->right_free = 0;
79262306a36Sopenharmony_ci
79362306a36Sopenharmony_ci	/* iterate over free areas and update the contig hints */
79462306a36Sopenharmony_ci	for_each_clear_bitrange_from(start, end, alloc_map, PCPU_BITMAP_BLOCK_BITS)
79562306a36Sopenharmony_ci		pcpu_block_update(block, start, end);
79662306a36Sopenharmony_ci}
79762306a36Sopenharmony_ci
79862306a36Sopenharmony_ci/**
79962306a36Sopenharmony_ci * pcpu_block_update_hint_alloc - update hint on allocation path
80062306a36Sopenharmony_ci * @chunk: chunk of interest
80162306a36Sopenharmony_ci * @bit_off: chunk offset
80262306a36Sopenharmony_ci * @bits: size of request
80362306a36Sopenharmony_ci *
80462306a36Sopenharmony_ci * Updates metadata for the allocation path.  The metadata only has to be
80562306a36Sopenharmony_ci * refreshed by a full scan iff the chunk's contig hint is broken.  Block level
80662306a36Sopenharmony_ci * scans are required if the block's contig hint is broken.
80762306a36Sopenharmony_ci */
80862306a36Sopenharmony_cistatic void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
80962306a36Sopenharmony_ci					 int bits)
81062306a36Sopenharmony_ci{
81162306a36Sopenharmony_ci	struct pcpu_block_md *chunk_md = &chunk->chunk_md;
81262306a36Sopenharmony_ci	int nr_empty_pages = 0;
81362306a36Sopenharmony_ci	struct pcpu_block_md *s_block, *e_block, *block;
81462306a36Sopenharmony_ci	int s_index, e_index;	/* block indexes of the freed allocation */
81562306a36Sopenharmony_ci	int s_off, e_off;	/* block offsets of the freed allocation */
81662306a36Sopenharmony_ci
81762306a36Sopenharmony_ci	/*
81862306a36Sopenharmony_ci	 * Calculate per block offsets.
81962306a36Sopenharmony_ci	 * The calculation uses an inclusive range, but the resulting offsets
82062306a36Sopenharmony_ci	 * are [start, end).  e_index always points to the last block in the
82162306a36Sopenharmony_ci	 * range.
82262306a36Sopenharmony_ci	 */
82362306a36Sopenharmony_ci	s_index = pcpu_off_to_block_index(bit_off);
82462306a36Sopenharmony_ci	e_index = pcpu_off_to_block_index(bit_off + bits - 1);
82562306a36Sopenharmony_ci	s_off = pcpu_off_to_block_off(bit_off);
82662306a36Sopenharmony_ci	e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
82762306a36Sopenharmony_ci
82862306a36Sopenharmony_ci	s_block = chunk->md_blocks + s_index;
82962306a36Sopenharmony_ci	e_block = chunk->md_blocks + e_index;
83062306a36Sopenharmony_ci
83162306a36Sopenharmony_ci	/*
83262306a36Sopenharmony_ci	 * Update s_block.
83362306a36Sopenharmony_ci	 */
83462306a36Sopenharmony_ci	if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
83562306a36Sopenharmony_ci		nr_empty_pages++;
83662306a36Sopenharmony_ci
83762306a36Sopenharmony_ci	/*
83862306a36Sopenharmony_ci	 * block->first_free must be updated if the allocation takes its place.
83962306a36Sopenharmony_ci	 * If the allocation breaks the contig_hint, a scan is required to
84062306a36Sopenharmony_ci	 * restore this hint.
84162306a36Sopenharmony_ci	 */
84262306a36Sopenharmony_ci	if (s_off == s_block->first_free)
84362306a36Sopenharmony_ci		s_block->first_free = find_next_zero_bit(
84462306a36Sopenharmony_ci					pcpu_index_alloc_map(chunk, s_index),
84562306a36Sopenharmony_ci					PCPU_BITMAP_BLOCK_BITS,
84662306a36Sopenharmony_ci					s_off + bits);
84762306a36Sopenharmony_ci
84862306a36Sopenharmony_ci	if (pcpu_region_overlap(s_block->scan_hint_start,
84962306a36Sopenharmony_ci				s_block->scan_hint_start + s_block->scan_hint,
85062306a36Sopenharmony_ci				s_off,
85162306a36Sopenharmony_ci				s_off + bits))
85262306a36Sopenharmony_ci		s_block->scan_hint = 0;
85362306a36Sopenharmony_ci
85462306a36Sopenharmony_ci	if (pcpu_region_overlap(s_block->contig_hint_start,
85562306a36Sopenharmony_ci				s_block->contig_hint_start +
85662306a36Sopenharmony_ci				s_block->contig_hint,
85762306a36Sopenharmony_ci				s_off,
85862306a36Sopenharmony_ci				s_off + bits)) {
85962306a36Sopenharmony_ci		/* block contig hint is broken - scan to fix it */
86062306a36Sopenharmony_ci		if (!s_off)
86162306a36Sopenharmony_ci			s_block->left_free = 0;
86262306a36Sopenharmony_ci		pcpu_block_refresh_hint(chunk, s_index);
86362306a36Sopenharmony_ci	} else {
86462306a36Sopenharmony_ci		/* update left and right contig manually */
86562306a36Sopenharmony_ci		s_block->left_free = min(s_block->left_free, s_off);
86662306a36Sopenharmony_ci		if (s_index == e_index)
86762306a36Sopenharmony_ci			s_block->right_free = min_t(int, s_block->right_free,
86862306a36Sopenharmony_ci					PCPU_BITMAP_BLOCK_BITS - e_off);
86962306a36Sopenharmony_ci		else
87062306a36Sopenharmony_ci			s_block->right_free = 0;
87162306a36Sopenharmony_ci	}
87262306a36Sopenharmony_ci
87362306a36Sopenharmony_ci	/*
87462306a36Sopenharmony_ci	 * Update e_block.
87562306a36Sopenharmony_ci	 */
87662306a36Sopenharmony_ci	if (s_index != e_index) {
87762306a36Sopenharmony_ci		if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
87862306a36Sopenharmony_ci			nr_empty_pages++;
87962306a36Sopenharmony_ci
88062306a36Sopenharmony_ci		/*
88162306a36Sopenharmony_ci		 * When the allocation is across blocks, the end is along
88262306a36Sopenharmony_ci		 * the left part of the e_block.
88362306a36Sopenharmony_ci		 */
88462306a36Sopenharmony_ci		e_block->first_free = find_next_zero_bit(
88562306a36Sopenharmony_ci				pcpu_index_alloc_map(chunk, e_index),
88662306a36Sopenharmony_ci				PCPU_BITMAP_BLOCK_BITS, e_off);
88762306a36Sopenharmony_ci
88862306a36Sopenharmony_ci		if (e_off == PCPU_BITMAP_BLOCK_BITS) {
88962306a36Sopenharmony_ci			/* reset the block */
89062306a36Sopenharmony_ci			e_block++;
89162306a36Sopenharmony_ci		} else {
89262306a36Sopenharmony_ci			if (e_off > e_block->scan_hint_start)
89362306a36Sopenharmony_ci				e_block->scan_hint = 0;
89462306a36Sopenharmony_ci
89562306a36Sopenharmony_ci			e_block->left_free = 0;
89662306a36Sopenharmony_ci			if (e_off > e_block->contig_hint_start) {
89762306a36Sopenharmony_ci				/* contig hint is broken - scan to fix it */
89862306a36Sopenharmony_ci				pcpu_block_refresh_hint(chunk, e_index);
89962306a36Sopenharmony_ci			} else {
90062306a36Sopenharmony_ci				e_block->right_free =
90162306a36Sopenharmony_ci					min_t(int, e_block->right_free,
90262306a36Sopenharmony_ci					      PCPU_BITMAP_BLOCK_BITS - e_off);
90362306a36Sopenharmony_ci			}
90462306a36Sopenharmony_ci		}
90562306a36Sopenharmony_ci
90662306a36Sopenharmony_ci		/* update in-between md_blocks */
90762306a36Sopenharmony_ci		nr_empty_pages += (e_index - s_index - 1);
90862306a36Sopenharmony_ci		for (block = s_block + 1; block < e_block; block++) {
90962306a36Sopenharmony_ci			block->scan_hint = 0;
91062306a36Sopenharmony_ci			block->contig_hint = 0;
91162306a36Sopenharmony_ci			block->left_free = 0;
91262306a36Sopenharmony_ci			block->right_free = 0;
91362306a36Sopenharmony_ci		}
91462306a36Sopenharmony_ci	}
91562306a36Sopenharmony_ci
91662306a36Sopenharmony_ci	/*
91762306a36Sopenharmony_ci	 * If the allocation is not atomic, some blocks may not be
91862306a36Sopenharmony_ci	 * populated with pages, while we account it here.  The number
91962306a36Sopenharmony_ci	 * of pages will be added back with pcpu_chunk_populated()
92062306a36Sopenharmony_ci	 * when populating pages.
92162306a36Sopenharmony_ci	 */
92262306a36Sopenharmony_ci	if (nr_empty_pages)
92362306a36Sopenharmony_ci		pcpu_update_empty_pages(chunk, -nr_empty_pages);
92462306a36Sopenharmony_ci
92562306a36Sopenharmony_ci	if (pcpu_region_overlap(chunk_md->scan_hint_start,
92662306a36Sopenharmony_ci				chunk_md->scan_hint_start +
92762306a36Sopenharmony_ci				chunk_md->scan_hint,
92862306a36Sopenharmony_ci				bit_off,
92962306a36Sopenharmony_ci				bit_off + bits))
93062306a36Sopenharmony_ci		chunk_md->scan_hint = 0;
93162306a36Sopenharmony_ci
93262306a36Sopenharmony_ci	/*
93362306a36Sopenharmony_ci	 * The only time a full chunk scan is required is if the chunk
93462306a36Sopenharmony_ci	 * contig hint is broken.  Otherwise, it means a smaller space
93562306a36Sopenharmony_ci	 * was used and therefore the chunk contig hint is still correct.
93662306a36Sopenharmony_ci	 */
93762306a36Sopenharmony_ci	if (pcpu_region_overlap(chunk_md->contig_hint_start,
93862306a36Sopenharmony_ci				chunk_md->contig_hint_start +
93962306a36Sopenharmony_ci				chunk_md->contig_hint,
94062306a36Sopenharmony_ci				bit_off,
94162306a36Sopenharmony_ci				bit_off + bits))
94262306a36Sopenharmony_ci		pcpu_chunk_refresh_hint(chunk, false);
94362306a36Sopenharmony_ci}
94462306a36Sopenharmony_ci
94562306a36Sopenharmony_ci/**
94662306a36Sopenharmony_ci * pcpu_block_update_hint_free - updates the block hints on the free path
94762306a36Sopenharmony_ci * @chunk: chunk of interest
94862306a36Sopenharmony_ci * @bit_off: chunk offset
94962306a36Sopenharmony_ci * @bits: size of request
95062306a36Sopenharmony_ci *
95162306a36Sopenharmony_ci * Updates metadata for the allocation path.  This avoids a blind block
95262306a36Sopenharmony_ci * refresh by making use of the block contig hints.  If this fails, it scans
95362306a36Sopenharmony_ci * forward and backward to determine the extent of the free area.  This is
95462306a36Sopenharmony_ci * capped at the boundary of blocks.
95562306a36Sopenharmony_ci *
95662306a36Sopenharmony_ci * A chunk update is triggered if a page becomes free, a block becomes free,
95762306a36Sopenharmony_ci * or the free spans across blocks.  This tradeoff is to minimize iterating
95862306a36Sopenharmony_ci * over the block metadata to update chunk_md->contig_hint.
95962306a36Sopenharmony_ci * chunk_md->contig_hint may be off by up to a page, but it will never be more
96062306a36Sopenharmony_ci * than the available space.  If the contig hint is contained in one block, it
96162306a36Sopenharmony_ci * will be accurate.
96262306a36Sopenharmony_ci */
96362306a36Sopenharmony_cistatic void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
96462306a36Sopenharmony_ci					int bits)
96562306a36Sopenharmony_ci{
96662306a36Sopenharmony_ci	int nr_empty_pages = 0;
96762306a36Sopenharmony_ci	struct pcpu_block_md *s_block, *e_block, *block;
96862306a36Sopenharmony_ci	int s_index, e_index;	/* block indexes of the freed allocation */
96962306a36Sopenharmony_ci	int s_off, e_off;	/* block offsets of the freed allocation */
97062306a36Sopenharmony_ci	int start, end;		/* start and end of the whole free area */
97162306a36Sopenharmony_ci
97262306a36Sopenharmony_ci	/*
97362306a36Sopenharmony_ci	 * Calculate per block offsets.
97462306a36Sopenharmony_ci	 * The calculation uses an inclusive range, but the resulting offsets
97562306a36Sopenharmony_ci	 * are [start, end).  e_index always points to the last block in the
97662306a36Sopenharmony_ci	 * range.
97762306a36Sopenharmony_ci	 */
97862306a36Sopenharmony_ci	s_index = pcpu_off_to_block_index(bit_off);
97962306a36Sopenharmony_ci	e_index = pcpu_off_to_block_index(bit_off + bits - 1);
98062306a36Sopenharmony_ci	s_off = pcpu_off_to_block_off(bit_off);
98162306a36Sopenharmony_ci	e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
98262306a36Sopenharmony_ci
98362306a36Sopenharmony_ci	s_block = chunk->md_blocks + s_index;
98462306a36Sopenharmony_ci	e_block = chunk->md_blocks + e_index;
98562306a36Sopenharmony_ci
98662306a36Sopenharmony_ci	/*
98762306a36Sopenharmony_ci	 * Check if the freed area aligns with the block->contig_hint.
98862306a36Sopenharmony_ci	 * If it does, then the scan to find the beginning/end of the
98962306a36Sopenharmony_ci	 * larger free area can be avoided.
99062306a36Sopenharmony_ci	 *
99162306a36Sopenharmony_ci	 * start and end refer to beginning and end of the free area
99262306a36Sopenharmony_ci	 * within each their respective blocks.  This is not necessarily
99362306a36Sopenharmony_ci	 * the entire free area as it may span blocks past the beginning
99462306a36Sopenharmony_ci	 * or end of the block.
99562306a36Sopenharmony_ci	 */
99662306a36Sopenharmony_ci	start = s_off;
99762306a36Sopenharmony_ci	if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
99862306a36Sopenharmony_ci		start = s_block->contig_hint_start;
99962306a36Sopenharmony_ci	} else {
100062306a36Sopenharmony_ci		/*
100162306a36Sopenharmony_ci		 * Scan backwards to find the extent of the free area.
100262306a36Sopenharmony_ci		 * find_last_bit returns the starting bit, so if the start bit
100362306a36Sopenharmony_ci		 * is returned, that means there was no last bit and the
100462306a36Sopenharmony_ci		 * remainder of the chunk is free.
100562306a36Sopenharmony_ci		 */
100662306a36Sopenharmony_ci		int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
100762306a36Sopenharmony_ci					  start);
100862306a36Sopenharmony_ci		start = (start == l_bit) ? 0 : l_bit + 1;
100962306a36Sopenharmony_ci	}
101062306a36Sopenharmony_ci
101162306a36Sopenharmony_ci	end = e_off;
101262306a36Sopenharmony_ci	if (e_off == e_block->contig_hint_start)
101362306a36Sopenharmony_ci		end = e_block->contig_hint_start + e_block->contig_hint;
101462306a36Sopenharmony_ci	else
101562306a36Sopenharmony_ci		end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
101662306a36Sopenharmony_ci				    PCPU_BITMAP_BLOCK_BITS, end);
101762306a36Sopenharmony_ci
101862306a36Sopenharmony_ci	/* update s_block */
101962306a36Sopenharmony_ci	e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
102062306a36Sopenharmony_ci	if (!start && e_off == PCPU_BITMAP_BLOCK_BITS)
102162306a36Sopenharmony_ci		nr_empty_pages++;
102262306a36Sopenharmony_ci	pcpu_block_update(s_block, start, e_off);
102362306a36Sopenharmony_ci
102462306a36Sopenharmony_ci	/* freeing in the same block */
102562306a36Sopenharmony_ci	if (s_index != e_index) {
102662306a36Sopenharmony_ci		/* update e_block */
102762306a36Sopenharmony_ci		if (end == PCPU_BITMAP_BLOCK_BITS)
102862306a36Sopenharmony_ci			nr_empty_pages++;
102962306a36Sopenharmony_ci		pcpu_block_update(e_block, 0, end);
103062306a36Sopenharmony_ci
103162306a36Sopenharmony_ci		/* reset md_blocks in the middle */
103262306a36Sopenharmony_ci		nr_empty_pages += (e_index - s_index - 1);
103362306a36Sopenharmony_ci		for (block = s_block + 1; block < e_block; block++) {
103462306a36Sopenharmony_ci			block->first_free = 0;
103562306a36Sopenharmony_ci			block->scan_hint = 0;
103662306a36Sopenharmony_ci			block->contig_hint_start = 0;
103762306a36Sopenharmony_ci			block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
103862306a36Sopenharmony_ci			block->left_free = PCPU_BITMAP_BLOCK_BITS;
103962306a36Sopenharmony_ci			block->right_free = PCPU_BITMAP_BLOCK_BITS;
104062306a36Sopenharmony_ci		}
104162306a36Sopenharmony_ci	}
104262306a36Sopenharmony_ci
104362306a36Sopenharmony_ci	if (nr_empty_pages)
104462306a36Sopenharmony_ci		pcpu_update_empty_pages(chunk, nr_empty_pages);
104562306a36Sopenharmony_ci
104662306a36Sopenharmony_ci	/*
104762306a36Sopenharmony_ci	 * Refresh chunk metadata when the free makes a block free or spans
104862306a36Sopenharmony_ci	 * across blocks.  The contig_hint may be off by up to a page, but if
104962306a36Sopenharmony_ci	 * the contig_hint is contained in a block, it will be accurate with
105062306a36Sopenharmony_ci	 * the else condition below.
105162306a36Sopenharmony_ci	 */
105262306a36Sopenharmony_ci	if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index)
105362306a36Sopenharmony_ci		pcpu_chunk_refresh_hint(chunk, true);
105462306a36Sopenharmony_ci	else
105562306a36Sopenharmony_ci		pcpu_block_update(&chunk->chunk_md,
105662306a36Sopenharmony_ci				  pcpu_block_off_to_off(s_index, start),
105762306a36Sopenharmony_ci				  end);
105862306a36Sopenharmony_ci}
105962306a36Sopenharmony_ci
106062306a36Sopenharmony_ci/**
106162306a36Sopenharmony_ci * pcpu_is_populated - determines if the region is populated
106262306a36Sopenharmony_ci * @chunk: chunk of interest
106362306a36Sopenharmony_ci * @bit_off: chunk offset
106462306a36Sopenharmony_ci * @bits: size of area
106562306a36Sopenharmony_ci * @next_off: return value for the next offset to start searching
106662306a36Sopenharmony_ci *
106762306a36Sopenharmony_ci * For atomic allocations, check if the backing pages are populated.
106862306a36Sopenharmony_ci *
106962306a36Sopenharmony_ci * RETURNS:
107062306a36Sopenharmony_ci * Bool if the backing pages are populated.
107162306a36Sopenharmony_ci * next_index is to skip over unpopulated blocks in pcpu_find_block_fit.
107262306a36Sopenharmony_ci */
107362306a36Sopenharmony_cistatic bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
107462306a36Sopenharmony_ci			      int *next_off)
107562306a36Sopenharmony_ci{
107662306a36Sopenharmony_ci	unsigned int start, end;
107762306a36Sopenharmony_ci
107862306a36Sopenharmony_ci	start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
107962306a36Sopenharmony_ci	end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
108062306a36Sopenharmony_ci
108162306a36Sopenharmony_ci	start = find_next_zero_bit(chunk->populated, end, start);
108262306a36Sopenharmony_ci	if (start >= end)
108362306a36Sopenharmony_ci		return true;
108462306a36Sopenharmony_ci
108562306a36Sopenharmony_ci	end = find_next_bit(chunk->populated, end, start + 1);
108662306a36Sopenharmony_ci
108762306a36Sopenharmony_ci	*next_off = end * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
108862306a36Sopenharmony_ci	return false;
108962306a36Sopenharmony_ci}
109062306a36Sopenharmony_ci
109162306a36Sopenharmony_ci/**
109262306a36Sopenharmony_ci * pcpu_find_block_fit - finds the block index to start searching
109362306a36Sopenharmony_ci * @chunk: chunk of interest
109462306a36Sopenharmony_ci * @alloc_bits: size of request in allocation units
109562306a36Sopenharmony_ci * @align: alignment of area (max PAGE_SIZE bytes)
109662306a36Sopenharmony_ci * @pop_only: use populated regions only
109762306a36Sopenharmony_ci *
109862306a36Sopenharmony_ci * Given a chunk and an allocation spec, find the offset to begin searching
109962306a36Sopenharmony_ci * for a free region.  This iterates over the bitmap metadata blocks to
110062306a36Sopenharmony_ci * find an offset that will be guaranteed to fit the requirements.  It is
110162306a36Sopenharmony_ci * not quite first fit as if the allocation does not fit in the contig hint
110262306a36Sopenharmony_ci * of a block or chunk, it is skipped.  This errs on the side of caution
110362306a36Sopenharmony_ci * to prevent excess iteration.  Poor alignment can cause the allocator to
110462306a36Sopenharmony_ci * skip over blocks and chunks that have valid free areas.
110562306a36Sopenharmony_ci *
110662306a36Sopenharmony_ci * RETURNS:
110762306a36Sopenharmony_ci * The offset in the bitmap to begin searching.
110862306a36Sopenharmony_ci * -1 if no offset is found.
110962306a36Sopenharmony_ci */
111062306a36Sopenharmony_cistatic int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
111162306a36Sopenharmony_ci			       size_t align, bool pop_only)
111262306a36Sopenharmony_ci{
111362306a36Sopenharmony_ci	struct pcpu_block_md *chunk_md = &chunk->chunk_md;
111462306a36Sopenharmony_ci	int bit_off, bits, next_off;
111562306a36Sopenharmony_ci
111662306a36Sopenharmony_ci	/*
111762306a36Sopenharmony_ci	 * This is an optimization to prevent scanning by assuming if the
111862306a36Sopenharmony_ci	 * allocation cannot fit in the global hint, there is memory pressure
111962306a36Sopenharmony_ci	 * and creating a new chunk would happen soon.
112062306a36Sopenharmony_ci	 */
112162306a36Sopenharmony_ci	if (!pcpu_check_block_hint(chunk_md, alloc_bits, align))
112262306a36Sopenharmony_ci		return -1;
112362306a36Sopenharmony_ci
112462306a36Sopenharmony_ci	bit_off = pcpu_next_hint(chunk_md, alloc_bits);
112562306a36Sopenharmony_ci	bits = 0;
112662306a36Sopenharmony_ci	pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
112762306a36Sopenharmony_ci		if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
112862306a36Sopenharmony_ci						   &next_off))
112962306a36Sopenharmony_ci			break;
113062306a36Sopenharmony_ci
113162306a36Sopenharmony_ci		bit_off = next_off;
113262306a36Sopenharmony_ci		bits = 0;
113362306a36Sopenharmony_ci	}
113462306a36Sopenharmony_ci
113562306a36Sopenharmony_ci	if (bit_off == pcpu_chunk_map_bits(chunk))
113662306a36Sopenharmony_ci		return -1;
113762306a36Sopenharmony_ci
113862306a36Sopenharmony_ci	return bit_off;
113962306a36Sopenharmony_ci}
114062306a36Sopenharmony_ci
114162306a36Sopenharmony_ci/*
114262306a36Sopenharmony_ci * pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off()
114362306a36Sopenharmony_ci * @map: the address to base the search on
114462306a36Sopenharmony_ci * @size: the bitmap size in bits
114562306a36Sopenharmony_ci * @start: the bitnumber to start searching at
114662306a36Sopenharmony_ci * @nr: the number of zeroed bits we're looking for
114762306a36Sopenharmony_ci * @align_mask: alignment mask for zero area
114862306a36Sopenharmony_ci * @largest_off: offset of the largest area skipped
114962306a36Sopenharmony_ci * @largest_bits: size of the largest area skipped
115062306a36Sopenharmony_ci *
115162306a36Sopenharmony_ci * The @align_mask should be one less than a power of 2.
115262306a36Sopenharmony_ci *
115362306a36Sopenharmony_ci * This is a modified version of bitmap_find_next_zero_area_off() to remember
115462306a36Sopenharmony_ci * the largest area that was skipped.  This is imperfect, but in general is
115562306a36Sopenharmony_ci * good enough.  The largest remembered region is the largest failed region
115662306a36Sopenharmony_ci * seen.  This does not include anything we possibly skipped due to alignment.
115762306a36Sopenharmony_ci * pcpu_block_update_scan() does scan backwards to try and recover what was
115862306a36Sopenharmony_ci * lost to alignment.  While this can cause scanning to miss earlier possible
115962306a36Sopenharmony_ci * free areas, smaller allocations will eventually fill those holes.
116062306a36Sopenharmony_ci */
116162306a36Sopenharmony_cistatic unsigned long pcpu_find_zero_area(unsigned long *map,
116262306a36Sopenharmony_ci					 unsigned long size,
116362306a36Sopenharmony_ci					 unsigned long start,
116462306a36Sopenharmony_ci					 unsigned long nr,
116562306a36Sopenharmony_ci					 unsigned long align_mask,
116662306a36Sopenharmony_ci					 unsigned long *largest_off,
116762306a36Sopenharmony_ci					 unsigned long *largest_bits)
116862306a36Sopenharmony_ci{
116962306a36Sopenharmony_ci	unsigned long index, end, i, area_off, area_bits;
117062306a36Sopenharmony_ciagain:
117162306a36Sopenharmony_ci	index = find_next_zero_bit(map, size, start);
117262306a36Sopenharmony_ci
117362306a36Sopenharmony_ci	/* Align allocation */
117462306a36Sopenharmony_ci	index = __ALIGN_MASK(index, align_mask);
117562306a36Sopenharmony_ci	area_off = index;
117662306a36Sopenharmony_ci
117762306a36Sopenharmony_ci	end = index + nr;
117862306a36Sopenharmony_ci	if (end > size)
117962306a36Sopenharmony_ci		return end;
118062306a36Sopenharmony_ci	i = find_next_bit(map, end, index);
118162306a36Sopenharmony_ci	if (i < end) {
118262306a36Sopenharmony_ci		area_bits = i - area_off;
118362306a36Sopenharmony_ci		/* remember largest unused area with best alignment */
118462306a36Sopenharmony_ci		if (area_bits > *largest_bits ||
118562306a36Sopenharmony_ci		    (area_bits == *largest_bits && *largest_off &&
118662306a36Sopenharmony_ci		     (!area_off || __ffs(area_off) > __ffs(*largest_off)))) {
118762306a36Sopenharmony_ci			*largest_off = area_off;
118862306a36Sopenharmony_ci			*largest_bits = area_bits;
118962306a36Sopenharmony_ci		}
119062306a36Sopenharmony_ci
119162306a36Sopenharmony_ci		start = i + 1;
119262306a36Sopenharmony_ci		goto again;
119362306a36Sopenharmony_ci	}
119462306a36Sopenharmony_ci	return index;
119562306a36Sopenharmony_ci}
119662306a36Sopenharmony_ci
119762306a36Sopenharmony_ci/**
119862306a36Sopenharmony_ci * pcpu_alloc_area - allocates an area from a pcpu_chunk
119962306a36Sopenharmony_ci * @chunk: chunk of interest
120062306a36Sopenharmony_ci * @alloc_bits: size of request in allocation units
120162306a36Sopenharmony_ci * @align: alignment of area (max PAGE_SIZE)
120262306a36Sopenharmony_ci * @start: bit_off to start searching
120362306a36Sopenharmony_ci *
120462306a36Sopenharmony_ci * This function takes in a @start offset to begin searching to fit an
120562306a36Sopenharmony_ci * allocation of @alloc_bits with alignment @align.  It needs to scan
120662306a36Sopenharmony_ci * the allocation map because if it fits within the block's contig hint,
120762306a36Sopenharmony_ci * @start will be block->first_free. This is an attempt to fill the
120862306a36Sopenharmony_ci * allocation prior to breaking the contig hint.  The allocation and
120962306a36Sopenharmony_ci * boundary maps are updated accordingly if it confirms a valid
121062306a36Sopenharmony_ci * free area.
121162306a36Sopenharmony_ci *
121262306a36Sopenharmony_ci * RETURNS:
121362306a36Sopenharmony_ci * Allocated addr offset in @chunk on success.
121462306a36Sopenharmony_ci * -1 if no matching area is found.
121562306a36Sopenharmony_ci */
121662306a36Sopenharmony_cistatic int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
121762306a36Sopenharmony_ci			   size_t align, int start)
121862306a36Sopenharmony_ci{
121962306a36Sopenharmony_ci	struct pcpu_block_md *chunk_md = &chunk->chunk_md;
122062306a36Sopenharmony_ci	size_t align_mask = (align) ? (align - 1) : 0;
122162306a36Sopenharmony_ci	unsigned long area_off = 0, area_bits = 0;
122262306a36Sopenharmony_ci	int bit_off, end, oslot;
122362306a36Sopenharmony_ci
122462306a36Sopenharmony_ci	lockdep_assert_held(&pcpu_lock);
122562306a36Sopenharmony_ci
122662306a36Sopenharmony_ci	oslot = pcpu_chunk_slot(chunk);
122762306a36Sopenharmony_ci
122862306a36Sopenharmony_ci	/*
122962306a36Sopenharmony_ci	 * Search to find a fit.
123062306a36Sopenharmony_ci	 */
123162306a36Sopenharmony_ci	end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS,
123262306a36Sopenharmony_ci		    pcpu_chunk_map_bits(chunk));
123362306a36Sopenharmony_ci	bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits,
123462306a36Sopenharmony_ci				      align_mask, &area_off, &area_bits);
123562306a36Sopenharmony_ci	if (bit_off >= end)
123662306a36Sopenharmony_ci		return -1;
123762306a36Sopenharmony_ci
123862306a36Sopenharmony_ci	if (area_bits)
123962306a36Sopenharmony_ci		pcpu_block_update_scan(chunk, area_off, area_bits);
124062306a36Sopenharmony_ci
124162306a36Sopenharmony_ci	/* update alloc map */
124262306a36Sopenharmony_ci	bitmap_set(chunk->alloc_map, bit_off, alloc_bits);
124362306a36Sopenharmony_ci
124462306a36Sopenharmony_ci	/* update boundary map */
124562306a36Sopenharmony_ci	set_bit(bit_off, chunk->bound_map);
124662306a36Sopenharmony_ci	bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
124762306a36Sopenharmony_ci	set_bit(bit_off + alloc_bits, chunk->bound_map);
124862306a36Sopenharmony_ci
124962306a36Sopenharmony_ci	chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;
125062306a36Sopenharmony_ci
125162306a36Sopenharmony_ci	/* update first free bit */
125262306a36Sopenharmony_ci	if (bit_off == chunk_md->first_free)
125362306a36Sopenharmony_ci		chunk_md->first_free = find_next_zero_bit(
125462306a36Sopenharmony_ci					chunk->alloc_map,
125562306a36Sopenharmony_ci					pcpu_chunk_map_bits(chunk),
125662306a36Sopenharmony_ci					bit_off + alloc_bits);
125762306a36Sopenharmony_ci
125862306a36Sopenharmony_ci	pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits);
125962306a36Sopenharmony_ci
126062306a36Sopenharmony_ci	pcpu_chunk_relocate(chunk, oslot);
126162306a36Sopenharmony_ci
126262306a36Sopenharmony_ci	return bit_off * PCPU_MIN_ALLOC_SIZE;
126362306a36Sopenharmony_ci}
126462306a36Sopenharmony_ci
126562306a36Sopenharmony_ci/**
126662306a36Sopenharmony_ci * pcpu_free_area - frees the corresponding offset
126762306a36Sopenharmony_ci * @chunk: chunk of interest
126862306a36Sopenharmony_ci * @off: addr offset into chunk
126962306a36Sopenharmony_ci *
127062306a36Sopenharmony_ci * This function determines the size of an allocation to free using
127162306a36Sopenharmony_ci * the boundary bitmap and clears the allocation map.
127262306a36Sopenharmony_ci *
127362306a36Sopenharmony_ci * RETURNS:
127462306a36Sopenharmony_ci * Number of freed bytes.
127562306a36Sopenharmony_ci */
127662306a36Sopenharmony_cistatic int pcpu_free_area(struct pcpu_chunk *chunk, int off)
127762306a36Sopenharmony_ci{
127862306a36Sopenharmony_ci	struct pcpu_block_md *chunk_md = &chunk->chunk_md;
127962306a36Sopenharmony_ci	int bit_off, bits, end, oslot, freed;
128062306a36Sopenharmony_ci
128162306a36Sopenharmony_ci	lockdep_assert_held(&pcpu_lock);
128262306a36Sopenharmony_ci	pcpu_stats_area_dealloc(chunk);
128362306a36Sopenharmony_ci
128462306a36Sopenharmony_ci	oslot = pcpu_chunk_slot(chunk);
128562306a36Sopenharmony_ci
128662306a36Sopenharmony_ci	bit_off = off / PCPU_MIN_ALLOC_SIZE;
128762306a36Sopenharmony_ci
128862306a36Sopenharmony_ci	/* find end index */
128962306a36Sopenharmony_ci	end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
129062306a36Sopenharmony_ci			    bit_off + 1);
129162306a36Sopenharmony_ci	bits = end - bit_off;
129262306a36Sopenharmony_ci	bitmap_clear(chunk->alloc_map, bit_off, bits);
129362306a36Sopenharmony_ci
129462306a36Sopenharmony_ci	freed = bits * PCPU_MIN_ALLOC_SIZE;
129562306a36Sopenharmony_ci
129662306a36Sopenharmony_ci	/* update metadata */
129762306a36Sopenharmony_ci	chunk->free_bytes += freed;
129862306a36Sopenharmony_ci
129962306a36Sopenharmony_ci	/* update first free bit */
130062306a36Sopenharmony_ci	chunk_md->first_free = min(chunk_md->first_free, bit_off);
130162306a36Sopenharmony_ci
130262306a36Sopenharmony_ci	pcpu_block_update_hint_free(chunk, bit_off, bits);
130362306a36Sopenharmony_ci
130462306a36Sopenharmony_ci	pcpu_chunk_relocate(chunk, oslot);
130562306a36Sopenharmony_ci
130662306a36Sopenharmony_ci	return freed;
130762306a36Sopenharmony_ci}
130862306a36Sopenharmony_ci
130962306a36Sopenharmony_cistatic void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits)
131062306a36Sopenharmony_ci{
131162306a36Sopenharmony_ci	block->scan_hint = 0;
131262306a36Sopenharmony_ci	block->contig_hint = nr_bits;
131362306a36Sopenharmony_ci	block->left_free = nr_bits;
131462306a36Sopenharmony_ci	block->right_free = nr_bits;
131562306a36Sopenharmony_ci	block->first_free = 0;
131662306a36Sopenharmony_ci	block->nr_bits = nr_bits;
131762306a36Sopenharmony_ci}
131862306a36Sopenharmony_ci
131962306a36Sopenharmony_cistatic void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
132062306a36Sopenharmony_ci{
132162306a36Sopenharmony_ci	struct pcpu_block_md *md_block;
132262306a36Sopenharmony_ci
132362306a36Sopenharmony_ci	/* init the chunk's block */
132462306a36Sopenharmony_ci	pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk));
132562306a36Sopenharmony_ci
132662306a36Sopenharmony_ci	for (md_block = chunk->md_blocks;
132762306a36Sopenharmony_ci	     md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
132862306a36Sopenharmony_ci	     md_block++)
132962306a36Sopenharmony_ci		pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS);
133062306a36Sopenharmony_ci}
133162306a36Sopenharmony_ci
133262306a36Sopenharmony_ci/**
133362306a36Sopenharmony_ci * pcpu_alloc_first_chunk - creates chunks that serve the first chunk
133462306a36Sopenharmony_ci * @tmp_addr: the start of the region served
133562306a36Sopenharmony_ci * @map_size: size of the region served
133662306a36Sopenharmony_ci *
133762306a36Sopenharmony_ci * This is responsible for creating the chunks that serve the first chunk.  The
133862306a36Sopenharmony_ci * base_addr is page aligned down of @tmp_addr while the region end is page
133962306a36Sopenharmony_ci * aligned up.  Offsets are kept track of to determine the region served. All
134062306a36Sopenharmony_ci * this is done to appease the bitmap allocator in avoiding partial blocks.
134162306a36Sopenharmony_ci *
134262306a36Sopenharmony_ci * RETURNS:
134362306a36Sopenharmony_ci * Chunk serving the region at @tmp_addr of @map_size.
134462306a36Sopenharmony_ci */
134562306a36Sopenharmony_cistatic struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
134662306a36Sopenharmony_ci							 int map_size)
134762306a36Sopenharmony_ci{
134862306a36Sopenharmony_ci	struct pcpu_chunk *chunk;
134962306a36Sopenharmony_ci	unsigned long aligned_addr;
135062306a36Sopenharmony_ci	int start_offset, offset_bits, region_size, region_bits;
135162306a36Sopenharmony_ci	size_t alloc_size;
135262306a36Sopenharmony_ci
135362306a36Sopenharmony_ci	/* region calculations */
135462306a36Sopenharmony_ci	aligned_addr = tmp_addr & PAGE_MASK;
135562306a36Sopenharmony_ci
135662306a36Sopenharmony_ci	start_offset = tmp_addr - aligned_addr;
135762306a36Sopenharmony_ci	region_size = ALIGN(start_offset + map_size, PAGE_SIZE);
135862306a36Sopenharmony_ci
135962306a36Sopenharmony_ci	/* allocate chunk */
136062306a36Sopenharmony_ci	alloc_size = struct_size(chunk, populated,
136162306a36Sopenharmony_ci				 BITS_TO_LONGS(region_size >> PAGE_SHIFT));
136262306a36Sopenharmony_ci	chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
136362306a36Sopenharmony_ci	if (!chunk)
136462306a36Sopenharmony_ci		panic("%s: Failed to allocate %zu bytes\n", __func__,
136562306a36Sopenharmony_ci		      alloc_size);
136662306a36Sopenharmony_ci
136762306a36Sopenharmony_ci	INIT_LIST_HEAD(&chunk->list);
136862306a36Sopenharmony_ci
136962306a36Sopenharmony_ci	chunk->base_addr = (void *)aligned_addr;
137062306a36Sopenharmony_ci	chunk->start_offset = start_offset;
137162306a36Sopenharmony_ci	chunk->end_offset = region_size - chunk->start_offset - map_size;
137262306a36Sopenharmony_ci
137362306a36Sopenharmony_ci	chunk->nr_pages = region_size >> PAGE_SHIFT;
137462306a36Sopenharmony_ci	region_bits = pcpu_chunk_map_bits(chunk);
137562306a36Sopenharmony_ci
137662306a36Sopenharmony_ci	alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]);
137762306a36Sopenharmony_ci	chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
137862306a36Sopenharmony_ci	if (!chunk->alloc_map)
137962306a36Sopenharmony_ci		panic("%s: Failed to allocate %zu bytes\n", __func__,
138062306a36Sopenharmony_ci		      alloc_size);
138162306a36Sopenharmony_ci
138262306a36Sopenharmony_ci	alloc_size =
138362306a36Sopenharmony_ci		BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]);
138462306a36Sopenharmony_ci	chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
138562306a36Sopenharmony_ci	if (!chunk->bound_map)
138662306a36Sopenharmony_ci		panic("%s: Failed to allocate %zu bytes\n", __func__,
138762306a36Sopenharmony_ci		      alloc_size);
138862306a36Sopenharmony_ci
138962306a36Sopenharmony_ci	alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]);
139062306a36Sopenharmony_ci	chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
139162306a36Sopenharmony_ci	if (!chunk->md_blocks)
139262306a36Sopenharmony_ci		panic("%s: Failed to allocate %zu bytes\n", __func__,
139362306a36Sopenharmony_ci		      alloc_size);
139462306a36Sopenharmony_ci
139562306a36Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM
139662306a36Sopenharmony_ci	/* first chunk is free to use */
139762306a36Sopenharmony_ci	chunk->obj_cgroups = NULL;
139862306a36Sopenharmony_ci#endif
139962306a36Sopenharmony_ci	pcpu_init_md_blocks(chunk);
140062306a36Sopenharmony_ci
140162306a36Sopenharmony_ci	/* manage populated page bitmap */
140262306a36Sopenharmony_ci	chunk->immutable = true;
140362306a36Sopenharmony_ci	bitmap_fill(chunk->populated, chunk->nr_pages);
140462306a36Sopenharmony_ci	chunk->nr_populated = chunk->nr_pages;
140562306a36Sopenharmony_ci	chunk->nr_empty_pop_pages = chunk->nr_pages;
140662306a36Sopenharmony_ci
140762306a36Sopenharmony_ci	chunk->free_bytes = map_size;
140862306a36Sopenharmony_ci
140962306a36Sopenharmony_ci	if (chunk->start_offset) {
141062306a36Sopenharmony_ci		/* hide the beginning of the bitmap */
141162306a36Sopenharmony_ci		offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
141262306a36Sopenharmony_ci		bitmap_set(chunk->alloc_map, 0, offset_bits);
141362306a36Sopenharmony_ci		set_bit(0, chunk->bound_map);
141462306a36Sopenharmony_ci		set_bit(offset_bits, chunk->bound_map);
141562306a36Sopenharmony_ci
141662306a36Sopenharmony_ci		chunk->chunk_md.first_free = offset_bits;
141762306a36Sopenharmony_ci
141862306a36Sopenharmony_ci		pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
141962306a36Sopenharmony_ci	}
142062306a36Sopenharmony_ci
142162306a36Sopenharmony_ci	if (chunk->end_offset) {
142262306a36Sopenharmony_ci		/* hide the end of the bitmap */
142362306a36Sopenharmony_ci		offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
142462306a36Sopenharmony_ci		bitmap_set(chunk->alloc_map,
142562306a36Sopenharmony_ci			   pcpu_chunk_map_bits(chunk) - offset_bits,
142662306a36Sopenharmony_ci			   offset_bits);
142762306a36Sopenharmony_ci		set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
142862306a36Sopenharmony_ci			chunk->bound_map);
142962306a36Sopenharmony_ci		set_bit(region_bits, chunk->bound_map);
143062306a36Sopenharmony_ci
143162306a36Sopenharmony_ci		pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk)
143262306a36Sopenharmony_ci					     - offset_bits, offset_bits);
143362306a36Sopenharmony_ci	}
143462306a36Sopenharmony_ci
143562306a36Sopenharmony_ci	return chunk;
143662306a36Sopenharmony_ci}
143762306a36Sopenharmony_ci
143862306a36Sopenharmony_cistatic struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
143962306a36Sopenharmony_ci{
144062306a36Sopenharmony_ci	struct pcpu_chunk *chunk;
144162306a36Sopenharmony_ci	int region_bits;
144262306a36Sopenharmony_ci
144362306a36Sopenharmony_ci	chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
144462306a36Sopenharmony_ci	if (!chunk)
144562306a36Sopenharmony_ci		return NULL;
144662306a36Sopenharmony_ci
144762306a36Sopenharmony_ci	INIT_LIST_HEAD(&chunk->list);
144862306a36Sopenharmony_ci	chunk->nr_pages = pcpu_unit_pages;
144962306a36Sopenharmony_ci	region_bits = pcpu_chunk_map_bits(chunk);
145062306a36Sopenharmony_ci
145162306a36Sopenharmony_ci	chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
145262306a36Sopenharmony_ci					   sizeof(chunk->alloc_map[0]), gfp);
145362306a36Sopenharmony_ci	if (!chunk->alloc_map)
145462306a36Sopenharmony_ci		goto alloc_map_fail;
145562306a36Sopenharmony_ci
145662306a36Sopenharmony_ci	chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
145762306a36Sopenharmony_ci					   sizeof(chunk->bound_map[0]), gfp);
145862306a36Sopenharmony_ci	if (!chunk->bound_map)
145962306a36Sopenharmony_ci		goto bound_map_fail;
146062306a36Sopenharmony_ci
146162306a36Sopenharmony_ci	chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
146262306a36Sopenharmony_ci					   sizeof(chunk->md_blocks[0]), gfp);
146362306a36Sopenharmony_ci	if (!chunk->md_blocks)
146462306a36Sopenharmony_ci		goto md_blocks_fail;
146562306a36Sopenharmony_ci
146662306a36Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM
146762306a36Sopenharmony_ci	if (!mem_cgroup_kmem_disabled()) {
146862306a36Sopenharmony_ci		chunk->obj_cgroups =
146962306a36Sopenharmony_ci			pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) *
147062306a36Sopenharmony_ci					sizeof(struct obj_cgroup *), gfp);
147162306a36Sopenharmony_ci		if (!chunk->obj_cgroups)
147262306a36Sopenharmony_ci			goto objcg_fail;
147362306a36Sopenharmony_ci	}
147462306a36Sopenharmony_ci#endif
147562306a36Sopenharmony_ci
147662306a36Sopenharmony_ci	pcpu_init_md_blocks(chunk);
147762306a36Sopenharmony_ci
147862306a36Sopenharmony_ci	/* init metadata */
147962306a36Sopenharmony_ci	chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;
148062306a36Sopenharmony_ci
148162306a36Sopenharmony_ci	return chunk;
148262306a36Sopenharmony_ci
148362306a36Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM
148462306a36Sopenharmony_ciobjcg_fail:
148562306a36Sopenharmony_ci	pcpu_mem_free(chunk->md_blocks);
148662306a36Sopenharmony_ci#endif
148762306a36Sopenharmony_cimd_blocks_fail:
148862306a36Sopenharmony_ci	pcpu_mem_free(chunk->bound_map);
148962306a36Sopenharmony_cibound_map_fail:
149062306a36Sopenharmony_ci	pcpu_mem_free(chunk->alloc_map);
149162306a36Sopenharmony_cialloc_map_fail:
149262306a36Sopenharmony_ci	pcpu_mem_free(chunk);
149362306a36Sopenharmony_ci
149462306a36Sopenharmony_ci	return NULL;
149562306a36Sopenharmony_ci}
149662306a36Sopenharmony_ci
149762306a36Sopenharmony_cistatic void pcpu_free_chunk(struct pcpu_chunk *chunk)
149862306a36Sopenharmony_ci{
149962306a36Sopenharmony_ci	if (!chunk)
150062306a36Sopenharmony_ci		return;
150162306a36Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM
150262306a36Sopenharmony_ci	pcpu_mem_free(chunk->obj_cgroups);
150362306a36Sopenharmony_ci#endif
150462306a36Sopenharmony_ci	pcpu_mem_free(chunk->md_blocks);
150562306a36Sopenharmony_ci	pcpu_mem_free(chunk->bound_map);
150662306a36Sopenharmony_ci	pcpu_mem_free(chunk->alloc_map);
150762306a36Sopenharmony_ci	pcpu_mem_free(chunk);
150862306a36Sopenharmony_ci}
150962306a36Sopenharmony_ci
151062306a36Sopenharmony_ci/**
151162306a36Sopenharmony_ci * pcpu_chunk_populated - post-population bookkeeping
151262306a36Sopenharmony_ci * @chunk: pcpu_chunk which got populated
151362306a36Sopenharmony_ci * @page_start: the start page
151462306a36Sopenharmony_ci * @page_end: the end page
151562306a36Sopenharmony_ci *
151662306a36Sopenharmony_ci * Pages in [@page_start,@page_end) have been populated to @chunk.  Update
151762306a36Sopenharmony_ci * the bookkeeping information accordingly.  Must be called after each
151862306a36Sopenharmony_ci * successful population.
151962306a36Sopenharmony_ci */
152062306a36Sopenharmony_cistatic void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
152162306a36Sopenharmony_ci				 int page_end)
152262306a36Sopenharmony_ci{
152362306a36Sopenharmony_ci	int nr = page_end - page_start;
152462306a36Sopenharmony_ci
152562306a36Sopenharmony_ci	lockdep_assert_held(&pcpu_lock);
152662306a36Sopenharmony_ci
152762306a36Sopenharmony_ci	bitmap_set(chunk->populated, page_start, nr);
152862306a36Sopenharmony_ci	chunk->nr_populated += nr;
152962306a36Sopenharmony_ci	pcpu_nr_populated += nr;
153062306a36Sopenharmony_ci
153162306a36Sopenharmony_ci	pcpu_update_empty_pages(chunk, nr);
153262306a36Sopenharmony_ci}
153362306a36Sopenharmony_ci
153462306a36Sopenharmony_ci/**
153562306a36Sopenharmony_ci * pcpu_chunk_depopulated - post-depopulation bookkeeping
153662306a36Sopenharmony_ci * @chunk: pcpu_chunk which got depopulated
153762306a36Sopenharmony_ci * @page_start: the start page
153862306a36Sopenharmony_ci * @page_end: the end page
153962306a36Sopenharmony_ci *
154062306a36Sopenharmony_ci * Pages in [@page_start,@page_end) have been depopulated from @chunk.
154162306a36Sopenharmony_ci * Update the bookkeeping information accordingly.  Must be called after
154262306a36Sopenharmony_ci * each successful depopulation.
154362306a36Sopenharmony_ci */
154462306a36Sopenharmony_cistatic void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
154562306a36Sopenharmony_ci				   int page_start, int page_end)
154662306a36Sopenharmony_ci{
154762306a36Sopenharmony_ci	int nr = page_end - page_start;
154862306a36Sopenharmony_ci
154962306a36Sopenharmony_ci	lockdep_assert_held(&pcpu_lock);
155062306a36Sopenharmony_ci
155162306a36Sopenharmony_ci	bitmap_clear(chunk->populated, page_start, nr);
155262306a36Sopenharmony_ci	chunk->nr_populated -= nr;
155362306a36Sopenharmony_ci	pcpu_nr_populated -= nr;
155462306a36Sopenharmony_ci
155562306a36Sopenharmony_ci	pcpu_update_empty_pages(chunk, -nr);
155662306a36Sopenharmony_ci}
155762306a36Sopenharmony_ci
155862306a36Sopenharmony_ci/*
155962306a36Sopenharmony_ci * Chunk management implementation.
156062306a36Sopenharmony_ci *
156162306a36Sopenharmony_ci * To allow different implementations, chunk alloc/free and
156262306a36Sopenharmony_ci * [de]population are implemented in a separate file which is pulled
156362306a36Sopenharmony_ci * into this file and compiled together.  The following functions
156462306a36Sopenharmony_ci * should be implemented.
156562306a36Sopenharmony_ci *
156662306a36Sopenharmony_ci * pcpu_populate_chunk		- populate the specified range of a chunk
156762306a36Sopenharmony_ci * pcpu_depopulate_chunk	- depopulate the specified range of a chunk
156862306a36Sopenharmony_ci * pcpu_post_unmap_tlb_flush	- flush tlb for the specified range of a chunk
156962306a36Sopenharmony_ci * pcpu_create_chunk		- create a new chunk
157062306a36Sopenharmony_ci * pcpu_destroy_chunk		- destroy a chunk, always preceded by full depop
157162306a36Sopenharmony_ci * pcpu_addr_to_page		- translate address to physical address
157262306a36Sopenharmony_ci * pcpu_verify_alloc_info	- check alloc_info is acceptable during init
157362306a36Sopenharmony_ci */
157462306a36Sopenharmony_cistatic int pcpu_populate_chunk(struct pcpu_chunk *chunk,
157562306a36Sopenharmony_ci			       int page_start, int page_end, gfp_t gfp);
157662306a36Sopenharmony_cistatic void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
157762306a36Sopenharmony_ci				  int page_start, int page_end);
157862306a36Sopenharmony_cistatic void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
157962306a36Sopenharmony_ci				      int page_start, int page_end);
158062306a36Sopenharmony_cistatic struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
158162306a36Sopenharmony_cistatic void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
158262306a36Sopenharmony_cistatic struct page *pcpu_addr_to_page(void *addr);
158362306a36Sopenharmony_cistatic int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
158462306a36Sopenharmony_ci
158562306a36Sopenharmony_ci#ifdef CONFIG_NEED_PER_CPU_KM
158662306a36Sopenharmony_ci#include "percpu-km.c"
158762306a36Sopenharmony_ci#else
158862306a36Sopenharmony_ci#include "percpu-vm.c"
158962306a36Sopenharmony_ci#endif
159062306a36Sopenharmony_ci
159162306a36Sopenharmony_ci/**
159262306a36Sopenharmony_ci * pcpu_chunk_addr_search - determine chunk containing specified address
159362306a36Sopenharmony_ci * @addr: address for which the chunk needs to be determined.
159462306a36Sopenharmony_ci *
159562306a36Sopenharmony_ci * This is an internal function that handles all but static allocations.
159662306a36Sopenharmony_ci * Static percpu address values should never be passed into the allocator.
159762306a36Sopenharmony_ci *
159862306a36Sopenharmony_ci * RETURNS:
159962306a36Sopenharmony_ci * The address of the found chunk.
160062306a36Sopenharmony_ci */
160162306a36Sopenharmony_cistatic struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
160262306a36Sopenharmony_ci{
160362306a36Sopenharmony_ci	/* is it in the dynamic region (first chunk)? */
160462306a36Sopenharmony_ci	if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
160562306a36Sopenharmony_ci		return pcpu_first_chunk;
160662306a36Sopenharmony_ci
160762306a36Sopenharmony_ci	/* is it in the reserved region? */
160862306a36Sopenharmony_ci	if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
160962306a36Sopenharmony_ci		return pcpu_reserved_chunk;
161062306a36Sopenharmony_ci
161162306a36Sopenharmony_ci	/*
161262306a36Sopenharmony_ci	 * The address is relative to unit0 which might be unused and
161362306a36Sopenharmony_ci	 * thus unmapped.  Offset the address to the unit space of the
161462306a36Sopenharmony_ci	 * current processor before looking it up in the vmalloc
161562306a36Sopenharmony_ci	 * space.  Note that any possible cpu id can be used here, so
161662306a36Sopenharmony_ci	 * there's no need to worry about preemption or cpu hotplug.
161762306a36Sopenharmony_ci	 */
161862306a36Sopenharmony_ci	addr += pcpu_unit_offsets[raw_smp_processor_id()];
161962306a36Sopenharmony_ci	return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
162062306a36Sopenharmony_ci}
162162306a36Sopenharmony_ci
162262306a36Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM
162362306a36Sopenharmony_cistatic bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
162462306a36Sopenharmony_ci				      struct obj_cgroup **objcgp)
162562306a36Sopenharmony_ci{
162662306a36Sopenharmony_ci	struct obj_cgroup *objcg;
162762306a36Sopenharmony_ci
162862306a36Sopenharmony_ci	if (!memcg_kmem_online() || !(gfp & __GFP_ACCOUNT))
162962306a36Sopenharmony_ci		return true;
163062306a36Sopenharmony_ci
163162306a36Sopenharmony_ci	objcg = get_obj_cgroup_from_current();
163262306a36Sopenharmony_ci	if (!objcg)
163362306a36Sopenharmony_ci		return true;
163462306a36Sopenharmony_ci
163562306a36Sopenharmony_ci	if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size))) {
163662306a36Sopenharmony_ci		obj_cgroup_put(objcg);
163762306a36Sopenharmony_ci		return false;
163862306a36Sopenharmony_ci	}
163962306a36Sopenharmony_ci
164062306a36Sopenharmony_ci	*objcgp = objcg;
164162306a36Sopenharmony_ci	return true;
164262306a36Sopenharmony_ci}
164362306a36Sopenharmony_ci
164462306a36Sopenharmony_cistatic void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
164562306a36Sopenharmony_ci				       struct pcpu_chunk *chunk, int off,
164662306a36Sopenharmony_ci				       size_t size)
164762306a36Sopenharmony_ci{
164862306a36Sopenharmony_ci	if (!objcg)
164962306a36Sopenharmony_ci		return;
165062306a36Sopenharmony_ci
165162306a36Sopenharmony_ci	if (likely(chunk && chunk->obj_cgroups)) {
165262306a36Sopenharmony_ci		chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg;
165362306a36Sopenharmony_ci
165462306a36Sopenharmony_ci		rcu_read_lock();
165562306a36Sopenharmony_ci		mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
165662306a36Sopenharmony_ci				pcpu_obj_full_size(size));
165762306a36Sopenharmony_ci		rcu_read_unlock();
165862306a36Sopenharmony_ci	} else {
165962306a36Sopenharmony_ci		obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));
166062306a36Sopenharmony_ci		obj_cgroup_put(objcg);
166162306a36Sopenharmony_ci	}
166262306a36Sopenharmony_ci}
166362306a36Sopenharmony_ci
166462306a36Sopenharmony_cistatic void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
166562306a36Sopenharmony_ci{
166662306a36Sopenharmony_ci	struct obj_cgroup *objcg;
166762306a36Sopenharmony_ci
166862306a36Sopenharmony_ci	if (unlikely(!chunk->obj_cgroups))
166962306a36Sopenharmony_ci		return;
167062306a36Sopenharmony_ci
167162306a36Sopenharmony_ci	objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT];
167262306a36Sopenharmony_ci	if (!objcg)
167362306a36Sopenharmony_ci		return;
167462306a36Sopenharmony_ci	chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL;
167562306a36Sopenharmony_ci
167662306a36Sopenharmony_ci	obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));
167762306a36Sopenharmony_ci
167862306a36Sopenharmony_ci	rcu_read_lock();
167962306a36Sopenharmony_ci	mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
168062306a36Sopenharmony_ci			-pcpu_obj_full_size(size));
168162306a36Sopenharmony_ci	rcu_read_unlock();
168262306a36Sopenharmony_ci
168362306a36Sopenharmony_ci	obj_cgroup_put(objcg);
168462306a36Sopenharmony_ci}
168562306a36Sopenharmony_ci
168662306a36Sopenharmony_ci#else /* CONFIG_MEMCG_KMEM */
168762306a36Sopenharmony_cistatic bool
168862306a36Sopenharmony_cipcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp)
168962306a36Sopenharmony_ci{
169062306a36Sopenharmony_ci	return true;
169162306a36Sopenharmony_ci}
169262306a36Sopenharmony_ci
169362306a36Sopenharmony_cistatic void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
169462306a36Sopenharmony_ci				       struct pcpu_chunk *chunk, int off,
169562306a36Sopenharmony_ci				       size_t size)
169662306a36Sopenharmony_ci{
169762306a36Sopenharmony_ci}
169862306a36Sopenharmony_ci
169962306a36Sopenharmony_cistatic void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
170062306a36Sopenharmony_ci{
170162306a36Sopenharmony_ci}
170262306a36Sopenharmony_ci#endif /* CONFIG_MEMCG_KMEM */
170362306a36Sopenharmony_ci
170462306a36Sopenharmony_ci/**
170562306a36Sopenharmony_ci * pcpu_alloc - the percpu allocator
170662306a36Sopenharmony_ci * @size: size of area to allocate in bytes
170762306a36Sopenharmony_ci * @align: alignment of area (max PAGE_SIZE)
170862306a36Sopenharmony_ci * @reserved: allocate from the reserved chunk if available
170962306a36Sopenharmony_ci * @gfp: allocation flags
171062306a36Sopenharmony_ci *
171162306a36Sopenharmony_ci * Allocate percpu area of @size bytes aligned at @align.  If @gfp doesn't
171262306a36Sopenharmony_ci * contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN
171362306a36Sopenharmony_ci * then no warning will be triggered on invalid or failed allocation
171462306a36Sopenharmony_ci * requests.
171562306a36Sopenharmony_ci *
171662306a36Sopenharmony_ci * RETURNS:
171762306a36Sopenharmony_ci * Percpu pointer to the allocated area on success, NULL on failure.
171862306a36Sopenharmony_ci */
171962306a36Sopenharmony_cistatic void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
172062306a36Sopenharmony_ci				 gfp_t gfp)
172162306a36Sopenharmony_ci{
172262306a36Sopenharmony_ci	gfp_t pcpu_gfp;
172362306a36Sopenharmony_ci	bool is_atomic;
172462306a36Sopenharmony_ci	bool do_warn;
172562306a36Sopenharmony_ci	struct obj_cgroup *objcg = NULL;
172662306a36Sopenharmony_ci	static int warn_limit = 10;
172762306a36Sopenharmony_ci	struct pcpu_chunk *chunk, *next;
172862306a36Sopenharmony_ci	const char *err;
172962306a36Sopenharmony_ci	int slot, off, cpu, ret;
173062306a36Sopenharmony_ci	unsigned long flags;
173162306a36Sopenharmony_ci	void __percpu *ptr;
173262306a36Sopenharmony_ci	size_t bits, bit_align;
173362306a36Sopenharmony_ci
173462306a36Sopenharmony_ci	gfp = current_gfp_context(gfp);
173562306a36Sopenharmony_ci	/* whitelisted flags that can be passed to the backing allocators */
173662306a36Sopenharmony_ci	pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
173762306a36Sopenharmony_ci	is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
173862306a36Sopenharmony_ci	do_warn = !(gfp & __GFP_NOWARN);
173962306a36Sopenharmony_ci
174062306a36Sopenharmony_ci	/*
174162306a36Sopenharmony_ci	 * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
174262306a36Sopenharmony_ci	 * therefore alignment must be a minimum of that many bytes.
174362306a36Sopenharmony_ci	 * An allocation may have internal fragmentation from rounding up
174462306a36Sopenharmony_ci	 * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes.
174562306a36Sopenharmony_ci	 */
174662306a36Sopenharmony_ci	if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
174762306a36Sopenharmony_ci		align = PCPU_MIN_ALLOC_SIZE;
174862306a36Sopenharmony_ci
174962306a36Sopenharmony_ci	size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
175062306a36Sopenharmony_ci	bits = size >> PCPU_MIN_ALLOC_SHIFT;
175162306a36Sopenharmony_ci	bit_align = align >> PCPU_MIN_ALLOC_SHIFT;
175262306a36Sopenharmony_ci
175362306a36Sopenharmony_ci	if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
175462306a36Sopenharmony_ci		     !is_power_of_2(align))) {
175562306a36Sopenharmony_ci		WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n",
175662306a36Sopenharmony_ci		     size, align);
175762306a36Sopenharmony_ci		return NULL;
175862306a36Sopenharmony_ci	}
175962306a36Sopenharmony_ci
176062306a36Sopenharmony_ci	if (unlikely(!pcpu_memcg_pre_alloc_hook(size, gfp, &objcg)))
176162306a36Sopenharmony_ci		return NULL;
176262306a36Sopenharmony_ci
176362306a36Sopenharmony_ci	if (!is_atomic) {
176462306a36Sopenharmony_ci		/*
176562306a36Sopenharmony_ci		 * pcpu_balance_workfn() allocates memory under this mutex,
176662306a36Sopenharmony_ci		 * and it may wait for memory reclaim. Allow current task
176762306a36Sopenharmony_ci		 * to become OOM victim, in case of memory pressure.
176862306a36Sopenharmony_ci		 */
176962306a36Sopenharmony_ci		if (gfp & __GFP_NOFAIL) {
177062306a36Sopenharmony_ci			mutex_lock(&pcpu_alloc_mutex);
177162306a36Sopenharmony_ci		} else if (mutex_lock_killable(&pcpu_alloc_mutex)) {
177262306a36Sopenharmony_ci			pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
177362306a36Sopenharmony_ci			return NULL;
177462306a36Sopenharmony_ci		}
177562306a36Sopenharmony_ci	}
177662306a36Sopenharmony_ci
177762306a36Sopenharmony_ci	spin_lock_irqsave(&pcpu_lock, flags);
177862306a36Sopenharmony_ci
177962306a36Sopenharmony_ci	/* serve reserved allocations from the reserved chunk if available */
178062306a36Sopenharmony_ci	if (reserved && pcpu_reserved_chunk) {
178162306a36Sopenharmony_ci		chunk = pcpu_reserved_chunk;
178262306a36Sopenharmony_ci
178362306a36Sopenharmony_ci		off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
178462306a36Sopenharmony_ci		if (off < 0) {
178562306a36Sopenharmony_ci			err = "alloc from reserved chunk failed";
178662306a36Sopenharmony_ci			goto fail_unlock;
178762306a36Sopenharmony_ci		}
178862306a36Sopenharmony_ci
178962306a36Sopenharmony_ci		off = pcpu_alloc_area(chunk, bits, bit_align, off);
179062306a36Sopenharmony_ci		if (off >= 0)
179162306a36Sopenharmony_ci			goto area_found;
179262306a36Sopenharmony_ci
179362306a36Sopenharmony_ci		err = "alloc from reserved chunk failed";
179462306a36Sopenharmony_ci		goto fail_unlock;
179562306a36Sopenharmony_ci	}
179662306a36Sopenharmony_ci
179762306a36Sopenharmony_cirestart:
179862306a36Sopenharmony_ci	/* search through normal chunks */
179962306a36Sopenharmony_ci	for (slot = pcpu_size_to_slot(size); slot <= pcpu_free_slot; slot++) {
180062306a36Sopenharmony_ci		list_for_each_entry_safe(chunk, next, &pcpu_chunk_lists[slot],
180162306a36Sopenharmony_ci					 list) {
180262306a36Sopenharmony_ci			off = pcpu_find_block_fit(chunk, bits, bit_align,
180362306a36Sopenharmony_ci						  is_atomic);
180462306a36Sopenharmony_ci			if (off < 0) {
180562306a36Sopenharmony_ci				if (slot < PCPU_SLOT_FAIL_THRESHOLD)
180662306a36Sopenharmony_ci					pcpu_chunk_move(chunk, 0);
180762306a36Sopenharmony_ci				continue;
180862306a36Sopenharmony_ci			}
180962306a36Sopenharmony_ci
181062306a36Sopenharmony_ci			off = pcpu_alloc_area(chunk, bits, bit_align, off);
181162306a36Sopenharmony_ci			if (off >= 0) {
181262306a36Sopenharmony_ci				pcpu_reintegrate_chunk(chunk);
181362306a36Sopenharmony_ci				goto area_found;
181462306a36Sopenharmony_ci			}
181562306a36Sopenharmony_ci		}
181662306a36Sopenharmony_ci	}
181762306a36Sopenharmony_ci
181862306a36Sopenharmony_ci	spin_unlock_irqrestore(&pcpu_lock, flags);
181962306a36Sopenharmony_ci
182062306a36Sopenharmony_ci	if (is_atomic) {
182162306a36Sopenharmony_ci		err = "atomic alloc failed, no space left";
182262306a36Sopenharmony_ci		goto fail;
182362306a36Sopenharmony_ci	}
182462306a36Sopenharmony_ci
182562306a36Sopenharmony_ci	/* No space left.  Create a new chunk. */
182662306a36Sopenharmony_ci	if (list_empty(&pcpu_chunk_lists[pcpu_free_slot])) {
182762306a36Sopenharmony_ci		chunk = pcpu_create_chunk(pcpu_gfp);
182862306a36Sopenharmony_ci		if (!chunk) {
182962306a36Sopenharmony_ci			err = "failed to allocate new chunk";
183062306a36Sopenharmony_ci			goto fail;
183162306a36Sopenharmony_ci		}
183262306a36Sopenharmony_ci
183362306a36Sopenharmony_ci		spin_lock_irqsave(&pcpu_lock, flags);
183462306a36Sopenharmony_ci		pcpu_chunk_relocate(chunk, -1);
183562306a36Sopenharmony_ci	} else {
183662306a36Sopenharmony_ci		spin_lock_irqsave(&pcpu_lock, flags);
183762306a36Sopenharmony_ci	}
183862306a36Sopenharmony_ci
183962306a36Sopenharmony_ci	goto restart;
184062306a36Sopenharmony_ci
184162306a36Sopenharmony_ciarea_found:
184262306a36Sopenharmony_ci	pcpu_stats_area_alloc(chunk, size);
184362306a36Sopenharmony_ci	spin_unlock_irqrestore(&pcpu_lock, flags);
184462306a36Sopenharmony_ci
184562306a36Sopenharmony_ci	/* populate if not all pages are already there */
184662306a36Sopenharmony_ci	if (!is_atomic) {
184762306a36Sopenharmony_ci		unsigned int page_end, rs, re;
184862306a36Sopenharmony_ci
184962306a36Sopenharmony_ci		rs = PFN_DOWN(off);
185062306a36Sopenharmony_ci		page_end = PFN_UP(off + size);
185162306a36Sopenharmony_ci
185262306a36Sopenharmony_ci		for_each_clear_bitrange_from(rs, re, chunk->populated, page_end) {
185362306a36Sopenharmony_ci			WARN_ON(chunk->immutable);
185462306a36Sopenharmony_ci
185562306a36Sopenharmony_ci			ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);
185662306a36Sopenharmony_ci
185762306a36Sopenharmony_ci			spin_lock_irqsave(&pcpu_lock, flags);
185862306a36Sopenharmony_ci			if (ret) {
185962306a36Sopenharmony_ci				pcpu_free_area(chunk, off);
186062306a36Sopenharmony_ci				err = "failed to populate";
186162306a36Sopenharmony_ci				goto fail_unlock;
186262306a36Sopenharmony_ci			}
186362306a36Sopenharmony_ci			pcpu_chunk_populated(chunk, rs, re);
186462306a36Sopenharmony_ci			spin_unlock_irqrestore(&pcpu_lock, flags);
186562306a36Sopenharmony_ci		}
186662306a36Sopenharmony_ci
186762306a36Sopenharmony_ci		mutex_unlock(&pcpu_alloc_mutex);
186862306a36Sopenharmony_ci	}
186962306a36Sopenharmony_ci
187062306a36Sopenharmony_ci	if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
187162306a36Sopenharmony_ci		pcpu_schedule_balance_work();
187262306a36Sopenharmony_ci
187362306a36Sopenharmony_ci	/* clear the areas and return address relative to base address */
187462306a36Sopenharmony_ci	for_each_possible_cpu(cpu)
187562306a36Sopenharmony_ci		memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
187662306a36Sopenharmony_ci
187762306a36Sopenharmony_ci	ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
187862306a36Sopenharmony_ci	kmemleak_alloc_percpu(ptr, size, gfp);
187962306a36Sopenharmony_ci
188062306a36Sopenharmony_ci	trace_percpu_alloc_percpu(_RET_IP_, reserved, is_atomic, size, align,
188162306a36Sopenharmony_ci				  chunk->base_addr, off, ptr,
188262306a36Sopenharmony_ci				  pcpu_obj_full_size(size), gfp);
188362306a36Sopenharmony_ci
188462306a36Sopenharmony_ci	pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);
188562306a36Sopenharmony_ci
188662306a36Sopenharmony_ci	return ptr;
188762306a36Sopenharmony_ci
188862306a36Sopenharmony_cifail_unlock:
188962306a36Sopenharmony_ci	spin_unlock_irqrestore(&pcpu_lock, flags);
189062306a36Sopenharmony_cifail:
189162306a36Sopenharmony_ci	trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);
189262306a36Sopenharmony_ci
189362306a36Sopenharmony_ci	if (do_warn && warn_limit) {
189462306a36Sopenharmony_ci		pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
189562306a36Sopenharmony_ci			size, align, is_atomic, err);
189662306a36Sopenharmony_ci		if (!is_atomic)
189762306a36Sopenharmony_ci			dump_stack();
189862306a36Sopenharmony_ci		if (!--warn_limit)
189962306a36Sopenharmony_ci			pr_info("limit reached, disable warning\n");
190062306a36Sopenharmony_ci	}
190162306a36Sopenharmony_ci
190262306a36Sopenharmony_ci	if (is_atomic) {
190362306a36Sopenharmony_ci		/* see the flag handling in pcpu_balance_workfn() */
190462306a36Sopenharmony_ci		pcpu_atomic_alloc_failed = true;
190562306a36Sopenharmony_ci		pcpu_schedule_balance_work();
190662306a36Sopenharmony_ci	} else {
190762306a36Sopenharmony_ci		mutex_unlock(&pcpu_alloc_mutex);
190862306a36Sopenharmony_ci	}
190962306a36Sopenharmony_ci
191062306a36Sopenharmony_ci	pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
191162306a36Sopenharmony_ci
191262306a36Sopenharmony_ci	return NULL;
191362306a36Sopenharmony_ci}
191462306a36Sopenharmony_ci
191562306a36Sopenharmony_ci/**
191662306a36Sopenharmony_ci * __alloc_percpu_gfp - allocate dynamic percpu area
191762306a36Sopenharmony_ci * @size: size of area to allocate in bytes
191862306a36Sopenharmony_ci * @align: alignment of area (max PAGE_SIZE)
191962306a36Sopenharmony_ci * @gfp: allocation flags
192062306a36Sopenharmony_ci *
192162306a36Sopenharmony_ci * Allocate zero-filled percpu area of @size bytes aligned at @align.  If
192262306a36Sopenharmony_ci * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can
192362306a36Sopenharmony_ci * be called from any context but is a lot more likely to fail. If @gfp
192462306a36Sopenharmony_ci * has __GFP_NOWARN then no warning will be triggered on invalid or failed
192562306a36Sopenharmony_ci * allocation requests.
192662306a36Sopenharmony_ci *
192762306a36Sopenharmony_ci * RETURNS:
192862306a36Sopenharmony_ci * Percpu pointer to the allocated area on success, NULL on failure.
192962306a36Sopenharmony_ci */
193062306a36Sopenharmony_civoid __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
193162306a36Sopenharmony_ci{
193262306a36Sopenharmony_ci	return pcpu_alloc(size, align, false, gfp);
193362306a36Sopenharmony_ci}
193462306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
193562306a36Sopenharmony_ci
193662306a36Sopenharmony_ci/**
193762306a36Sopenharmony_ci * __alloc_percpu - allocate dynamic percpu area
193862306a36Sopenharmony_ci * @size: size of area to allocate in bytes
193962306a36Sopenharmony_ci * @align: alignment of area (max PAGE_SIZE)
194062306a36Sopenharmony_ci *
194162306a36Sopenharmony_ci * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL).
194262306a36Sopenharmony_ci */
194362306a36Sopenharmony_civoid __percpu *__alloc_percpu(size_t size, size_t align)
194462306a36Sopenharmony_ci{
194562306a36Sopenharmony_ci	return pcpu_alloc(size, align, false, GFP_KERNEL);
194662306a36Sopenharmony_ci}
194762306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(__alloc_percpu);
194862306a36Sopenharmony_ci
194962306a36Sopenharmony_ci/**
195062306a36Sopenharmony_ci * __alloc_reserved_percpu - allocate reserved percpu area
195162306a36Sopenharmony_ci * @size: size of area to allocate in bytes
195262306a36Sopenharmony_ci * @align: alignment of area (max PAGE_SIZE)
195362306a36Sopenharmony_ci *
195462306a36Sopenharmony_ci * Allocate zero-filled percpu area of @size bytes aligned at @align
195562306a36Sopenharmony_ci * from reserved percpu area if arch has set it up; otherwise,
195662306a36Sopenharmony_ci * allocation is served from the same dynamic area.  Might sleep.
195762306a36Sopenharmony_ci * Might trigger writeouts.
195862306a36Sopenharmony_ci *
195962306a36Sopenharmony_ci * CONTEXT:
196062306a36Sopenharmony_ci * Does GFP_KERNEL allocation.
196162306a36Sopenharmony_ci *
196262306a36Sopenharmony_ci * RETURNS:
196362306a36Sopenharmony_ci * Percpu pointer to the allocated area on success, NULL on failure.
196462306a36Sopenharmony_ci */
196562306a36Sopenharmony_civoid __percpu *__alloc_reserved_percpu(size_t size, size_t align)
196662306a36Sopenharmony_ci{
196762306a36Sopenharmony_ci	return pcpu_alloc(size, align, true, GFP_KERNEL);
196862306a36Sopenharmony_ci}
196962306a36Sopenharmony_ci
197062306a36Sopenharmony_ci/**
197162306a36Sopenharmony_ci * pcpu_balance_free - manage the amount of free chunks
197262306a36Sopenharmony_ci * @empty_only: free chunks only if there are no populated pages
197362306a36Sopenharmony_ci *
197462306a36Sopenharmony_ci * If empty_only is %false, reclaim all fully free chunks regardless of the
197562306a36Sopenharmony_ci * number of populated pages.  Otherwise, only reclaim chunks that have no
197662306a36Sopenharmony_ci * populated pages.
197762306a36Sopenharmony_ci *
197862306a36Sopenharmony_ci * CONTEXT:
197962306a36Sopenharmony_ci * pcpu_lock (can be dropped temporarily)
198062306a36Sopenharmony_ci */
198162306a36Sopenharmony_cistatic void pcpu_balance_free(bool empty_only)
198262306a36Sopenharmony_ci{
198362306a36Sopenharmony_ci	LIST_HEAD(to_free);
198462306a36Sopenharmony_ci	struct list_head *free_head = &pcpu_chunk_lists[pcpu_free_slot];
198562306a36Sopenharmony_ci	struct pcpu_chunk *chunk, *next;
198662306a36Sopenharmony_ci
198762306a36Sopenharmony_ci	lockdep_assert_held(&pcpu_lock);
198862306a36Sopenharmony_ci
198962306a36Sopenharmony_ci	/*
199062306a36Sopenharmony_ci	 * There's no reason to keep around multiple unused chunks and VM
199162306a36Sopenharmony_ci	 * areas can be scarce.  Destroy all free chunks except for one.
199262306a36Sopenharmony_ci	 */
199362306a36Sopenharmony_ci	list_for_each_entry_safe(chunk, next, free_head, list) {
199462306a36Sopenharmony_ci		WARN_ON(chunk->immutable);
199562306a36Sopenharmony_ci
199662306a36Sopenharmony_ci		/* spare the first one */
199762306a36Sopenharmony_ci		if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
199862306a36Sopenharmony_ci			continue;
199962306a36Sopenharmony_ci
200062306a36Sopenharmony_ci		if (!empty_only || chunk->nr_empty_pop_pages == 0)
200162306a36Sopenharmony_ci			list_move(&chunk->list, &to_free);
200262306a36Sopenharmony_ci	}
200362306a36Sopenharmony_ci
200462306a36Sopenharmony_ci	if (list_empty(&to_free))
200562306a36Sopenharmony_ci		return;
200662306a36Sopenharmony_ci
200762306a36Sopenharmony_ci	spin_unlock_irq(&pcpu_lock);
200862306a36Sopenharmony_ci	list_for_each_entry_safe(chunk, next, &to_free, list) {
200962306a36Sopenharmony_ci		unsigned int rs, re;
201062306a36Sopenharmony_ci
201162306a36Sopenharmony_ci		for_each_set_bitrange(rs, re, chunk->populated, chunk->nr_pages) {
201262306a36Sopenharmony_ci			pcpu_depopulate_chunk(chunk, rs, re);
201362306a36Sopenharmony_ci			spin_lock_irq(&pcpu_lock);
201462306a36Sopenharmony_ci			pcpu_chunk_depopulated(chunk, rs, re);
201562306a36Sopenharmony_ci			spin_unlock_irq(&pcpu_lock);
201662306a36Sopenharmony_ci		}
201762306a36Sopenharmony_ci		pcpu_destroy_chunk(chunk);
201862306a36Sopenharmony_ci		cond_resched();
201962306a36Sopenharmony_ci	}
202062306a36Sopenharmony_ci	spin_lock_irq(&pcpu_lock);
202162306a36Sopenharmony_ci}
202262306a36Sopenharmony_ci
202362306a36Sopenharmony_ci/**
202462306a36Sopenharmony_ci * pcpu_balance_populated - manage the amount of populated pages
202562306a36Sopenharmony_ci *
202662306a36Sopenharmony_ci * Maintain a certain amount of populated pages to satisfy atomic allocations.
202762306a36Sopenharmony_ci * It is possible that this is called when physical memory is scarce causing
202862306a36Sopenharmony_ci * OOM killer to be triggered.  We should avoid doing so until an actual
202962306a36Sopenharmony_ci * allocation causes the failure as it is possible that requests can be
203062306a36Sopenharmony_ci * serviced from already backed regions.
203162306a36Sopenharmony_ci *
203262306a36Sopenharmony_ci * CONTEXT:
203362306a36Sopenharmony_ci * pcpu_lock (can be dropped temporarily)
203462306a36Sopenharmony_ci */
203562306a36Sopenharmony_cistatic void pcpu_balance_populated(void)
203662306a36Sopenharmony_ci{
203762306a36Sopenharmony_ci	/* gfp flags passed to underlying allocators */
203862306a36Sopenharmony_ci	const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
203962306a36Sopenharmony_ci	struct pcpu_chunk *chunk;
204062306a36Sopenharmony_ci	int slot, nr_to_pop, ret;
204162306a36Sopenharmony_ci
204262306a36Sopenharmony_ci	lockdep_assert_held(&pcpu_lock);
204362306a36Sopenharmony_ci
204462306a36Sopenharmony_ci	/*
204562306a36Sopenharmony_ci	 * Ensure there are certain number of free populated pages for
204662306a36Sopenharmony_ci	 * atomic allocs.  Fill up from the most packed so that atomic
204762306a36Sopenharmony_ci	 * allocs don't increase fragmentation.  If atomic allocation
204862306a36Sopenharmony_ci	 * failed previously, always populate the maximum amount.  This
204962306a36Sopenharmony_ci	 * should prevent atomic allocs larger than PAGE_SIZE from keeping
205062306a36Sopenharmony_ci	 * failing indefinitely; however, large atomic allocs are not
205162306a36Sopenharmony_ci	 * something we support properly and can be highly unreliable and
205262306a36Sopenharmony_ci	 * inefficient.
205362306a36Sopenharmony_ci	 */
205462306a36Sopenharmony_ciretry_pop:
205562306a36Sopenharmony_ci	if (pcpu_atomic_alloc_failed) {
205662306a36Sopenharmony_ci		nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
205762306a36Sopenharmony_ci		/* best effort anyway, don't worry about synchronization */
205862306a36Sopenharmony_ci		pcpu_atomic_alloc_failed = false;
205962306a36Sopenharmony_ci	} else {
206062306a36Sopenharmony_ci		nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
206162306a36Sopenharmony_ci				  pcpu_nr_empty_pop_pages,
206262306a36Sopenharmony_ci				  0, PCPU_EMPTY_POP_PAGES_HIGH);
206362306a36Sopenharmony_ci	}
206462306a36Sopenharmony_ci
206562306a36Sopenharmony_ci	for (slot = pcpu_size_to_slot(PAGE_SIZE); slot <= pcpu_free_slot; slot++) {
206662306a36Sopenharmony_ci		unsigned int nr_unpop = 0, rs, re;
206762306a36Sopenharmony_ci
206862306a36Sopenharmony_ci		if (!nr_to_pop)
206962306a36Sopenharmony_ci			break;
207062306a36Sopenharmony_ci
207162306a36Sopenharmony_ci		list_for_each_entry(chunk, &pcpu_chunk_lists[slot], list) {
207262306a36Sopenharmony_ci			nr_unpop = chunk->nr_pages - chunk->nr_populated;
207362306a36Sopenharmony_ci			if (nr_unpop)
207462306a36Sopenharmony_ci				break;
207562306a36Sopenharmony_ci		}
207662306a36Sopenharmony_ci
207762306a36Sopenharmony_ci		if (!nr_unpop)
207862306a36Sopenharmony_ci			continue;
207962306a36Sopenharmony_ci
208062306a36Sopenharmony_ci		/* @chunk can't go away while pcpu_alloc_mutex is held */
208162306a36Sopenharmony_ci		for_each_clear_bitrange(rs, re, chunk->populated, chunk->nr_pages) {
208262306a36Sopenharmony_ci			int nr = min_t(int, re - rs, nr_to_pop);
208362306a36Sopenharmony_ci
208462306a36Sopenharmony_ci			spin_unlock_irq(&pcpu_lock);
208562306a36Sopenharmony_ci			ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
208662306a36Sopenharmony_ci			cond_resched();
208762306a36Sopenharmony_ci			spin_lock_irq(&pcpu_lock);
208862306a36Sopenharmony_ci			if (!ret) {
208962306a36Sopenharmony_ci				nr_to_pop -= nr;
209062306a36Sopenharmony_ci				pcpu_chunk_populated(chunk, rs, rs + nr);
209162306a36Sopenharmony_ci			} else {
209262306a36Sopenharmony_ci				nr_to_pop = 0;
209362306a36Sopenharmony_ci			}
209462306a36Sopenharmony_ci
209562306a36Sopenharmony_ci			if (!nr_to_pop)
209662306a36Sopenharmony_ci				break;
209762306a36Sopenharmony_ci		}
209862306a36Sopenharmony_ci	}
209962306a36Sopenharmony_ci
210062306a36Sopenharmony_ci	if (nr_to_pop) {
210162306a36Sopenharmony_ci		/* ran out of chunks to populate, create a new one and retry */
210262306a36Sopenharmony_ci		spin_unlock_irq(&pcpu_lock);
210362306a36Sopenharmony_ci		chunk = pcpu_create_chunk(gfp);
210462306a36Sopenharmony_ci		cond_resched();
210562306a36Sopenharmony_ci		spin_lock_irq(&pcpu_lock);
210662306a36Sopenharmony_ci		if (chunk) {
210762306a36Sopenharmony_ci			pcpu_chunk_relocate(chunk, -1);
210862306a36Sopenharmony_ci			goto retry_pop;
210962306a36Sopenharmony_ci		}
211062306a36Sopenharmony_ci	}
211162306a36Sopenharmony_ci}
211262306a36Sopenharmony_ci
211362306a36Sopenharmony_ci/**
211462306a36Sopenharmony_ci * pcpu_reclaim_populated - scan over to_depopulate chunks and free empty pages
211562306a36Sopenharmony_ci *
211662306a36Sopenharmony_ci * Scan over chunks in the depopulate list and try to release unused populated
211762306a36Sopenharmony_ci * pages back to the system.  Depopulated chunks are sidelined to prevent
211862306a36Sopenharmony_ci * repopulating these pages unless required.  Fully free chunks are reintegrated
211962306a36Sopenharmony_ci * and freed accordingly (1 is kept around).  If we drop below the empty
212062306a36Sopenharmony_ci * populated pages threshold, reintegrate the chunk if it has empty free pages.
212162306a36Sopenharmony_ci * Each chunk is scanned in the reverse order to keep populated pages close to
212262306a36Sopenharmony_ci * the beginning of the chunk.
212362306a36Sopenharmony_ci *
212462306a36Sopenharmony_ci * CONTEXT:
212562306a36Sopenharmony_ci * pcpu_lock (can be dropped temporarily)
212662306a36Sopenharmony_ci *
212762306a36Sopenharmony_ci */
212862306a36Sopenharmony_cistatic void pcpu_reclaim_populated(void)
212962306a36Sopenharmony_ci{
213062306a36Sopenharmony_ci	struct pcpu_chunk *chunk;
213162306a36Sopenharmony_ci	struct pcpu_block_md *block;
213262306a36Sopenharmony_ci	int freed_page_start, freed_page_end;
213362306a36Sopenharmony_ci	int i, end;
213462306a36Sopenharmony_ci	bool reintegrate;
213562306a36Sopenharmony_ci
213662306a36Sopenharmony_ci	lockdep_assert_held(&pcpu_lock);
213762306a36Sopenharmony_ci
213862306a36Sopenharmony_ci	/*
213962306a36Sopenharmony_ci	 * Once a chunk is isolated to the to_depopulate list, the chunk is no
214062306a36Sopenharmony_ci	 * longer discoverable to allocations whom may populate pages.  The only
214162306a36Sopenharmony_ci	 * other accessor is the free path which only returns area back to the
214262306a36Sopenharmony_ci	 * allocator not touching the populated bitmap.
214362306a36Sopenharmony_ci	 */
214462306a36Sopenharmony_ci	while ((chunk = list_first_entry_or_null(
214562306a36Sopenharmony_ci			&pcpu_chunk_lists[pcpu_to_depopulate_slot],
214662306a36Sopenharmony_ci			struct pcpu_chunk, list))) {
214762306a36Sopenharmony_ci		WARN_ON(chunk->immutable);
214862306a36Sopenharmony_ci
214962306a36Sopenharmony_ci		/*
215062306a36Sopenharmony_ci		 * Scan chunk's pages in the reverse order to keep populated
215162306a36Sopenharmony_ci		 * pages close to the beginning of the chunk.
215262306a36Sopenharmony_ci		 */
215362306a36Sopenharmony_ci		freed_page_start = chunk->nr_pages;
215462306a36Sopenharmony_ci		freed_page_end = 0;
215562306a36Sopenharmony_ci		reintegrate = false;
215662306a36Sopenharmony_ci		for (i = chunk->nr_pages - 1, end = -1; i >= 0; i--) {
215762306a36Sopenharmony_ci			/* no more work to do */
215862306a36Sopenharmony_ci			if (chunk->nr_empty_pop_pages == 0)
215962306a36Sopenharmony_ci				break;
216062306a36Sopenharmony_ci
216162306a36Sopenharmony_ci			/* reintegrate chunk to prevent atomic alloc failures */
216262306a36Sopenharmony_ci			if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) {
216362306a36Sopenharmony_ci				reintegrate = true;
216462306a36Sopenharmony_ci				break;
216562306a36Sopenharmony_ci			}
216662306a36Sopenharmony_ci
216762306a36Sopenharmony_ci			/*
216862306a36Sopenharmony_ci			 * If the page is empty and populated, start or
216962306a36Sopenharmony_ci			 * extend the (i, end) range.  If i == 0, decrease
217062306a36Sopenharmony_ci			 * i and perform the depopulation to cover the last
217162306a36Sopenharmony_ci			 * (first) page in the chunk.
217262306a36Sopenharmony_ci			 */
217362306a36Sopenharmony_ci			block = chunk->md_blocks + i;
217462306a36Sopenharmony_ci			if (block->contig_hint == PCPU_BITMAP_BLOCK_BITS &&
217562306a36Sopenharmony_ci			    test_bit(i, chunk->populated)) {
217662306a36Sopenharmony_ci				if (end == -1)
217762306a36Sopenharmony_ci					end = i;
217862306a36Sopenharmony_ci				if (i > 0)
217962306a36Sopenharmony_ci					continue;
218062306a36Sopenharmony_ci				i--;
218162306a36Sopenharmony_ci			}
218262306a36Sopenharmony_ci
218362306a36Sopenharmony_ci			/* depopulate if there is an active range */
218462306a36Sopenharmony_ci			if (end == -1)
218562306a36Sopenharmony_ci				continue;
218662306a36Sopenharmony_ci
218762306a36Sopenharmony_ci			spin_unlock_irq(&pcpu_lock);
218862306a36Sopenharmony_ci			pcpu_depopulate_chunk(chunk, i + 1, end + 1);
218962306a36Sopenharmony_ci			cond_resched();
219062306a36Sopenharmony_ci			spin_lock_irq(&pcpu_lock);
219162306a36Sopenharmony_ci
219262306a36Sopenharmony_ci			pcpu_chunk_depopulated(chunk, i + 1, end + 1);
219362306a36Sopenharmony_ci			freed_page_start = min(freed_page_start, i + 1);
219462306a36Sopenharmony_ci			freed_page_end = max(freed_page_end, end + 1);
219562306a36Sopenharmony_ci
219662306a36Sopenharmony_ci			/* reset the range and continue */
219762306a36Sopenharmony_ci			end = -1;
219862306a36Sopenharmony_ci		}
219962306a36Sopenharmony_ci
220062306a36Sopenharmony_ci		/* batch tlb flush per chunk to amortize cost */
220162306a36Sopenharmony_ci		if (freed_page_start < freed_page_end) {
220262306a36Sopenharmony_ci			spin_unlock_irq(&pcpu_lock);
220362306a36Sopenharmony_ci			pcpu_post_unmap_tlb_flush(chunk,
220462306a36Sopenharmony_ci						  freed_page_start,
220562306a36Sopenharmony_ci						  freed_page_end);
220662306a36Sopenharmony_ci			cond_resched();
220762306a36Sopenharmony_ci			spin_lock_irq(&pcpu_lock);
220862306a36Sopenharmony_ci		}
220962306a36Sopenharmony_ci
221062306a36Sopenharmony_ci		if (reintegrate || chunk->free_bytes == pcpu_unit_size)
221162306a36Sopenharmony_ci			pcpu_reintegrate_chunk(chunk);
221262306a36Sopenharmony_ci		else
221362306a36Sopenharmony_ci			list_move_tail(&chunk->list,
221462306a36Sopenharmony_ci				       &pcpu_chunk_lists[pcpu_sidelined_slot]);
221562306a36Sopenharmony_ci	}
221662306a36Sopenharmony_ci}
221762306a36Sopenharmony_ci
221862306a36Sopenharmony_ci/**
221962306a36Sopenharmony_ci * pcpu_balance_workfn - manage the amount of free chunks and populated pages
222062306a36Sopenharmony_ci * @work: unused
222162306a36Sopenharmony_ci *
222262306a36Sopenharmony_ci * For each chunk type, manage the number of fully free chunks and the number of
222362306a36Sopenharmony_ci * populated pages.  An important thing to consider is when pages are freed and
222462306a36Sopenharmony_ci * how they contribute to the global counts.
222562306a36Sopenharmony_ci */
222662306a36Sopenharmony_cistatic void pcpu_balance_workfn(struct work_struct *work)
222762306a36Sopenharmony_ci{
222862306a36Sopenharmony_ci	/*
222962306a36Sopenharmony_ci	 * pcpu_balance_free() is called twice because the first time we may
223062306a36Sopenharmony_ci	 * trim pages in the active pcpu_nr_empty_pop_pages which may cause us
223162306a36Sopenharmony_ci	 * to grow other chunks.  This then gives pcpu_reclaim_populated() time
223262306a36Sopenharmony_ci	 * to move fully free chunks to the active list to be freed if
223362306a36Sopenharmony_ci	 * appropriate.
223462306a36Sopenharmony_ci	 */
223562306a36Sopenharmony_ci	mutex_lock(&pcpu_alloc_mutex);
223662306a36Sopenharmony_ci	spin_lock_irq(&pcpu_lock);
223762306a36Sopenharmony_ci
223862306a36Sopenharmony_ci	pcpu_balance_free(false);
223962306a36Sopenharmony_ci	pcpu_reclaim_populated();
224062306a36Sopenharmony_ci	pcpu_balance_populated();
224162306a36Sopenharmony_ci	pcpu_balance_free(true);
224262306a36Sopenharmony_ci
224362306a36Sopenharmony_ci	spin_unlock_irq(&pcpu_lock);
224462306a36Sopenharmony_ci	mutex_unlock(&pcpu_alloc_mutex);
224562306a36Sopenharmony_ci}
224662306a36Sopenharmony_ci
224762306a36Sopenharmony_ci/**
224862306a36Sopenharmony_ci * free_percpu - free percpu area
224962306a36Sopenharmony_ci * @ptr: pointer to area to free
225062306a36Sopenharmony_ci *
225162306a36Sopenharmony_ci * Free percpu area @ptr.
225262306a36Sopenharmony_ci *
225362306a36Sopenharmony_ci * CONTEXT:
225462306a36Sopenharmony_ci * Can be called from atomic context.
225562306a36Sopenharmony_ci */
225662306a36Sopenharmony_civoid free_percpu(void __percpu *ptr)
225762306a36Sopenharmony_ci{
225862306a36Sopenharmony_ci	void *addr;
225962306a36Sopenharmony_ci	struct pcpu_chunk *chunk;
226062306a36Sopenharmony_ci	unsigned long flags;
226162306a36Sopenharmony_ci	int size, off;
226262306a36Sopenharmony_ci	bool need_balance = false;
226362306a36Sopenharmony_ci
226462306a36Sopenharmony_ci	if (!ptr)
226562306a36Sopenharmony_ci		return;
226662306a36Sopenharmony_ci
226762306a36Sopenharmony_ci	kmemleak_free_percpu(ptr);
226862306a36Sopenharmony_ci
226962306a36Sopenharmony_ci	addr = __pcpu_ptr_to_addr(ptr);
227062306a36Sopenharmony_ci
227162306a36Sopenharmony_ci	spin_lock_irqsave(&pcpu_lock, flags);
227262306a36Sopenharmony_ci
227362306a36Sopenharmony_ci	chunk = pcpu_chunk_addr_search(addr);
227462306a36Sopenharmony_ci	off = addr - chunk->base_addr;
227562306a36Sopenharmony_ci
227662306a36Sopenharmony_ci	size = pcpu_free_area(chunk, off);
227762306a36Sopenharmony_ci
227862306a36Sopenharmony_ci	pcpu_memcg_free_hook(chunk, off, size);
227962306a36Sopenharmony_ci
228062306a36Sopenharmony_ci	/*
228162306a36Sopenharmony_ci	 * If there are more than one fully free chunks, wake up grim reaper.
228262306a36Sopenharmony_ci	 * If the chunk is isolated, it may be in the process of being
228362306a36Sopenharmony_ci	 * reclaimed.  Let reclaim manage cleaning up of that chunk.
228462306a36Sopenharmony_ci	 */
228562306a36Sopenharmony_ci	if (!chunk->isolated && chunk->free_bytes == pcpu_unit_size) {
228662306a36Sopenharmony_ci		struct pcpu_chunk *pos;
228762306a36Sopenharmony_ci
228862306a36Sopenharmony_ci		list_for_each_entry(pos, &pcpu_chunk_lists[pcpu_free_slot], list)
228962306a36Sopenharmony_ci			if (pos != chunk) {
229062306a36Sopenharmony_ci				need_balance = true;
229162306a36Sopenharmony_ci				break;
229262306a36Sopenharmony_ci			}
229362306a36Sopenharmony_ci	} else if (pcpu_should_reclaim_chunk(chunk)) {
229462306a36Sopenharmony_ci		pcpu_isolate_chunk(chunk);
229562306a36Sopenharmony_ci		need_balance = true;
229662306a36Sopenharmony_ci	}
229762306a36Sopenharmony_ci
229862306a36Sopenharmony_ci	trace_percpu_free_percpu(chunk->base_addr, off, ptr);
229962306a36Sopenharmony_ci
230062306a36Sopenharmony_ci	spin_unlock_irqrestore(&pcpu_lock, flags);
230162306a36Sopenharmony_ci
230262306a36Sopenharmony_ci	if (need_balance)
230362306a36Sopenharmony_ci		pcpu_schedule_balance_work();
230462306a36Sopenharmony_ci}
230562306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(free_percpu);
230662306a36Sopenharmony_ci
230762306a36Sopenharmony_cibool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
230862306a36Sopenharmony_ci{
230962306a36Sopenharmony_ci#ifdef CONFIG_SMP
231062306a36Sopenharmony_ci	const size_t static_size = __per_cpu_end - __per_cpu_start;
231162306a36Sopenharmony_ci	void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
231262306a36Sopenharmony_ci	unsigned int cpu;
231362306a36Sopenharmony_ci
231462306a36Sopenharmony_ci	for_each_possible_cpu(cpu) {
231562306a36Sopenharmony_ci		void *start = per_cpu_ptr(base, cpu);
231662306a36Sopenharmony_ci		void *va = (void *)addr;
231762306a36Sopenharmony_ci
231862306a36Sopenharmony_ci		if (va >= start && va < start + static_size) {
231962306a36Sopenharmony_ci			if (can_addr) {
232062306a36Sopenharmony_ci				*can_addr = (unsigned long) (va - start);
232162306a36Sopenharmony_ci				*can_addr += (unsigned long)
232262306a36Sopenharmony_ci					per_cpu_ptr(base, get_boot_cpu_id());
232362306a36Sopenharmony_ci			}
232462306a36Sopenharmony_ci			return true;
232562306a36Sopenharmony_ci		}
232662306a36Sopenharmony_ci	}
232762306a36Sopenharmony_ci#endif
232862306a36Sopenharmony_ci	/* on UP, can't distinguish from other static vars, always false */
232962306a36Sopenharmony_ci	return false;
233062306a36Sopenharmony_ci}
233162306a36Sopenharmony_ci
233262306a36Sopenharmony_ci/**
233362306a36Sopenharmony_ci * is_kernel_percpu_address - test whether address is from static percpu area
233462306a36Sopenharmony_ci * @addr: address to test
233562306a36Sopenharmony_ci *
233662306a36Sopenharmony_ci * Test whether @addr belongs to in-kernel static percpu area.  Module
233762306a36Sopenharmony_ci * static percpu areas are not considered.  For those, use
233862306a36Sopenharmony_ci * is_module_percpu_address().
233962306a36Sopenharmony_ci *
234062306a36Sopenharmony_ci * RETURNS:
234162306a36Sopenharmony_ci * %true if @addr is from in-kernel static percpu area, %false otherwise.
234262306a36Sopenharmony_ci */
234362306a36Sopenharmony_cibool is_kernel_percpu_address(unsigned long addr)
234462306a36Sopenharmony_ci{
234562306a36Sopenharmony_ci	return __is_kernel_percpu_address(addr, NULL);
234662306a36Sopenharmony_ci}
234762306a36Sopenharmony_ci
234862306a36Sopenharmony_ci/**
234962306a36Sopenharmony_ci * per_cpu_ptr_to_phys - convert translated percpu address to physical address
235062306a36Sopenharmony_ci * @addr: the address to be converted to physical address
235162306a36Sopenharmony_ci *
235262306a36Sopenharmony_ci * Given @addr which is dereferenceable address obtained via one of
235362306a36Sopenharmony_ci * percpu access macros, this function translates it into its physical
235462306a36Sopenharmony_ci * address.  The caller is responsible for ensuring @addr stays valid
235562306a36Sopenharmony_ci * until this function finishes.
235662306a36Sopenharmony_ci *
235762306a36Sopenharmony_ci * percpu allocator has special setup for the first chunk, which currently
235862306a36Sopenharmony_ci * supports either embedding in linear address space or vmalloc mapping,
235962306a36Sopenharmony_ci * and, from the second one, the backing allocator (currently either vm or
236062306a36Sopenharmony_ci * km) provides translation.
236162306a36Sopenharmony_ci *
236262306a36Sopenharmony_ci * The addr can be translated simply without checking if it falls into the
236362306a36Sopenharmony_ci * first chunk. But the current code reflects better how percpu allocator
236462306a36Sopenharmony_ci * actually works, and the verification can discover both bugs in percpu
236562306a36Sopenharmony_ci * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
236662306a36Sopenharmony_ci * code.
236762306a36Sopenharmony_ci *
236862306a36Sopenharmony_ci * RETURNS:
236962306a36Sopenharmony_ci * The physical address for @addr.
237062306a36Sopenharmony_ci */
237162306a36Sopenharmony_ciphys_addr_t per_cpu_ptr_to_phys(void *addr)
237262306a36Sopenharmony_ci{
237362306a36Sopenharmony_ci	void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
237462306a36Sopenharmony_ci	bool in_first_chunk = false;
237562306a36Sopenharmony_ci	unsigned long first_low, first_high;
237662306a36Sopenharmony_ci	unsigned int cpu;
237762306a36Sopenharmony_ci
237862306a36Sopenharmony_ci	/*
237962306a36Sopenharmony_ci	 * The following test on unit_low/high isn't strictly
238062306a36Sopenharmony_ci	 * necessary but will speed up lookups of addresses which
238162306a36Sopenharmony_ci	 * aren't in the first chunk.
238262306a36Sopenharmony_ci	 *
238362306a36Sopenharmony_ci	 * The address check is against full chunk sizes.  pcpu_base_addr
238462306a36Sopenharmony_ci	 * points to the beginning of the first chunk including the
238562306a36Sopenharmony_ci	 * static region.  Assumes good intent as the first chunk may
238662306a36Sopenharmony_ci	 * not be full (ie. < pcpu_unit_pages in size).
238762306a36Sopenharmony_ci	 */
238862306a36Sopenharmony_ci	first_low = (unsigned long)pcpu_base_addr +
238962306a36Sopenharmony_ci		    pcpu_unit_page_offset(pcpu_low_unit_cpu, 0);
239062306a36Sopenharmony_ci	first_high = (unsigned long)pcpu_base_addr +
239162306a36Sopenharmony_ci		     pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages);
239262306a36Sopenharmony_ci	if ((unsigned long)addr >= first_low &&
239362306a36Sopenharmony_ci	    (unsigned long)addr < first_high) {
239462306a36Sopenharmony_ci		for_each_possible_cpu(cpu) {
239562306a36Sopenharmony_ci			void *start = per_cpu_ptr(base, cpu);
239662306a36Sopenharmony_ci
239762306a36Sopenharmony_ci			if (addr >= start && addr < start + pcpu_unit_size) {
239862306a36Sopenharmony_ci				in_first_chunk = true;
239962306a36Sopenharmony_ci				break;
240062306a36Sopenharmony_ci			}
240162306a36Sopenharmony_ci		}
240262306a36Sopenharmony_ci	}
240362306a36Sopenharmony_ci
240462306a36Sopenharmony_ci	if (in_first_chunk) {
240562306a36Sopenharmony_ci		if (!is_vmalloc_addr(addr))
240662306a36Sopenharmony_ci			return __pa(addr);
240762306a36Sopenharmony_ci		else
240862306a36Sopenharmony_ci			return page_to_phys(vmalloc_to_page(addr)) +
240962306a36Sopenharmony_ci			       offset_in_page(addr);
241062306a36Sopenharmony_ci	} else
241162306a36Sopenharmony_ci		return page_to_phys(pcpu_addr_to_page(addr)) +
241262306a36Sopenharmony_ci		       offset_in_page(addr);
241362306a36Sopenharmony_ci}
241462306a36Sopenharmony_ci
241562306a36Sopenharmony_ci/**
241662306a36Sopenharmony_ci * pcpu_alloc_alloc_info - allocate percpu allocation info
241762306a36Sopenharmony_ci * @nr_groups: the number of groups
241862306a36Sopenharmony_ci * @nr_units: the number of units
241962306a36Sopenharmony_ci *
242062306a36Sopenharmony_ci * Allocate ai which is large enough for @nr_groups groups containing
242162306a36Sopenharmony_ci * @nr_units units.  The returned ai's groups[0].cpu_map points to the
242262306a36Sopenharmony_ci * cpu_map array which is long enough for @nr_units and filled with
242362306a36Sopenharmony_ci * NR_CPUS.  It's the caller's responsibility to initialize cpu_map
242462306a36Sopenharmony_ci * pointer of other groups.
242562306a36Sopenharmony_ci *
242662306a36Sopenharmony_ci * RETURNS:
242762306a36Sopenharmony_ci * Pointer to the allocated pcpu_alloc_info on success, NULL on
242862306a36Sopenharmony_ci * failure.
242962306a36Sopenharmony_ci */
243062306a36Sopenharmony_cistruct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
243162306a36Sopenharmony_ci						      int nr_units)
243262306a36Sopenharmony_ci{
243362306a36Sopenharmony_ci	struct pcpu_alloc_info *ai;
243462306a36Sopenharmony_ci	size_t base_size, ai_size;
243562306a36Sopenharmony_ci	void *ptr;
243662306a36Sopenharmony_ci	int unit;
243762306a36Sopenharmony_ci
243862306a36Sopenharmony_ci	base_size = ALIGN(struct_size(ai, groups, nr_groups),
243962306a36Sopenharmony_ci			  __alignof__(ai->groups[0].cpu_map[0]));
244062306a36Sopenharmony_ci	ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
244162306a36Sopenharmony_ci
244262306a36Sopenharmony_ci	ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE);
244362306a36Sopenharmony_ci	if (!ptr)
244462306a36Sopenharmony_ci		return NULL;
244562306a36Sopenharmony_ci	ai = ptr;
244662306a36Sopenharmony_ci	ptr += base_size;
244762306a36Sopenharmony_ci
244862306a36Sopenharmony_ci	ai->groups[0].cpu_map = ptr;
244962306a36Sopenharmony_ci
245062306a36Sopenharmony_ci	for (unit = 0; unit < nr_units; unit++)
245162306a36Sopenharmony_ci		ai->groups[0].cpu_map[unit] = NR_CPUS;
245262306a36Sopenharmony_ci
245362306a36Sopenharmony_ci	ai->nr_groups = nr_groups;
245462306a36Sopenharmony_ci	ai->__ai_size = PFN_ALIGN(ai_size);
245562306a36Sopenharmony_ci
245662306a36Sopenharmony_ci	return ai;
245762306a36Sopenharmony_ci}
245862306a36Sopenharmony_ci
245962306a36Sopenharmony_ci/**
246062306a36Sopenharmony_ci * pcpu_free_alloc_info - free percpu allocation info
246162306a36Sopenharmony_ci * @ai: pcpu_alloc_info to free
246262306a36Sopenharmony_ci *
246362306a36Sopenharmony_ci * Free @ai which was allocated by pcpu_alloc_alloc_info().
246462306a36Sopenharmony_ci */
246562306a36Sopenharmony_civoid __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
246662306a36Sopenharmony_ci{
246762306a36Sopenharmony_ci	memblock_free(ai, ai->__ai_size);
246862306a36Sopenharmony_ci}
246962306a36Sopenharmony_ci
247062306a36Sopenharmony_ci/**
247162306a36Sopenharmony_ci * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
247262306a36Sopenharmony_ci * @lvl: loglevel
247362306a36Sopenharmony_ci * @ai: allocation info to dump
247462306a36Sopenharmony_ci *
247562306a36Sopenharmony_ci * Print out information about @ai using loglevel @lvl.
247662306a36Sopenharmony_ci */
247762306a36Sopenharmony_cistatic void pcpu_dump_alloc_info(const char *lvl,
247862306a36Sopenharmony_ci				 const struct pcpu_alloc_info *ai)
247962306a36Sopenharmony_ci{
248062306a36Sopenharmony_ci	int group_width = 1, cpu_width = 1, width;
248162306a36Sopenharmony_ci	char empty_str[] = "--------";
248262306a36Sopenharmony_ci	int alloc = 0, alloc_end = 0;
248362306a36Sopenharmony_ci	int group, v;
248462306a36Sopenharmony_ci	int upa, apl;	/* units per alloc, allocs per line */
248562306a36Sopenharmony_ci
248662306a36Sopenharmony_ci	v = ai->nr_groups;
248762306a36Sopenharmony_ci	while (v /= 10)
248862306a36Sopenharmony_ci		group_width++;
248962306a36Sopenharmony_ci
249062306a36Sopenharmony_ci	v = num_possible_cpus();
249162306a36Sopenharmony_ci	while (v /= 10)
249262306a36Sopenharmony_ci		cpu_width++;
249362306a36Sopenharmony_ci	empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
249462306a36Sopenharmony_ci
249562306a36Sopenharmony_ci	upa = ai->alloc_size / ai->unit_size;
249662306a36Sopenharmony_ci	width = upa * (cpu_width + 1) + group_width + 3;
249762306a36Sopenharmony_ci	apl = rounddown_pow_of_two(max(60 / width, 1));
249862306a36Sopenharmony_ci
249962306a36Sopenharmony_ci	printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
250062306a36Sopenharmony_ci	       lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
250162306a36Sopenharmony_ci	       ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
250262306a36Sopenharmony_ci
250362306a36Sopenharmony_ci	for (group = 0; group < ai->nr_groups; group++) {
250462306a36Sopenharmony_ci		const struct pcpu_group_info *gi = &ai->groups[group];
250562306a36Sopenharmony_ci		int unit = 0, unit_end = 0;
250662306a36Sopenharmony_ci
250762306a36Sopenharmony_ci		BUG_ON(gi->nr_units % upa);
250862306a36Sopenharmony_ci		for (alloc_end += gi->nr_units / upa;
250962306a36Sopenharmony_ci		     alloc < alloc_end; alloc++) {
251062306a36Sopenharmony_ci			if (!(alloc % apl)) {
251162306a36Sopenharmony_ci				pr_cont("\n");
251262306a36Sopenharmony_ci				printk("%spcpu-alloc: ", lvl);
251362306a36Sopenharmony_ci			}
251462306a36Sopenharmony_ci			pr_cont("[%0*d] ", group_width, group);
251562306a36Sopenharmony_ci
251662306a36Sopenharmony_ci			for (unit_end += upa; unit < unit_end; unit++)
251762306a36Sopenharmony_ci				if (gi->cpu_map[unit] != NR_CPUS)
251862306a36Sopenharmony_ci					pr_cont("%0*d ",
251962306a36Sopenharmony_ci						cpu_width, gi->cpu_map[unit]);
252062306a36Sopenharmony_ci				else
252162306a36Sopenharmony_ci					pr_cont("%s ", empty_str);
252262306a36Sopenharmony_ci		}
252362306a36Sopenharmony_ci	}
252462306a36Sopenharmony_ci	pr_cont("\n");
252562306a36Sopenharmony_ci}
252662306a36Sopenharmony_ci
252762306a36Sopenharmony_ci/**
252862306a36Sopenharmony_ci * pcpu_setup_first_chunk - initialize the first percpu chunk
252962306a36Sopenharmony_ci * @ai: pcpu_alloc_info describing how to percpu area is shaped
253062306a36Sopenharmony_ci * @base_addr: mapped address
253162306a36Sopenharmony_ci *
253262306a36Sopenharmony_ci * Initialize the first percpu chunk which contains the kernel static
253362306a36Sopenharmony_ci * percpu area.  This function is to be called from arch percpu area
253462306a36Sopenharmony_ci * setup path.
253562306a36Sopenharmony_ci *
253662306a36Sopenharmony_ci * @ai contains all information necessary to initialize the first
253762306a36Sopenharmony_ci * chunk and prime the dynamic percpu allocator.
253862306a36Sopenharmony_ci *
253962306a36Sopenharmony_ci * @ai->static_size is the size of static percpu area.
254062306a36Sopenharmony_ci *
254162306a36Sopenharmony_ci * @ai->reserved_size, if non-zero, specifies the amount of bytes to
254262306a36Sopenharmony_ci * reserve after the static area in the first chunk.  This reserves
254362306a36Sopenharmony_ci * the first chunk such that it's available only through reserved
254462306a36Sopenharmony_ci * percpu allocation.  This is primarily used to serve module percpu
254562306a36Sopenharmony_ci * static areas on architectures where the addressing model has
254662306a36Sopenharmony_ci * limited offset range for symbol relocations to guarantee module
254762306a36Sopenharmony_ci * percpu symbols fall inside the relocatable range.
254862306a36Sopenharmony_ci *
254962306a36Sopenharmony_ci * @ai->dyn_size determines the number of bytes available for dynamic
255062306a36Sopenharmony_ci * allocation in the first chunk.  The area between @ai->static_size +
255162306a36Sopenharmony_ci * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
255262306a36Sopenharmony_ci *
255362306a36Sopenharmony_ci * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
255462306a36Sopenharmony_ci * and equal to or larger than @ai->static_size + @ai->reserved_size +
255562306a36Sopenharmony_ci * @ai->dyn_size.
255662306a36Sopenharmony_ci *
255762306a36Sopenharmony_ci * @ai->atom_size is the allocation atom size and used as alignment
255862306a36Sopenharmony_ci * for vm areas.
255962306a36Sopenharmony_ci *
256062306a36Sopenharmony_ci * @ai->alloc_size is the allocation size and always multiple of
256162306a36Sopenharmony_ci * @ai->atom_size.  This is larger than @ai->atom_size if
256262306a36Sopenharmony_ci * @ai->unit_size is larger than @ai->atom_size.
256362306a36Sopenharmony_ci *
256462306a36Sopenharmony_ci * @ai->nr_groups and @ai->groups describe virtual memory layout of
256562306a36Sopenharmony_ci * percpu areas.  Units which should be colocated are put into the
256662306a36Sopenharmony_ci * same group.  Dynamic VM areas will be allocated according to these
256762306a36Sopenharmony_ci * groupings.  If @ai->nr_groups is zero, a single group containing
256862306a36Sopenharmony_ci * all units is assumed.
256962306a36Sopenharmony_ci *
257062306a36Sopenharmony_ci * The caller should have mapped the first chunk at @base_addr and
257162306a36Sopenharmony_ci * copied static data to each unit.
257262306a36Sopenharmony_ci *
257362306a36Sopenharmony_ci * The first chunk will always contain a static and a dynamic region.
257462306a36Sopenharmony_ci * However, the static region is not managed by any chunk.  If the first
257562306a36Sopenharmony_ci * chunk also contains a reserved region, it is served by two chunks -
257662306a36Sopenharmony_ci * one for the reserved region and one for the dynamic region.  They
257762306a36Sopenharmony_ci * share the same vm, but use offset regions in the area allocation map.
257862306a36Sopenharmony_ci * The chunk serving the dynamic region is circulated in the chunk slots
257962306a36Sopenharmony_ci * and available for dynamic allocation like any other chunk.
258062306a36Sopenharmony_ci */
258162306a36Sopenharmony_civoid __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
258262306a36Sopenharmony_ci				   void *base_addr)
258362306a36Sopenharmony_ci{
258462306a36Sopenharmony_ci	size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
258562306a36Sopenharmony_ci	size_t static_size, dyn_size;
258662306a36Sopenharmony_ci	unsigned long *group_offsets;
258762306a36Sopenharmony_ci	size_t *group_sizes;
258862306a36Sopenharmony_ci	unsigned long *unit_off;
258962306a36Sopenharmony_ci	unsigned int cpu;
259062306a36Sopenharmony_ci	int *unit_map;
259162306a36Sopenharmony_ci	int group, unit, i;
259262306a36Sopenharmony_ci	unsigned long tmp_addr;
259362306a36Sopenharmony_ci	size_t alloc_size;
259462306a36Sopenharmony_ci
259562306a36Sopenharmony_ci#define PCPU_SETUP_BUG_ON(cond)	do {					\
259662306a36Sopenharmony_ci	if (unlikely(cond)) {						\
259762306a36Sopenharmony_ci		pr_emerg("failed to initialize, %s\n", #cond);		\
259862306a36Sopenharmony_ci		pr_emerg("cpu_possible_mask=%*pb\n",			\
259962306a36Sopenharmony_ci			 cpumask_pr_args(cpu_possible_mask));		\
260062306a36Sopenharmony_ci		pcpu_dump_alloc_info(KERN_EMERG, ai);			\
260162306a36Sopenharmony_ci		BUG();							\
260262306a36Sopenharmony_ci	}								\
260362306a36Sopenharmony_ci} while (0)
260462306a36Sopenharmony_ci
260562306a36Sopenharmony_ci	/* sanity checks */
260662306a36Sopenharmony_ci	PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
260762306a36Sopenharmony_ci#ifdef CONFIG_SMP
260862306a36Sopenharmony_ci	PCPU_SETUP_BUG_ON(!ai->static_size);
260962306a36Sopenharmony_ci	PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
261062306a36Sopenharmony_ci#endif
261162306a36Sopenharmony_ci	PCPU_SETUP_BUG_ON(!base_addr);
261262306a36Sopenharmony_ci	PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
261362306a36Sopenharmony_ci	PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
261462306a36Sopenharmony_ci	PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
261562306a36Sopenharmony_ci	PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
261662306a36Sopenharmony_ci	PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
261762306a36Sopenharmony_ci	PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
261862306a36Sopenharmony_ci	PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
261962306a36Sopenharmony_ci	PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
262062306a36Sopenharmony_ci			    IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
262162306a36Sopenharmony_ci	PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
262262306a36Sopenharmony_ci
262362306a36Sopenharmony_ci	/* process group information and build config tables accordingly */
262462306a36Sopenharmony_ci	alloc_size = ai->nr_groups * sizeof(group_offsets[0]);
262562306a36Sopenharmony_ci	group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
262662306a36Sopenharmony_ci	if (!group_offsets)
262762306a36Sopenharmony_ci		panic("%s: Failed to allocate %zu bytes\n", __func__,
262862306a36Sopenharmony_ci		      alloc_size);
262962306a36Sopenharmony_ci
263062306a36Sopenharmony_ci	alloc_size = ai->nr_groups * sizeof(group_sizes[0]);
263162306a36Sopenharmony_ci	group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
263262306a36Sopenharmony_ci	if (!group_sizes)
263362306a36Sopenharmony_ci		panic("%s: Failed to allocate %zu bytes\n", __func__,
263462306a36Sopenharmony_ci		      alloc_size);
263562306a36Sopenharmony_ci
263662306a36Sopenharmony_ci	alloc_size = nr_cpu_ids * sizeof(unit_map[0]);
263762306a36Sopenharmony_ci	unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
263862306a36Sopenharmony_ci	if (!unit_map)
263962306a36Sopenharmony_ci		panic("%s: Failed to allocate %zu bytes\n", __func__,
264062306a36Sopenharmony_ci		      alloc_size);
264162306a36Sopenharmony_ci
264262306a36Sopenharmony_ci	alloc_size = nr_cpu_ids * sizeof(unit_off[0]);
264362306a36Sopenharmony_ci	unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
264462306a36Sopenharmony_ci	if (!unit_off)
264562306a36Sopenharmony_ci		panic("%s: Failed to allocate %zu bytes\n", __func__,
264662306a36Sopenharmony_ci		      alloc_size);
264762306a36Sopenharmony_ci
264862306a36Sopenharmony_ci	for (cpu = 0; cpu < nr_cpu_ids; cpu++)
264962306a36Sopenharmony_ci		unit_map[cpu] = UINT_MAX;
265062306a36Sopenharmony_ci
265162306a36Sopenharmony_ci	pcpu_low_unit_cpu = NR_CPUS;
265262306a36Sopenharmony_ci	pcpu_high_unit_cpu = NR_CPUS;
265362306a36Sopenharmony_ci
265462306a36Sopenharmony_ci	for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
265562306a36Sopenharmony_ci		const struct pcpu_group_info *gi = &ai->groups[group];
265662306a36Sopenharmony_ci
265762306a36Sopenharmony_ci		group_offsets[group] = gi->base_offset;
265862306a36Sopenharmony_ci		group_sizes[group] = gi->nr_units * ai->unit_size;
265962306a36Sopenharmony_ci
266062306a36Sopenharmony_ci		for (i = 0; i < gi->nr_units; i++) {
266162306a36Sopenharmony_ci			cpu = gi->cpu_map[i];
266262306a36Sopenharmony_ci			if (cpu == NR_CPUS)
266362306a36Sopenharmony_ci				continue;
266462306a36Sopenharmony_ci
266562306a36Sopenharmony_ci			PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
266662306a36Sopenharmony_ci			PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
266762306a36Sopenharmony_ci			PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
266862306a36Sopenharmony_ci
266962306a36Sopenharmony_ci			unit_map[cpu] = unit + i;
267062306a36Sopenharmony_ci			unit_off[cpu] = gi->base_offset + i * ai->unit_size;
267162306a36Sopenharmony_ci
267262306a36Sopenharmony_ci			/* determine low/high unit_cpu */
267362306a36Sopenharmony_ci			if (pcpu_low_unit_cpu == NR_CPUS ||
267462306a36Sopenharmony_ci			    unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
267562306a36Sopenharmony_ci				pcpu_low_unit_cpu = cpu;
267662306a36Sopenharmony_ci			if (pcpu_high_unit_cpu == NR_CPUS ||
267762306a36Sopenharmony_ci			    unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
267862306a36Sopenharmony_ci				pcpu_high_unit_cpu = cpu;
267962306a36Sopenharmony_ci		}
268062306a36Sopenharmony_ci	}
268162306a36Sopenharmony_ci	pcpu_nr_units = unit;
268262306a36Sopenharmony_ci
268362306a36Sopenharmony_ci	for_each_possible_cpu(cpu)
268462306a36Sopenharmony_ci		PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
268562306a36Sopenharmony_ci
268662306a36Sopenharmony_ci	/* we're done parsing the input, undefine BUG macro and dump config */
268762306a36Sopenharmony_ci#undef PCPU_SETUP_BUG_ON
268862306a36Sopenharmony_ci	pcpu_dump_alloc_info(KERN_DEBUG, ai);
268962306a36Sopenharmony_ci
269062306a36Sopenharmony_ci	pcpu_nr_groups = ai->nr_groups;
269162306a36Sopenharmony_ci	pcpu_group_offsets = group_offsets;
269262306a36Sopenharmony_ci	pcpu_group_sizes = group_sizes;
269362306a36Sopenharmony_ci	pcpu_unit_map = unit_map;
269462306a36Sopenharmony_ci	pcpu_unit_offsets = unit_off;
269562306a36Sopenharmony_ci
269662306a36Sopenharmony_ci	/* determine basic parameters */
269762306a36Sopenharmony_ci	pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
269862306a36Sopenharmony_ci	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
269962306a36Sopenharmony_ci	pcpu_atom_size = ai->atom_size;
270062306a36Sopenharmony_ci	pcpu_chunk_struct_size = struct_size((struct pcpu_chunk *)0, populated,
270162306a36Sopenharmony_ci					     BITS_TO_LONGS(pcpu_unit_pages));
270262306a36Sopenharmony_ci
270362306a36Sopenharmony_ci	pcpu_stats_save_ai(ai);
270462306a36Sopenharmony_ci
270562306a36Sopenharmony_ci	/*
270662306a36Sopenharmony_ci	 * Allocate chunk slots.  The slots after the active slots are:
270762306a36Sopenharmony_ci	 *   sidelined_slot - isolated, depopulated chunks
270862306a36Sopenharmony_ci	 *   free_slot - fully free chunks
270962306a36Sopenharmony_ci	 *   to_depopulate_slot - isolated, chunks to depopulate
271062306a36Sopenharmony_ci	 */
271162306a36Sopenharmony_ci	pcpu_sidelined_slot = __pcpu_size_to_slot(pcpu_unit_size) + 1;
271262306a36Sopenharmony_ci	pcpu_free_slot = pcpu_sidelined_slot + 1;
271362306a36Sopenharmony_ci	pcpu_to_depopulate_slot = pcpu_free_slot + 1;
271462306a36Sopenharmony_ci	pcpu_nr_slots = pcpu_to_depopulate_slot + 1;
271562306a36Sopenharmony_ci	pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots *
271662306a36Sopenharmony_ci					  sizeof(pcpu_chunk_lists[0]),
271762306a36Sopenharmony_ci					  SMP_CACHE_BYTES);
271862306a36Sopenharmony_ci	if (!pcpu_chunk_lists)
271962306a36Sopenharmony_ci		panic("%s: Failed to allocate %zu bytes\n", __func__,
272062306a36Sopenharmony_ci		      pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]));
272162306a36Sopenharmony_ci
272262306a36Sopenharmony_ci	for (i = 0; i < pcpu_nr_slots; i++)
272362306a36Sopenharmony_ci		INIT_LIST_HEAD(&pcpu_chunk_lists[i]);
272462306a36Sopenharmony_ci
272562306a36Sopenharmony_ci	/*
272662306a36Sopenharmony_ci	 * The end of the static region needs to be aligned with the
272762306a36Sopenharmony_ci	 * minimum allocation size as this offsets the reserved and
272862306a36Sopenharmony_ci	 * dynamic region.  The first chunk ends page aligned by
272962306a36Sopenharmony_ci	 * expanding the dynamic region, therefore the dynamic region
273062306a36Sopenharmony_ci	 * can be shrunk to compensate while still staying above the
273162306a36Sopenharmony_ci	 * configured sizes.
273262306a36Sopenharmony_ci	 */
273362306a36Sopenharmony_ci	static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE);
273462306a36Sopenharmony_ci	dyn_size = ai->dyn_size - (static_size - ai->static_size);
273562306a36Sopenharmony_ci
273662306a36Sopenharmony_ci	/*
273762306a36Sopenharmony_ci	 * Initialize first chunk:
273862306a36Sopenharmony_ci	 * This chunk is broken up into 3 parts:
273962306a36Sopenharmony_ci	 *		< static | [reserved] | dynamic >
274062306a36Sopenharmony_ci	 * - static - there is no backing chunk because these allocations can
274162306a36Sopenharmony_ci	 *   never be freed.
274262306a36Sopenharmony_ci	 * - reserved (pcpu_reserved_chunk) - exists primarily to serve
274362306a36Sopenharmony_ci	 *   allocations from module load.
274462306a36Sopenharmony_ci	 * - dynamic (pcpu_first_chunk) - serves the dynamic part of the first
274562306a36Sopenharmony_ci	 *   chunk.
274662306a36Sopenharmony_ci	 */
274762306a36Sopenharmony_ci	tmp_addr = (unsigned long)base_addr + static_size;
274862306a36Sopenharmony_ci	if (ai->reserved_size)
274962306a36Sopenharmony_ci		pcpu_reserved_chunk = pcpu_alloc_first_chunk(tmp_addr,
275062306a36Sopenharmony_ci						ai->reserved_size);
275162306a36Sopenharmony_ci	tmp_addr = (unsigned long)base_addr + static_size + ai->reserved_size;
275262306a36Sopenharmony_ci	pcpu_first_chunk = pcpu_alloc_first_chunk(tmp_addr, dyn_size);
275362306a36Sopenharmony_ci
275462306a36Sopenharmony_ci	pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
275562306a36Sopenharmony_ci	pcpu_chunk_relocate(pcpu_first_chunk, -1);
275662306a36Sopenharmony_ci
275762306a36Sopenharmony_ci	/* include all regions of the first chunk */
275862306a36Sopenharmony_ci	pcpu_nr_populated += PFN_DOWN(size_sum);
275962306a36Sopenharmony_ci
276062306a36Sopenharmony_ci	pcpu_stats_chunk_alloc();
276162306a36Sopenharmony_ci	trace_percpu_create_chunk(base_addr);
276262306a36Sopenharmony_ci
276362306a36Sopenharmony_ci	/* we're done */
276462306a36Sopenharmony_ci	pcpu_base_addr = base_addr;
276562306a36Sopenharmony_ci}
276662306a36Sopenharmony_ci
276762306a36Sopenharmony_ci#ifdef CONFIG_SMP
276862306a36Sopenharmony_ci
276962306a36Sopenharmony_ciconst char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
277062306a36Sopenharmony_ci	[PCPU_FC_AUTO]	= "auto",
277162306a36Sopenharmony_ci	[PCPU_FC_EMBED]	= "embed",
277262306a36Sopenharmony_ci	[PCPU_FC_PAGE]	= "page",
277362306a36Sopenharmony_ci};
277462306a36Sopenharmony_ci
277562306a36Sopenharmony_cienum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
277662306a36Sopenharmony_ci
277762306a36Sopenharmony_cistatic int __init percpu_alloc_setup(char *str)
277862306a36Sopenharmony_ci{
277962306a36Sopenharmony_ci	if (!str)
278062306a36Sopenharmony_ci		return -EINVAL;
278162306a36Sopenharmony_ci
278262306a36Sopenharmony_ci	if (0)
278362306a36Sopenharmony_ci		/* nada */;
278462306a36Sopenharmony_ci#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
278562306a36Sopenharmony_ci	else if (!strcmp(str, "embed"))
278662306a36Sopenharmony_ci		pcpu_chosen_fc = PCPU_FC_EMBED;
278762306a36Sopenharmony_ci#endif
278862306a36Sopenharmony_ci#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
278962306a36Sopenharmony_ci	else if (!strcmp(str, "page"))
279062306a36Sopenharmony_ci		pcpu_chosen_fc = PCPU_FC_PAGE;
279162306a36Sopenharmony_ci#endif
279262306a36Sopenharmony_ci	else
279362306a36Sopenharmony_ci		pr_warn("unknown allocator %s specified\n", str);
279462306a36Sopenharmony_ci
279562306a36Sopenharmony_ci	return 0;
279662306a36Sopenharmony_ci}
279762306a36Sopenharmony_ciearly_param("percpu_alloc", percpu_alloc_setup);
279862306a36Sopenharmony_ci
279962306a36Sopenharmony_ci/*
280062306a36Sopenharmony_ci * pcpu_embed_first_chunk() is used by the generic percpu setup.
280162306a36Sopenharmony_ci * Build it if needed by the arch config or the generic setup is going
280262306a36Sopenharmony_ci * to be used.
280362306a36Sopenharmony_ci */
280462306a36Sopenharmony_ci#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
280562306a36Sopenharmony_ci	!defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
280662306a36Sopenharmony_ci#define BUILD_EMBED_FIRST_CHUNK
280762306a36Sopenharmony_ci#endif
280862306a36Sopenharmony_ci
280962306a36Sopenharmony_ci/* build pcpu_page_first_chunk() iff needed by the arch config */
281062306a36Sopenharmony_ci#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
281162306a36Sopenharmony_ci#define BUILD_PAGE_FIRST_CHUNK
281262306a36Sopenharmony_ci#endif
281362306a36Sopenharmony_ci
281462306a36Sopenharmony_ci/* pcpu_build_alloc_info() is used by both embed and page first chunk */
281562306a36Sopenharmony_ci#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
281662306a36Sopenharmony_ci/**
281762306a36Sopenharmony_ci * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
281862306a36Sopenharmony_ci * @reserved_size: the size of reserved percpu area in bytes
281962306a36Sopenharmony_ci * @dyn_size: minimum free size for dynamic allocation in bytes
282062306a36Sopenharmony_ci * @atom_size: allocation atom size
282162306a36Sopenharmony_ci * @cpu_distance_fn: callback to determine distance between cpus, optional
282262306a36Sopenharmony_ci *
282362306a36Sopenharmony_ci * This function determines grouping of units, their mappings to cpus
282462306a36Sopenharmony_ci * and other parameters considering needed percpu size, allocation
282562306a36Sopenharmony_ci * atom size and distances between CPUs.
282662306a36Sopenharmony_ci *
282762306a36Sopenharmony_ci * Groups are always multiples of atom size and CPUs which are of
282862306a36Sopenharmony_ci * LOCAL_DISTANCE both ways are grouped together and share space for
282962306a36Sopenharmony_ci * units in the same group.  The returned configuration is guaranteed
283062306a36Sopenharmony_ci * to have CPUs on different nodes on different groups and >=75% usage
283162306a36Sopenharmony_ci * of allocated virtual address space.
283262306a36Sopenharmony_ci *
283362306a36Sopenharmony_ci * RETURNS:
283462306a36Sopenharmony_ci * On success, pointer to the new allocation_info is returned.  On
283562306a36Sopenharmony_ci * failure, ERR_PTR value is returned.
283662306a36Sopenharmony_ci */
283762306a36Sopenharmony_cistatic struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
283862306a36Sopenharmony_ci				size_t reserved_size, size_t dyn_size,
283962306a36Sopenharmony_ci				size_t atom_size,
284062306a36Sopenharmony_ci				pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
284162306a36Sopenharmony_ci{
284262306a36Sopenharmony_ci	static int group_map[NR_CPUS] __initdata;
284362306a36Sopenharmony_ci	static int group_cnt[NR_CPUS] __initdata;
284462306a36Sopenharmony_ci	static struct cpumask mask __initdata;
284562306a36Sopenharmony_ci	const size_t static_size = __per_cpu_end - __per_cpu_start;
284662306a36Sopenharmony_ci	int nr_groups = 1, nr_units = 0;
284762306a36Sopenharmony_ci	size_t size_sum, min_unit_size, alloc_size;
284862306a36Sopenharmony_ci	int upa, max_upa, best_upa;	/* units_per_alloc */
284962306a36Sopenharmony_ci	int last_allocs, group, unit;
285062306a36Sopenharmony_ci	unsigned int cpu, tcpu;
285162306a36Sopenharmony_ci	struct pcpu_alloc_info *ai;
285262306a36Sopenharmony_ci	unsigned int *cpu_map;
285362306a36Sopenharmony_ci
285462306a36Sopenharmony_ci	/* this function may be called multiple times */
285562306a36Sopenharmony_ci	memset(group_map, 0, sizeof(group_map));
285662306a36Sopenharmony_ci	memset(group_cnt, 0, sizeof(group_cnt));
285762306a36Sopenharmony_ci	cpumask_clear(&mask);
285862306a36Sopenharmony_ci
285962306a36Sopenharmony_ci	/* calculate size_sum and ensure dyn_size is enough for early alloc */
286062306a36Sopenharmony_ci	size_sum = PFN_ALIGN(static_size + reserved_size +
286162306a36Sopenharmony_ci			    max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
286262306a36Sopenharmony_ci	dyn_size = size_sum - static_size - reserved_size;
286362306a36Sopenharmony_ci
286462306a36Sopenharmony_ci	/*
286562306a36Sopenharmony_ci	 * Determine min_unit_size, alloc_size and max_upa such that
286662306a36Sopenharmony_ci	 * alloc_size is multiple of atom_size and is the smallest
286762306a36Sopenharmony_ci	 * which can accommodate 4k aligned segments which are equal to
286862306a36Sopenharmony_ci	 * or larger than min_unit_size.
286962306a36Sopenharmony_ci	 */
287062306a36Sopenharmony_ci	min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
287162306a36Sopenharmony_ci
287262306a36Sopenharmony_ci	/* determine the maximum # of units that can fit in an allocation */
287362306a36Sopenharmony_ci	alloc_size = roundup(min_unit_size, atom_size);
287462306a36Sopenharmony_ci	upa = alloc_size / min_unit_size;
287562306a36Sopenharmony_ci	while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
287662306a36Sopenharmony_ci		upa--;
287762306a36Sopenharmony_ci	max_upa = upa;
287862306a36Sopenharmony_ci
287962306a36Sopenharmony_ci	cpumask_copy(&mask, cpu_possible_mask);
288062306a36Sopenharmony_ci
288162306a36Sopenharmony_ci	/* group cpus according to their proximity */
288262306a36Sopenharmony_ci	for (group = 0; !cpumask_empty(&mask); group++) {
288362306a36Sopenharmony_ci		/* pop the group's first cpu */
288462306a36Sopenharmony_ci		cpu = cpumask_first(&mask);
288562306a36Sopenharmony_ci		group_map[cpu] = group;
288662306a36Sopenharmony_ci		group_cnt[group]++;
288762306a36Sopenharmony_ci		cpumask_clear_cpu(cpu, &mask);
288862306a36Sopenharmony_ci
288962306a36Sopenharmony_ci		for_each_cpu(tcpu, &mask) {
289062306a36Sopenharmony_ci			if (!cpu_distance_fn ||
289162306a36Sopenharmony_ci			    (cpu_distance_fn(cpu, tcpu) == LOCAL_DISTANCE &&
289262306a36Sopenharmony_ci			     cpu_distance_fn(tcpu, cpu) == LOCAL_DISTANCE)) {
289362306a36Sopenharmony_ci				group_map[tcpu] = group;
289462306a36Sopenharmony_ci				group_cnt[group]++;
289562306a36Sopenharmony_ci				cpumask_clear_cpu(tcpu, &mask);
289662306a36Sopenharmony_ci			}
289762306a36Sopenharmony_ci		}
289862306a36Sopenharmony_ci	}
289962306a36Sopenharmony_ci	nr_groups = group;
290062306a36Sopenharmony_ci
290162306a36Sopenharmony_ci	/*
290262306a36Sopenharmony_ci	 * Wasted space is caused by a ratio imbalance of upa to group_cnt.
290362306a36Sopenharmony_ci	 * Expand the unit_size until we use >= 75% of the units allocated.
290462306a36Sopenharmony_ci	 * Related to atom_size, which could be much larger than the unit_size.
290562306a36Sopenharmony_ci	 */
290662306a36Sopenharmony_ci	last_allocs = INT_MAX;
290762306a36Sopenharmony_ci	best_upa = 0;
290862306a36Sopenharmony_ci	for (upa = max_upa; upa; upa--) {
290962306a36Sopenharmony_ci		int allocs = 0, wasted = 0;
291062306a36Sopenharmony_ci
291162306a36Sopenharmony_ci		if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
291262306a36Sopenharmony_ci			continue;
291362306a36Sopenharmony_ci
291462306a36Sopenharmony_ci		for (group = 0; group < nr_groups; group++) {
291562306a36Sopenharmony_ci			int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
291662306a36Sopenharmony_ci			allocs += this_allocs;
291762306a36Sopenharmony_ci			wasted += this_allocs * upa - group_cnt[group];
291862306a36Sopenharmony_ci		}
291962306a36Sopenharmony_ci
292062306a36Sopenharmony_ci		/*
292162306a36Sopenharmony_ci		 * Don't accept if wastage is over 1/3.  The
292262306a36Sopenharmony_ci		 * greater-than comparison ensures upa==1 always
292362306a36Sopenharmony_ci		 * passes the following check.
292462306a36Sopenharmony_ci		 */
292562306a36Sopenharmony_ci		if (wasted > num_possible_cpus() / 3)
292662306a36Sopenharmony_ci			continue;
292762306a36Sopenharmony_ci
292862306a36Sopenharmony_ci		/* and then don't consume more memory */
292962306a36Sopenharmony_ci		if (allocs > last_allocs)
293062306a36Sopenharmony_ci			break;
293162306a36Sopenharmony_ci		last_allocs = allocs;
293262306a36Sopenharmony_ci		best_upa = upa;
293362306a36Sopenharmony_ci	}
293462306a36Sopenharmony_ci	BUG_ON(!best_upa);
293562306a36Sopenharmony_ci	upa = best_upa;
293662306a36Sopenharmony_ci
293762306a36Sopenharmony_ci	/* allocate and fill alloc_info */
293862306a36Sopenharmony_ci	for (group = 0; group < nr_groups; group++)
293962306a36Sopenharmony_ci		nr_units += roundup(group_cnt[group], upa);
294062306a36Sopenharmony_ci
294162306a36Sopenharmony_ci	ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
294262306a36Sopenharmony_ci	if (!ai)
294362306a36Sopenharmony_ci		return ERR_PTR(-ENOMEM);
294462306a36Sopenharmony_ci	cpu_map = ai->groups[0].cpu_map;
294562306a36Sopenharmony_ci
294662306a36Sopenharmony_ci	for (group = 0; group < nr_groups; group++) {
294762306a36Sopenharmony_ci		ai->groups[group].cpu_map = cpu_map;
294862306a36Sopenharmony_ci		cpu_map += roundup(group_cnt[group], upa);
294962306a36Sopenharmony_ci	}
295062306a36Sopenharmony_ci
295162306a36Sopenharmony_ci	ai->static_size = static_size;
295262306a36Sopenharmony_ci	ai->reserved_size = reserved_size;
295362306a36Sopenharmony_ci	ai->dyn_size = dyn_size;
295462306a36Sopenharmony_ci	ai->unit_size = alloc_size / upa;
295562306a36Sopenharmony_ci	ai->atom_size = atom_size;
295662306a36Sopenharmony_ci	ai->alloc_size = alloc_size;
295762306a36Sopenharmony_ci
295862306a36Sopenharmony_ci	for (group = 0, unit = 0; group < nr_groups; group++) {
295962306a36Sopenharmony_ci		struct pcpu_group_info *gi = &ai->groups[group];
296062306a36Sopenharmony_ci
296162306a36Sopenharmony_ci		/*
296262306a36Sopenharmony_ci		 * Initialize base_offset as if all groups are located
296362306a36Sopenharmony_ci		 * back-to-back.  The caller should update this to
296462306a36Sopenharmony_ci		 * reflect actual allocation.
296562306a36Sopenharmony_ci		 */
296662306a36Sopenharmony_ci		gi->base_offset = unit * ai->unit_size;
296762306a36Sopenharmony_ci
296862306a36Sopenharmony_ci		for_each_possible_cpu(cpu)
296962306a36Sopenharmony_ci			if (group_map[cpu] == group)
297062306a36Sopenharmony_ci				gi->cpu_map[gi->nr_units++] = cpu;
297162306a36Sopenharmony_ci		gi->nr_units = roundup(gi->nr_units, upa);
297262306a36Sopenharmony_ci		unit += gi->nr_units;
297362306a36Sopenharmony_ci	}
297462306a36Sopenharmony_ci	BUG_ON(unit != nr_units);
297562306a36Sopenharmony_ci
297662306a36Sopenharmony_ci	return ai;
297762306a36Sopenharmony_ci}
297862306a36Sopenharmony_ci
297962306a36Sopenharmony_cistatic void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align,
298062306a36Sopenharmony_ci				   pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
298162306a36Sopenharmony_ci{
298262306a36Sopenharmony_ci	const unsigned long goal = __pa(MAX_DMA_ADDRESS);
298362306a36Sopenharmony_ci#ifdef CONFIG_NUMA
298462306a36Sopenharmony_ci	int node = NUMA_NO_NODE;
298562306a36Sopenharmony_ci	void *ptr;
298662306a36Sopenharmony_ci
298762306a36Sopenharmony_ci	if (cpu_to_nd_fn)
298862306a36Sopenharmony_ci		node = cpu_to_nd_fn(cpu);
298962306a36Sopenharmony_ci
299062306a36Sopenharmony_ci	if (node == NUMA_NO_NODE || !node_online(node) || !NODE_DATA(node)) {
299162306a36Sopenharmony_ci		ptr = memblock_alloc_from(size, align, goal);
299262306a36Sopenharmony_ci		pr_info("cpu %d has no node %d or node-local memory\n",
299362306a36Sopenharmony_ci			cpu, node);
299462306a36Sopenharmony_ci		pr_debug("per cpu data for cpu%d %zu bytes at 0x%llx\n",
299562306a36Sopenharmony_ci			 cpu, size, (u64)__pa(ptr));
299662306a36Sopenharmony_ci	} else {
299762306a36Sopenharmony_ci		ptr = memblock_alloc_try_nid(size, align, goal,
299862306a36Sopenharmony_ci					     MEMBLOCK_ALLOC_ACCESSIBLE,
299962306a36Sopenharmony_ci					     node);
300062306a36Sopenharmony_ci
300162306a36Sopenharmony_ci		pr_debug("per cpu data for cpu%d %zu bytes on node%d at 0x%llx\n",
300262306a36Sopenharmony_ci			 cpu, size, node, (u64)__pa(ptr));
300362306a36Sopenharmony_ci	}
300462306a36Sopenharmony_ci	return ptr;
300562306a36Sopenharmony_ci#else
300662306a36Sopenharmony_ci	return memblock_alloc_from(size, align, goal);
300762306a36Sopenharmony_ci#endif
300862306a36Sopenharmony_ci}
300962306a36Sopenharmony_ci
301062306a36Sopenharmony_cistatic void __init pcpu_fc_free(void *ptr, size_t size)
301162306a36Sopenharmony_ci{
301262306a36Sopenharmony_ci	memblock_free(ptr, size);
301362306a36Sopenharmony_ci}
301462306a36Sopenharmony_ci#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */
301562306a36Sopenharmony_ci
301662306a36Sopenharmony_ci#if defined(BUILD_EMBED_FIRST_CHUNK)
301762306a36Sopenharmony_ci/**
301862306a36Sopenharmony_ci * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
301962306a36Sopenharmony_ci * @reserved_size: the size of reserved percpu area in bytes
302062306a36Sopenharmony_ci * @dyn_size: minimum free size for dynamic allocation in bytes
302162306a36Sopenharmony_ci * @atom_size: allocation atom size
302262306a36Sopenharmony_ci * @cpu_distance_fn: callback to determine distance between cpus, optional
302362306a36Sopenharmony_ci * @cpu_to_nd_fn: callback to convert cpu to it's node, optional
302462306a36Sopenharmony_ci *
302562306a36Sopenharmony_ci * This is a helper to ease setting up embedded first percpu chunk and
302662306a36Sopenharmony_ci * can be called where pcpu_setup_first_chunk() is expected.
302762306a36Sopenharmony_ci *
302862306a36Sopenharmony_ci * If this function is used to setup the first chunk, it is allocated
302962306a36Sopenharmony_ci * by calling pcpu_fc_alloc and used as-is without being mapped into
303062306a36Sopenharmony_ci * vmalloc area.  Allocations are always whole multiples of @atom_size
303162306a36Sopenharmony_ci * aligned to @atom_size.
303262306a36Sopenharmony_ci *
303362306a36Sopenharmony_ci * This enables the first chunk to piggy back on the linear physical
303462306a36Sopenharmony_ci * mapping which often uses larger page size.  Please note that this
303562306a36Sopenharmony_ci * can result in very sparse cpu->unit mapping on NUMA machines thus
303662306a36Sopenharmony_ci * requiring large vmalloc address space.  Don't use this allocator if
303762306a36Sopenharmony_ci * vmalloc space is not orders of magnitude larger than distances
303862306a36Sopenharmony_ci * between node memory addresses (ie. 32bit NUMA machines).
303962306a36Sopenharmony_ci *
304062306a36Sopenharmony_ci * @dyn_size specifies the minimum dynamic area size.
304162306a36Sopenharmony_ci *
304262306a36Sopenharmony_ci * If the needed size is smaller than the minimum or specified unit
304362306a36Sopenharmony_ci * size, the leftover is returned using pcpu_fc_free.
304462306a36Sopenharmony_ci *
304562306a36Sopenharmony_ci * RETURNS:
304662306a36Sopenharmony_ci * 0 on success, -errno on failure.
304762306a36Sopenharmony_ci */
304862306a36Sopenharmony_ciint __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
304962306a36Sopenharmony_ci				  size_t atom_size,
305062306a36Sopenharmony_ci				  pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
305162306a36Sopenharmony_ci				  pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
305262306a36Sopenharmony_ci{
305362306a36Sopenharmony_ci	void *base = (void *)ULONG_MAX;
305462306a36Sopenharmony_ci	void **areas = NULL;
305562306a36Sopenharmony_ci	struct pcpu_alloc_info *ai;
305662306a36Sopenharmony_ci	size_t size_sum, areas_size;
305762306a36Sopenharmony_ci	unsigned long max_distance;
305862306a36Sopenharmony_ci	int group, i, highest_group, rc = 0;
305962306a36Sopenharmony_ci
306062306a36Sopenharmony_ci	ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
306162306a36Sopenharmony_ci				   cpu_distance_fn);
306262306a36Sopenharmony_ci	if (IS_ERR(ai))
306362306a36Sopenharmony_ci		return PTR_ERR(ai);
306462306a36Sopenharmony_ci
306562306a36Sopenharmony_ci	size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
306662306a36Sopenharmony_ci	areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
306762306a36Sopenharmony_ci
306862306a36Sopenharmony_ci	areas = memblock_alloc(areas_size, SMP_CACHE_BYTES);
306962306a36Sopenharmony_ci	if (!areas) {
307062306a36Sopenharmony_ci		rc = -ENOMEM;
307162306a36Sopenharmony_ci		goto out_free;
307262306a36Sopenharmony_ci	}
307362306a36Sopenharmony_ci
307462306a36Sopenharmony_ci	/* allocate, copy and determine base address & max_distance */
307562306a36Sopenharmony_ci	highest_group = 0;
307662306a36Sopenharmony_ci	for (group = 0; group < ai->nr_groups; group++) {
307762306a36Sopenharmony_ci		struct pcpu_group_info *gi = &ai->groups[group];
307862306a36Sopenharmony_ci		unsigned int cpu = NR_CPUS;
307962306a36Sopenharmony_ci		void *ptr;
308062306a36Sopenharmony_ci
308162306a36Sopenharmony_ci		for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
308262306a36Sopenharmony_ci			cpu = gi->cpu_map[i];
308362306a36Sopenharmony_ci		BUG_ON(cpu == NR_CPUS);
308462306a36Sopenharmony_ci
308562306a36Sopenharmony_ci		/* allocate space for the whole group */
308662306a36Sopenharmony_ci		ptr = pcpu_fc_alloc(cpu, gi->nr_units * ai->unit_size, atom_size, cpu_to_nd_fn);
308762306a36Sopenharmony_ci		if (!ptr) {
308862306a36Sopenharmony_ci			rc = -ENOMEM;
308962306a36Sopenharmony_ci			goto out_free_areas;
309062306a36Sopenharmony_ci		}
309162306a36Sopenharmony_ci		/* kmemleak tracks the percpu allocations separately */
309262306a36Sopenharmony_ci		kmemleak_ignore_phys(__pa(ptr));
309362306a36Sopenharmony_ci		areas[group] = ptr;
309462306a36Sopenharmony_ci
309562306a36Sopenharmony_ci		base = min(ptr, base);
309662306a36Sopenharmony_ci		if (ptr > areas[highest_group])
309762306a36Sopenharmony_ci			highest_group = group;
309862306a36Sopenharmony_ci	}
309962306a36Sopenharmony_ci	max_distance = areas[highest_group] - base;
310062306a36Sopenharmony_ci	max_distance += ai->unit_size * ai->groups[highest_group].nr_units;
310162306a36Sopenharmony_ci
310262306a36Sopenharmony_ci	/* warn if maximum distance is further than 75% of vmalloc space */
310362306a36Sopenharmony_ci	if (max_distance > VMALLOC_TOTAL * 3 / 4) {
310462306a36Sopenharmony_ci		pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
310562306a36Sopenharmony_ci				max_distance, VMALLOC_TOTAL);
310662306a36Sopenharmony_ci#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
310762306a36Sopenharmony_ci		/* and fail if we have fallback */
310862306a36Sopenharmony_ci		rc = -EINVAL;
310962306a36Sopenharmony_ci		goto out_free_areas;
311062306a36Sopenharmony_ci#endif
311162306a36Sopenharmony_ci	}
311262306a36Sopenharmony_ci
311362306a36Sopenharmony_ci	/*
311462306a36Sopenharmony_ci	 * Copy data and free unused parts.  This should happen after all
311562306a36Sopenharmony_ci	 * allocations are complete; otherwise, we may end up with
311662306a36Sopenharmony_ci	 * overlapping groups.
311762306a36Sopenharmony_ci	 */
311862306a36Sopenharmony_ci	for (group = 0; group < ai->nr_groups; group++) {
311962306a36Sopenharmony_ci		struct pcpu_group_info *gi = &ai->groups[group];
312062306a36Sopenharmony_ci		void *ptr = areas[group];
312162306a36Sopenharmony_ci
312262306a36Sopenharmony_ci		for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
312362306a36Sopenharmony_ci			if (gi->cpu_map[i] == NR_CPUS) {
312462306a36Sopenharmony_ci				/* unused unit, free whole */
312562306a36Sopenharmony_ci				pcpu_fc_free(ptr, ai->unit_size);
312662306a36Sopenharmony_ci				continue;
312762306a36Sopenharmony_ci			}
312862306a36Sopenharmony_ci			/* copy and return the unused part */
312962306a36Sopenharmony_ci			memcpy(ptr, __per_cpu_load, ai->static_size);
313062306a36Sopenharmony_ci			pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum);
313162306a36Sopenharmony_ci		}
313262306a36Sopenharmony_ci	}
313362306a36Sopenharmony_ci
313462306a36Sopenharmony_ci	/* base address is now known, determine group base offsets */
313562306a36Sopenharmony_ci	for (group = 0; group < ai->nr_groups; group++) {
313662306a36Sopenharmony_ci		ai->groups[group].base_offset = areas[group] - base;
313762306a36Sopenharmony_ci	}
313862306a36Sopenharmony_ci
313962306a36Sopenharmony_ci	pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n",
314062306a36Sopenharmony_ci		PFN_DOWN(size_sum), ai->static_size, ai->reserved_size,
314162306a36Sopenharmony_ci		ai->dyn_size, ai->unit_size);
314262306a36Sopenharmony_ci
314362306a36Sopenharmony_ci	pcpu_setup_first_chunk(ai, base);
314462306a36Sopenharmony_ci	goto out_free;
314562306a36Sopenharmony_ci
314662306a36Sopenharmony_ciout_free_areas:
314762306a36Sopenharmony_ci	for (group = 0; group < ai->nr_groups; group++)
314862306a36Sopenharmony_ci		if (areas[group])
314962306a36Sopenharmony_ci			pcpu_fc_free(areas[group],
315062306a36Sopenharmony_ci				ai->groups[group].nr_units * ai->unit_size);
315162306a36Sopenharmony_ciout_free:
315262306a36Sopenharmony_ci	pcpu_free_alloc_info(ai);
315362306a36Sopenharmony_ci	if (areas)
315462306a36Sopenharmony_ci		memblock_free(areas, areas_size);
315562306a36Sopenharmony_ci	return rc;
315662306a36Sopenharmony_ci}
315762306a36Sopenharmony_ci#endif /* BUILD_EMBED_FIRST_CHUNK */
315862306a36Sopenharmony_ci
315962306a36Sopenharmony_ci#ifdef BUILD_PAGE_FIRST_CHUNK
316062306a36Sopenharmony_ci#include <asm/pgalloc.h>
316162306a36Sopenharmony_ci
316262306a36Sopenharmony_ci#ifndef P4D_TABLE_SIZE
316362306a36Sopenharmony_ci#define P4D_TABLE_SIZE PAGE_SIZE
316462306a36Sopenharmony_ci#endif
316562306a36Sopenharmony_ci
316662306a36Sopenharmony_ci#ifndef PUD_TABLE_SIZE
316762306a36Sopenharmony_ci#define PUD_TABLE_SIZE PAGE_SIZE
316862306a36Sopenharmony_ci#endif
316962306a36Sopenharmony_ci
317062306a36Sopenharmony_ci#ifndef PMD_TABLE_SIZE
317162306a36Sopenharmony_ci#define PMD_TABLE_SIZE PAGE_SIZE
317262306a36Sopenharmony_ci#endif
317362306a36Sopenharmony_ci
317462306a36Sopenharmony_ci#ifndef PTE_TABLE_SIZE
317562306a36Sopenharmony_ci#define PTE_TABLE_SIZE PAGE_SIZE
317662306a36Sopenharmony_ci#endif
317762306a36Sopenharmony_civoid __init __weak pcpu_populate_pte(unsigned long addr)
317862306a36Sopenharmony_ci{
317962306a36Sopenharmony_ci	pgd_t *pgd = pgd_offset_k(addr);
318062306a36Sopenharmony_ci	p4d_t *p4d;
318162306a36Sopenharmony_ci	pud_t *pud;
318262306a36Sopenharmony_ci	pmd_t *pmd;
318362306a36Sopenharmony_ci
318462306a36Sopenharmony_ci	if (pgd_none(*pgd)) {
318562306a36Sopenharmony_ci		p4d = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
318662306a36Sopenharmony_ci		if (!p4d)
318762306a36Sopenharmony_ci			goto err_alloc;
318862306a36Sopenharmony_ci		pgd_populate(&init_mm, pgd, p4d);
318962306a36Sopenharmony_ci	}
319062306a36Sopenharmony_ci
319162306a36Sopenharmony_ci	p4d = p4d_offset(pgd, addr);
319262306a36Sopenharmony_ci	if (p4d_none(*p4d)) {
319362306a36Sopenharmony_ci		pud = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
319462306a36Sopenharmony_ci		if (!pud)
319562306a36Sopenharmony_ci			goto err_alloc;
319662306a36Sopenharmony_ci		p4d_populate(&init_mm, p4d, pud);
319762306a36Sopenharmony_ci	}
319862306a36Sopenharmony_ci
319962306a36Sopenharmony_ci	pud = pud_offset(p4d, addr);
320062306a36Sopenharmony_ci	if (pud_none(*pud)) {
320162306a36Sopenharmony_ci		pmd = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
320262306a36Sopenharmony_ci		if (!pmd)
320362306a36Sopenharmony_ci			goto err_alloc;
320462306a36Sopenharmony_ci		pud_populate(&init_mm, pud, pmd);
320562306a36Sopenharmony_ci	}
320662306a36Sopenharmony_ci
320762306a36Sopenharmony_ci	pmd = pmd_offset(pud, addr);
320862306a36Sopenharmony_ci	if (!pmd_present(*pmd)) {
320962306a36Sopenharmony_ci		pte_t *new;
321062306a36Sopenharmony_ci
321162306a36Sopenharmony_ci		new = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE);
321262306a36Sopenharmony_ci		if (!new)
321362306a36Sopenharmony_ci			goto err_alloc;
321462306a36Sopenharmony_ci		pmd_populate_kernel(&init_mm, pmd, new);
321562306a36Sopenharmony_ci	}
321662306a36Sopenharmony_ci
321762306a36Sopenharmony_ci	return;
321862306a36Sopenharmony_ci
321962306a36Sopenharmony_cierr_alloc:
322062306a36Sopenharmony_ci	panic("%s: Failed to allocate memory\n", __func__);
322162306a36Sopenharmony_ci}
322262306a36Sopenharmony_ci
322362306a36Sopenharmony_ci/**
322462306a36Sopenharmony_ci * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
322562306a36Sopenharmony_ci * @reserved_size: the size of reserved percpu area in bytes
322662306a36Sopenharmony_ci * @cpu_to_nd_fn: callback to convert cpu to it's node, optional
322762306a36Sopenharmony_ci *
322862306a36Sopenharmony_ci * This is a helper to ease setting up page-remapped first percpu
322962306a36Sopenharmony_ci * chunk and can be called where pcpu_setup_first_chunk() is expected.
323062306a36Sopenharmony_ci *
323162306a36Sopenharmony_ci * This is the basic allocator.  Static percpu area is allocated
323262306a36Sopenharmony_ci * page-by-page into vmalloc area.
323362306a36Sopenharmony_ci *
323462306a36Sopenharmony_ci * RETURNS:
323562306a36Sopenharmony_ci * 0 on success, -errno on failure.
323662306a36Sopenharmony_ci */
323762306a36Sopenharmony_ciint __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
323862306a36Sopenharmony_ci{
323962306a36Sopenharmony_ci	static struct vm_struct vm;
324062306a36Sopenharmony_ci	struct pcpu_alloc_info *ai;
324162306a36Sopenharmony_ci	char psize_str[16];
324262306a36Sopenharmony_ci	int unit_pages;
324362306a36Sopenharmony_ci	size_t pages_size;
324462306a36Sopenharmony_ci	struct page **pages;
324562306a36Sopenharmony_ci	int unit, i, j, rc = 0;
324662306a36Sopenharmony_ci	int upa;
324762306a36Sopenharmony_ci	int nr_g0_units;
324862306a36Sopenharmony_ci
324962306a36Sopenharmony_ci	snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
325062306a36Sopenharmony_ci
325162306a36Sopenharmony_ci	ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
325262306a36Sopenharmony_ci	if (IS_ERR(ai))
325362306a36Sopenharmony_ci		return PTR_ERR(ai);
325462306a36Sopenharmony_ci	BUG_ON(ai->nr_groups != 1);
325562306a36Sopenharmony_ci	upa = ai->alloc_size/ai->unit_size;
325662306a36Sopenharmony_ci	nr_g0_units = roundup(num_possible_cpus(), upa);
325762306a36Sopenharmony_ci	if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) {
325862306a36Sopenharmony_ci		pcpu_free_alloc_info(ai);
325962306a36Sopenharmony_ci		return -EINVAL;
326062306a36Sopenharmony_ci	}
326162306a36Sopenharmony_ci
326262306a36Sopenharmony_ci	unit_pages = ai->unit_size >> PAGE_SHIFT;
326362306a36Sopenharmony_ci
326462306a36Sopenharmony_ci	/* unaligned allocations can't be freed, round up to page size */
326562306a36Sopenharmony_ci	pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
326662306a36Sopenharmony_ci			       sizeof(pages[0]));
326762306a36Sopenharmony_ci	pages = memblock_alloc(pages_size, SMP_CACHE_BYTES);
326862306a36Sopenharmony_ci	if (!pages)
326962306a36Sopenharmony_ci		panic("%s: Failed to allocate %zu bytes\n", __func__,
327062306a36Sopenharmony_ci		      pages_size);
327162306a36Sopenharmony_ci
327262306a36Sopenharmony_ci	/* allocate pages */
327362306a36Sopenharmony_ci	j = 0;
327462306a36Sopenharmony_ci	for (unit = 0; unit < num_possible_cpus(); unit++) {
327562306a36Sopenharmony_ci		unsigned int cpu = ai->groups[0].cpu_map[unit];
327662306a36Sopenharmony_ci		for (i = 0; i < unit_pages; i++) {
327762306a36Sopenharmony_ci			void *ptr;
327862306a36Sopenharmony_ci
327962306a36Sopenharmony_ci			ptr = pcpu_fc_alloc(cpu, PAGE_SIZE, PAGE_SIZE, cpu_to_nd_fn);
328062306a36Sopenharmony_ci			if (!ptr) {
328162306a36Sopenharmony_ci				pr_warn("failed to allocate %s page for cpu%u\n",
328262306a36Sopenharmony_ci						psize_str, cpu);
328362306a36Sopenharmony_ci				goto enomem;
328462306a36Sopenharmony_ci			}
328562306a36Sopenharmony_ci			/* kmemleak tracks the percpu allocations separately */
328662306a36Sopenharmony_ci			kmemleak_ignore_phys(__pa(ptr));
328762306a36Sopenharmony_ci			pages[j++] = virt_to_page(ptr);
328862306a36Sopenharmony_ci		}
328962306a36Sopenharmony_ci	}
329062306a36Sopenharmony_ci
329162306a36Sopenharmony_ci	/* allocate vm area, map the pages and copy static data */
329262306a36Sopenharmony_ci	vm.flags = VM_ALLOC;
329362306a36Sopenharmony_ci	vm.size = num_possible_cpus() * ai->unit_size;
329462306a36Sopenharmony_ci	vm_area_register_early(&vm, PAGE_SIZE);
329562306a36Sopenharmony_ci
329662306a36Sopenharmony_ci	for (unit = 0; unit < num_possible_cpus(); unit++) {
329762306a36Sopenharmony_ci		unsigned long unit_addr =
329862306a36Sopenharmony_ci			(unsigned long)vm.addr + unit * ai->unit_size;
329962306a36Sopenharmony_ci
330062306a36Sopenharmony_ci		for (i = 0; i < unit_pages; i++)
330162306a36Sopenharmony_ci			pcpu_populate_pte(unit_addr + (i << PAGE_SHIFT));
330262306a36Sopenharmony_ci
330362306a36Sopenharmony_ci		/* pte already populated, the following shouldn't fail */
330462306a36Sopenharmony_ci		rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
330562306a36Sopenharmony_ci				      unit_pages);
330662306a36Sopenharmony_ci		if (rc < 0)
330762306a36Sopenharmony_ci			panic("failed to map percpu area, err=%d\n", rc);
330862306a36Sopenharmony_ci
330962306a36Sopenharmony_ci		flush_cache_vmap_early(unit_addr, unit_addr + ai->unit_size);
331062306a36Sopenharmony_ci
331162306a36Sopenharmony_ci		/* copy static data */
331262306a36Sopenharmony_ci		memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
331362306a36Sopenharmony_ci	}
331462306a36Sopenharmony_ci
331562306a36Sopenharmony_ci	/* we're ready, commit */
331662306a36Sopenharmony_ci	pr_info("%d %s pages/cpu s%zu r%zu d%zu\n",
331762306a36Sopenharmony_ci		unit_pages, psize_str, ai->static_size,
331862306a36Sopenharmony_ci		ai->reserved_size, ai->dyn_size);
331962306a36Sopenharmony_ci
332062306a36Sopenharmony_ci	pcpu_setup_first_chunk(ai, vm.addr);
332162306a36Sopenharmony_ci	goto out_free_ar;
332262306a36Sopenharmony_ci
332362306a36Sopenharmony_cienomem:
332462306a36Sopenharmony_ci	while (--j >= 0)
332562306a36Sopenharmony_ci		pcpu_fc_free(page_address(pages[j]), PAGE_SIZE);
332662306a36Sopenharmony_ci	rc = -ENOMEM;
332762306a36Sopenharmony_ciout_free_ar:
332862306a36Sopenharmony_ci	memblock_free(pages, pages_size);
332962306a36Sopenharmony_ci	pcpu_free_alloc_info(ai);
333062306a36Sopenharmony_ci	return rc;
333162306a36Sopenharmony_ci}
333262306a36Sopenharmony_ci#endif /* BUILD_PAGE_FIRST_CHUNK */
333362306a36Sopenharmony_ci
333462306a36Sopenharmony_ci#ifndef	CONFIG_HAVE_SETUP_PER_CPU_AREA
333562306a36Sopenharmony_ci/*
333662306a36Sopenharmony_ci * Generic SMP percpu area setup.
333762306a36Sopenharmony_ci *
333862306a36Sopenharmony_ci * The embedding helper is used because its behavior closely resembles
333962306a36Sopenharmony_ci * the original non-dynamic generic percpu area setup.  This is
334062306a36Sopenharmony_ci * important because many archs have addressing restrictions and might
334162306a36Sopenharmony_ci * fail if the percpu area is located far away from the previous
334262306a36Sopenharmony_ci * location.  As an added bonus, in non-NUMA cases, embedding is
334362306a36Sopenharmony_ci * generally a good idea TLB-wise because percpu area can piggy back
334462306a36Sopenharmony_ci * on the physical linear memory mapping which uses large page
334562306a36Sopenharmony_ci * mappings on applicable archs.
334662306a36Sopenharmony_ci */
334762306a36Sopenharmony_ciunsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
334862306a36Sopenharmony_ciEXPORT_SYMBOL(__per_cpu_offset);
334962306a36Sopenharmony_ci
335062306a36Sopenharmony_civoid __init setup_per_cpu_areas(void)
335162306a36Sopenharmony_ci{
335262306a36Sopenharmony_ci	unsigned long delta;
335362306a36Sopenharmony_ci	unsigned int cpu;
335462306a36Sopenharmony_ci	int rc;
335562306a36Sopenharmony_ci
335662306a36Sopenharmony_ci	/*
335762306a36Sopenharmony_ci	 * Always reserve area for module percpu variables.  That's
335862306a36Sopenharmony_ci	 * what the legacy allocator did.
335962306a36Sopenharmony_ci	 */
336062306a36Sopenharmony_ci	rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE,
336162306a36Sopenharmony_ci				    PAGE_SIZE, NULL, NULL);
336262306a36Sopenharmony_ci	if (rc < 0)
336362306a36Sopenharmony_ci		panic("Failed to initialize percpu areas.");
336462306a36Sopenharmony_ci
336562306a36Sopenharmony_ci	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
336662306a36Sopenharmony_ci	for_each_possible_cpu(cpu)
336762306a36Sopenharmony_ci		__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
336862306a36Sopenharmony_ci}
336962306a36Sopenharmony_ci#endif	/* CONFIG_HAVE_SETUP_PER_CPU_AREA */
337062306a36Sopenharmony_ci
337162306a36Sopenharmony_ci#else	/* CONFIG_SMP */
337262306a36Sopenharmony_ci
337362306a36Sopenharmony_ci/*
337462306a36Sopenharmony_ci * UP percpu area setup.
337562306a36Sopenharmony_ci *
337662306a36Sopenharmony_ci * UP always uses km-based percpu allocator with identity mapping.
337762306a36Sopenharmony_ci * Static percpu variables are indistinguishable from the usual static
337862306a36Sopenharmony_ci * variables and don't require any special preparation.
337962306a36Sopenharmony_ci */
338062306a36Sopenharmony_civoid __init setup_per_cpu_areas(void)
338162306a36Sopenharmony_ci{
338262306a36Sopenharmony_ci	const size_t unit_size =
338362306a36Sopenharmony_ci		roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
338462306a36Sopenharmony_ci					 PERCPU_DYNAMIC_RESERVE));
338562306a36Sopenharmony_ci	struct pcpu_alloc_info *ai;
338662306a36Sopenharmony_ci	void *fc;
338762306a36Sopenharmony_ci
338862306a36Sopenharmony_ci	ai = pcpu_alloc_alloc_info(1, 1);
338962306a36Sopenharmony_ci	fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
339062306a36Sopenharmony_ci	if (!ai || !fc)
339162306a36Sopenharmony_ci		panic("Failed to allocate memory for percpu areas.");
339262306a36Sopenharmony_ci	/* kmemleak tracks the percpu allocations separately */
339362306a36Sopenharmony_ci	kmemleak_ignore_phys(__pa(fc));
339462306a36Sopenharmony_ci
339562306a36Sopenharmony_ci	ai->dyn_size = unit_size;
339662306a36Sopenharmony_ci	ai->unit_size = unit_size;
339762306a36Sopenharmony_ci	ai->atom_size = unit_size;
339862306a36Sopenharmony_ci	ai->alloc_size = unit_size;
339962306a36Sopenharmony_ci	ai->groups[0].nr_units = 1;
340062306a36Sopenharmony_ci	ai->groups[0].cpu_map[0] = 0;
340162306a36Sopenharmony_ci
340262306a36Sopenharmony_ci	pcpu_setup_first_chunk(ai, fc);
340362306a36Sopenharmony_ci	pcpu_free_alloc_info(ai);
340462306a36Sopenharmony_ci}
340562306a36Sopenharmony_ci
340662306a36Sopenharmony_ci#endif	/* CONFIG_SMP */
340762306a36Sopenharmony_ci
340862306a36Sopenharmony_ci/*
340962306a36Sopenharmony_ci * pcpu_nr_pages - calculate total number of populated backing pages
341062306a36Sopenharmony_ci *
341162306a36Sopenharmony_ci * This reflects the number of pages populated to back chunks.  Metadata is
341262306a36Sopenharmony_ci * excluded in the number exposed in meminfo as the number of backing pages
341362306a36Sopenharmony_ci * scales with the number of cpus and can quickly outweigh the memory used for
341462306a36Sopenharmony_ci * metadata.  It also keeps this calculation nice and simple.
341562306a36Sopenharmony_ci *
341662306a36Sopenharmony_ci * RETURNS:
341762306a36Sopenharmony_ci * Total number of populated backing pages in use by the allocator.
341862306a36Sopenharmony_ci */
341962306a36Sopenharmony_ciunsigned long pcpu_nr_pages(void)
342062306a36Sopenharmony_ci{
342162306a36Sopenharmony_ci	return pcpu_nr_populated * pcpu_nr_units;
342262306a36Sopenharmony_ci}
342362306a36Sopenharmony_ci
342462306a36Sopenharmony_ci/*
342562306a36Sopenharmony_ci * Percpu allocator is initialized early during boot when neither slab or
342662306a36Sopenharmony_ci * workqueue is available.  Plug async management until everything is up
342762306a36Sopenharmony_ci * and running.
342862306a36Sopenharmony_ci */
342962306a36Sopenharmony_cistatic int __init percpu_enable_async(void)
343062306a36Sopenharmony_ci{
343162306a36Sopenharmony_ci	pcpu_async_enabled = true;
343262306a36Sopenharmony_ci	return 0;
343362306a36Sopenharmony_ci}
343462306a36Sopenharmony_cisubsys_initcall(percpu_enable_async);
3435