18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * mm/percpu.c - percpu memory allocator
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * Copyright (C) 2009		SUSE Linux Products GmbH
68c2ecf20Sopenharmony_ci * Copyright (C) 2009		Tejun Heo <tj@kernel.org>
78c2ecf20Sopenharmony_ci *
88c2ecf20Sopenharmony_ci * Copyright (C) 2017		Facebook Inc.
98c2ecf20Sopenharmony_ci * Copyright (C) 2017		Dennis Zhou <dennis@kernel.org>
108c2ecf20Sopenharmony_ci *
118c2ecf20Sopenharmony_ci * The percpu allocator handles both static and dynamic areas.  Percpu
128c2ecf20Sopenharmony_ci * areas are allocated in chunks which are divided into units.  There is
138c2ecf20Sopenharmony_ci * a 1-to-1 mapping for units to possible cpus.  These units are grouped
148c2ecf20Sopenharmony_ci * based on NUMA properties of the machine.
158c2ecf20Sopenharmony_ci *
168c2ecf20Sopenharmony_ci *  c0                           c1                         c2
178c2ecf20Sopenharmony_ci *  -------------------          -------------------        ------------
188c2ecf20Sopenharmony_ci * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
198c2ecf20Sopenharmony_ci *  -------------------  ......  -------------------  ....  ------------
208c2ecf20Sopenharmony_ci *
218c2ecf20Sopenharmony_ci * Allocation is done by offsets into a unit's address space.  Ie., an
228c2ecf20Sopenharmony_ci * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0,
238c2ecf20Sopenharmony_ci * c1:u1, c1:u2, etc.  On NUMA machines, the mapping may be non-linear
248c2ecf20Sopenharmony_ci * and even sparse.  Access is handled by configuring percpu base
258c2ecf20Sopenharmony_ci * registers according to the cpu to unit mappings and offsetting the
268c2ecf20Sopenharmony_ci * base address using pcpu_unit_size.
278c2ecf20Sopenharmony_ci *
288c2ecf20Sopenharmony_ci * There is special consideration for the first chunk which must handle
298c2ecf20Sopenharmony_ci * the static percpu variables in the kernel image as allocation services
308c2ecf20Sopenharmony_ci * are not online yet.  In short, the first chunk is structured like so:
318c2ecf20Sopenharmony_ci *
328c2ecf20Sopenharmony_ci *                  <Static | [Reserved] | Dynamic>
338c2ecf20Sopenharmony_ci *
348c2ecf20Sopenharmony_ci * The static data is copied from the original section managed by the
358c2ecf20Sopenharmony_ci * linker.  The reserved section, if non-zero, primarily manages static
368c2ecf20Sopenharmony_ci * percpu variables from kernel modules.  Finally, the dynamic section
378c2ecf20Sopenharmony_ci * takes care of normal allocations.
388c2ecf20Sopenharmony_ci *
398c2ecf20Sopenharmony_ci * The allocator organizes chunks into lists according to free size and
408c2ecf20Sopenharmony_ci * memcg-awareness.  To make a percpu allocation memcg-aware the __GFP_ACCOUNT
418c2ecf20Sopenharmony_ci * flag should be passed.  All memcg-aware allocations are sharing one set
428c2ecf20Sopenharmony_ci * of chunks and all unaccounted allocations and allocations performed
438c2ecf20Sopenharmony_ci * by processes belonging to the root memory cgroup are using the second set.
448c2ecf20Sopenharmony_ci *
458c2ecf20Sopenharmony_ci * The allocator tries to allocate from the fullest chunk first. Each chunk
468c2ecf20Sopenharmony_ci * is managed by a bitmap with metadata blocks.  The allocation map is updated
478c2ecf20Sopenharmony_ci * on every allocation and free to reflect the current state while the boundary
488c2ecf20Sopenharmony_ci * map is only updated on allocation.  Each metadata block contains
498c2ecf20Sopenharmony_ci * information to help mitigate the need to iterate over large portions
508c2ecf20Sopenharmony_ci * of the bitmap.  The reverse mapping from page to chunk is stored in
518c2ecf20Sopenharmony_ci * the page's index.  Lastly, units are lazily backed and grow in unison.
528c2ecf20Sopenharmony_ci *
538c2ecf20Sopenharmony_ci * There is a unique conversion that goes on here between bytes and bits.
548c2ecf20Sopenharmony_ci * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE.  The chunk
558c2ecf20Sopenharmony_ci * tracks the number of pages it is responsible for in nr_pages.  Helper
568c2ecf20Sopenharmony_ci * functions are used to convert from between the bytes, bits, and blocks.
578c2ecf20Sopenharmony_ci * All hints are managed in bits unless explicitly stated.
588c2ecf20Sopenharmony_ci *
598c2ecf20Sopenharmony_ci * To use this allocator, arch code should do the following:
608c2ecf20Sopenharmony_ci *
618c2ecf20Sopenharmony_ci * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
628c2ecf20Sopenharmony_ci *   regular address to percpu pointer and back if they need to be
638c2ecf20Sopenharmony_ci *   different from the default
648c2ecf20Sopenharmony_ci *
658c2ecf20Sopenharmony_ci * - use pcpu_setup_first_chunk() during percpu area initialization to
668c2ecf20Sopenharmony_ci *   setup the first chunk containing the kernel static percpu area
678c2ecf20Sopenharmony_ci */
688c2ecf20Sopenharmony_ci
698c2ecf20Sopenharmony_ci#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
708c2ecf20Sopenharmony_ci
718c2ecf20Sopenharmony_ci#include <linux/bitmap.h>
728c2ecf20Sopenharmony_ci#include <linux/memblock.h>
738c2ecf20Sopenharmony_ci#include <linux/err.h>
748c2ecf20Sopenharmony_ci#include <linux/lcm.h>
758c2ecf20Sopenharmony_ci#include <linux/list.h>
768c2ecf20Sopenharmony_ci#include <linux/log2.h>
778c2ecf20Sopenharmony_ci#include <linux/mm.h>
788c2ecf20Sopenharmony_ci#include <linux/module.h>
798c2ecf20Sopenharmony_ci#include <linux/mutex.h>
808c2ecf20Sopenharmony_ci#include <linux/percpu.h>
818c2ecf20Sopenharmony_ci#include <linux/pfn.h>
828c2ecf20Sopenharmony_ci#include <linux/slab.h>
838c2ecf20Sopenharmony_ci#include <linux/spinlock.h>
848c2ecf20Sopenharmony_ci#include <linux/vmalloc.h>
858c2ecf20Sopenharmony_ci#include <linux/workqueue.h>
868c2ecf20Sopenharmony_ci#include <linux/kmemleak.h>
878c2ecf20Sopenharmony_ci#include <linux/sched.h>
888c2ecf20Sopenharmony_ci#include <linux/sched/mm.h>
898c2ecf20Sopenharmony_ci#include <linux/memcontrol.h>
908c2ecf20Sopenharmony_ci
918c2ecf20Sopenharmony_ci#include <asm/cacheflush.h>
928c2ecf20Sopenharmony_ci#include <asm/sections.h>
938c2ecf20Sopenharmony_ci#include <asm/tlbflush.h>
948c2ecf20Sopenharmony_ci#include <asm/io.h>
958c2ecf20Sopenharmony_ci
968c2ecf20Sopenharmony_ci#define CREATE_TRACE_POINTS
978c2ecf20Sopenharmony_ci#include <trace/events/percpu.h>
988c2ecf20Sopenharmony_ci
998c2ecf20Sopenharmony_ci#include "percpu-internal.h"
1008c2ecf20Sopenharmony_ci
1018c2ecf20Sopenharmony_ci/* the slots are sorted by free bytes left, 1-31 bytes share the same slot */
1028c2ecf20Sopenharmony_ci#define PCPU_SLOT_BASE_SHIFT		5
1038c2ecf20Sopenharmony_ci/* chunks in slots below this are subject to being sidelined on failed alloc */
1048c2ecf20Sopenharmony_ci#define PCPU_SLOT_FAIL_THRESHOLD	3
1058c2ecf20Sopenharmony_ci
1068c2ecf20Sopenharmony_ci#define PCPU_EMPTY_POP_PAGES_LOW	2
1078c2ecf20Sopenharmony_ci#define PCPU_EMPTY_POP_PAGES_HIGH	4
1088c2ecf20Sopenharmony_ci
1098c2ecf20Sopenharmony_ci#ifdef CONFIG_SMP
1108c2ecf20Sopenharmony_ci/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
1118c2ecf20Sopenharmony_ci#ifndef __addr_to_pcpu_ptr
1128c2ecf20Sopenharmony_ci#define __addr_to_pcpu_ptr(addr)					\
1138c2ecf20Sopenharmony_ci	(void __percpu *)((unsigned long)(addr) -			\
1148c2ecf20Sopenharmony_ci			  (unsigned long)pcpu_base_addr	+		\
1158c2ecf20Sopenharmony_ci			  (unsigned long)__per_cpu_start)
1168c2ecf20Sopenharmony_ci#endif
1178c2ecf20Sopenharmony_ci#ifndef __pcpu_ptr_to_addr
1188c2ecf20Sopenharmony_ci#define __pcpu_ptr_to_addr(ptr)						\
1198c2ecf20Sopenharmony_ci	(void __force *)((unsigned long)(ptr) +				\
1208c2ecf20Sopenharmony_ci			 (unsigned long)pcpu_base_addr -		\
1218c2ecf20Sopenharmony_ci			 (unsigned long)__per_cpu_start)
1228c2ecf20Sopenharmony_ci#endif
1238c2ecf20Sopenharmony_ci#else	/* CONFIG_SMP */
1248c2ecf20Sopenharmony_ci/* on UP, it's always identity mapped */
1258c2ecf20Sopenharmony_ci#define __addr_to_pcpu_ptr(addr)	(void __percpu *)(addr)
1268c2ecf20Sopenharmony_ci#define __pcpu_ptr_to_addr(ptr)		(void __force *)(ptr)
1278c2ecf20Sopenharmony_ci#endif	/* CONFIG_SMP */
1288c2ecf20Sopenharmony_ci
1298c2ecf20Sopenharmony_cistatic int pcpu_unit_pages __ro_after_init;
1308c2ecf20Sopenharmony_cistatic int pcpu_unit_size __ro_after_init;
1318c2ecf20Sopenharmony_cistatic int pcpu_nr_units __ro_after_init;
1328c2ecf20Sopenharmony_cistatic int pcpu_atom_size __ro_after_init;
1338c2ecf20Sopenharmony_ciint pcpu_nr_slots __ro_after_init;
1348c2ecf20Sopenharmony_cistatic size_t pcpu_chunk_struct_size __ro_after_init;
1358c2ecf20Sopenharmony_ci
1368c2ecf20Sopenharmony_ci/* cpus with the lowest and highest unit addresses */
1378c2ecf20Sopenharmony_cistatic unsigned int pcpu_low_unit_cpu __ro_after_init;
1388c2ecf20Sopenharmony_cistatic unsigned int pcpu_high_unit_cpu __ro_after_init;
1398c2ecf20Sopenharmony_ci
1408c2ecf20Sopenharmony_ci/* the address of the first chunk which starts with the kernel static area */
1418c2ecf20Sopenharmony_civoid *pcpu_base_addr __ro_after_init;
1428c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(pcpu_base_addr);
1438c2ecf20Sopenharmony_ci
1448c2ecf20Sopenharmony_cistatic const int *pcpu_unit_map __ro_after_init;		/* cpu -> unit */
1458c2ecf20Sopenharmony_ciconst unsigned long *pcpu_unit_offsets __ro_after_init;	/* cpu -> unit offset */
1468c2ecf20Sopenharmony_ci
1478c2ecf20Sopenharmony_ci/* group information, used for vm allocation */
1488c2ecf20Sopenharmony_cistatic int pcpu_nr_groups __ro_after_init;
1498c2ecf20Sopenharmony_cistatic const unsigned long *pcpu_group_offsets __ro_after_init;
1508c2ecf20Sopenharmony_cistatic const size_t *pcpu_group_sizes __ro_after_init;
1518c2ecf20Sopenharmony_ci
1528c2ecf20Sopenharmony_ci/*
1538c2ecf20Sopenharmony_ci * The first chunk which always exists.  Note that unlike other
1548c2ecf20Sopenharmony_ci * chunks, this one can be allocated and mapped in several different
1558c2ecf20Sopenharmony_ci * ways and thus often doesn't live in the vmalloc area.
1568c2ecf20Sopenharmony_ci */
1578c2ecf20Sopenharmony_cistruct pcpu_chunk *pcpu_first_chunk __ro_after_init;
1588c2ecf20Sopenharmony_ci
1598c2ecf20Sopenharmony_ci/*
1608c2ecf20Sopenharmony_ci * Optional reserved chunk.  This chunk reserves part of the first
1618c2ecf20Sopenharmony_ci * chunk and serves it for reserved allocations.  When the reserved
1628c2ecf20Sopenharmony_ci * region doesn't exist, the following variable is NULL.
1638c2ecf20Sopenharmony_ci */
1648c2ecf20Sopenharmony_cistruct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
1658c2ecf20Sopenharmony_ci
1668c2ecf20Sopenharmony_ciDEFINE_SPINLOCK(pcpu_lock);	/* all internal data structures */
1678c2ecf20Sopenharmony_cistatic DEFINE_MUTEX(pcpu_alloc_mutex);	/* chunk create/destroy, [de]pop, map ext */
1688c2ecf20Sopenharmony_ci
1698c2ecf20Sopenharmony_cistruct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */
1708c2ecf20Sopenharmony_ci
1718c2ecf20Sopenharmony_ci/* chunks which need their map areas extended, protected by pcpu_lock */
1728c2ecf20Sopenharmony_cistatic LIST_HEAD(pcpu_map_extend_chunks);
1738c2ecf20Sopenharmony_ci
1748c2ecf20Sopenharmony_ci/*
1758c2ecf20Sopenharmony_ci * The number of empty populated pages by chunk type, protected by pcpu_lock.
1768c2ecf20Sopenharmony_ci * The reserved chunk doesn't contribute to the count.
1778c2ecf20Sopenharmony_ci */
1788c2ecf20Sopenharmony_ciint pcpu_nr_empty_pop_pages[PCPU_NR_CHUNK_TYPES];
1798c2ecf20Sopenharmony_ci
1808c2ecf20Sopenharmony_ci/*
1818c2ecf20Sopenharmony_ci * The number of populated pages in use by the allocator, protected by
1828c2ecf20Sopenharmony_ci * pcpu_lock.  This number is kept per a unit per chunk (i.e. when a page gets
1838c2ecf20Sopenharmony_ci * allocated/deallocated, it is allocated/deallocated in all units of a chunk
1848c2ecf20Sopenharmony_ci * and increments/decrements this count by 1).
1858c2ecf20Sopenharmony_ci */
1868c2ecf20Sopenharmony_cistatic unsigned long pcpu_nr_populated;
1878c2ecf20Sopenharmony_ci
1888c2ecf20Sopenharmony_ci/*
1898c2ecf20Sopenharmony_ci * Balance work is used to populate or destroy chunks asynchronously.  We
1908c2ecf20Sopenharmony_ci * try to keep the number of populated free pages between
1918c2ecf20Sopenharmony_ci * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
1928c2ecf20Sopenharmony_ci * empty chunk.
1938c2ecf20Sopenharmony_ci */
1948c2ecf20Sopenharmony_cistatic void pcpu_balance_workfn(struct work_struct *work);
1958c2ecf20Sopenharmony_cistatic DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
1968c2ecf20Sopenharmony_cistatic bool pcpu_async_enabled __read_mostly;
1978c2ecf20Sopenharmony_cistatic bool pcpu_atomic_alloc_failed;
1988c2ecf20Sopenharmony_ci
1998c2ecf20Sopenharmony_cistatic void pcpu_schedule_balance_work(void)
2008c2ecf20Sopenharmony_ci{
2018c2ecf20Sopenharmony_ci	if (pcpu_async_enabled)
2028c2ecf20Sopenharmony_ci		schedule_work(&pcpu_balance_work);
2038c2ecf20Sopenharmony_ci}
2048c2ecf20Sopenharmony_ci
2058c2ecf20Sopenharmony_ci/**
2068c2ecf20Sopenharmony_ci * pcpu_addr_in_chunk - check if the address is served from this chunk
2078c2ecf20Sopenharmony_ci * @chunk: chunk of interest
2088c2ecf20Sopenharmony_ci * @addr: percpu address
2098c2ecf20Sopenharmony_ci *
2108c2ecf20Sopenharmony_ci * RETURNS:
2118c2ecf20Sopenharmony_ci * True if the address is served from this chunk.
2128c2ecf20Sopenharmony_ci */
2138c2ecf20Sopenharmony_cistatic bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
2148c2ecf20Sopenharmony_ci{
2158c2ecf20Sopenharmony_ci	void *start_addr, *end_addr;
2168c2ecf20Sopenharmony_ci
2178c2ecf20Sopenharmony_ci	if (!chunk)
2188c2ecf20Sopenharmony_ci		return false;
2198c2ecf20Sopenharmony_ci
2208c2ecf20Sopenharmony_ci	start_addr = chunk->base_addr + chunk->start_offset;
2218c2ecf20Sopenharmony_ci	end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
2228c2ecf20Sopenharmony_ci		   chunk->end_offset;
2238c2ecf20Sopenharmony_ci
2248c2ecf20Sopenharmony_ci	return addr >= start_addr && addr < end_addr;
2258c2ecf20Sopenharmony_ci}
2268c2ecf20Sopenharmony_ci
2278c2ecf20Sopenharmony_cistatic int __pcpu_size_to_slot(int size)
2288c2ecf20Sopenharmony_ci{
2298c2ecf20Sopenharmony_ci	int highbit = fls(size);	/* size is in bytes */
2308c2ecf20Sopenharmony_ci	return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
2318c2ecf20Sopenharmony_ci}
2328c2ecf20Sopenharmony_ci
2338c2ecf20Sopenharmony_cistatic int pcpu_size_to_slot(int size)
2348c2ecf20Sopenharmony_ci{
2358c2ecf20Sopenharmony_ci	if (size == pcpu_unit_size)
2368c2ecf20Sopenharmony_ci		return pcpu_nr_slots - 1;
2378c2ecf20Sopenharmony_ci	return __pcpu_size_to_slot(size);
2388c2ecf20Sopenharmony_ci}
2398c2ecf20Sopenharmony_ci
2408c2ecf20Sopenharmony_cistatic int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
2418c2ecf20Sopenharmony_ci{
2428c2ecf20Sopenharmony_ci	const struct pcpu_block_md *chunk_md = &chunk->chunk_md;
2438c2ecf20Sopenharmony_ci
2448c2ecf20Sopenharmony_ci	if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE ||
2458c2ecf20Sopenharmony_ci	    chunk_md->contig_hint == 0)
2468c2ecf20Sopenharmony_ci		return 0;
2478c2ecf20Sopenharmony_ci
2488c2ecf20Sopenharmony_ci	return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE);
2498c2ecf20Sopenharmony_ci}
2508c2ecf20Sopenharmony_ci
2518c2ecf20Sopenharmony_ci/* set the pointer to a chunk in a page struct */
2528c2ecf20Sopenharmony_cistatic void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
2538c2ecf20Sopenharmony_ci{
2548c2ecf20Sopenharmony_ci	page->index = (unsigned long)pcpu;
2558c2ecf20Sopenharmony_ci}
2568c2ecf20Sopenharmony_ci
2578c2ecf20Sopenharmony_ci/* obtain pointer to a chunk from a page struct */
2588c2ecf20Sopenharmony_cistatic struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
2598c2ecf20Sopenharmony_ci{
2608c2ecf20Sopenharmony_ci	return (struct pcpu_chunk *)page->index;
2618c2ecf20Sopenharmony_ci}
2628c2ecf20Sopenharmony_ci
2638c2ecf20Sopenharmony_cistatic int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
2648c2ecf20Sopenharmony_ci{
2658c2ecf20Sopenharmony_ci	return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
2668c2ecf20Sopenharmony_ci}
2678c2ecf20Sopenharmony_ci
2688c2ecf20Sopenharmony_cistatic unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
2698c2ecf20Sopenharmony_ci{
2708c2ecf20Sopenharmony_ci	return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
2718c2ecf20Sopenharmony_ci}
2728c2ecf20Sopenharmony_ci
2738c2ecf20Sopenharmony_cistatic unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
2748c2ecf20Sopenharmony_ci				     unsigned int cpu, int page_idx)
2758c2ecf20Sopenharmony_ci{
2768c2ecf20Sopenharmony_ci	return (unsigned long)chunk->base_addr +
2778c2ecf20Sopenharmony_ci	       pcpu_unit_page_offset(cpu, page_idx);
2788c2ecf20Sopenharmony_ci}
2798c2ecf20Sopenharmony_ci
2808c2ecf20Sopenharmony_ci/*
2818c2ecf20Sopenharmony_ci * The following are helper functions to help access bitmaps and convert
2828c2ecf20Sopenharmony_ci * between bitmap offsets to address offsets.
2838c2ecf20Sopenharmony_ci */
2848c2ecf20Sopenharmony_cistatic unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
2858c2ecf20Sopenharmony_ci{
2868c2ecf20Sopenharmony_ci	return chunk->alloc_map +
2878c2ecf20Sopenharmony_ci	       (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
2888c2ecf20Sopenharmony_ci}
2898c2ecf20Sopenharmony_ci
2908c2ecf20Sopenharmony_cistatic unsigned long pcpu_off_to_block_index(int off)
2918c2ecf20Sopenharmony_ci{
2928c2ecf20Sopenharmony_ci	return off / PCPU_BITMAP_BLOCK_BITS;
2938c2ecf20Sopenharmony_ci}
2948c2ecf20Sopenharmony_ci
2958c2ecf20Sopenharmony_cistatic unsigned long pcpu_off_to_block_off(int off)
2968c2ecf20Sopenharmony_ci{
2978c2ecf20Sopenharmony_ci	return off & (PCPU_BITMAP_BLOCK_BITS - 1);
2988c2ecf20Sopenharmony_ci}
2998c2ecf20Sopenharmony_ci
3008c2ecf20Sopenharmony_cistatic unsigned long pcpu_block_off_to_off(int index, int off)
3018c2ecf20Sopenharmony_ci{
3028c2ecf20Sopenharmony_ci	return index * PCPU_BITMAP_BLOCK_BITS + off;
3038c2ecf20Sopenharmony_ci}
3048c2ecf20Sopenharmony_ci
3058c2ecf20Sopenharmony_ci/*
3068c2ecf20Sopenharmony_ci * pcpu_next_hint - determine which hint to use
3078c2ecf20Sopenharmony_ci * @block: block of interest
3088c2ecf20Sopenharmony_ci * @alloc_bits: size of allocation
3098c2ecf20Sopenharmony_ci *
3108c2ecf20Sopenharmony_ci * This determines if we should scan based on the scan_hint or first_free.
3118c2ecf20Sopenharmony_ci * In general, we want to scan from first_free to fulfill allocations by
3128c2ecf20Sopenharmony_ci * first fit.  However, if we know a scan_hint at position scan_hint_start
3138c2ecf20Sopenharmony_ci * cannot fulfill an allocation, we can begin scanning from there knowing
3148c2ecf20Sopenharmony_ci * the contig_hint will be our fallback.
3158c2ecf20Sopenharmony_ci */
3168c2ecf20Sopenharmony_cistatic int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits)
3178c2ecf20Sopenharmony_ci{
3188c2ecf20Sopenharmony_ci	/*
3198c2ecf20Sopenharmony_ci	 * The three conditions below determine if we can skip past the
3208c2ecf20Sopenharmony_ci	 * scan_hint.  First, does the scan hint exist.  Second, is the
3218c2ecf20Sopenharmony_ci	 * contig_hint after the scan_hint (possibly not true iff
3228c2ecf20Sopenharmony_ci	 * contig_hint == scan_hint).  Third, is the allocation request
3238c2ecf20Sopenharmony_ci	 * larger than the scan_hint.
3248c2ecf20Sopenharmony_ci	 */
3258c2ecf20Sopenharmony_ci	if (block->scan_hint &&
3268c2ecf20Sopenharmony_ci	    block->contig_hint_start > block->scan_hint_start &&
3278c2ecf20Sopenharmony_ci	    alloc_bits > block->scan_hint)
3288c2ecf20Sopenharmony_ci		return block->scan_hint_start + block->scan_hint;
3298c2ecf20Sopenharmony_ci
3308c2ecf20Sopenharmony_ci	return block->first_free;
3318c2ecf20Sopenharmony_ci}
3328c2ecf20Sopenharmony_ci
3338c2ecf20Sopenharmony_ci/**
3348c2ecf20Sopenharmony_ci * pcpu_next_md_free_region - finds the next hint free area
3358c2ecf20Sopenharmony_ci * @chunk: chunk of interest
3368c2ecf20Sopenharmony_ci * @bit_off: chunk offset
3378c2ecf20Sopenharmony_ci * @bits: size of free area
3388c2ecf20Sopenharmony_ci *
3398c2ecf20Sopenharmony_ci * Helper function for pcpu_for_each_md_free_region.  It checks
3408c2ecf20Sopenharmony_ci * block->contig_hint and performs aggregation across blocks to find the
3418c2ecf20Sopenharmony_ci * next hint.  It modifies bit_off and bits in-place to be consumed in the
3428c2ecf20Sopenharmony_ci * loop.
3438c2ecf20Sopenharmony_ci */
3448c2ecf20Sopenharmony_cistatic void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
3458c2ecf20Sopenharmony_ci				     int *bits)
3468c2ecf20Sopenharmony_ci{
3478c2ecf20Sopenharmony_ci	int i = pcpu_off_to_block_index(*bit_off);
3488c2ecf20Sopenharmony_ci	int block_off = pcpu_off_to_block_off(*bit_off);
3498c2ecf20Sopenharmony_ci	struct pcpu_block_md *block;
3508c2ecf20Sopenharmony_ci
3518c2ecf20Sopenharmony_ci	*bits = 0;
3528c2ecf20Sopenharmony_ci	for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
3538c2ecf20Sopenharmony_ci	     block++, i++) {
3548c2ecf20Sopenharmony_ci		/* handles contig area across blocks */
3558c2ecf20Sopenharmony_ci		if (*bits) {
3568c2ecf20Sopenharmony_ci			*bits += block->left_free;
3578c2ecf20Sopenharmony_ci			if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
3588c2ecf20Sopenharmony_ci				continue;
3598c2ecf20Sopenharmony_ci			return;
3608c2ecf20Sopenharmony_ci		}
3618c2ecf20Sopenharmony_ci
3628c2ecf20Sopenharmony_ci		/*
3638c2ecf20Sopenharmony_ci		 * This checks three things.  First is there a contig_hint to
3648c2ecf20Sopenharmony_ci		 * check.  Second, have we checked this hint before by
3658c2ecf20Sopenharmony_ci		 * comparing the block_off.  Third, is this the same as the
3668c2ecf20Sopenharmony_ci		 * right contig hint.  In the last case, it spills over into
3678c2ecf20Sopenharmony_ci		 * the next block and should be handled by the contig area
3688c2ecf20Sopenharmony_ci		 * across blocks code.
3698c2ecf20Sopenharmony_ci		 */
3708c2ecf20Sopenharmony_ci		*bits = block->contig_hint;
3718c2ecf20Sopenharmony_ci		if (*bits && block->contig_hint_start >= block_off &&
3728c2ecf20Sopenharmony_ci		    *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
3738c2ecf20Sopenharmony_ci			*bit_off = pcpu_block_off_to_off(i,
3748c2ecf20Sopenharmony_ci					block->contig_hint_start);
3758c2ecf20Sopenharmony_ci			return;
3768c2ecf20Sopenharmony_ci		}
3778c2ecf20Sopenharmony_ci		/* reset to satisfy the second predicate above */
3788c2ecf20Sopenharmony_ci		block_off = 0;
3798c2ecf20Sopenharmony_ci
3808c2ecf20Sopenharmony_ci		*bits = block->right_free;
3818c2ecf20Sopenharmony_ci		*bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
3828c2ecf20Sopenharmony_ci	}
3838c2ecf20Sopenharmony_ci}
3848c2ecf20Sopenharmony_ci
3858c2ecf20Sopenharmony_ci/**
3868c2ecf20Sopenharmony_ci * pcpu_next_fit_region - finds fit areas for a given allocation request
3878c2ecf20Sopenharmony_ci * @chunk: chunk of interest
3888c2ecf20Sopenharmony_ci * @alloc_bits: size of allocation
3898c2ecf20Sopenharmony_ci * @align: alignment of area (max PAGE_SIZE)
3908c2ecf20Sopenharmony_ci * @bit_off: chunk offset
3918c2ecf20Sopenharmony_ci * @bits: size of free area
3928c2ecf20Sopenharmony_ci *
3938c2ecf20Sopenharmony_ci * Finds the next free region that is viable for use with a given size and
3948c2ecf20Sopenharmony_ci * alignment.  This only returns if there is a valid area to be used for this
3958c2ecf20Sopenharmony_ci * allocation.  block->first_free is returned if the allocation request fits
3968c2ecf20Sopenharmony_ci * within the block to see if the request can be fulfilled prior to the contig
3978c2ecf20Sopenharmony_ci * hint.
3988c2ecf20Sopenharmony_ci */
3998c2ecf20Sopenharmony_cistatic void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
4008c2ecf20Sopenharmony_ci				 int align, int *bit_off, int *bits)
4018c2ecf20Sopenharmony_ci{
4028c2ecf20Sopenharmony_ci	int i = pcpu_off_to_block_index(*bit_off);
4038c2ecf20Sopenharmony_ci	int block_off = pcpu_off_to_block_off(*bit_off);
4048c2ecf20Sopenharmony_ci	struct pcpu_block_md *block;
4058c2ecf20Sopenharmony_ci
4068c2ecf20Sopenharmony_ci	*bits = 0;
4078c2ecf20Sopenharmony_ci	for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
4088c2ecf20Sopenharmony_ci	     block++, i++) {
4098c2ecf20Sopenharmony_ci		/* handles contig area across blocks */
4108c2ecf20Sopenharmony_ci		if (*bits) {
4118c2ecf20Sopenharmony_ci			*bits += block->left_free;
4128c2ecf20Sopenharmony_ci			if (*bits >= alloc_bits)
4138c2ecf20Sopenharmony_ci				return;
4148c2ecf20Sopenharmony_ci			if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
4158c2ecf20Sopenharmony_ci				continue;
4168c2ecf20Sopenharmony_ci		}
4178c2ecf20Sopenharmony_ci
4188c2ecf20Sopenharmony_ci		/* check block->contig_hint */
4198c2ecf20Sopenharmony_ci		*bits = ALIGN(block->contig_hint_start, align) -
4208c2ecf20Sopenharmony_ci			block->contig_hint_start;
4218c2ecf20Sopenharmony_ci		/*
4228c2ecf20Sopenharmony_ci		 * This uses the block offset to determine if this has been
4238c2ecf20Sopenharmony_ci		 * checked in the prior iteration.
4248c2ecf20Sopenharmony_ci		 */
4258c2ecf20Sopenharmony_ci		if (block->contig_hint &&
4268c2ecf20Sopenharmony_ci		    block->contig_hint_start >= block_off &&
4278c2ecf20Sopenharmony_ci		    block->contig_hint >= *bits + alloc_bits) {
4288c2ecf20Sopenharmony_ci			int start = pcpu_next_hint(block, alloc_bits);
4298c2ecf20Sopenharmony_ci
4308c2ecf20Sopenharmony_ci			*bits += alloc_bits + block->contig_hint_start -
4318c2ecf20Sopenharmony_ci				 start;
4328c2ecf20Sopenharmony_ci			*bit_off = pcpu_block_off_to_off(i, start);
4338c2ecf20Sopenharmony_ci			return;
4348c2ecf20Sopenharmony_ci		}
4358c2ecf20Sopenharmony_ci		/* reset to satisfy the second predicate above */
4368c2ecf20Sopenharmony_ci		block_off = 0;
4378c2ecf20Sopenharmony_ci
4388c2ecf20Sopenharmony_ci		*bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
4398c2ecf20Sopenharmony_ci				 align);
4408c2ecf20Sopenharmony_ci		*bits = PCPU_BITMAP_BLOCK_BITS - *bit_off;
4418c2ecf20Sopenharmony_ci		*bit_off = pcpu_block_off_to_off(i, *bit_off);
4428c2ecf20Sopenharmony_ci		if (*bits >= alloc_bits)
4438c2ecf20Sopenharmony_ci			return;
4448c2ecf20Sopenharmony_ci	}
4458c2ecf20Sopenharmony_ci
4468c2ecf20Sopenharmony_ci	/* no valid offsets were found - fail condition */
4478c2ecf20Sopenharmony_ci	*bit_off = pcpu_chunk_map_bits(chunk);
4488c2ecf20Sopenharmony_ci}
4498c2ecf20Sopenharmony_ci
4508c2ecf20Sopenharmony_ci/*
4518c2ecf20Sopenharmony_ci * Metadata free area iterators.  These perform aggregation of free areas
4528c2ecf20Sopenharmony_ci * based on the metadata blocks and return the offset @bit_off and size in
4538c2ecf20Sopenharmony_ci * bits of the free area @bits.  pcpu_for_each_fit_region only returns when
4548c2ecf20Sopenharmony_ci * a fit is found for the allocation request.
4558c2ecf20Sopenharmony_ci */
4568c2ecf20Sopenharmony_ci#define pcpu_for_each_md_free_region(chunk, bit_off, bits)		\
4578c2ecf20Sopenharmony_ci	for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits));	\
4588c2ecf20Sopenharmony_ci	     (bit_off) < pcpu_chunk_map_bits((chunk));			\
4598c2ecf20Sopenharmony_ci	     (bit_off) += (bits) + 1,					\
4608c2ecf20Sopenharmony_ci	     pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))
4618c2ecf20Sopenharmony_ci
4628c2ecf20Sopenharmony_ci#define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits)     \
4638c2ecf20Sopenharmony_ci	for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
4648c2ecf20Sopenharmony_ci				  &(bits));				      \
4658c2ecf20Sopenharmony_ci	     (bit_off) < pcpu_chunk_map_bits((chunk));			      \
4668c2ecf20Sopenharmony_ci	     (bit_off) += (bits),					      \
4678c2ecf20Sopenharmony_ci	     pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
4688c2ecf20Sopenharmony_ci				  &(bits)))
4698c2ecf20Sopenharmony_ci
4708c2ecf20Sopenharmony_ci/**
4718c2ecf20Sopenharmony_ci * pcpu_mem_zalloc - allocate memory
4728c2ecf20Sopenharmony_ci * @size: bytes to allocate
4738c2ecf20Sopenharmony_ci * @gfp: allocation flags
4748c2ecf20Sopenharmony_ci *
4758c2ecf20Sopenharmony_ci * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
4768c2ecf20Sopenharmony_ci * kzalloc() is used; otherwise, the equivalent of vzalloc() is used.
4778c2ecf20Sopenharmony_ci * This is to facilitate passing through whitelisted flags.  The
4788c2ecf20Sopenharmony_ci * returned memory is always zeroed.
4798c2ecf20Sopenharmony_ci *
4808c2ecf20Sopenharmony_ci * RETURNS:
4818c2ecf20Sopenharmony_ci * Pointer to the allocated area on success, NULL on failure.
4828c2ecf20Sopenharmony_ci */
4838c2ecf20Sopenharmony_cistatic void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
4848c2ecf20Sopenharmony_ci{
4858c2ecf20Sopenharmony_ci	if (WARN_ON_ONCE(!slab_is_available()))
4868c2ecf20Sopenharmony_ci		return NULL;
4878c2ecf20Sopenharmony_ci
4888c2ecf20Sopenharmony_ci	if (size <= PAGE_SIZE)
4898c2ecf20Sopenharmony_ci		return kzalloc(size, gfp);
4908c2ecf20Sopenharmony_ci	else
4918c2ecf20Sopenharmony_ci		return __vmalloc(size, gfp | __GFP_ZERO);
4928c2ecf20Sopenharmony_ci}
4938c2ecf20Sopenharmony_ci
4948c2ecf20Sopenharmony_ci/**
4958c2ecf20Sopenharmony_ci * pcpu_mem_free - free memory
4968c2ecf20Sopenharmony_ci * @ptr: memory to free
4978c2ecf20Sopenharmony_ci *
4988c2ecf20Sopenharmony_ci * Free @ptr.  @ptr should have been allocated using pcpu_mem_zalloc().
4998c2ecf20Sopenharmony_ci */
5008c2ecf20Sopenharmony_cistatic void pcpu_mem_free(void *ptr)
5018c2ecf20Sopenharmony_ci{
5028c2ecf20Sopenharmony_ci	kvfree(ptr);
5038c2ecf20Sopenharmony_ci}
5048c2ecf20Sopenharmony_ci
5058c2ecf20Sopenharmony_cistatic void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
5068c2ecf20Sopenharmony_ci			      bool move_front)
5078c2ecf20Sopenharmony_ci{
5088c2ecf20Sopenharmony_ci	if (chunk != pcpu_reserved_chunk) {
5098c2ecf20Sopenharmony_ci		struct list_head *pcpu_slot;
5108c2ecf20Sopenharmony_ci
5118c2ecf20Sopenharmony_ci		pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));
5128c2ecf20Sopenharmony_ci		if (move_front)
5138c2ecf20Sopenharmony_ci			list_move(&chunk->list, &pcpu_slot[slot]);
5148c2ecf20Sopenharmony_ci		else
5158c2ecf20Sopenharmony_ci			list_move_tail(&chunk->list, &pcpu_slot[slot]);
5168c2ecf20Sopenharmony_ci	}
5178c2ecf20Sopenharmony_ci}
5188c2ecf20Sopenharmony_ci
5198c2ecf20Sopenharmony_cistatic void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot)
5208c2ecf20Sopenharmony_ci{
5218c2ecf20Sopenharmony_ci	__pcpu_chunk_move(chunk, slot, true);
5228c2ecf20Sopenharmony_ci}
5238c2ecf20Sopenharmony_ci
5248c2ecf20Sopenharmony_ci/**
5258c2ecf20Sopenharmony_ci * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
5268c2ecf20Sopenharmony_ci * @chunk: chunk of interest
5278c2ecf20Sopenharmony_ci * @oslot: the previous slot it was on
5288c2ecf20Sopenharmony_ci *
5298c2ecf20Sopenharmony_ci * This function is called after an allocation or free changed @chunk.
5308c2ecf20Sopenharmony_ci * New slot according to the changed state is determined and @chunk is
5318c2ecf20Sopenharmony_ci * moved to the slot.  Note that the reserved chunk is never put on
5328c2ecf20Sopenharmony_ci * chunk slots.
5338c2ecf20Sopenharmony_ci *
5348c2ecf20Sopenharmony_ci * CONTEXT:
5358c2ecf20Sopenharmony_ci * pcpu_lock.
5368c2ecf20Sopenharmony_ci */
5378c2ecf20Sopenharmony_cistatic void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
5388c2ecf20Sopenharmony_ci{
5398c2ecf20Sopenharmony_ci	int nslot = pcpu_chunk_slot(chunk);
5408c2ecf20Sopenharmony_ci
5418c2ecf20Sopenharmony_ci	if (oslot != nslot)
5428c2ecf20Sopenharmony_ci		__pcpu_chunk_move(chunk, nslot, oslot < nslot);
5438c2ecf20Sopenharmony_ci}
5448c2ecf20Sopenharmony_ci
5458c2ecf20Sopenharmony_ci/*
5468c2ecf20Sopenharmony_ci * pcpu_update_empty_pages - update empty page counters
5478c2ecf20Sopenharmony_ci * @chunk: chunk of interest
5488c2ecf20Sopenharmony_ci * @nr: nr of empty pages
5498c2ecf20Sopenharmony_ci *
5508c2ecf20Sopenharmony_ci * This is used to keep track of the empty pages now based on the premise
5518c2ecf20Sopenharmony_ci * a md_block covers a page.  The hint update functions recognize if a block
5528c2ecf20Sopenharmony_ci * is made full or broken to calculate deltas for keeping track of free pages.
5538c2ecf20Sopenharmony_ci */
5548c2ecf20Sopenharmony_cistatic inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
5558c2ecf20Sopenharmony_ci{
5568c2ecf20Sopenharmony_ci	chunk->nr_empty_pop_pages += nr;
5578c2ecf20Sopenharmony_ci	if (chunk != pcpu_reserved_chunk)
5588c2ecf20Sopenharmony_ci		pcpu_nr_empty_pop_pages[pcpu_chunk_type(chunk)] += nr;
5598c2ecf20Sopenharmony_ci}
5608c2ecf20Sopenharmony_ci
5618c2ecf20Sopenharmony_ci/*
5628c2ecf20Sopenharmony_ci * pcpu_region_overlap - determines if two regions overlap
5638c2ecf20Sopenharmony_ci * @a: start of first region, inclusive
5648c2ecf20Sopenharmony_ci * @b: end of first region, exclusive
5658c2ecf20Sopenharmony_ci * @x: start of second region, inclusive
5668c2ecf20Sopenharmony_ci * @y: end of second region, exclusive
5678c2ecf20Sopenharmony_ci *
5688c2ecf20Sopenharmony_ci * This is used to determine if the hint region [a, b) overlaps with the
5698c2ecf20Sopenharmony_ci * allocated region [x, y).
5708c2ecf20Sopenharmony_ci */
5718c2ecf20Sopenharmony_cistatic inline bool pcpu_region_overlap(int a, int b, int x, int y)
5728c2ecf20Sopenharmony_ci{
5738c2ecf20Sopenharmony_ci	return (a < y) && (x < b);
5748c2ecf20Sopenharmony_ci}
5758c2ecf20Sopenharmony_ci
5768c2ecf20Sopenharmony_ci/**
5778c2ecf20Sopenharmony_ci * pcpu_block_update - updates a block given a free area
5788c2ecf20Sopenharmony_ci * @block: block of interest
5798c2ecf20Sopenharmony_ci * @start: start offset in block
5808c2ecf20Sopenharmony_ci * @end: end offset in block
5818c2ecf20Sopenharmony_ci *
5828c2ecf20Sopenharmony_ci * Updates a block given a known free area.  The region [start, end) is
5838c2ecf20Sopenharmony_ci * expected to be the entirety of the free area within a block.  Chooses
5848c2ecf20Sopenharmony_ci * the best starting offset if the contig hints are equal.
5858c2ecf20Sopenharmony_ci */
5868c2ecf20Sopenharmony_cistatic void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
5878c2ecf20Sopenharmony_ci{
5888c2ecf20Sopenharmony_ci	int contig = end - start;
5898c2ecf20Sopenharmony_ci
5908c2ecf20Sopenharmony_ci	block->first_free = min(block->first_free, start);
5918c2ecf20Sopenharmony_ci	if (start == 0)
5928c2ecf20Sopenharmony_ci		block->left_free = contig;
5938c2ecf20Sopenharmony_ci
5948c2ecf20Sopenharmony_ci	if (end == block->nr_bits)
5958c2ecf20Sopenharmony_ci		block->right_free = contig;
5968c2ecf20Sopenharmony_ci
5978c2ecf20Sopenharmony_ci	if (contig > block->contig_hint) {
5988c2ecf20Sopenharmony_ci		/* promote the old contig_hint to be the new scan_hint */
5998c2ecf20Sopenharmony_ci		if (start > block->contig_hint_start) {
6008c2ecf20Sopenharmony_ci			if (block->contig_hint > block->scan_hint) {
6018c2ecf20Sopenharmony_ci				block->scan_hint_start =
6028c2ecf20Sopenharmony_ci					block->contig_hint_start;
6038c2ecf20Sopenharmony_ci				block->scan_hint = block->contig_hint;
6048c2ecf20Sopenharmony_ci			} else if (start < block->scan_hint_start) {
6058c2ecf20Sopenharmony_ci				/*
6068c2ecf20Sopenharmony_ci				 * The old contig_hint == scan_hint.  But, the
6078c2ecf20Sopenharmony_ci				 * new contig is larger so hold the invariant
6088c2ecf20Sopenharmony_ci				 * scan_hint_start < contig_hint_start.
6098c2ecf20Sopenharmony_ci				 */
6108c2ecf20Sopenharmony_ci				block->scan_hint = 0;
6118c2ecf20Sopenharmony_ci			}
6128c2ecf20Sopenharmony_ci		} else {
6138c2ecf20Sopenharmony_ci			block->scan_hint = 0;
6148c2ecf20Sopenharmony_ci		}
6158c2ecf20Sopenharmony_ci		block->contig_hint_start = start;
6168c2ecf20Sopenharmony_ci		block->contig_hint = contig;
6178c2ecf20Sopenharmony_ci	} else if (contig == block->contig_hint) {
6188c2ecf20Sopenharmony_ci		if (block->contig_hint_start &&
6198c2ecf20Sopenharmony_ci		    (!start ||
6208c2ecf20Sopenharmony_ci		     __ffs(start) > __ffs(block->contig_hint_start))) {
6218c2ecf20Sopenharmony_ci			/* start has a better alignment so use it */
6228c2ecf20Sopenharmony_ci			block->contig_hint_start = start;
6238c2ecf20Sopenharmony_ci			if (start < block->scan_hint_start &&
6248c2ecf20Sopenharmony_ci			    block->contig_hint > block->scan_hint)
6258c2ecf20Sopenharmony_ci				block->scan_hint = 0;
6268c2ecf20Sopenharmony_ci		} else if (start > block->scan_hint_start ||
6278c2ecf20Sopenharmony_ci			   block->contig_hint > block->scan_hint) {
6288c2ecf20Sopenharmony_ci			/*
6298c2ecf20Sopenharmony_ci			 * Knowing contig == contig_hint, update the scan_hint
6308c2ecf20Sopenharmony_ci			 * if it is farther than or larger than the current
6318c2ecf20Sopenharmony_ci			 * scan_hint.
6328c2ecf20Sopenharmony_ci			 */
6338c2ecf20Sopenharmony_ci			block->scan_hint_start = start;
6348c2ecf20Sopenharmony_ci			block->scan_hint = contig;
6358c2ecf20Sopenharmony_ci		}
6368c2ecf20Sopenharmony_ci	} else {
6378c2ecf20Sopenharmony_ci		/*
6388c2ecf20Sopenharmony_ci		 * The region is smaller than the contig_hint.  So only update
6398c2ecf20Sopenharmony_ci		 * the scan_hint if it is larger than or equal and farther than
6408c2ecf20Sopenharmony_ci		 * the current scan_hint.
6418c2ecf20Sopenharmony_ci		 */
6428c2ecf20Sopenharmony_ci		if ((start < block->contig_hint_start &&
6438c2ecf20Sopenharmony_ci		     (contig > block->scan_hint ||
6448c2ecf20Sopenharmony_ci		      (contig == block->scan_hint &&
6458c2ecf20Sopenharmony_ci		       start > block->scan_hint_start)))) {
6468c2ecf20Sopenharmony_ci			block->scan_hint_start = start;
6478c2ecf20Sopenharmony_ci			block->scan_hint = contig;
6488c2ecf20Sopenharmony_ci		}
6498c2ecf20Sopenharmony_ci	}
6508c2ecf20Sopenharmony_ci}
6518c2ecf20Sopenharmony_ci
6528c2ecf20Sopenharmony_ci/*
6538c2ecf20Sopenharmony_ci * pcpu_block_update_scan - update a block given a free area from a scan
6548c2ecf20Sopenharmony_ci * @chunk: chunk of interest
6558c2ecf20Sopenharmony_ci * @bit_off: chunk offset
6568c2ecf20Sopenharmony_ci * @bits: size of free area
6578c2ecf20Sopenharmony_ci *
6588c2ecf20Sopenharmony_ci * Finding the final allocation spot first goes through pcpu_find_block_fit()
6598c2ecf20Sopenharmony_ci * to find a block that can hold the allocation and then pcpu_alloc_area()
6608c2ecf20Sopenharmony_ci * where a scan is used.  When allocations require specific alignments,
6618c2ecf20Sopenharmony_ci * we can inadvertently create holes which will not be seen in the alloc
6628c2ecf20Sopenharmony_ci * or free paths.
6638c2ecf20Sopenharmony_ci *
6648c2ecf20Sopenharmony_ci * This takes a given free area hole and updates a block as it may change the
6658c2ecf20Sopenharmony_ci * scan_hint.  We need to scan backwards to ensure we don't miss free bits
6668c2ecf20Sopenharmony_ci * from alignment.
6678c2ecf20Sopenharmony_ci */
6688c2ecf20Sopenharmony_cistatic void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off,
6698c2ecf20Sopenharmony_ci				   int bits)
6708c2ecf20Sopenharmony_ci{
6718c2ecf20Sopenharmony_ci	int s_off = pcpu_off_to_block_off(bit_off);
6728c2ecf20Sopenharmony_ci	int e_off = s_off + bits;
6738c2ecf20Sopenharmony_ci	int s_index, l_bit;
6748c2ecf20Sopenharmony_ci	struct pcpu_block_md *block;
6758c2ecf20Sopenharmony_ci
6768c2ecf20Sopenharmony_ci	if (e_off > PCPU_BITMAP_BLOCK_BITS)
6778c2ecf20Sopenharmony_ci		return;
6788c2ecf20Sopenharmony_ci
6798c2ecf20Sopenharmony_ci	s_index = pcpu_off_to_block_index(bit_off);
6808c2ecf20Sopenharmony_ci	block = chunk->md_blocks + s_index;
6818c2ecf20Sopenharmony_ci
6828c2ecf20Sopenharmony_ci	/* scan backwards in case of alignment skipping free bits */
6838c2ecf20Sopenharmony_ci	l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off);
6848c2ecf20Sopenharmony_ci	s_off = (s_off == l_bit) ? 0 : l_bit + 1;
6858c2ecf20Sopenharmony_ci
6868c2ecf20Sopenharmony_ci	pcpu_block_update(block, s_off, e_off);
6878c2ecf20Sopenharmony_ci}
6888c2ecf20Sopenharmony_ci
6898c2ecf20Sopenharmony_ci/**
6908c2ecf20Sopenharmony_ci * pcpu_chunk_refresh_hint - updates metadata about a chunk
6918c2ecf20Sopenharmony_ci * @chunk: chunk of interest
6928c2ecf20Sopenharmony_ci * @full_scan: if we should scan from the beginning
6938c2ecf20Sopenharmony_ci *
6948c2ecf20Sopenharmony_ci * Iterates over the metadata blocks to find the largest contig area.
6958c2ecf20Sopenharmony_ci * A full scan can be avoided on the allocation path as this is triggered
6968c2ecf20Sopenharmony_ci * if we broke the contig_hint.  In doing so, the scan_hint will be before
6978c2ecf20Sopenharmony_ci * the contig_hint or after if the scan_hint == contig_hint.  This cannot
6988c2ecf20Sopenharmony_ci * be prevented on freeing as we want to find the largest area possibly
6998c2ecf20Sopenharmony_ci * spanning blocks.
7008c2ecf20Sopenharmony_ci */
7018c2ecf20Sopenharmony_cistatic void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan)
7028c2ecf20Sopenharmony_ci{
7038c2ecf20Sopenharmony_ci	struct pcpu_block_md *chunk_md = &chunk->chunk_md;
7048c2ecf20Sopenharmony_ci	int bit_off, bits;
7058c2ecf20Sopenharmony_ci
7068c2ecf20Sopenharmony_ci	/* promote scan_hint to contig_hint */
7078c2ecf20Sopenharmony_ci	if (!full_scan && chunk_md->scan_hint) {
7088c2ecf20Sopenharmony_ci		bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint;
7098c2ecf20Sopenharmony_ci		chunk_md->contig_hint_start = chunk_md->scan_hint_start;
7108c2ecf20Sopenharmony_ci		chunk_md->contig_hint = chunk_md->scan_hint;
7118c2ecf20Sopenharmony_ci		chunk_md->scan_hint = 0;
7128c2ecf20Sopenharmony_ci	} else {
7138c2ecf20Sopenharmony_ci		bit_off = chunk_md->first_free;
7148c2ecf20Sopenharmony_ci		chunk_md->contig_hint = 0;
7158c2ecf20Sopenharmony_ci	}
7168c2ecf20Sopenharmony_ci
7178c2ecf20Sopenharmony_ci	bits = 0;
7188c2ecf20Sopenharmony_ci	pcpu_for_each_md_free_region(chunk, bit_off, bits)
7198c2ecf20Sopenharmony_ci		pcpu_block_update(chunk_md, bit_off, bit_off + bits);
7208c2ecf20Sopenharmony_ci}
7218c2ecf20Sopenharmony_ci
7228c2ecf20Sopenharmony_ci/**
7238c2ecf20Sopenharmony_ci * pcpu_block_refresh_hint
7248c2ecf20Sopenharmony_ci * @chunk: chunk of interest
7258c2ecf20Sopenharmony_ci * @index: index of the metadata block
7268c2ecf20Sopenharmony_ci *
7278c2ecf20Sopenharmony_ci * Scans over the block beginning at first_free and updates the block
7288c2ecf20Sopenharmony_ci * metadata accordingly.
7298c2ecf20Sopenharmony_ci */
7308c2ecf20Sopenharmony_cistatic void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
7318c2ecf20Sopenharmony_ci{
7328c2ecf20Sopenharmony_ci	struct pcpu_block_md *block = chunk->md_blocks + index;
7338c2ecf20Sopenharmony_ci	unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
7348c2ecf20Sopenharmony_ci	unsigned int rs, re, start;	/* region start, region end */
7358c2ecf20Sopenharmony_ci
7368c2ecf20Sopenharmony_ci	/* promote scan_hint to contig_hint */
7378c2ecf20Sopenharmony_ci	if (block->scan_hint) {
7388c2ecf20Sopenharmony_ci		start = block->scan_hint_start + block->scan_hint;
7398c2ecf20Sopenharmony_ci		block->contig_hint_start = block->scan_hint_start;
7408c2ecf20Sopenharmony_ci		block->contig_hint = block->scan_hint;
7418c2ecf20Sopenharmony_ci		block->scan_hint = 0;
7428c2ecf20Sopenharmony_ci	} else {
7438c2ecf20Sopenharmony_ci		start = block->first_free;
7448c2ecf20Sopenharmony_ci		block->contig_hint = 0;
7458c2ecf20Sopenharmony_ci	}
7468c2ecf20Sopenharmony_ci
7478c2ecf20Sopenharmony_ci	block->right_free = 0;
7488c2ecf20Sopenharmony_ci
7498c2ecf20Sopenharmony_ci	/* iterate over free areas and update the contig hints */
7508c2ecf20Sopenharmony_ci	bitmap_for_each_clear_region(alloc_map, rs, re, start,
7518c2ecf20Sopenharmony_ci				     PCPU_BITMAP_BLOCK_BITS)
7528c2ecf20Sopenharmony_ci		pcpu_block_update(block, rs, re);
7538c2ecf20Sopenharmony_ci}
7548c2ecf20Sopenharmony_ci
7558c2ecf20Sopenharmony_ci/**
7568c2ecf20Sopenharmony_ci * pcpu_block_update_hint_alloc - update hint on allocation path
7578c2ecf20Sopenharmony_ci * @chunk: chunk of interest
7588c2ecf20Sopenharmony_ci * @bit_off: chunk offset
7598c2ecf20Sopenharmony_ci * @bits: size of request
7608c2ecf20Sopenharmony_ci *
7618c2ecf20Sopenharmony_ci * Updates metadata for the allocation path.  The metadata only has to be
7628c2ecf20Sopenharmony_ci * refreshed by a full scan iff the chunk's contig hint is broken.  Block level
7638c2ecf20Sopenharmony_ci * scans are required if the block's contig hint is broken.
7648c2ecf20Sopenharmony_ci */
7658c2ecf20Sopenharmony_cistatic void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
7668c2ecf20Sopenharmony_ci					 int bits)
7678c2ecf20Sopenharmony_ci{
7688c2ecf20Sopenharmony_ci	struct pcpu_block_md *chunk_md = &chunk->chunk_md;
7698c2ecf20Sopenharmony_ci	int nr_empty_pages = 0;
7708c2ecf20Sopenharmony_ci	struct pcpu_block_md *s_block, *e_block, *block;
7718c2ecf20Sopenharmony_ci	int s_index, e_index;	/* block indexes of the freed allocation */
7728c2ecf20Sopenharmony_ci	int s_off, e_off;	/* block offsets of the freed allocation */
7738c2ecf20Sopenharmony_ci
7748c2ecf20Sopenharmony_ci	/*
7758c2ecf20Sopenharmony_ci	 * Calculate per block offsets.
7768c2ecf20Sopenharmony_ci	 * The calculation uses an inclusive range, but the resulting offsets
7778c2ecf20Sopenharmony_ci	 * are [start, end).  e_index always points to the last block in the
7788c2ecf20Sopenharmony_ci	 * range.
7798c2ecf20Sopenharmony_ci	 */
7808c2ecf20Sopenharmony_ci	s_index = pcpu_off_to_block_index(bit_off);
7818c2ecf20Sopenharmony_ci	e_index = pcpu_off_to_block_index(bit_off + bits - 1);
7828c2ecf20Sopenharmony_ci	s_off = pcpu_off_to_block_off(bit_off);
7838c2ecf20Sopenharmony_ci	e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
7848c2ecf20Sopenharmony_ci
7858c2ecf20Sopenharmony_ci	s_block = chunk->md_blocks + s_index;
7868c2ecf20Sopenharmony_ci	e_block = chunk->md_blocks + e_index;
7878c2ecf20Sopenharmony_ci
7888c2ecf20Sopenharmony_ci	/*
7898c2ecf20Sopenharmony_ci	 * Update s_block.
7908c2ecf20Sopenharmony_ci	 * block->first_free must be updated if the allocation takes its place.
7918c2ecf20Sopenharmony_ci	 * If the allocation breaks the contig_hint, a scan is required to
7928c2ecf20Sopenharmony_ci	 * restore this hint.
7938c2ecf20Sopenharmony_ci	 */
7948c2ecf20Sopenharmony_ci	if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
7958c2ecf20Sopenharmony_ci		nr_empty_pages++;
7968c2ecf20Sopenharmony_ci
7978c2ecf20Sopenharmony_ci	if (s_off == s_block->first_free)
7988c2ecf20Sopenharmony_ci		s_block->first_free = find_next_zero_bit(
7998c2ecf20Sopenharmony_ci					pcpu_index_alloc_map(chunk, s_index),
8008c2ecf20Sopenharmony_ci					PCPU_BITMAP_BLOCK_BITS,
8018c2ecf20Sopenharmony_ci					s_off + bits);
8028c2ecf20Sopenharmony_ci
8038c2ecf20Sopenharmony_ci	if (pcpu_region_overlap(s_block->scan_hint_start,
8048c2ecf20Sopenharmony_ci				s_block->scan_hint_start + s_block->scan_hint,
8058c2ecf20Sopenharmony_ci				s_off,
8068c2ecf20Sopenharmony_ci				s_off + bits))
8078c2ecf20Sopenharmony_ci		s_block->scan_hint = 0;
8088c2ecf20Sopenharmony_ci
8098c2ecf20Sopenharmony_ci	if (pcpu_region_overlap(s_block->contig_hint_start,
8108c2ecf20Sopenharmony_ci				s_block->contig_hint_start +
8118c2ecf20Sopenharmony_ci				s_block->contig_hint,
8128c2ecf20Sopenharmony_ci				s_off,
8138c2ecf20Sopenharmony_ci				s_off + bits)) {
8148c2ecf20Sopenharmony_ci		/* block contig hint is broken - scan to fix it */
8158c2ecf20Sopenharmony_ci		if (!s_off)
8168c2ecf20Sopenharmony_ci			s_block->left_free = 0;
8178c2ecf20Sopenharmony_ci		pcpu_block_refresh_hint(chunk, s_index);
8188c2ecf20Sopenharmony_ci	} else {
8198c2ecf20Sopenharmony_ci		/* update left and right contig manually */
8208c2ecf20Sopenharmony_ci		s_block->left_free = min(s_block->left_free, s_off);
8218c2ecf20Sopenharmony_ci		if (s_index == e_index)
8228c2ecf20Sopenharmony_ci			s_block->right_free = min_t(int, s_block->right_free,
8238c2ecf20Sopenharmony_ci					PCPU_BITMAP_BLOCK_BITS - e_off);
8248c2ecf20Sopenharmony_ci		else
8258c2ecf20Sopenharmony_ci			s_block->right_free = 0;
8268c2ecf20Sopenharmony_ci	}
8278c2ecf20Sopenharmony_ci
8288c2ecf20Sopenharmony_ci	/*
8298c2ecf20Sopenharmony_ci	 * Update e_block.
8308c2ecf20Sopenharmony_ci	 */
8318c2ecf20Sopenharmony_ci	if (s_index != e_index) {
8328c2ecf20Sopenharmony_ci		if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
8338c2ecf20Sopenharmony_ci			nr_empty_pages++;
8348c2ecf20Sopenharmony_ci
8358c2ecf20Sopenharmony_ci		/*
8368c2ecf20Sopenharmony_ci		 * When the allocation is across blocks, the end is along
8378c2ecf20Sopenharmony_ci		 * the left part of the e_block.
8388c2ecf20Sopenharmony_ci		 */
8398c2ecf20Sopenharmony_ci		e_block->first_free = find_next_zero_bit(
8408c2ecf20Sopenharmony_ci				pcpu_index_alloc_map(chunk, e_index),
8418c2ecf20Sopenharmony_ci				PCPU_BITMAP_BLOCK_BITS, e_off);
8428c2ecf20Sopenharmony_ci
8438c2ecf20Sopenharmony_ci		if (e_off == PCPU_BITMAP_BLOCK_BITS) {
8448c2ecf20Sopenharmony_ci			/* reset the block */
8458c2ecf20Sopenharmony_ci			e_block++;
8468c2ecf20Sopenharmony_ci		} else {
8478c2ecf20Sopenharmony_ci			if (e_off > e_block->scan_hint_start)
8488c2ecf20Sopenharmony_ci				e_block->scan_hint = 0;
8498c2ecf20Sopenharmony_ci
8508c2ecf20Sopenharmony_ci			e_block->left_free = 0;
8518c2ecf20Sopenharmony_ci			if (e_off > e_block->contig_hint_start) {
8528c2ecf20Sopenharmony_ci				/* contig hint is broken - scan to fix it */
8538c2ecf20Sopenharmony_ci				pcpu_block_refresh_hint(chunk, e_index);
8548c2ecf20Sopenharmony_ci			} else {
8558c2ecf20Sopenharmony_ci				e_block->right_free =
8568c2ecf20Sopenharmony_ci					min_t(int, e_block->right_free,
8578c2ecf20Sopenharmony_ci					      PCPU_BITMAP_BLOCK_BITS - e_off);
8588c2ecf20Sopenharmony_ci			}
8598c2ecf20Sopenharmony_ci		}
8608c2ecf20Sopenharmony_ci
8618c2ecf20Sopenharmony_ci		/* update in-between md_blocks */
8628c2ecf20Sopenharmony_ci		nr_empty_pages += (e_index - s_index - 1);
8638c2ecf20Sopenharmony_ci		for (block = s_block + 1; block < e_block; block++) {
8648c2ecf20Sopenharmony_ci			block->scan_hint = 0;
8658c2ecf20Sopenharmony_ci			block->contig_hint = 0;
8668c2ecf20Sopenharmony_ci			block->left_free = 0;
8678c2ecf20Sopenharmony_ci			block->right_free = 0;
8688c2ecf20Sopenharmony_ci		}
8698c2ecf20Sopenharmony_ci	}
8708c2ecf20Sopenharmony_ci
8718c2ecf20Sopenharmony_ci	if (nr_empty_pages)
8728c2ecf20Sopenharmony_ci		pcpu_update_empty_pages(chunk, -nr_empty_pages);
8738c2ecf20Sopenharmony_ci
8748c2ecf20Sopenharmony_ci	if (pcpu_region_overlap(chunk_md->scan_hint_start,
8758c2ecf20Sopenharmony_ci				chunk_md->scan_hint_start +
8768c2ecf20Sopenharmony_ci				chunk_md->scan_hint,
8778c2ecf20Sopenharmony_ci				bit_off,
8788c2ecf20Sopenharmony_ci				bit_off + bits))
8798c2ecf20Sopenharmony_ci		chunk_md->scan_hint = 0;
8808c2ecf20Sopenharmony_ci
8818c2ecf20Sopenharmony_ci	/*
8828c2ecf20Sopenharmony_ci	 * The only time a full chunk scan is required is if the chunk
8838c2ecf20Sopenharmony_ci	 * contig hint is broken.  Otherwise, it means a smaller space
8848c2ecf20Sopenharmony_ci	 * was used and therefore the chunk contig hint is still correct.
8858c2ecf20Sopenharmony_ci	 */
8868c2ecf20Sopenharmony_ci	if (pcpu_region_overlap(chunk_md->contig_hint_start,
8878c2ecf20Sopenharmony_ci				chunk_md->contig_hint_start +
8888c2ecf20Sopenharmony_ci				chunk_md->contig_hint,
8898c2ecf20Sopenharmony_ci				bit_off,
8908c2ecf20Sopenharmony_ci				bit_off + bits))
8918c2ecf20Sopenharmony_ci		pcpu_chunk_refresh_hint(chunk, false);
8928c2ecf20Sopenharmony_ci}
8938c2ecf20Sopenharmony_ci
8948c2ecf20Sopenharmony_ci/**
8958c2ecf20Sopenharmony_ci * pcpu_block_update_hint_free - updates the block hints on the free path
8968c2ecf20Sopenharmony_ci * @chunk: chunk of interest
8978c2ecf20Sopenharmony_ci * @bit_off: chunk offset
8988c2ecf20Sopenharmony_ci * @bits: size of request
8998c2ecf20Sopenharmony_ci *
9008c2ecf20Sopenharmony_ci * Updates metadata for the allocation path.  This avoids a blind block
9018c2ecf20Sopenharmony_ci * refresh by making use of the block contig hints.  If this fails, it scans
9028c2ecf20Sopenharmony_ci * forward and backward to determine the extent of the free area.  This is
9038c2ecf20Sopenharmony_ci * capped at the boundary of blocks.
9048c2ecf20Sopenharmony_ci *
9058c2ecf20Sopenharmony_ci * A chunk update is triggered if a page becomes free, a block becomes free,
9068c2ecf20Sopenharmony_ci * or the free spans across blocks.  This tradeoff is to minimize iterating
9078c2ecf20Sopenharmony_ci * over the block metadata to update chunk_md->contig_hint.
9088c2ecf20Sopenharmony_ci * chunk_md->contig_hint may be off by up to a page, but it will never be more
9098c2ecf20Sopenharmony_ci * than the available space.  If the contig hint is contained in one block, it
9108c2ecf20Sopenharmony_ci * will be accurate.
9118c2ecf20Sopenharmony_ci */
9128c2ecf20Sopenharmony_cistatic void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
9138c2ecf20Sopenharmony_ci					int bits)
9148c2ecf20Sopenharmony_ci{
9158c2ecf20Sopenharmony_ci	int nr_empty_pages = 0;
9168c2ecf20Sopenharmony_ci	struct pcpu_block_md *s_block, *e_block, *block;
9178c2ecf20Sopenharmony_ci	int s_index, e_index;	/* block indexes of the freed allocation */
9188c2ecf20Sopenharmony_ci	int s_off, e_off;	/* block offsets of the freed allocation */
9198c2ecf20Sopenharmony_ci	int start, end;		/* start and end of the whole free area */
9208c2ecf20Sopenharmony_ci
9218c2ecf20Sopenharmony_ci	/*
9228c2ecf20Sopenharmony_ci	 * Calculate per block offsets.
9238c2ecf20Sopenharmony_ci	 * The calculation uses an inclusive range, but the resulting offsets
9248c2ecf20Sopenharmony_ci	 * are [start, end).  e_index always points to the last block in the
9258c2ecf20Sopenharmony_ci	 * range.
9268c2ecf20Sopenharmony_ci	 */
9278c2ecf20Sopenharmony_ci	s_index = pcpu_off_to_block_index(bit_off);
9288c2ecf20Sopenharmony_ci	e_index = pcpu_off_to_block_index(bit_off + bits - 1);
9298c2ecf20Sopenharmony_ci	s_off = pcpu_off_to_block_off(bit_off);
9308c2ecf20Sopenharmony_ci	e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
9318c2ecf20Sopenharmony_ci
9328c2ecf20Sopenharmony_ci	s_block = chunk->md_blocks + s_index;
9338c2ecf20Sopenharmony_ci	e_block = chunk->md_blocks + e_index;
9348c2ecf20Sopenharmony_ci
9358c2ecf20Sopenharmony_ci	/*
9368c2ecf20Sopenharmony_ci	 * Check if the freed area aligns with the block->contig_hint.
9378c2ecf20Sopenharmony_ci	 * If it does, then the scan to find the beginning/end of the
9388c2ecf20Sopenharmony_ci	 * larger free area can be avoided.
9398c2ecf20Sopenharmony_ci	 *
9408c2ecf20Sopenharmony_ci	 * start and end refer to beginning and end of the free area
9418c2ecf20Sopenharmony_ci	 * within each their respective blocks.  This is not necessarily
9428c2ecf20Sopenharmony_ci	 * the entire free area as it may span blocks past the beginning
9438c2ecf20Sopenharmony_ci	 * or end of the block.
9448c2ecf20Sopenharmony_ci	 */
9458c2ecf20Sopenharmony_ci	start = s_off;
9468c2ecf20Sopenharmony_ci	if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
9478c2ecf20Sopenharmony_ci		start = s_block->contig_hint_start;
9488c2ecf20Sopenharmony_ci	} else {
9498c2ecf20Sopenharmony_ci		/*
9508c2ecf20Sopenharmony_ci		 * Scan backwards to find the extent of the free area.
9518c2ecf20Sopenharmony_ci		 * find_last_bit returns the starting bit, so if the start bit
9528c2ecf20Sopenharmony_ci		 * is returned, that means there was no last bit and the
9538c2ecf20Sopenharmony_ci		 * remainder of the chunk is free.
9548c2ecf20Sopenharmony_ci		 */
9558c2ecf20Sopenharmony_ci		int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
9568c2ecf20Sopenharmony_ci					  start);
9578c2ecf20Sopenharmony_ci		start = (start == l_bit) ? 0 : l_bit + 1;
9588c2ecf20Sopenharmony_ci	}
9598c2ecf20Sopenharmony_ci
9608c2ecf20Sopenharmony_ci	end = e_off;
9618c2ecf20Sopenharmony_ci	if (e_off == e_block->contig_hint_start)
9628c2ecf20Sopenharmony_ci		end = e_block->contig_hint_start + e_block->contig_hint;
9638c2ecf20Sopenharmony_ci	else
9648c2ecf20Sopenharmony_ci		end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
9658c2ecf20Sopenharmony_ci				    PCPU_BITMAP_BLOCK_BITS, end);
9668c2ecf20Sopenharmony_ci
9678c2ecf20Sopenharmony_ci	/* update s_block */
9688c2ecf20Sopenharmony_ci	e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
9698c2ecf20Sopenharmony_ci	if (!start && e_off == PCPU_BITMAP_BLOCK_BITS)
9708c2ecf20Sopenharmony_ci		nr_empty_pages++;
9718c2ecf20Sopenharmony_ci	pcpu_block_update(s_block, start, e_off);
9728c2ecf20Sopenharmony_ci
9738c2ecf20Sopenharmony_ci	/* freeing in the same block */
9748c2ecf20Sopenharmony_ci	if (s_index != e_index) {
9758c2ecf20Sopenharmony_ci		/* update e_block */
9768c2ecf20Sopenharmony_ci		if (end == PCPU_BITMAP_BLOCK_BITS)
9778c2ecf20Sopenharmony_ci			nr_empty_pages++;
9788c2ecf20Sopenharmony_ci		pcpu_block_update(e_block, 0, end);
9798c2ecf20Sopenharmony_ci
9808c2ecf20Sopenharmony_ci		/* reset md_blocks in the middle */
9818c2ecf20Sopenharmony_ci		nr_empty_pages += (e_index - s_index - 1);
9828c2ecf20Sopenharmony_ci		for (block = s_block + 1; block < e_block; block++) {
9838c2ecf20Sopenharmony_ci			block->first_free = 0;
9848c2ecf20Sopenharmony_ci			block->scan_hint = 0;
9858c2ecf20Sopenharmony_ci			block->contig_hint_start = 0;
9868c2ecf20Sopenharmony_ci			block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
9878c2ecf20Sopenharmony_ci			block->left_free = PCPU_BITMAP_BLOCK_BITS;
9888c2ecf20Sopenharmony_ci			block->right_free = PCPU_BITMAP_BLOCK_BITS;
9898c2ecf20Sopenharmony_ci		}
9908c2ecf20Sopenharmony_ci	}
9918c2ecf20Sopenharmony_ci
9928c2ecf20Sopenharmony_ci	if (nr_empty_pages)
9938c2ecf20Sopenharmony_ci		pcpu_update_empty_pages(chunk, nr_empty_pages);
9948c2ecf20Sopenharmony_ci
9958c2ecf20Sopenharmony_ci	/*
9968c2ecf20Sopenharmony_ci	 * Refresh chunk metadata when the free makes a block free or spans
9978c2ecf20Sopenharmony_ci	 * across blocks.  The contig_hint may be off by up to a page, but if
9988c2ecf20Sopenharmony_ci	 * the contig_hint is contained in a block, it will be accurate with
9998c2ecf20Sopenharmony_ci	 * the else condition below.
10008c2ecf20Sopenharmony_ci	 */
10018c2ecf20Sopenharmony_ci	if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index)
10028c2ecf20Sopenharmony_ci		pcpu_chunk_refresh_hint(chunk, true);
10038c2ecf20Sopenharmony_ci	else
10048c2ecf20Sopenharmony_ci		pcpu_block_update(&chunk->chunk_md,
10058c2ecf20Sopenharmony_ci				  pcpu_block_off_to_off(s_index, start),
10068c2ecf20Sopenharmony_ci				  end);
10078c2ecf20Sopenharmony_ci}
10088c2ecf20Sopenharmony_ci
10098c2ecf20Sopenharmony_ci/**
10108c2ecf20Sopenharmony_ci * pcpu_is_populated - determines if the region is populated
10118c2ecf20Sopenharmony_ci * @chunk: chunk of interest
10128c2ecf20Sopenharmony_ci * @bit_off: chunk offset
10138c2ecf20Sopenharmony_ci * @bits: size of area
10148c2ecf20Sopenharmony_ci * @next_off: return value for the next offset to start searching
10158c2ecf20Sopenharmony_ci *
10168c2ecf20Sopenharmony_ci * For atomic allocations, check if the backing pages are populated.
10178c2ecf20Sopenharmony_ci *
10188c2ecf20Sopenharmony_ci * RETURNS:
10198c2ecf20Sopenharmony_ci * Bool if the backing pages are populated.
10208c2ecf20Sopenharmony_ci * next_index is to skip over unpopulated blocks in pcpu_find_block_fit.
10218c2ecf20Sopenharmony_ci */
10228c2ecf20Sopenharmony_cistatic bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
10238c2ecf20Sopenharmony_ci			      int *next_off)
10248c2ecf20Sopenharmony_ci{
10258c2ecf20Sopenharmony_ci	unsigned int page_start, page_end, rs, re;
10268c2ecf20Sopenharmony_ci
10278c2ecf20Sopenharmony_ci	page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
10288c2ecf20Sopenharmony_ci	page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
10298c2ecf20Sopenharmony_ci
10308c2ecf20Sopenharmony_ci	rs = page_start;
10318c2ecf20Sopenharmony_ci	bitmap_next_clear_region(chunk->populated, &rs, &re, page_end);
10328c2ecf20Sopenharmony_ci	if (rs >= page_end)
10338c2ecf20Sopenharmony_ci		return true;
10348c2ecf20Sopenharmony_ci
10358c2ecf20Sopenharmony_ci	*next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
10368c2ecf20Sopenharmony_ci	return false;
10378c2ecf20Sopenharmony_ci}
10388c2ecf20Sopenharmony_ci
10398c2ecf20Sopenharmony_ci/**
10408c2ecf20Sopenharmony_ci * pcpu_find_block_fit - finds the block index to start searching
10418c2ecf20Sopenharmony_ci * @chunk: chunk of interest
10428c2ecf20Sopenharmony_ci * @alloc_bits: size of request in allocation units
10438c2ecf20Sopenharmony_ci * @align: alignment of area (max PAGE_SIZE bytes)
10448c2ecf20Sopenharmony_ci * @pop_only: use populated regions only
10458c2ecf20Sopenharmony_ci *
10468c2ecf20Sopenharmony_ci * Given a chunk and an allocation spec, find the offset to begin searching
10478c2ecf20Sopenharmony_ci * for a free region.  This iterates over the bitmap metadata blocks to
10488c2ecf20Sopenharmony_ci * find an offset that will be guaranteed to fit the requirements.  It is
10498c2ecf20Sopenharmony_ci * not quite first fit as if the allocation does not fit in the contig hint
10508c2ecf20Sopenharmony_ci * of a block or chunk, it is skipped.  This errs on the side of caution
10518c2ecf20Sopenharmony_ci * to prevent excess iteration.  Poor alignment can cause the allocator to
10528c2ecf20Sopenharmony_ci * skip over blocks and chunks that have valid free areas.
10538c2ecf20Sopenharmony_ci *
10548c2ecf20Sopenharmony_ci * RETURNS:
10558c2ecf20Sopenharmony_ci * The offset in the bitmap to begin searching.
10568c2ecf20Sopenharmony_ci * -1 if no offset is found.
10578c2ecf20Sopenharmony_ci */
10588c2ecf20Sopenharmony_cistatic int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
10598c2ecf20Sopenharmony_ci			       size_t align, bool pop_only)
10608c2ecf20Sopenharmony_ci{
10618c2ecf20Sopenharmony_ci	struct pcpu_block_md *chunk_md = &chunk->chunk_md;
10628c2ecf20Sopenharmony_ci	int bit_off, bits, next_off;
10638c2ecf20Sopenharmony_ci
10648c2ecf20Sopenharmony_ci	/*
10658c2ecf20Sopenharmony_ci	 * Check to see if the allocation can fit in the chunk's contig hint.
10668c2ecf20Sopenharmony_ci	 * This is an optimization to prevent scanning by assuming if it
10678c2ecf20Sopenharmony_ci	 * cannot fit in the global hint, there is memory pressure and creating
10688c2ecf20Sopenharmony_ci	 * a new chunk would happen soon.
10698c2ecf20Sopenharmony_ci	 */
10708c2ecf20Sopenharmony_ci	bit_off = ALIGN(chunk_md->contig_hint_start, align) -
10718c2ecf20Sopenharmony_ci		  chunk_md->contig_hint_start;
10728c2ecf20Sopenharmony_ci	if (bit_off + alloc_bits > chunk_md->contig_hint)
10738c2ecf20Sopenharmony_ci		return -1;
10748c2ecf20Sopenharmony_ci
10758c2ecf20Sopenharmony_ci	bit_off = pcpu_next_hint(chunk_md, alloc_bits);
10768c2ecf20Sopenharmony_ci	bits = 0;
10778c2ecf20Sopenharmony_ci	pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
10788c2ecf20Sopenharmony_ci		if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
10798c2ecf20Sopenharmony_ci						   &next_off))
10808c2ecf20Sopenharmony_ci			break;
10818c2ecf20Sopenharmony_ci
10828c2ecf20Sopenharmony_ci		bit_off = next_off;
10838c2ecf20Sopenharmony_ci		bits = 0;
10848c2ecf20Sopenharmony_ci	}
10858c2ecf20Sopenharmony_ci
10868c2ecf20Sopenharmony_ci	if (bit_off == pcpu_chunk_map_bits(chunk))
10878c2ecf20Sopenharmony_ci		return -1;
10888c2ecf20Sopenharmony_ci
10898c2ecf20Sopenharmony_ci	return bit_off;
10908c2ecf20Sopenharmony_ci}
10918c2ecf20Sopenharmony_ci
10928c2ecf20Sopenharmony_ci/*
10938c2ecf20Sopenharmony_ci * pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off()
10948c2ecf20Sopenharmony_ci * @map: the address to base the search on
10958c2ecf20Sopenharmony_ci * @size: the bitmap size in bits
10968c2ecf20Sopenharmony_ci * @start: the bitnumber to start searching at
10978c2ecf20Sopenharmony_ci * @nr: the number of zeroed bits we're looking for
10988c2ecf20Sopenharmony_ci * @align_mask: alignment mask for zero area
10998c2ecf20Sopenharmony_ci * @largest_off: offset of the largest area skipped
11008c2ecf20Sopenharmony_ci * @largest_bits: size of the largest area skipped
11018c2ecf20Sopenharmony_ci *
11028c2ecf20Sopenharmony_ci * The @align_mask should be one less than a power of 2.
11038c2ecf20Sopenharmony_ci *
11048c2ecf20Sopenharmony_ci * This is a modified version of bitmap_find_next_zero_area_off() to remember
11058c2ecf20Sopenharmony_ci * the largest area that was skipped.  This is imperfect, but in general is
11068c2ecf20Sopenharmony_ci * good enough.  The largest remembered region is the largest failed region
11078c2ecf20Sopenharmony_ci * seen.  This does not include anything we possibly skipped due to alignment.
11088c2ecf20Sopenharmony_ci * pcpu_block_update_scan() does scan backwards to try and recover what was
11098c2ecf20Sopenharmony_ci * lost to alignment.  While this can cause scanning to miss earlier possible
11108c2ecf20Sopenharmony_ci * free areas, smaller allocations will eventually fill those holes.
11118c2ecf20Sopenharmony_ci */
11128c2ecf20Sopenharmony_cistatic unsigned long pcpu_find_zero_area(unsigned long *map,
11138c2ecf20Sopenharmony_ci					 unsigned long size,
11148c2ecf20Sopenharmony_ci					 unsigned long start,
11158c2ecf20Sopenharmony_ci					 unsigned long nr,
11168c2ecf20Sopenharmony_ci					 unsigned long align_mask,
11178c2ecf20Sopenharmony_ci					 unsigned long *largest_off,
11188c2ecf20Sopenharmony_ci					 unsigned long *largest_bits)
11198c2ecf20Sopenharmony_ci{
11208c2ecf20Sopenharmony_ci	unsigned long index, end, i, area_off, area_bits;
11218c2ecf20Sopenharmony_ciagain:
11228c2ecf20Sopenharmony_ci	index = find_next_zero_bit(map, size, start);
11238c2ecf20Sopenharmony_ci
11248c2ecf20Sopenharmony_ci	/* Align allocation */
11258c2ecf20Sopenharmony_ci	index = __ALIGN_MASK(index, align_mask);
11268c2ecf20Sopenharmony_ci	area_off = index;
11278c2ecf20Sopenharmony_ci
11288c2ecf20Sopenharmony_ci	end = index + nr;
11298c2ecf20Sopenharmony_ci	if (end > size)
11308c2ecf20Sopenharmony_ci		return end;
11318c2ecf20Sopenharmony_ci	i = find_next_bit(map, end, index);
11328c2ecf20Sopenharmony_ci	if (i < end) {
11338c2ecf20Sopenharmony_ci		area_bits = i - area_off;
11348c2ecf20Sopenharmony_ci		/* remember largest unused area with best alignment */
11358c2ecf20Sopenharmony_ci		if (area_bits > *largest_bits ||
11368c2ecf20Sopenharmony_ci		    (area_bits == *largest_bits && *largest_off &&
11378c2ecf20Sopenharmony_ci		     (!area_off || __ffs(area_off) > __ffs(*largest_off)))) {
11388c2ecf20Sopenharmony_ci			*largest_off = area_off;
11398c2ecf20Sopenharmony_ci			*largest_bits = area_bits;
11408c2ecf20Sopenharmony_ci		}
11418c2ecf20Sopenharmony_ci
11428c2ecf20Sopenharmony_ci		start = i + 1;
11438c2ecf20Sopenharmony_ci		goto again;
11448c2ecf20Sopenharmony_ci	}
11458c2ecf20Sopenharmony_ci	return index;
11468c2ecf20Sopenharmony_ci}
11478c2ecf20Sopenharmony_ci
11488c2ecf20Sopenharmony_ci/**
11498c2ecf20Sopenharmony_ci * pcpu_alloc_area - allocates an area from a pcpu_chunk
11508c2ecf20Sopenharmony_ci * @chunk: chunk of interest
11518c2ecf20Sopenharmony_ci * @alloc_bits: size of request in allocation units
11528c2ecf20Sopenharmony_ci * @align: alignment of area (max PAGE_SIZE)
11538c2ecf20Sopenharmony_ci * @start: bit_off to start searching
11548c2ecf20Sopenharmony_ci *
11558c2ecf20Sopenharmony_ci * This function takes in a @start offset to begin searching to fit an
11568c2ecf20Sopenharmony_ci * allocation of @alloc_bits with alignment @align.  It needs to scan
11578c2ecf20Sopenharmony_ci * the allocation map because if it fits within the block's contig hint,
11588c2ecf20Sopenharmony_ci * @start will be block->first_free. This is an attempt to fill the
11598c2ecf20Sopenharmony_ci * allocation prior to breaking the contig hint.  The allocation and
11608c2ecf20Sopenharmony_ci * boundary maps are updated accordingly if it confirms a valid
11618c2ecf20Sopenharmony_ci * free area.
11628c2ecf20Sopenharmony_ci *
11638c2ecf20Sopenharmony_ci * RETURNS:
11648c2ecf20Sopenharmony_ci * Allocated addr offset in @chunk on success.
11658c2ecf20Sopenharmony_ci * -1 if no matching area is found.
11668c2ecf20Sopenharmony_ci */
11678c2ecf20Sopenharmony_cistatic int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
11688c2ecf20Sopenharmony_ci			   size_t align, int start)
11698c2ecf20Sopenharmony_ci{
11708c2ecf20Sopenharmony_ci	struct pcpu_block_md *chunk_md = &chunk->chunk_md;
11718c2ecf20Sopenharmony_ci	size_t align_mask = (align) ? (align - 1) : 0;
11728c2ecf20Sopenharmony_ci	unsigned long area_off = 0, area_bits = 0;
11738c2ecf20Sopenharmony_ci	int bit_off, end, oslot;
11748c2ecf20Sopenharmony_ci
11758c2ecf20Sopenharmony_ci	lockdep_assert_held(&pcpu_lock);
11768c2ecf20Sopenharmony_ci
11778c2ecf20Sopenharmony_ci	oslot = pcpu_chunk_slot(chunk);
11788c2ecf20Sopenharmony_ci
11798c2ecf20Sopenharmony_ci	/*
11808c2ecf20Sopenharmony_ci	 * Search to find a fit.
11818c2ecf20Sopenharmony_ci	 */
11828c2ecf20Sopenharmony_ci	end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS,
11838c2ecf20Sopenharmony_ci		    pcpu_chunk_map_bits(chunk));
11848c2ecf20Sopenharmony_ci	bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits,
11858c2ecf20Sopenharmony_ci				      align_mask, &area_off, &area_bits);
11868c2ecf20Sopenharmony_ci	if (bit_off >= end)
11878c2ecf20Sopenharmony_ci		return -1;
11888c2ecf20Sopenharmony_ci
11898c2ecf20Sopenharmony_ci	if (area_bits)
11908c2ecf20Sopenharmony_ci		pcpu_block_update_scan(chunk, area_off, area_bits);
11918c2ecf20Sopenharmony_ci
11928c2ecf20Sopenharmony_ci	/* update alloc map */
11938c2ecf20Sopenharmony_ci	bitmap_set(chunk->alloc_map, bit_off, alloc_bits);
11948c2ecf20Sopenharmony_ci
11958c2ecf20Sopenharmony_ci	/* update boundary map */
11968c2ecf20Sopenharmony_ci	set_bit(bit_off, chunk->bound_map);
11978c2ecf20Sopenharmony_ci	bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
11988c2ecf20Sopenharmony_ci	set_bit(bit_off + alloc_bits, chunk->bound_map);
11998c2ecf20Sopenharmony_ci
12008c2ecf20Sopenharmony_ci	chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;
12018c2ecf20Sopenharmony_ci
12028c2ecf20Sopenharmony_ci	/* update first free bit */
12038c2ecf20Sopenharmony_ci	if (bit_off == chunk_md->first_free)
12048c2ecf20Sopenharmony_ci		chunk_md->first_free = find_next_zero_bit(
12058c2ecf20Sopenharmony_ci					chunk->alloc_map,
12068c2ecf20Sopenharmony_ci					pcpu_chunk_map_bits(chunk),
12078c2ecf20Sopenharmony_ci					bit_off + alloc_bits);
12088c2ecf20Sopenharmony_ci
12098c2ecf20Sopenharmony_ci	pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits);
12108c2ecf20Sopenharmony_ci
12118c2ecf20Sopenharmony_ci	pcpu_chunk_relocate(chunk, oslot);
12128c2ecf20Sopenharmony_ci
12138c2ecf20Sopenharmony_ci	return bit_off * PCPU_MIN_ALLOC_SIZE;
12148c2ecf20Sopenharmony_ci}
12158c2ecf20Sopenharmony_ci
12168c2ecf20Sopenharmony_ci/**
12178c2ecf20Sopenharmony_ci * pcpu_free_area - frees the corresponding offset
12188c2ecf20Sopenharmony_ci * @chunk: chunk of interest
12198c2ecf20Sopenharmony_ci * @off: addr offset into chunk
12208c2ecf20Sopenharmony_ci *
12218c2ecf20Sopenharmony_ci * This function determines the size of an allocation to free using
12228c2ecf20Sopenharmony_ci * the boundary bitmap and clears the allocation map.
12238c2ecf20Sopenharmony_ci *
12248c2ecf20Sopenharmony_ci * RETURNS:
12258c2ecf20Sopenharmony_ci * Number of freed bytes.
12268c2ecf20Sopenharmony_ci */
12278c2ecf20Sopenharmony_cistatic int pcpu_free_area(struct pcpu_chunk *chunk, int off)
12288c2ecf20Sopenharmony_ci{
12298c2ecf20Sopenharmony_ci	struct pcpu_block_md *chunk_md = &chunk->chunk_md;
12308c2ecf20Sopenharmony_ci	int bit_off, bits, end, oslot, freed;
12318c2ecf20Sopenharmony_ci
12328c2ecf20Sopenharmony_ci	lockdep_assert_held(&pcpu_lock);
12338c2ecf20Sopenharmony_ci	pcpu_stats_area_dealloc(chunk);
12348c2ecf20Sopenharmony_ci
12358c2ecf20Sopenharmony_ci	oslot = pcpu_chunk_slot(chunk);
12368c2ecf20Sopenharmony_ci
12378c2ecf20Sopenharmony_ci	bit_off = off / PCPU_MIN_ALLOC_SIZE;
12388c2ecf20Sopenharmony_ci
12398c2ecf20Sopenharmony_ci	/* find end index */
12408c2ecf20Sopenharmony_ci	end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
12418c2ecf20Sopenharmony_ci			    bit_off + 1);
12428c2ecf20Sopenharmony_ci	bits = end - bit_off;
12438c2ecf20Sopenharmony_ci	bitmap_clear(chunk->alloc_map, bit_off, bits);
12448c2ecf20Sopenharmony_ci
12458c2ecf20Sopenharmony_ci	freed = bits * PCPU_MIN_ALLOC_SIZE;
12468c2ecf20Sopenharmony_ci
12478c2ecf20Sopenharmony_ci	/* update metadata */
12488c2ecf20Sopenharmony_ci	chunk->free_bytes += freed;
12498c2ecf20Sopenharmony_ci
12508c2ecf20Sopenharmony_ci	/* update first free bit */
12518c2ecf20Sopenharmony_ci	chunk_md->first_free = min(chunk_md->first_free, bit_off);
12528c2ecf20Sopenharmony_ci
12538c2ecf20Sopenharmony_ci	pcpu_block_update_hint_free(chunk, bit_off, bits);
12548c2ecf20Sopenharmony_ci
12558c2ecf20Sopenharmony_ci	pcpu_chunk_relocate(chunk, oslot);
12568c2ecf20Sopenharmony_ci
12578c2ecf20Sopenharmony_ci	return freed;
12588c2ecf20Sopenharmony_ci}
12598c2ecf20Sopenharmony_ci
12608c2ecf20Sopenharmony_cistatic void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits)
12618c2ecf20Sopenharmony_ci{
12628c2ecf20Sopenharmony_ci	block->scan_hint = 0;
12638c2ecf20Sopenharmony_ci	block->contig_hint = nr_bits;
12648c2ecf20Sopenharmony_ci	block->left_free = nr_bits;
12658c2ecf20Sopenharmony_ci	block->right_free = nr_bits;
12668c2ecf20Sopenharmony_ci	block->first_free = 0;
12678c2ecf20Sopenharmony_ci	block->nr_bits = nr_bits;
12688c2ecf20Sopenharmony_ci}
12698c2ecf20Sopenharmony_ci
12708c2ecf20Sopenharmony_cistatic void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
12718c2ecf20Sopenharmony_ci{
12728c2ecf20Sopenharmony_ci	struct pcpu_block_md *md_block;
12738c2ecf20Sopenharmony_ci
12748c2ecf20Sopenharmony_ci	/* init the chunk's block */
12758c2ecf20Sopenharmony_ci	pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk));
12768c2ecf20Sopenharmony_ci
12778c2ecf20Sopenharmony_ci	for (md_block = chunk->md_blocks;
12788c2ecf20Sopenharmony_ci	     md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
12798c2ecf20Sopenharmony_ci	     md_block++)
12808c2ecf20Sopenharmony_ci		pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS);
12818c2ecf20Sopenharmony_ci}
12828c2ecf20Sopenharmony_ci
12838c2ecf20Sopenharmony_ci/**
12848c2ecf20Sopenharmony_ci * pcpu_alloc_first_chunk - creates chunks that serve the first chunk
12858c2ecf20Sopenharmony_ci * @tmp_addr: the start of the region served
12868c2ecf20Sopenharmony_ci * @map_size: size of the region served
12878c2ecf20Sopenharmony_ci *
12888c2ecf20Sopenharmony_ci * This is responsible for creating the chunks that serve the first chunk.  The
12898c2ecf20Sopenharmony_ci * base_addr is page aligned down of @tmp_addr while the region end is page
12908c2ecf20Sopenharmony_ci * aligned up.  Offsets are kept track of to determine the region served. All
12918c2ecf20Sopenharmony_ci * this is done to appease the bitmap allocator in avoiding partial blocks.
12928c2ecf20Sopenharmony_ci *
12938c2ecf20Sopenharmony_ci * RETURNS:
12948c2ecf20Sopenharmony_ci * Chunk serving the region at @tmp_addr of @map_size.
12958c2ecf20Sopenharmony_ci */
12968c2ecf20Sopenharmony_cistatic struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
12978c2ecf20Sopenharmony_ci							 int map_size)
12988c2ecf20Sopenharmony_ci{
12998c2ecf20Sopenharmony_ci	struct pcpu_chunk *chunk;
13008c2ecf20Sopenharmony_ci	unsigned long aligned_addr, lcm_align;
13018c2ecf20Sopenharmony_ci	int start_offset, offset_bits, region_size, region_bits;
13028c2ecf20Sopenharmony_ci	size_t alloc_size;
13038c2ecf20Sopenharmony_ci
13048c2ecf20Sopenharmony_ci	/* region calculations */
13058c2ecf20Sopenharmony_ci	aligned_addr = tmp_addr & PAGE_MASK;
13068c2ecf20Sopenharmony_ci
13078c2ecf20Sopenharmony_ci	start_offset = tmp_addr - aligned_addr;
13088c2ecf20Sopenharmony_ci
13098c2ecf20Sopenharmony_ci	/*
13108c2ecf20Sopenharmony_ci	 * Align the end of the region with the LCM of PAGE_SIZE and
13118c2ecf20Sopenharmony_ci	 * PCPU_BITMAP_BLOCK_SIZE.  One of these constants is a multiple of
13128c2ecf20Sopenharmony_ci	 * the other.
13138c2ecf20Sopenharmony_ci	 */
13148c2ecf20Sopenharmony_ci	lcm_align = lcm(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE);
13158c2ecf20Sopenharmony_ci	region_size = ALIGN(start_offset + map_size, lcm_align);
13168c2ecf20Sopenharmony_ci
13178c2ecf20Sopenharmony_ci	/* allocate chunk */
13188c2ecf20Sopenharmony_ci	alloc_size = struct_size(chunk, populated,
13198c2ecf20Sopenharmony_ci				 BITS_TO_LONGS(region_size >> PAGE_SHIFT));
13208c2ecf20Sopenharmony_ci	chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
13218c2ecf20Sopenharmony_ci	if (!chunk)
13228c2ecf20Sopenharmony_ci		panic("%s: Failed to allocate %zu bytes\n", __func__,
13238c2ecf20Sopenharmony_ci		      alloc_size);
13248c2ecf20Sopenharmony_ci
13258c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&chunk->list);
13268c2ecf20Sopenharmony_ci
13278c2ecf20Sopenharmony_ci	chunk->base_addr = (void *)aligned_addr;
13288c2ecf20Sopenharmony_ci	chunk->start_offset = start_offset;
13298c2ecf20Sopenharmony_ci	chunk->end_offset = region_size - chunk->start_offset - map_size;
13308c2ecf20Sopenharmony_ci
13318c2ecf20Sopenharmony_ci	chunk->nr_pages = region_size >> PAGE_SHIFT;
13328c2ecf20Sopenharmony_ci	region_bits = pcpu_chunk_map_bits(chunk);
13338c2ecf20Sopenharmony_ci
13348c2ecf20Sopenharmony_ci	alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]);
13358c2ecf20Sopenharmony_ci	chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
13368c2ecf20Sopenharmony_ci	if (!chunk->alloc_map)
13378c2ecf20Sopenharmony_ci		panic("%s: Failed to allocate %zu bytes\n", __func__,
13388c2ecf20Sopenharmony_ci		      alloc_size);
13398c2ecf20Sopenharmony_ci
13408c2ecf20Sopenharmony_ci	alloc_size =
13418c2ecf20Sopenharmony_ci		BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]);
13428c2ecf20Sopenharmony_ci	chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
13438c2ecf20Sopenharmony_ci	if (!chunk->bound_map)
13448c2ecf20Sopenharmony_ci		panic("%s: Failed to allocate %zu bytes\n", __func__,
13458c2ecf20Sopenharmony_ci		      alloc_size);
13468c2ecf20Sopenharmony_ci
13478c2ecf20Sopenharmony_ci	alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]);
13488c2ecf20Sopenharmony_ci	chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
13498c2ecf20Sopenharmony_ci	if (!chunk->md_blocks)
13508c2ecf20Sopenharmony_ci		panic("%s: Failed to allocate %zu bytes\n", __func__,
13518c2ecf20Sopenharmony_ci		      alloc_size);
13528c2ecf20Sopenharmony_ci
13538c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM
13548c2ecf20Sopenharmony_ci	/* first chunk isn't memcg-aware */
13558c2ecf20Sopenharmony_ci	chunk->obj_cgroups = NULL;
13568c2ecf20Sopenharmony_ci#endif
13578c2ecf20Sopenharmony_ci	pcpu_init_md_blocks(chunk);
13588c2ecf20Sopenharmony_ci
13598c2ecf20Sopenharmony_ci	/* manage populated page bitmap */
13608c2ecf20Sopenharmony_ci	chunk->immutable = true;
13618c2ecf20Sopenharmony_ci	bitmap_fill(chunk->populated, chunk->nr_pages);
13628c2ecf20Sopenharmony_ci	chunk->nr_populated = chunk->nr_pages;
13638c2ecf20Sopenharmony_ci	chunk->nr_empty_pop_pages = chunk->nr_pages;
13648c2ecf20Sopenharmony_ci
13658c2ecf20Sopenharmony_ci	chunk->free_bytes = map_size;
13668c2ecf20Sopenharmony_ci
13678c2ecf20Sopenharmony_ci	if (chunk->start_offset) {
13688c2ecf20Sopenharmony_ci		/* hide the beginning of the bitmap */
13698c2ecf20Sopenharmony_ci		offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
13708c2ecf20Sopenharmony_ci		bitmap_set(chunk->alloc_map, 0, offset_bits);
13718c2ecf20Sopenharmony_ci		set_bit(0, chunk->bound_map);
13728c2ecf20Sopenharmony_ci		set_bit(offset_bits, chunk->bound_map);
13738c2ecf20Sopenharmony_ci
13748c2ecf20Sopenharmony_ci		chunk->chunk_md.first_free = offset_bits;
13758c2ecf20Sopenharmony_ci
13768c2ecf20Sopenharmony_ci		pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
13778c2ecf20Sopenharmony_ci	}
13788c2ecf20Sopenharmony_ci
13798c2ecf20Sopenharmony_ci	if (chunk->end_offset) {
13808c2ecf20Sopenharmony_ci		/* hide the end of the bitmap */
13818c2ecf20Sopenharmony_ci		offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
13828c2ecf20Sopenharmony_ci		bitmap_set(chunk->alloc_map,
13838c2ecf20Sopenharmony_ci			   pcpu_chunk_map_bits(chunk) - offset_bits,
13848c2ecf20Sopenharmony_ci			   offset_bits);
13858c2ecf20Sopenharmony_ci		set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
13868c2ecf20Sopenharmony_ci			chunk->bound_map);
13878c2ecf20Sopenharmony_ci		set_bit(region_bits, chunk->bound_map);
13888c2ecf20Sopenharmony_ci
13898c2ecf20Sopenharmony_ci		pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk)
13908c2ecf20Sopenharmony_ci					     - offset_bits, offset_bits);
13918c2ecf20Sopenharmony_ci	}
13928c2ecf20Sopenharmony_ci
13938c2ecf20Sopenharmony_ci	return chunk;
13948c2ecf20Sopenharmony_ci}
13958c2ecf20Sopenharmony_ci
13968c2ecf20Sopenharmony_cistatic struct pcpu_chunk *pcpu_alloc_chunk(enum pcpu_chunk_type type, gfp_t gfp)
13978c2ecf20Sopenharmony_ci{
13988c2ecf20Sopenharmony_ci	struct pcpu_chunk *chunk;
13998c2ecf20Sopenharmony_ci	int region_bits;
14008c2ecf20Sopenharmony_ci
14018c2ecf20Sopenharmony_ci	chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
14028c2ecf20Sopenharmony_ci	if (!chunk)
14038c2ecf20Sopenharmony_ci		return NULL;
14048c2ecf20Sopenharmony_ci
14058c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&chunk->list);
14068c2ecf20Sopenharmony_ci	chunk->nr_pages = pcpu_unit_pages;
14078c2ecf20Sopenharmony_ci	region_bits = pcpu_chunk_map_bits(chunk);
14088c2ecf20Sopenharmony_ci
14098c2ecf20Sopenharmony_ci	chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
14108c2ecf20Sopenharmony_ci					   sizeof(chunk->alloc_map[0]), gfp);
14118c2ecf20Sopenharmony_ci	if (!chunk->alloc_map)
14128c2ecf20Sopenharmony_ci		goto alloc_map_fail;
14138c2ecf20Sopenharmony_ci
14148c2ecf20Sopenharmony_ci	chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
14158c2ecf20Sopenharmony_ci					   sizeof(chunk->bound_map[0]), gfp);
14168c2ecf20Sopenharmony_ci	if (!chunk->bound_map)
14178c2ecf20Sopenharmony_ci		goto bound_map_fail;
14188c2ecf20Sopenharmony_ci
14198c2ecf20Sopenharmony_ci	chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
14208c2ecf20Sopenharmony_ci					   sizeof(chunk->md_blocks[0]), gfp);
14218c2ecf20Sopenharmony_ci	if (!chunk->md_blocks)
14228c2ecf20Sopenharmony_ci		goto md_blocks_fail;
14238c2ecf20Sopenharmony_ci
14248c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM
14258c2ecf20Sopenharmony_ci	if (pcpu_is_memcg_chunk(type)) {
14268c2ecf20Sopenharmony_ci		chunk->obj_cgroups =
14278c2ecf20Sopenharmony_ci			pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) *
14288c2ecf20Sopenharmony_ci					sizeof(struct obj_cgroup *), gfp);
14298c2ecf20Sopenharmony_ci		if (!chunk->obj_cgroups)
14308c2ecf20Sopenharmony_ci			goto objcg_fail;
14318c2ecf20Sopenharmony_ci	}
14328c2ecf20Sopenharmony_ci#endif
14338c2ecf20Sopenharmony_ci
14348c2ecf20Sopenharmony_ci	pcpu_init_md_blocks(chunk);
14358c2ecf20Sopenharmony_ci
14368c2ecf20Sopenharmony_ci	/* init metadata */
14378c2ecf20Sopenharmony_ci	chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;
14388c2ecf20Sopenharmony_ci
14398c2ecf20Sopenharmony_ci	return chunk;
14408c2ecf20Sopenharmony_ci
14418c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM
14428c2ecf20Sopenharmony_ciobjcg_fail:
14438c2ecf20Sopenharmony_ci	pcpu_mem_free(chunk->md_blocks);
14448c2ecf20Sopenharmony_ci#endif
14458c2ecf20Sopenharmony_cimd_blocks_fail:
14468c2ecf20Sopenharmony_ci	pcpu_mem_free(chunk->bound_map);
14478c2ecf20Sopenharmony_cibound_map_fail:
14488c2ecf20Sopenharmony_ci	pcpu_mem_free(chunk->alloc_map);
14498c2ecf20Sopenharmony_cialloc_map_fail:
14508c2ecf20Sopenharmony_ci	pcpu_mem_free(chunk);
14518c2ecf20Sopenharmony_ci
14528c2ecf20Sopenharmony_ci	return NULL;
14538c2ecf20Sopenharmony_ci}
14548c2ecf20Sopenharmony_ci
14558c2ecf20Sopenharmony_cistatic void pcpu_free_chunk(struct pcpu_chunk *chunk)
14568c2ecf20Sopenharmony_ci{
14578c2ecf20Sopenharmony_ci	if (!chunk)
14588c2ecf20Sopenharmony_ci		return;
14598c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM
14608c2ecf20Sopenharmony_ci	pcpu_mem_free(chunk->obj_cgroups);
14618c2ecf20Sopenharmony_ci#endif
14628c2ecf20Sopenharmony_ci	pcpu_mem_free(chunk->md_blocks);
14638c2ecf20Sopenharmony_ci	pcpu_mem_free(chunk->bound_map);
14648c2ecf20Sopenharmony_ci	pcpu_mem_free(chunk->alloc_map);
14658c2ecf20Sopenharmony_ci	pcpu_mem_free(chunk);
14668c2ecf20Sopenharmony_ci}
14678c2ecf20Sopenharmony_ci
14688c2ecf20Sopenharmony_ci/**
14698c2ecf20Sopenharmony_ci * pcpu_chunk_populated - post-population bookkeeping
14708c2ecf20Sopenharmony_ci * @chunk: pcpu_chunk which got populated
14718c2ecf20Sopenharmony_ci * @page_start: the start page
14728c2ecf20Sopenharmony_ci * @page_end: the end page
14738c2ecf20Sopenharmony_ci *
14748c2ecf20Sopenharmony_ci * Pages in [@page_start,@page_end) have been populated to @chunk.  Update
14758c2ecf20Sopenharmony_ci * the bookkeeping information accordingly.  Must be called after each
14768c2ecf20Sopenharmony_ci * successful population.
14778c2ecf20Sopenharmony_ci *
14788c2ecf20Sopenharmony_ci * If this is @for_alloc, do not increment pcpu_nr_empty_pop_pages because it
14798c2ecf20Sopenharmony_ci * is to serve an allocation in that area.
14808c2ecf20Sopenharmony_ci */
14818c2ecf20Sopenharmony_cistatic void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
14828c2ecf20Sopenharmony_ci				 int page_end)
14838c2ecf20Sopenharmony_ci{
14848c2ecf20Sopenharmony_ci	int nr = page_end - page_start;
14858c2ecf20Sopenharmony_ci
14868c2ecf20Sopenharmony_ci	lockdep_assert_held(&pcpu_lock);
14878c2ecf20Sopenharmony_ci
14888c2ecf20Sopenharmony_ci	bitmap_set(chunk->populated, page_start, nr);
14898c2ecf20Sopenharmony_ci	chunk->nr_populated += nr;
14908c2ecf20Sopenharmony_ci	pcpu_nr_populated += nr;
14918c2ecf20Sopenharmony_ci
14928c2ecf20Sopenharmony_ci	pcpu_update_empty_pages(chunk, nr);
14938c2ecf20Sopenharmony_ci}
14948c2ecf20Sopenharmony_ci
14958c2ecf20Sopenharmony_ci/**
14968c2ecf20Sopenharmony_ci * pcpu_chunk_depopulated - post-depopulation bookkeeping
14978c2ecf20Sopenharmony_ci * @chunk: pcpu_chunk which got depopulated
14988c2ecf20Sopenharmony_ci * @page_start: the start page
14998c2ecf20Sopenharmony_ci * @page_end: the end page
15008c2ecf20Sopenharmony_ci *
15018c2ecf20Sopenharmony_ci * Pages in [@page_start,@page_end) have been depopulated from @chunk.
15028c2ecf20Sopenharmony_ci * Update the bookkeeping information accordingly.  Must be called after
15038c2ecf20Sopenharmony_ci * each successful depopulation.
15048c2ecf20Sopenharmony_ci */
15058c2ecf20Sopenharmony_cistatic void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
15068c2ecf20Sopenharmony_ci				   int page_start, int page_end)
15078c2ecf20Sopenharmony_ci{
15088c2ecf20Sopenharmony_ci	int nr = page_end - page_start;
15098c2ecf20Sopenharmony_ci
15108c2ecf20Sopenharmony_ci	lockdep_assert_held(&pcpu_lock);
15118c2ecf20Sopenharmony_ci
15128c2ecf20Sopenharmony_ci	bitmap_clear(chunk->populated, page_start, nr);
15138c2ecf20Sopenharmony_ci	chunk->nr_populated -= nr;
15148c2ecf20Sopenharmony_ci	pcpu_nr_populated -= nr;
15158c2ecf20Sopenharmony_ci
15168c2ecf20Sopenharmony_ci	pcpu_update_empty_pages(chunk, -nr);
15178c2ecf20Sopenharmony_ci}
15188c2ecf20Sopenharmony_ci
15198c2ecf20Sopenharmony_ci/*
15208c2ecf20Sopenharmony_ci * Chunk management implementation.
15218c2ecf20Sopenharmony_ci *
15228c2ecf20Sopenharmony_ci * To allow different implementations, chunk alloc/free and
15238c2ecf20Sopenharmony_ci * [de]population are implemented in a separate file which is pulled
15248c2ecf20Sopenharmony_ci * into this file and compiled together.  The following functions
15258c2ecf20Sopenharmony_ci * should be implemented.
15268c2ecf20Sopenharmony_ci *
15278c2ecf20Sopenharmony_ci * pcpu_populate_chunk		- populate the specified range of a chunk
15288c2ecf20Sopenharmony_ci * pcpu_depopulate_chunk	- depopulate the specified range of a chunk
15298c2ecf20Sopenharmony_ci * pcpu_create_chunk		- create a new chunk
15308c2ecf20Sopenharmony_ci * pcpu_destroy_chunk		- destroy a chunk, always preceded by full depop
15318c2ecf20Sopenharmony_ci * pcpu_addr_to_page		- translate address to physical address
15328c2ecf20Sopenharmony_ci * pcpu_verify_alloc_info	- check alloc_info is acceptable during init
15338c2ecf20Sopenharmony_ci */
15348c2ecf20Sopenharmony_cistatic int pcpu_populate_chunk(struct pcpu_chunk *chunk,
15358c2ecf20Sopenharmony_ci			       int page_start, int page_end, gfp_t gfp);
15368c2ecf20Sopenharmony_cistatic void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
15378c2ecf20Sopenharmony_ci				  int page_start, int page_end);
15388c2ecf20Sopenharmony_cistatic struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
15398c2ecf20Sopenharmony_ci					    gfp_t gfp);
15408c2ecf20Sopenharmony_cistatic void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
15418c2ecf20Sopenharmony_cistatic struct page *pcpu_addr_to_page(void *addr);
15428c2ecf20Sopenharmony_cistatic int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
15438c2ecf20Sopenharmony_ci
15448c2ecf20Sopenharmony_ci#ifdef CONFIG_NEED_PER_CPU_KM
15458c2ecf20Sopenharmony_ci#include "percpu-km.c"
15468c2ecf20Sopenharmony_ci#else
15478c2ecf20Sopenharmony_ci#include "percpu-vm.c"
15488c2ecf20Sopenharmony_ci#endif
15498c2ecf20Sopenharmony_ci
15508c2ecf20Sopenharmony_ci/**
15518c2ecf20Sopenharmony_ci * pcpu_chunk_addr_search - determine chunk containing specified address
15528c2ecf20Sopenharmony_ci * @addr: address for which the chunk needs to be determined.
15538c2ecf20Sopenharmony_ci *
15548c2ecf20Sopenharmony_ci * This is an internal function that handles all but static allocations.
15558c2ecf20Sopenharmony_ci * Static percpu address values should never be passed into the allocator.
15568c2ecf20Sopenharmony_ci *
15578c2ecf20Sopenharmony_ci * RETURNS:
15588c2ecf20Sopenharmony_ci * The address of the found chunk.
15598c2ecf20Sopenharmony_ci */
15608c2ecf20Sopenharmony_cistatic struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
15618c2ecf20Sopenharmony_ci{
15628c2ecf20Sopenharmony_ci	/* is it in the dynamic region (first chunk)? */
15638c2ecf20Sopenharmony_ci	if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
15648c2ecf20Sopenharmony_ci		return pcpu_first_chunk;
15658c2ecf20Sopenharmony_ci
15668c2ecf20Sopenharmony_ci	/* is it in the reserved region? */
15678c2ecf20Sopenharmony_ci	if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
15688c2ecf20Sopenharmony_ci		return pcpu_reserved_chunk;
15698c2ecf20Sopenharmony_ci
15708c2ecf20Sopenharmony_ci	/*
15718c2ecf20Sopenharmony_ci	 * The address is relative to unit0 which might be unused and
15728c2ecf20Sopenharmony_ci	 * thus unmapped.  Offset the address to the unit space of the
15738c2ecf20Sopenharmony_ci	 * current processor before looking it up in the vmalloc
15748c2ecf20Sopenharmony_ci	 * space.  Note that any possible cpu id can be used here, so
15758c2ecf20Sopenharmony_ci	 * there's no need to worry about preemption or cpu hotplug.
15768c2ecf20Sopenharmony_ci	 */
15778c2ecf20Sopenharmony_ci	addr += pcpu_unit_offsets[raw_smp_processor_id()];
15788c2ecf20Sopenharmony_ci	return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
15798c2ecf20Sopenharmony_ci}
15808c2ecf20Sopenharmony_ci
15818c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM
15828c2ecf20Sopenharmony_cistatic enum pcpu_chunk_type pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
15838c2ecf20Sopenharmony_ci						     struct obj_cgroup **objcgp)
15848c2ecf20Sopenharmony_ci{
15858c2ecf20Sopenharmony_ci	struct obj_cgroup *objcg;
15868c2ecf20Sopenharmony_ci
15878c2ecf20Sopenharmony_ci	if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT))
15888c2ecf20Sopenharmony_ci		return PCPU_CHUNK_ROOT;
15898c2ecf20Sopenharmony_ci
15908c2ecf20Sopenharmony_ci	objcg = get_obj_cgroup_from_current();
15918c2ecf20Sopenharmony_ci	if (!objcg)
15928c2ecf20Sopenharmony_ci		return PCPU_CHUNK_ROOT;
15938c2ecf20Sopenharmony_ci
15948c2ecf20Sopenharmony_ci	if (obj_cgroup_charge(objcg, gfp, size * num_possible_cpus())) {
15958c2ecf20Sopenharmony_ci		obj_cgroup_put(objcg);
15968c2ecf20Sopenharmony_ci		return PCPU_FAIL_ALLOC;
15978c2ecf20Sopenharmony_ci	}
15988c2ecf20Sopenharmony_ci
15998c2ecf20Sopenharmony_ci	*objcgp = objcg;
16008c2ecf20Sopenharmony_ci	return PCPU_CHUNK_MEMCG;
16018c2ecf20Sopenharmony_ci}
16028c2ecf20Sopenharmony_ci
16038c2ecf20Sopenharmony_cistatic void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
16048c2ecf20Sopenharmony_ci				       struct pcpu_chunk *chunk, int off,
16058c2ecf20Sopenharmony_ci				       size_t size)
16068c2ecf20Sopenharmony_ci{
16078c2ecf20Sopenharmony_ci	if (!objcg)
16088c2ecf20Sopenharmony_ci		return;
16098c2ecf20Sopenharmony_ci
16108c2ecf20Sopenharmony_ci	if (chunk) {
16118c2ecf20Sopenharmony_ci		chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg;
16128c2ecf20Sopenharmony_ci
16138c2ecf20Sopenharmony_ci		rcu_read_lock();
16148c2ecf20Sopenharmony_ci		mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
16158c2ecf20Sopenharmony_ci				size * num_possible_cpus());
16168c2ecf20Sopenharmony_ci		rcu_read_unlock();
16178c2ecf20Sopenharmony_ci	} else {
16188c2ecf20Sopenharmony_ci		obj_cgroup_uncharge(objcg, size * num_possible_cpus());
16198c2ecf20Sopenharmony_ci		obj_cgroup_put(objcg);
16208c2ecf20Sopenharmony_ci	}
16218c2ecf20Sopenharmony_ci}
16228c2ecf20Sopenharmony_ci
16238c2ecf20Sopenharmony_cistatic void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
16248c2ecf20Sopenharmony_ci{
16258c2ecf20Sopenharmony_ci	struct obj_cgroup *objcg;
16268c2ecf20Sopenharmony_ci
16278c2ecf20Sopenharmony_ci	if (!pcpu_is_memcg_chunk(pcpu_chunk_type(chunk)))
16288c2ecf20Sopenharmony_ci		return;
16298c2ecf20Sopenharmony_ci
16308c2ecf20Sopenharmony_ci	objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT];
16318c2ecf20Sopenharmony_ci	chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL;
16328c2ecf20Sopenharmony_ci
16338c2ecf20Sopenharmony_ci	obj_cgroup_uncharge(objcg, size * num_possible_cpus());
16348c2ecf20Sopenharmony_ci
16358c2ecf20Sopenharmony_ci	rcu_read_lock();
16368c2ecf20Sopenharmony_ci	mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
16378c2ecf20Sopenharmony_ci			-(size * num_possible_cpus()));
16388c2ecf20Sopenharmony_ci	rcu_read_unlock();
16398c2ecf20Sopenharmony_ci
16408c2ecf20Sopenharmony_ci	obj_cgroup_put(objcg);
16418c2ecf20Sopenharmony_ci}
16428c2ecf20Sopenharmony_ci
16438c2ecf20Sopenharmony_ci#else /* CONFIG_MEMCG_KMEM */
16448c2ecf20Sopenharmony_cistatic enum pcpu_chunk_type
16458c2ecf20Sopenharmony_cipcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp)
16468c2ecf20Sopenharmony_ci{
16478c2ecf20Sopenharmony_ci	return PCPU_CHUNK_ROOT;
16488c2ecf20Sopenharmony_ci}
16498c2ecf20Sopenharmony_ci
16508c2ecf20Sopenharmony_cistatic void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
16518c2ecf20Sopenharmony_ci				       struct pcpu_chunk *chunk, int off,
16528c2ecf20Sopenharmony_ci				       size_t size)
16538c2ecf20Sopenharmony_ci{
16548c2ecf20Sopenharmony_ci}
16558c2ecf20Sopenharmony_ci
16568c2ecf20Sopenharmony_cistatic void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
16578c2ecf20Sopenharmony_ci{
16588c2ecf20Sopenharmony_ci}
16598c2ecf20Sopenharmony_ci#endif /* CONFIG_MEMCG_KMEM */
16608c2ecf20Sopenharmony_ci
16618c2ecf20Sopenharmony_ci/**
16628c2ecf20Sopenharmony_ci * pcpu_alloc - the percpu allocator
16638c2ecf20Sopenharmony_ci * @size: size of area to allocate in bytes
16648c2ecf20Sopenharmony_ci * @align: alignment of area (max PAGE_SIZE)
16658c2ecf20Sopenharmony_ci * @reserved: allocate from the reserved chunk if available
16668c2ecf20Sopenharmony_ci * @gfp: allocation flags
16678c2ecf20Sopenharmony_ci *
16688c2ecf20Sopenharmony_ci * Allocate percpu area of @size bytes aligned at @align.  If @gfp doesn't
16698c2ecf20Sopenharmony_ci * contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN
16708c2ecf20Sopenharmony_ci * then no warning will be triggered on invalid or failed allocation
16718c2ecf20Sopenharmony_ci * requests.
16728c2ecf20Sopenharmony_ci *
16738c2ecf20Sopenharmony_ci * RETURNS:
16748c2ecf20Sopenharmony_ci * Percpu pointer to the allocated area on success, NULL on failure.
16758c2ecf20Sopenharmony_ci */
16768c2ecf20Sopenharmony_cistatic void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
16778c2ecf20Sopenharmony_ci				 gfp_t gfp)
16788c2ecf20Sopenharmony_ci{
16798c2ecf20Sopenharmony_ci	gfp_t pcpu_gfp;
16808c2ecf20Sopenharmony_ci	bool is_atomic;
16818c2ecf20Sopenharmony_ci	bool do_warn;
16828c2ecf20Sopenharmony_ci	enum pcpu_chunk_type type;
16838c2ecf20Sopenharmony_ci	struct list_head *pcpu_slot;
16848c2ecf20Sopenharmony_ci	struct obj_cgroup *objcg = NULL;
16858c2ecf20Sopenharmony_ci	static int warn_limit = 10;
16868c2ecf20Sopenharmony_ci	struct pcpu_chunk *chunk, *next;
16878c2ecf20Sopenharmony_ci	const char *err;
16888c2ecf20Sopenharmony_ci	int slot, off, cpu, ret;
16898c2ecf20Sopenharmony_ci	unsigned long flags;
16908c2ecf20Sopenharmony_ci	void __percpu *ptr;
16918c2ecf20Sopenharmony_ci	size_t bits, bit_align;
16928c2ecf20Sopenharmony_ci
16938c2ecf20Sopenharmony_ci	gfp = current_gfp_context(gfp);
16948c2ecf20Sopenharmony_ci	/* whitelisted flags that can be passed to the backing allocators */
16958c2ecf20Sopenharmony_ci	pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
16968c2ecf20Sopenharmony_ci	is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
16978c2ecf20Sopenharmony_ci	do_warn = !(gfp & __GFP_NOWARN);
16988c2ecf20Sopenharmony_ci
16998c2ecf20Sopenharmony_ci	/*
17008c2ecf20Sopenharmony_ci	 * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
17018c2ecf20Sopenharmony_ci	 * therefore alignment must be a minimum of that many bytes.
17028c2ecf20Sopenharmony_ci	 * An allocation may have internal fragmentation from rounding up
17038c2ecf20Sopenharmony_ci	 * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes.
17048c2ecf20Sopenharmony_ci	 */
17058c2ecf20Sopenharmony_ci	if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
17068c2ecf20Sopenharmony_ci		align = PCPU_MIN_ALLOC_SIZE;
17078c2ecf20Sopenharmony_ci
17088c2ecf20Sopenharmony_ci	size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
17098c2ecf20Sopenharmony_ci	bits = size >> PCPU_MIN_ALLOC_SHIFT;
17108c2ecf20Sopenharmony_ci	bit_align = align >> PCPU_MIN_ALLOC_SHIFT;
17118c2ecf20Sopenharmony_ci
17128c2ecf20Sopenharmony_ci	if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
17138c2ecf20Sopenharmony_ci		     !is_power_of_2(align))) {
17148c2ecf20Sopenharmony_ci		WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n",
17158c2ecf20Sopenharmony_ci		     size, align);
17168c2ecf20Sopenharmony_ci		return NULL;
17178c2ecf20Sopenharmony_ci	}
17188c2ecf20Sopenharmony_ci
17198c2ecf20Sopenharmony_ci	type = pcpu_memcg_pre_alloc_hook(size, gfp, &objcg);
17208c2ecf20Sopenharmony_ci	if (unlikely(type == PCPU_FAIL_ALLOC))
17218c2ecf20Sopenharmony_ci		return NULL;
17228c2ecf20Sopenharmony_ci	pcpu_slot = pcpu_chunk_list(type);
17238c2ecf20Sopenharmony_ci
17248c2ecf20Sopenharmony_ci	if (!is_atomic) {
17258c2ecf20Sopenharmony_ci		/*
17268c2ecf20Sopenharmony_ci		 * pcpu_balance_workfn() allocates memory under this mutex,
17278c2ecf20Sopenharmony_ci		 * and it may wait for memory reclaim. Allow current task
17288c2ecf20Sopenharmony_ci		 * to become OOM victim, in case of memory pressure.
17298c2ecf20Sopenharmony_ci		 */
17308c2ecf20Sopenharmony_ci		if (gfp & __GFP_NOFAIL) {
17318c2ecf20Sopenharmony_ci			mutex_lock(&pcpu_alloc_mutex);
17328c2ecf20Sopenharmony_ci		} else if (mutex_lock_killable(&pcpu_alloc_mutex)) {
17338c2ecf20Sopenharmony_ci			pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
17348c2ecf20Sopenharmony_ci			return NULL;
17358c2ecf20Sopenharmony_ci		}
17368c2ecf20Sopenharmony_ci	}
17378c2ecf20Sopenharmony_ci
17388c2ecf20Sopenharmony_ci	spin_lock_irqsave(&pcpu_lock, flags);
17398c2ecf20Sopenharmony_ci
17408c2ecf20Sopenharmony_ci	/* serve reserved allocations from the reserved chunk if available */
17418c2ecf20Sopenharmony_ci	if (reserved && pcpu_reserved_chunk) {
17428c2ecf20Sopenharmony_ci		chunk = pcpu_reserved_chunk;
17438c2ecf20Sopenharmony_ci
17448c2ecf20Sopenharmony_ci		off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
17458c2ecf20Sopenharmony_ci		if (off < 0) {
17468c2ecf20Sopenharmony_ci			err = "alloc from reserved chunk failed";
17478c2ecf20Sopenharmony_ci			goto fail_unlock;
17488c2ecf20Sopenharmony_ci		}
17498c2ecf20Sopenharmony_ci
17508c2ecf20Sopenharmony_ci		off = pcpu_alloc_area(chunk, bits, bit_align, off);
17518c2ecf20Sopenharmony_ci		if (off >= 0)
17528c2ecf20Sopenharmony_ci			goto area_found;
17538c2ecf20Sopenharmony_ci
17548c2ecf20Sopenharmony_ci		err = "alloc from reserved chunk failed";
17558c2ecf20Sopenharmony_ci		goto fail_unlock;
17568c2ecf20Sopenharmony_ci	}
17578c2ecf20Sopenharmony_ci
17588c2ecf20Sopenharmony_cirestart:
17598c2ecf20Sopenharmony_ci	/* search through normal chunks */
17608c2ecf20Sopenharmony_ci	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
17618c2ecf20Sopenharmony_ci		list_for_each_entry_safe(chunk, next, &pcpu_slot[slot], list) {
17628c2ecf20Sopenharmony_ci			off = pcpu_find_block_fit(chunk, bits, bit_align,
17638c2ecf20Sopenharmony_ci						  is_atomic);
17648c2ecf20Sopenharmony_ci			if (off < 0) {
17658c2ecf20Sopenharmony_ci				if (slot < PCPU_SLOT_FAIL_THRESHOLD)
17668c2ecf20Sopenharmony_ci					pcpu_chunk_move(chunk, 0);
17678c2ecf20Sopenharmony_ci				continue;
17688c2ecf20Sopenharmony_ci			}
17698c2ecf20Sopenharmony_ci
17708c2ecf20Sopenharmony_ci			off = pcpu_alloc_area(chunk, bits, bit_align, off);
17718c2ecf20Sopenharmony_ci			if (off >= 0)
17728c2ecf20Sopenharmony_ci				goto area_found;
17738c2ecf20Sopenharmony_ci
17748c2ecf20Sopenharmony_ci		}
17758c2ecf20Sopenharmony_ci	}
17768c2ecf20Sopenharmony_ci
17778c2ecf20Sopenharmony_ci	spin_unlock_irqrestore(&pcpu_lock, flags);
17788c2ecf20Sopenharmony_ci
17798c2ecf20Sopenharmony_ci	/*
17808c2ecf20Sopenharmony_ci	 * No space left.  Create a new chunk.  We don't want multiple
17818c2ecf20Sopenharmony_ci	 * tasks to create chunks simultaneously.  Serialize and create iff
17828c2ecf20Sopenharmony_ci	 * there's still no empty chunk after grabbing the mutex.
17838c2ecf20Sopenharmony_ci	 */
17848c2ecf20Sopenharmony_ci	if (is_atomic) {
17858c2ecf20Sopenharmony_ci		err = "atomic alloc failed, no space left";
17868c2ecf20Sopenharmony_ci		goto fail;
17878c2ecf20Sopenharmony_ci	}
17888c2ecf20Sopenharmony_ci
17898c2ecf20Sopenharmony_ci	if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
17908c2ecf20Sopenharmony_ci		chunk = pcpu_create_chunk(type, pcpu_gfp);
17918c2ecf20Sopenharmony_ci		if (!chunk) {
17928c2ecf20Sopenharmony_ci			err = "failed to allocate new chunk";
17938c2ecf20Sopenharmony_ci			goto fail;
17948c2ecf20Sopenharmony_ci		}
17958c2ecf20Sopenharmony_ci
17968c2ecf20Sopenharmony_ci		spin_lock_irqsave(&pcpu_lock, flags);
17978c2ecf20Sopenharmony_ci		pcpu_chunk_relocate(chunk, -1);
17988c2ecf20Sopenharmony_ci	} else {
17998c2ecf20Sopenharmony_ci		spin_lock_irqsave(&pcpu_lock, flags);
18008c2ecf20Sopenharmony_ci	}
18018c2ecf20Sopenharmony_ci
18028c2ecf20Sopenharmony_ci	goto restart;
18038c2ecf20Sopenharmony_ci
18048c2ecf20Sopenharmony_ciarea_found:
18058c2ecf20Sopenharmony_ci	pcpu_stats_area_alloc(chunk, size);
18068c2ecf20Sopenharmony_ci	spin_unlock_irqrestore(&pcpu_lock, flags);
18078c2ecf20Sopenharmony_ci
18088c2ecf20Sopenharmony_ci	/* populate if not all pages are already there */
18098c2ecf20Sopenharmony_ci	if (!is_atomic) {
18108c2ecf20Sopenharmony_ci		unsigned int page_start, page_end, rs, re;
18118c2ecf20Sopenharmony_ci
18128c2ecf20Sopenharmony_ci		page_start = PFN_DOWN(off);
18138c2ecf20Sopenharmony_ci		page_end = PFN_UP(off + size);
18148c2ecf20Sopenharmony_ci
18158c2ecf20Sopenharmony_ci		bitmap_for_each_clear_region(chunk->populated, rs, re,
18168c2ecf20Sopenharmony_ci					     page_start, page_end) {
18178c2ecf20Sopenharmony_ci			WARN_ON(chunk->immutable);
18188c2ecf20Sopenharmony_ci
18198c2ecf20Sopenharmony_ci			ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);
18208c2ecf20Sopenharmony_ci
18218c2ecf20Sopenharmony_ci			spin_lock_irqsave(&pcpu_lock, flags);
18228c2ecf20Sopenharmony_ci			if (ret) {
18238c2ecf20Sopenharmony_ci				pcpu_free_area(chunk, off);
18248c2ecf20Sopenharmony_ci				err = "failed to populate";
18258c2ecf20Sopenharmony_ci				goto fail_unlock;
18268c2ecf20Sopenharmony_ci			}
18278c2ecf20Sopenharmony_ci			pcpu_chunk_populated(chunk, rs, re);
18288c2ecf20Sopenharmony_ci			spin_unlock_irqrestore(&pcpu_lock, flags);
18298c2ecf20Sopenharmony_ci		}
18308c2ecf20Sopenharmony_ci
18318c2ecf20Sopenharmony_ci		mutex_unlock(&pcpu_alloc_mutex);
18328c2ecf20Sopenharmony_ci	}
18338c2ecf20Sopenharmony_ci
18348c2ecf20Sopenharmony_ci	if (pcpu_nr_empty_pop_pages[type] < PCPU_EMPTY_POP_PAGES_LOW)
18358c2ecf20Sopenharmony_ci		pcpu_schedule_balance_work();
18368c2ecf20Sopenharmony_ci
18378c2ecf20Sopenharmony_ci	/* clear the areas and return address relative to base address */
18388c2ecf20Sopenharmony_ci	for_each_possible_cpu(cpu)
18398c2ecf20Sopenharmony_ci		memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
18408c2ecf20Sopenharmony_ci
18418c2ecf20Sopenharmony_ci	ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
18428c2ecf20Sopenharmony_ci	kmemleak_alloc_percpu(ptr, size, gfp);
18438c2ecf20Sopenharmony_ci
18448c2ecf20Sopenharmony_ci	trace_percpu_alloc_percpu(reserved, is_atomic, size, align,
18458c2ecf20Sopenharmony_ci			chunk->base_addr, off, ptr);
18468c2ecf20Sopenharmony_ci
18478c2ecf20Sopenharmony_ci	pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);
18488c2ecf20Sopenharmony_ci
18498c2ecf20Sopenharmony_ci	return ptr;
18508c2ecf20Sopenharmony_ci
18518c2ecf20Sopenharmony_cifail_unlock:
18528c2ecf20Sopenharmony_ci	spin_unlock_irqrestore(&pcpu_lock, flags);
18538c2ecf20Sopenharmony_cifail:
18548c2ecf20Sopenharmony_ci	trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);
18558c2ecf20Sopenharmony_ci
18568c2ecf20Sopenharmony_ci	if (!is_atomic && do_warn && warn_limit) {
18578c2ecf20Sopenharmony_ci		pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
18588c2ecf20Sopenharmony_ci			size, align, is_atomic, err);
18598c2ecf20Sopenharmony_ci		dump_stack();
18608c2ecf20Sopenharmony_ci		if (!--warn_limit)
18618c2ecf20Sopenharmony_ci			pr_info("limit reached, disable warning\n");
18628c2ecf20Sopenharmony_ci	}
18638c2ecf20Sopenharmony_ci	if (is_atomic) {
18648c2ecf20Sopenharmony_ci		/* see the flag handling in pcpu_blance_workfn() */
18658c2ecf20Sopenharmony_ci		pcpu_atomic_alloc_failed = true;
18668c2ecf20Sopenharmony_ci		pcpu_schedule_balance_work();
18678c2ecf20Sopenharmony_ci	} else {
18688c2ecf20Sopenharmony_ci		mutex_unlock(&pcpu_alloc_mutex);
18698c2ecf20Sopenharmony_ci	}
18708c2ecf20Sopenharmony_ci
18718c2ecf20Sopenharmony_ci	pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
18728c2ecf20Sopenharmony_ci
18738c2ecf20Sopenharmony_ci	return NULL;
18748c2ecf20Sopenharmony_ci}
18758c2ecf20Sopenharmony_ci
18768c2ecf20Sopenharmony_ci/**
18778c2ecf20Sopenharmony_ci * __alloc_percpu_gfp - allocate dynamic percpu area
18788c2ecf20Sopenharmony_ci * @size: size of area to allocate in bytes
18798c2ecf20Sopenharmony_ci * @align: alignment of area (max PAGE_SIZE)
18808c2ecf20Sopenharmony_ci * @gfp: allocation flags
18818c2ecf20Sopenharmony_ci *
18828c2ecf20Sopenharmony_ci * Allocate zero-filled percpu area of @size bytes aligned at @align.  If
18838c2ecf20Sopenharmony_ci * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can
18848c2ecf20Sopenharmony_ci * be called from any context but is a lot more likely to fail. If @gfp
18858c2ecf20Sopenharmony_ci * has __GFP_NOWARN then no warning will be triggered on invalid or failed
18868c2ecf20Sopenharmony_ci * allocation requests.
18878c2ecf20Sopenharmony_ci *
18888c2ecf20Sopenharmony_ci * RETURNS:
18898c2ecf20Sopenharmony_ci * Percpu pointer to the allocated area on success, NULL on failure.
18908c2ecf20Sopenharmony_ci */
18918c2ecf20Sopenharmony_civoid __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
18928c2ecf20Sopenharmony_ci{
18938c2ecf20Sopenharmony_ci	return pcpu_alloc(size, align, false, gfp);
18948c2ecf20Sopenharmony_ci}
18958c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
18968c2ecf20Sopenharmony_ci
18978c2ecf20Sopenharmony_ci/**
18988c2ecf20Sopenharmony_ci * __alloc_percpu - allocate dynamic percpu area
18998c2ecf20Sopenharmony_ci * @size: size of area to allocate in bytes
19008c2ecf20Sopenharmony_ci * @align: alignment of area (max PAGE_SIZE)
19018c2ecf20Sopenharmony_ci *
19028c2ecf20Sopenharmony_ci * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL).
19038c2ecf20Sopenharmony_ci */
19048c2ecf20Sopenharmony_civoid __percpu *__alloc_percpu(size_t size, size_t align)
19058c2ecf20Sopenharmony_ci{
19068c2ecf20Sopenharmony_ci	return pcpu_alloc(size, align, false, GFP_KERNEL);
19078c2ecf20Sopenharmony_ci}
19088c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(__alloc_percpu);
19098c2ecf20Sopenharmony_ci
19108c2ecf20Sopenharmony_ci/**
19118c2ecf20Sopenharmony_ci * __alloc_reserved_percpu - allocate reserved percpu area
19128c2ecf20Sopenharmony_ci * @size: size of area to allocate in bytes
19138c2ecf20Sopenharmony_ci * @align: alignment of area (max PAGE_SIZE)
19148c2ecf20Sopenharmony_ci *
19158c2ecf20Sopenharmony_ci * Allocate zero-filled percpu area of @size bytes aligned at @align
19168c2ecf20Sopenharmony_ci * from reserved percpu area if arch has set it up; otherwise,
19178c2ecf20Sopenharmony_ci * allocation is served from the same dynamic area.  Might sleep.
19188c2ecf20Sopenharmony_ci * Might trigger writeouts.
19198c2ecf20Sopenharmony_ci *
19208c2ecf20Sopenharmony_ci * CONTEXT:
19218c2ecf20Sopenharmony_ci * Does GFP_KERNEL allocation.
19228c2ecf20Sopenharmony_ci *
19238c2ecf20Sopenharmony_ci * RETURNS:
19248c2ecf20Sopenharmony_ci * Percpu pointer to the allocated area on success, NULL on failure.
19258c2ecf20Sopenharmony_ci */
19268c2ecf20Sopenharmony_civoid __percpu *__alloc_reserved_percpu(size_t size, size_t align)
19278c2ecf20Sopenharmony_ci{
19288c2ecf20Sopenharmony_ci	return pcpu_alloc(size, align, true, GFP_KERNEL);
19298c2ecf20Sopenharmony_ci}
19308c2ecf20Sopenharmony_ci
19318c2ecf20Sopenharmony_ci/**
19328c2ecf20Sopenharmony_ci * __pcpu_balance_workfn - manage the amount of free chunks and populated pages
19338c2ecf20Sopenharmony_ci * @type: chunk type
19348c2ecf20Sopenharmony_ci *
19358c2ecf20Sopenharmony_ci * Reclaim all fully free chunks except for the first one.  This is also
19368c2ecf20Sopenharmony_ci * responsible for maintaining the pool of empty populated pages.  However,
19378c2ecf20Sopenharmony_ci * it is possible that this is called when physical memory is scarce causing
19388c2ecf20Sopenharmony_ci * OOM killer to be triggered.  We should avoid doing so until an actual
19398c2ecf20Sopenharmony_ci * allocation causes the failure as it is possible that requests can be
19408c2ecf20Sopenharmony_ci * serviced from already backed regions.
19418c2ecf20Sopenharmony_ci */
19428c2ecf20Sopenharmony_cistatic void __pcpu_balance_workfn(enum pcpu_chunk_type type)
19438c2ecf20Sopenharmony_ci{
19448c2ecf20Sopenharmony_ci	/* gfp flags passed to underlying allocators */
19458c2ecf20Sopenharmony_ci	const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
19468c2ecf20Sopenharmony_ci	LIST_HEAD(to_free);
19478c2ecf20Sopenharmony_ci	struct list_head *pcpu_slot = pcpu_chunk_list(type);
19488c2ecf20Sopenharmony_ci	struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
19498c2ecf20Sopenharmony_ci	struct pcpu_chunk *chunk, *next;
19508c2ecf20Sopenharmony_ci	int slot, nr_to_pop, ret;
19518c2ecf20Sopenharmony_ci
19528c2ecf20Sopenharmony_ci	/*
19538c2ecf20Sopenharmony_ci	 * There's no reason to keep around multiple unused chunks and VM
19548c2ecf20Sopenharmony_ci	 * areas can be scarce.  Destroy all free chunks except for one.
19558c2ecf20Sopenharmony_ci	 */
19568c2ecf20Sopenharmony_ci	mutex_lock(&pcpu_alloc_mutex);
19578c2ecf20Sopenharmony_ci	spin_lock_irq(&pcpu_lock);
19588c2ecf20Sopenharmony_ci
19598c2ecf20Sopenharmony_ci	list_for_each_entry_safe(chunk, next, free_head, list) {
19608c2ecf20Sopenharmony_ci		WARN_ON(chunk->immutable);
19618c2ecf20Sopenharmony_ci
19628c2ecf20Sopenharmony_ci		/* spare the first one */
19638c2ecf20Sopenharmony_ci		if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
19648c2ecf20Sopenharmony_ci			continue;
19658c2ecf20Sopenharmony_ci
19668c2ecf20Sopenharmony_ci		list_move(&chunk->list, &to_free);
19678c2ecf20Sopenharmony_ci	}
19688c2ecf20Sopenharmony_ci
19698c2ecf20Sopenharmony_ci	spin_unlock_irq(&pcpu_lock);
19708c2ecf20Sopenharmony_ci
19718c2ecf20Sopenharmony_ci	list_for_each_entry_safe(chunk, next, &to_free, list) {
19728c2ecf20Sopenharmony_ci		unsigned int rs, re;
19738c2ecf20Sopenharmony_ci
19748c2ecf20Sopenharmony_ci		bitmap_for_each_set_region(chunk->populated, rs, re, 0,
19758c2ecf20Sopenharmony_ci					   chunk->nr_pages) {
19768c2ecf20Sopenharmony_ci			pcpu_depopulate_chunk(chunk, rs, re);
19778c2ecf20Sopenharmony_ci			spin_lock_irq(&pcpu_lock);
19788c2ecf20Sopenharmony_ci			pcpu_chunk_depopulated(chunk, rs, re);
19798c2ecf20Sopenharmony_ci			spin_unlock_irq(&pcpu_lock);
19808c2ecf20Sopenharmony_ci		}
19818c2ecf20Sopenharmony_ci		pcpu_destroy_chunk(chunk);
19828c2ecf20Sopenharmony_ci		cond_resched();
19838c2ecf20Sopenharmony_ci	}
19848c2ecf20Sopenharmony_ci
19858c2ecf20Sopenharmony_ci	/*
19868c2ecf20Sopenharmony_ci	 * Ensure there are certain number of free populated pages for
19878c2ecf20Sopenharmony_ci	 * atomic allocs.  Fill up from the most packed so that atomic
19888c2ecf20Sopenharmony_ci	 * allocs don't increase fragmentation.  If atomic allocation
19898c2ecf20Sopenharmony_ci	 * failed previously, always populate the maximum amount.  This
19908c2ecf20Sopenharmony_ci	 * should prevent atomic allocs larger than PAGE_SIZE from keeping
19918c2ecf20Sopenharmony_ci	 * failing indefinitely; however, large atomic allocs are not
19928c2ecf20Sopenharmony_ci	 * something we support properly and can be highly unreliable and
19938c2ecf20Sopenharmony_ci	 * inefficient.
19948c2ecf20Sopenharmony_ci	 */
19958c2ecf20Sopenharmony_ciretry_pop:
19968c2ecf20Sopenharmony_ci	if (pcpu_atomic_alloc_failed) {
19978c2ecf20Sopenharmony_ci		nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
19988c2ecf20Sopenharmony_ci		/* best effort anyway, don't worry about synchronization */
19998c2ecf20Sopenharmony_ci		pcpu_atomic_alloc_failed = false;
20008c2ecf20Sopenharmony_ci	} else {
20018c2ecf20Sopenharmony_ci		nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
20028c2ecf20Sopenharmony_ci				  pcpu_nr_empty_pop_pages[type],
20038c2ecf20Sopenharmony_ci				  0, PCPU_EMPTY_POP_PAGES_HIGH);
20048c2ecf20Sopenharmony_ci	}
20058c2ecf20Sopenharmony_ci
20068c2ecf20Sopenharmony_ci	for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
20078c2ecf20Sopenharmony_ci		unsigned int nr_unpop = 0, rs, re;
20088c2ecf20Sopenharmony_ci
20098c2ecf20Sopenharmony_ci		if (!nr_to_pop)
20108c2ecf20Sopenharmony_ci			break;
20118c2ecf20Sopenharmony_ci
20128c2ecf20Sopenharmony_ci		spin_lock_irq(&pcpu_lock);
20138c2ecf20Sopenharmony_ci		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
20148c2ecf20Sopenharmony_ci			nr_unpop = chunk->nr_pages - chunk->nr_populated;
20158c2ecf20Sopenharmony_ci			if (nr_unpop)
20168c2ecf20Sopenharmony_ci				break;
20178c2ecf20Sopenharmony_ci		}
20188c2ecf20Sopenharmony_ci		spin_unlock_irq(&pcpu_lock);
20198c2ecf20Sopenharmony_ci
20208c2ecf20Sopenharmony_ci		if (!nr_unpop)
20218c2ecf20Sopenharmony_ci			continue;
20228c2ecf20Sopenharmony_ci
20238c2ecf20Sopenharmony_ci		/* @chunk can't go away while pcpu_alloc_mutex is held */
20248c2ecf20Sopenharmony_ci		bitmap_for_each_clear_region(chunk->populated, rs, re, 0,
20258c2ecf20Sopenharmony_ci					     chunk->nr_pages) {
20268c2ecf20Sopenharmony_ci			int nr = min_t(int, re - rs, nr_to_pop);
20278c2ecf20Sopenharmony_ci
20288c2ecf20Sopenharmony_ci			ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
20298c2ecf20Sopenharmony_ci			if (!ret) {
20308c2ecf20Sopenharmony_ci				nr_to_pop -= nr;
20318c2ecf20Sopenharmony_ci				spin_lock_irq(&pcpu_lock);
20328c2ecf20Sopenharmony_ci				pcpu_chunk_populated(chunk, rs, rs + nr);
20338c2ecf20Sopenharmony_ci				spin_unlock_irq(&pcpu_lock);
20348c2ecf20Sopenharmony_ci			} else {
20358c2ecf20Sopenharmony_ci				nr_to_pop = 0;
20368c2ecf20Sopenharmony_ci			}
20378c2ecf20Sopenharmony_ci
20388c2ecf20Sopenharmony_ci			if (!nr_to_pop)
20398c2ecf20Sopenharmony_ci				break;
20408c2ecf20Sopenharmony_ci		}
20418c2ecf20Sopenharmony_ci	}
20428c2ecf20Sopenharmony_ci
20438c2ecf20Sopenharmony_ci	if (nr_to_pop) {
20448c2ecf20Sopenharmony_ci		/* ran out of chunks to populate, create a new one and retry */
20458c2ecf20Sopenharmony_ci		chunk = pcpu_create_chunk(type, gfp);
20468c2ecf20Sopenharmony_ci		if (chunk) {
20478c2ecf20Sopenharmony_ci			spin_lock_irq(&pcpu_lock);
20488c2ecf20Sopenharmony_ci			pcpu_chunk_relocate(chunk, -1);
20498c2ecf20Sopenharmony_ci			spin_unlock_irq(&pcpu_lock);
20508c2ecf20Sopenharmony_ci			goto retry_pop;
20518c2ecf20Sopenharmony_ci		}
20528c2ecf20Sopenharmony_ci	}
20538c2ecf20Sopenharmony_ci
20548c2ecf20Sopenharmony_ci	mutex_unlock(&pcpu_alloc_mutex);
20558c2ecf20Sopenharmony_ci}
20568c2ecf20Sopenharmony_ci
20578c2ecf20Sopenharmony_ci/**
20588c2ecf20Sopenharmony_ci * pcpu_balance_workfn - manage the amount of free chunks and populated pages
20598c2ecf20Sopenharmony_ci * @work: unused
20608c2ecf20Sopenharmony_ci *
20618c2ecf20Sopenharmony_ci * Call __pcpu_balance_workfn() for each chunk type.
20628c2ecf20Sopenharmony_ci */
20638c2ecf20Sopenharmony_cistatic void pcpu_balance_workfn(struct work_struct *work)
20648c2ecf20Sopenharmony_ci{
20658c2ecf20Sopenharmony_ci	enum pcpu_chunk_type type;
20668c2ecf20Sopenharmony_ci
20678c2ecf20Sopenharmony_ci	for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
20688c2ecf20Sopenharmony_ci		__pcpu_balance_workfn(type);
20698c2ecf20Sopenharmony_ci}
20708c2ecf20Sopenharmony_ci
20718c2ecf20Sopenharmony_ci/**
20728c2ecf20Sopenharmony_ci * free_percpu - free percpu area
20738c2ecf20Sopenharmony_ci * @ptr: pointer to area to free
20748c2ecf20Sopenharmony_ci *
20758c2ecf20Sopenharmony_ci * Free percpu area @ptr.
20768c2ecf20Sopenharmony_ci *
20778c2ecf20Sopenharmony_ci * CONTEXT:
20788c2ecf20Sopenharmony_ci * Can be called from atomic context.
20798c2ecf20Sopenharmony_ci */
20808c2ecf20Sopenharmony_civoid free_percpu(void __percpu *ptr)
20818c2ecf20Sopenharmony_ci{
20828c2ecf20Sopenharmony_ci	void *addr;
20838c2ecf20Sopenharmony_ci	struct pcpu_chunk *chunk;
20848c2ecf20Sopenharmony_ci	unsigned long flags;
20858c2ecf20Sopenharmony_ci	int size, off;
20868c2ecf20Sopenharmony_ci	bool need_balance = false;
20878c2ecf20Sopenharmony_ci	struct list_head *pcpu_slot;
20888c2ecf20Sopenharmony_ci
20898c2ecf20Sopenharmony_ci	if (!ptr)
20908c2ecf20Sopenharmony_ci		return;
20918c2ecf20Sopenharmony_ci
20928c2ecf20Sopenharmony_ci	kmemleak_free_percpu(ptr);
20938c2ecf20Sopenharmony_ci
20948c2ecf20Sopenharmony_ci	addr = __pcpu_ptr_to_addr(ptr);
20958c2ecf20Sopenharmony_ci
20968c2ecf20Sopenharmony_ci	spin_lock_irqsave(&pcpu_lock, flags);
20978c2ecf20Sopenharmony_ci
20988c2ecf20Sopenharmony_ci	chunk = pcpu_chunk_addr_search(addr);
20998c2ecf20Sopenharmony_ci	off = addr - chunk->base_addr;
21008c2ecf20Sopenharmony_ci
21018c2ecf20Sopenharmony_ci	size = pcpu_free_area(chunk, off);
21028c2ecf20Sopenharmony_ci
21038c2ecf20Sopenharmony_ci	pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));
21048c2ecf20Sopenharmony_ci
21058c2ecf20Sopenharmony_ci	pcpu_memcg_free_hook(chunk, off, size);
21068c2ecf20Sopenharmony_ci
21078c2ecf20Sopenharmony_ci	/* if there are more than one fully free chunks, wake up grim reaper */
21088c2ecf20Sopenharmony_ci	if (chunk->free_bytes == pcpu_unit_size) {
21098c2ecf20Sopenharmony_ci		struct pcpu_chunk *pos;
21108c2ecf20Sopenharmony_ci
21118c2ecf20Sopenharmony_ci		list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
21128c2ecf20Sopenharmony_ci			if (pos != chunk) {
21138c2ecf20Sopenharmony_ci				need_balance = true;
21148c2ecf20Sopenharmony_ci				break;
21158c2ecf20Sopenharmony_ci			}
21168c2ecf20Sopenharmony_ci	}
21178c2ecf20Sopenharmony_ci
21188c2ecf20Sopenharmony_ci	trace_percpu_free_percpu(chunk->base_addr, off, ptr);
21198c2ecf20Sopenharmony_ci
21208c2ecf20Sopenharmony_ci	spin_unlock_irqrestore(&pcpu_lock, flags);
21218c2ecf20Sopenharmony_ci
21228c2ecf20Sopenharmony_ci	if (need_balance)
21238c2ecf20Sopenharmony_ci		pcpu_schedule_balance_work();
21248c2ecf20Sopenharmony_ci}
21258c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(free_percpu);
21268c2ecf20Sopenharmony_ci
21278c2ecf20Sopenharmony_cibool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
21288c2ecf20Sopenharmony_ci{
21298c2ecf20Sopenharmony_ci#ifdef CONFIG_SMP
21308c2ecf20Sopenharmony_ci	const size_t static_size = __per_cpu_end - __per_cpu_start;
21318c2ecf20Sopenharmony_ci	void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
21328c2ecf20Sopenharmony_ci	unsigned int cpu;
21338c2ecf20Sopenharmony_ci
21348c2ecf20Sopenharmony_ci	for_each_possible_cpu(cpu) {
21358c2ecf20Sopenharmony_ci		void *start = per_cpu_ptr(base, cpu);
21368c2ecf20Sopenharmony_ci		void *va = (void *)addr;
21378c2ecf20Sopenharmony_ci
21388c2ecf20Sopenharmony_ci		if (va >= start && va < start + static_size) {
21398c2ecf20Sopenharmony_ci			if (can_addr) {
21408c2ecf20Sopenharmony_ci				*can_addr = (unsigned long) (va - start);
21418c2ecf20Sopenharmony_ci				*can_addr += (unsigned long)
21428c2ecf20Sopenharmony_ci					per_cpu_ptr(base, get_boot_cpu_id());
21438c2ecf20Sopenharmony_ci			}
21448c2ecf20Sopenharmony_ci			return true;
21458c2ecf20Sopenharmony_ci		}
21468c2ecf20Sopenharmony_ci	}
21478c2ecf20Sopenharmony_ci#endif
21488c2ecf20Sopenharmony_ci	/* on UP, can't distinguish from other static vars, always false */
21498c2ecf20Sopenharmony_ci	return false;
21508c2ecf20Sopenharmony_ci}
21518c2ecf20Sopenharmony_ci
21528c2ecf20Sopenharmony_ci/**
21538c2ecf20Sopenharmony_ci * is_kernel_percpu_address - test whether address is from static percpu area
21548c2ecf20Sopenharmony_ci * @addr: address to test
21558c2ecf20Sopenharmony_ci *
21568c2ecf20Sopenharmony_ci * Test whether @addr belongs to in-kernel static percpu area.  Module
21578c2ecf20Sopenharmony_ci * static percpu areas are not considered.  For those, use
21588c2ecf20Sopenharmony_ci * is_module_percpu_address().
21598c2ecf20Sopenharmony_ci *
21608c2ecf20Sopenharmony_ci * RETURNS:
21618c2ecf20Sopenharmony_ci * %true if @addr is from in-kernel static percpu area, %false otherwise.
21628c2ecf20Sopenharmony_ci */
21638c2ecf20Sopenharmony_cibool is_kernel_percpu_address(unsigned long addr)
21648c2ecf20Sopenharmony_ci{
21658c2ecf20Sopenharmony_ci	return __is_kernel_percpu_address(addr, NULL);
21668c2ecf20Sopenharmony_ci}
21678c2ecf20Sopenharmony_ci
21688c2ecf20Sopenharmony_ci/**
21698c2ecf20Sopenharmony_ci * per_cpu_ptr_to_phys - convert translated percpu address to physical address
21708c2ecf20Sopenharmony_ci * @addr: the address to be converted to physical address
21718c2ecf20Sopenharmony_ci *
21728c2ecf20Sopenharmony_ci * Given @addr which is dereferenceable address obtained via one of
21738c2ecf20Sopenharmony_ci * percpu access macros, this function translates it into its physical
21748c2ecf20Sopenharmony_ci * address.  The caller is responsible for ensuring @addr stays valid
21758c2ecf20Sopenharmony_ci * until this function finishes.
21768c2ecf20Sopenharmony_ci *
21778c2ecf20Sopenharmony_ci * percpu allocator has special setup for the first chunk, which currently
21788c2ecf20Sopenharmony_ci * supports either embedding in linear address space or vmalloc mapping,
21798c2ecf20Sopenharmony_ci * and, from the second one, the backing allocator (currently either vm or
21808c2ecf20Sopenharmony_ci * km) provides translation.
21818c2ecf20Sopenharmony_ci *
21828c2ecf20Sopenharmony_ci * The addr can be translated simply without checking if it falls into the
21838c2ecf20Sopenharmony_ci * first chunk. But the current code reflects better how percpu allocator
21848c2ecf20Sopenharmony_ci * actually works, and the verification can discover both bugs in percpu
21858c2ecf20Sopenharmony_ci * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
21868c2ecf20Sopenharmony_ci * code.
21878c2ecf20Sopenharmony_ci *
21888c2ecf20Sopenharmony_ci * RETURNS:
21898c2ecf20Sopenharmony_ci * The physical address for @addr.
21908c2ecf20Sopenharmony_ci */
21918c2ecf20Sopenharmony_ciphys_addr_t per_cpu_ptr_to_phys(void *addr)
21928c2ecf20Sopenharmony_ci{
21938c2ecf20Sopenharmony_ci	void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
21948c2ecf20Sopenharmony_ci	bool in_first_chunk = false;
21958c2ecf20Sopenharmony_ci	unsigned long first_low, first_high;
21968c2ecf20Sopenharmony_ci	unsigned int cpu;
21978c2ecf20Sopenharmony_ci
21988c2ecf20Sopenharmony_ci	/*
21998c2ecf20Sopenharmony_ci	 * The following test on unit_low/high isn't strictly
22008c2ecf20Sopenharmony_ci	 * necessary but will speed up lookups of addresses which
22018c2ecf20Sopenharmony_ci	 * aren't in the first chunk.
22028c2ecf20Sopenharmony_ci	 *
22038c2ecf20Sopenharmony_ci	 * The address check is against full chunk sizes.  pcpu_base_addr
22048c2ecf20Sopenharmony_ci	 * points to the beginning of the first chunk including the
22058c2ecf20Sopenharmony_ci	 * static region.  Assumes good intent as the first chunk may
22068c2ecf20Sopenharmony_ci	 * not be full (ie. < pcpu_unit_pages in size).
22078c2ecf20Sopenharmony_ci	 */
22088c2ecf20Sopenharmony_ci	first_low = (unsigned long)pcpu_base_addr +
22098c2ecf20Sopenharmony_ci		    pcpu_unit_page_offset(pcpu_low_unit_cpu, 0);
22108c2ecf20Sopenharmony_ci	first_high = (unsigned long)pcpu_base_addr +
22118c2ecf20Sopenharmony_ci		     pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages);
22128c2ecf20Sopenharmony_ci	if ((unsigned long)addr >= first_low &&
22138c2ecf20Sopenharmony_ci	    (unsigned long)addr < first_high) {
22148c2ecf20Sopenharmony_ci		for_each_possible_cpu(cpu) {
22158c2ecf20Sopenharmony_ci			void *start = per_cpu_ptr(base, cpu);
22168c2ecf20Sopenharmony_ci
22178c2ecf20Sopenharmony_ci			if (addr >= start && addr < start + pcpu_unit_size) {
22188c2ecf20Sopenharmony_ci				in_first_chunk = true;
22198c2ecf20Sopenharmony_ci				break;
22208c2ecf20Sopenharmony_ci			}
22218c2ecf20Sopenharmony_ci		}
22228c2ecf20Sopenharmony_ci	}
22238c2ecf20Sopenharmony_ci
22248c2ecf20Sopenharmony_ci	if (in_first_chunk) {
22258c2ecf20Sopenharmony_ci		if (!is_vmalloc_addr(addr))
22268c2ecf20Sopenharmony_ci			return __pa(addr);
22278c2ecf20Sopenharmony_ci		else
22288c2ecf20Sopenharmony_ci			return page_to_phys(vmalloc_to_page(addr)) +
22298c2ecf20Sopenharmony_ci			       offset_in_page(addr);
22308c2ecf20Sopenharmony_ci	} else
22318c2ecf20Sopenharmony_ci		return page_to_phys(pcpu_addr_to_page(addr)) +
22328c2ecf20Sopenharmony_ci		       offset_in_page(addr);
22338c2ecf20Sopenharmony_ci}
22348c2ecf20Sopenharmony_ci
22358c2ecf20Sopenharmony_ci/**
22368c2ecf20Sopenharmony_ci * pcpu_alloc_alloc_info - allocate percpu allocation info
22378c2ecf20Sopenharmony_ci * @nr_groups: the number of groups
22388c2ecf20Sopenharmony_ci * @nr_units: the number of units
22398c2ecf20Sopenharmony_ci *
22408c2ecf20Sopenharmony_ci * Allocate ai which is large enough for @nr_groups groups containing
22418c2ecf20Sopenharmony_ci * @nr_units units.  The returned ai's groups[0].cpu_map points to the
22428c2ecf20Sopenharmony_ci * cpu_map array which is long enough for @nr_units and filled with
22438c2ecf20Sopenharmony_ci * NR_CPUS.  It's the caller's responsibility to initialize cpu_map
22448c2ecf20Sopenharmony_ci * pointer of other groups.
22458c2ecf20Sopenharmony_ci *
22468c2ecf20Sopenharmony_ci * RETURNS:
22478c2ecf20Sopenharmony_ci * Pointer to the allocated pcpu_alloc_info on success, NULL on
22488c2ecf20Sopenharmony_ci * failure.
22498c2ecf20Sopenharmony_ci */
22508c2ecf20Sopenharmony_cistruct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
22518c2ecf20Sopenharmony_ci						      int nr_units)
22528c2ecf20Sopenharmony_ci{
22538c2ecf20Sopenharmony_ci	struct pcpu_alloc_info *ai;
22548c2ecf20Sopenharmony_ci	size_t base_size, ai_size;
22558c2ecf20Sopenharmony_ci	void *ptr;
22568c2ecf20Sopenharmony_ci	int unit;
22578c2ecf20Sopenharmony_ci
22588c2ecf20Sopenharmony_ci	base_size = ALIGN(struct_size(ai, groups, nr_groups),
22598c2ecf20Sopenharmony_ci			  __alignof__(ai->groups[0].cpu_map[0]));
22608c2ecf20Sopenharmony_ci	ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
22618c2ecf20Sopenharmony_ci
22628c2ecf20Sopenharmony_ci	ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE);
22638c2ecf20Sopenharmony_ci	if (!ptr)
22648c2ecf20Sopenharmony_ci		return NULL;
22658c2ecf20Sopenharmony_ci	ai = ptr;
22668c2ecf20Sopenharmony_ci	ptr += base_size;
22678c2ecf20Sopenharmony_ci
22688c2ecf20Sopenharmony_ci	ai->groups[0].cpu_map = ptr;
22698c2ecf20Sopenharmony_ci
22708c2ecf20Sopenharmony_ci	for (unit = 0; unit < nr_units; unit++)
22718c2ecf20Sopenharmony_ci		ai->groups[0].cpu_map[unit] = NR_CPUS;
22728c2ecf20Sopenharmony_ci
22738c2ecf20Sopenharmony_ci	ai->nr_groups = nr_groups;
22748c2ecf20Sopenharmony_ci	ai->__ai_size = PFN_ALIGN(ai_size);
22758c2ecf20Sopenharmony_ci
22768c2ecf20Sopenharmony_ci	return ai;
22778c2ecf20Sopenharmony_ci}
22788c2ecf20Sopenharmony_ci
22798c2ecf20Sopenharmony_ci/**
22808c2ecf20Sopenharmony_ci * pcpu_free_alloc_info - free percpu allocation info
22818c2ecf20Sopenharmony_ci * @ai: pcpu_alloc_info to free
22828c2ecf20Sopenharmony_ci *
22838c2ecf20Sopenharmony_ci * Free @ai which was allocated by pcpu_alloc_alloc_info().
22848c2ecf20Sopenharmony_ci */
22858c2ecf20Sopenharmony_civoid __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
22868c2ecf20Sopenharmony_ci{
22878c2ecf20Sopenharmony_ci	memblock_free_early(__pa(ai), ai->__ai_size);
22888c2ecf20Sopenharmony_ci}
22898c2ecf20Sopenharmony_ci
22908c2ecf20Sopenharmony_ci/**
22918c2ecf20Sopenharmony_ci * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
22928c2ecf20Sopenharmony_ci * @lvl: loglevel
22938c2ecf20Sopenharmony_ci * @ai: allocation info to dump
22948c2ecf20Sopenharmony_ci *
22958c2ecf20Sopenharmony_ci * Print out information about @ai using loglevel @lvl.
22968c2ecf20Sopenharmony_ci */
22978c2ecf20Sopenharmony_cistatic void pcpu_dump_alloc_info(const char *lvl,
22988c2ecf20Sopenharmony_ci				 const struct pcpu_alloc_info *ai)
22998c2ecf20Sopenharmony_ci{
23008c2ecf20Sopenharmony_ci	int group_width = 1, cpu_width = 1, width;
23018c2ecf20Sopenharmony_ci	char empty_str[] = "--------";
23028c2ecf20Sopenharmony_ci	int alloc = 0, alloc_end = 0;
23038c2ecf20Sopenharmony_ci	int group, v;
23048c2ecf20Sopenharmony_ci	int upa, apl;	/* units per alloc, allocs per line */
23058c2ecf20Sopenharmony_ci
23068c2ecf20Sopenharmony_ci	v = ai->nr_groups;
23078c2ecf20Sopenharmony_ci	while (v /= 10)
23088c2ecf20Sopenharmony_ci		group_width++;
23098c2ecf20Sopenharmony_ci
23108c2ecf20Sopenharmony_ci	v = num_possible_cpus();
23118c2ecf20Sopenharmony_ci	while (v /= 10)
23128c2ecf20Sopenharmony_ci		cpu_width++;
23138c2ecf20Sopenharmony_ci	empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
23148c2ecf20Sopenharmony_ci
23158c2ecf20Sopenharmony_ci	upa = ai->alloc_size / ai->unit_size;
23168c2ecf20Sopenharmony_ci	width = upa * (cpu_width + 1) + group_width + 3;
23178c2ecf20Sopenharmony_ci	apl = rounddown_pow_of_two(max(60 / width, 1));
23188c2ecf20Sopenharmony_ci
23198c2ecf20Sopenharmony_ci	printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
23208c2ecf20Sopenharmony_ci	       lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
23218c2ecf20Sopenharmony_ci	       ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
23228c2ecf20Sopenharmony_ci
23238c2ecf20Sopenharmony_ci	for (group = 0; group < ai->nr_groups; group++) {
23248c2ecf20Sopenharmony_ci		const struct pcpu_group_info *gi = &ai->groups[group];
23258c2ecf20Sopenharmony_ci		int unit = 0, unit_end = 0;
23268c2ecf20Sopenharmony_ci
23278c2ecf20Sopenharmony_ci		BUG_ON(gi->nr_units % upa);
23288c2ecf20Sopenharmony_ci		for (alloc_end += gi->nr_units / upa;
23298c2ecf20Sopenharmony_ci		     alloc < alloc_end; alloc++) {
23308c2ecf20Sopenharmony_ci			if (!(alloc % apl)) {
23318c2ecf20Sopenharmony_ci				pr_cont("\n");
23328c2ecf20Sopenharmony_ci				printk("%spcpu-alloc: ", lvl);
23338c2ecf20Sopenharmony_ci			}
23348c2ecf20Sopenharmony_ci			pr_cont("[%0*d] ", group_width, group);
23358c2ecf20Sopenharmony_ci
23368c2ecf20Sopenharmony_ci			for (unit_end += upa; unit < unit_end; unit++)
23378c2ecf20Sopenharmony_ci				if (gi->cpu_map[unit] != NR_CPUS)
23388c2ecf20Sopenharmony_ci					pr_cont("%0*d ",
23398c2ecf20Sopenharmony_ci						cpu_width, gi->cpu_map[unit]);
23408c2ecf20Sopenharmony_ci				else
23418c2ecf20Sopenharmony_ci					pr_cont("%s ", empty_str);
23428c2ecf20Sopenharmony_ci		}
23438c2ecf20Sopenharmony_ci	}
23448c2ecf20Sopenharmony_ci	pr_cont("\n");
23458c2ecf20Sopenharmony_ci}
23468c2ecf20Sopenharmony_ci
23478c2ecf20Sopenharmony_ci/**
23488c2ecf20Sopenharmony_ci * pcpu_setup_first_chunk - initialize the first percpu chunk
23498c2ecf20Sopenharmony_ci * @ai: pcpu_alloc_info describing how to percpu area is shaped
23508c2ecf20Sopenharmony_ci * @base_addr: mapped address
23518c2ecf20Sopenharmony_ci *
23528c2ecf20Sopenharmony_ci * Initialize the first percpu chunk which contains the kernel static
23538c2ecf20Sopenharmony_ci * percpu area.  This function is to be called from arch percpu area
23548c2ecf20Sopenharmony_ci * setup path.
23558c2ecf20Sopenharmony_ci *
23568c2ecf20Sopenharmony_ci * @ai contains all information necessary to initialize the first
23578c2ecf20Sopenharmony_ci * chunk and prime the dynamic percpu allocator.
23588c2ecf20Sopenharmony_ci *
23598c2ecf20Sopenharmony_ci * @ai->static_size is the size of static percpu area.
23608c2ecf20Sopenharmony_ci *
23618c2ecf20Sopenharmony_ci * @ai->reserved_size, if non-zero, specifies the amount of bytes to
23628c2ecf20Sopenharmony_ci * reserve after the static area in the first chunk.  This reserves
23638c2ecf20Sopenharmony_ci * the first chunk such that it's available only through reserved
23648c2ecf20Sopenharmony_ci * percpu allocation.  This is primarily used to serve module percpu
23658c2ecf20Sopenharmony_ci * static areas on architectures where the addressing model has
23668c2ecf20Sopenharmony_ci * limited offset range for symbol relocations to guarantee module
23678c2ecf20Sopenharmony_ci * percpu symbols fall inside the relocatable range.
23688c2ecf20Sopenharmony_ci *
23698c2ecf20Sopenharmony_ci * @ai->dyn_size determines the number of bytes available for dynamic
23708c2ecf20Sopenharmony_ci * allocation in the first chunk.  The area between @ai->static_size +
23718c2ecf20Sopenharmony_ci * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
23728c2ecf20Sopenharmony_ci *
23738c2ecf20Sopenharmony_ci * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
23748c2ecf20Sopenharmony_ci * and equal to or larger than @ai->static_size + @ai->reserved_size +
23758c2ecf20Sopenharmony_ci * @ai->dyn_size.
23768c2ecf20Sopenharmony_ci *
23778c2ecf20Sopenharmony_ci * @ai->atom_size is the allocation atom size and used as alignment
23788c2ecf20Sopenharmony_ci * for vm areas.
23798c2ecf20Sopenharmony_ci *
23808c2ecf20Sopenharmony_ci * @ai->alloc_size is the allocation size and always multiple of
23818c2ecf20Sopenharmony_ci * @ai->atom_size.  This is larger than @ai->atom_size if
23828c2ecf20Sopenharmony_ci * @ai->unit_size is larger than @ai->atom_size.
23838c2ecf20Sopenharmony_ci *
23848c2ecf20Sopenharmony_ci * @ai->nr_groups and @ai->groups describe virtual memory layout of
23858c2ecf20Sopenharmony_ci * percpu areas.  Units which should be colocated are put into the
23868c2ecf20Sopenharmony_ci * same group.  Dynamic VM areas will be allocated according to these
23878c2ecf20Sopenharmony_ci * groupings.  If @ai->nr_groups is zero, a single group containing
23888c2ecf20Sopenharmony_ci * all units is assumed.
23898c2ecf20Sopenharmony_ci *
23908c2ecf20Sopenharmony_ci * The caller should have mapped the first chunk at @base_addr and
23918c2ecf20Sopenharmony_ci * copied static data to each unit.
23928c2ecf20Sopenharmony_ci *
23938c2ecf20Sopenharmony_ci * The first chunk will always contain a static and a dynamic region.
23948c2ecf20Sopenharmony_ci * However, the static region is not managed by any chunk.  If the first
23958c2ecf20Sopenharmony_ci * chunk also contains a reserved region, it is served by two chunks -
23968c2ecf20Sopenharmony_ci * one for the reserved region and one for the dynamic region.  They
23978c2ecf20Sopenharmony_ci * share the same vm, but use offset regions in the area allocation map.
23988c2ecf20Sopenharmony_ci * The chunk serving the dynamic region is circulated in the chunk slots
23998c2ecf20Sopenharmony_ci * and available for dynamic allocation like any other chunk.
24008c2ecf20Sopenharmony_ci */
24018c2ecf20Sopenharmony_civoid __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
24028c2ecf20Sopenharmony_ci				   void *base_addr)
24038c2ecf20Sopenharmony_ci{
24048c2ecf20Sopenharmony_ci	size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
24058c2ecf20Sopenharmony_ci	size_t static_size, dyn_size;
24068c2ecf20Sopenharmony_ci	struct pcpu_chunk *chunk;
24078c2ecf20Sopenharmony_ci	unsigned long *group_offsets;
24088c2ecf20Sopenharmony_ci	size_t *group_sizes;
24098c2ecf20Sopenharmony_ci	unsigned long *unit_off;
24108c2ecf20Sopenharmony_ci	unsigned int cpu;
24118c2ecf20Sopenharmony_ci	int *unit_map;
24128c2ecf20Sopenharmony_ci	int group, unit, i;
24138c2ecf20Sopenharmony_ci	int map_size;
24148c2ecf20Sopenharmony_ci	unsigned long tmp_addr;
24158c2ecf20Sopenharmony_ci	size_t alloc_size;
24168c2ecf20Sopenharmony_ci	enum pcpu_chunk_type type;
24178c2ecf20Sopenharmony_ci
24188c2ecf20Sopenharmony_ci#define PCPU_SETUP_BUG_ON(cond)	do {					\
24198c2ecf20Sopenharmony_ci	if (unlikely(cond)) {						\
24208c2ecf20Sopenharmony_ci		pr_emerg("failed to initialize, %s\n", #cond);		\
24218c2ecf20Sopenharmony_ci		pr_emerg("cpu_possible_mask=%*pb\n",			\
24228c2ecf20Sopenharmony_ci			 cpumask_pr_args(cpu_possible_mask));		\
24238c2ecf20Sopenharmony_ci		pcpu_dump_alloc_info(KERN_EMERG, ai);			\
24248c2ecf20Sopenharmony_ci		BUG();							\
24258c2ecf20Sopenharmony_ci	}								\
24268c2ecf20Sopenharmony_ci} while (0)
24278c2ecf20Sopenharmony_ci
24288c2ecf20Sopenharmony_ci	/* sanity checks */
24298c2ecf20Sopenharmony_ci	PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
24308c2ecf20Sopenharmony_ci#ifdef CONFIG_SMP
24318c2ecf20Sopenharmony_ci	PCPU_SETUP_BUG_ON(!ai->static_size);
24328c2ecf20Sopenharmony_ci	PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
24338c2ecf20Sopenharmony_ci#endif
24348c2ecf20Sopenharmony_ci	PCPU_SETUP_BUG_ON(!base_addr);
24358c2ecf20Sopenharmony_ci	PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
24368c2ecf20Sopenharmony_ci	PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
24378c2ecf20Sopenharmony_ci	PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
24388c2ecf20Sopenharmony_ci	PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
24398c2ecf20Sopenharmony_ci	PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
24408c2ecf20Sopenharmony_ci	PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
24418c2ecf20Sopenharmony_ci	PCPU_SETUP_BUG_ON(!ai->dyn_size);
24428c2ecf20Sopenharmony_ci	PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
24438c2ecf20Sopenharmony_ci	PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
24448c2ecf20Sopenharmony_ci			    IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
24458c2ecf20Sopenharmony_ci	PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
24468c2ecf20Sopenharmony_ci
24478c2ecf20Sopenharmony_ci	/* process group information and build config tables accordingly */
24488c2ecf20Sopenharmony_ci	alloc_size = ai->nr_groups * sizeof(group_offsets[0]);
24498c2ecf20Sopenharmony_ci	group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
24508c2ecf20Sopenharmony_ci	if (!group_offsets)
24518c2ecf20Sopenharmony_ci		panic("%s: Failed to allocate %zu bytes\n", __func__,
24528c2ecf20Sopenharmony_ci		      alloc_size);
24538c2ecf20Sopenharmony_ci
24548c2ecf20Sopenharmony_ci	alloc_size = ai->nr_groups * sizeof(group_sizes[0]);
24558c2ecf20Sopenharmony_ci	group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
24568c2ecf20Sopenharmony_ci	if (!group_sizes)
24578c2ecf20Sopenharmony_ci		panic("%s: Failed to allocate %zu bytes\n", __func__,
24588c2ecf20Sopenharmony_ci		      alloc_size);
24598c2ecf20Sopenharmony_ci
24608c2ecf20Sopenharmony_ci	alloc_size = nr_cpu_ids * sizeof(unit_map[0]);
24618c2ecf20Sopenharmony_ci	unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
24628c2ecf20Sopenharmony_ci	if (!unit_map)
24638c2ecf20Sopenharmony_ci		panic("%s: Failed to allocate %zu bytes\n", __func__,
24648c2ecf20Sopenharmony_ci		      alloc_size);
24658c2ecf20Sopenharmony_ci
24668c2ecf20Sopenharmony_ci	alloc_size = nr_cpu_ids * sizeof(unit_off[0]);
24678c2ecf20Sopenharmony_ci	unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
24688c2ecf20Sopenharmony_ci	if (!unit_off)
24698c2ecf20Sopenharmony_ci		panic("%s: Failed to allocate %zu bytes\n", __func__,
24708c2ecf20Sopenharmony_ci		      alloc_size);
24718c2ecf20Sopenharmony_ci
24728c2ecf20Sopenharmony_ci	for (cpu = 0; cpu < nr_cpu_ids; cpu++)
24738c2ecf20Sopenharmony_ci		unit_map[cpu] = UINT_MAX;
24748c2ecf20Sopenharmony_ci
24758c2ecf20Sopenharmony_ci	pcpu_low_unit_cpu = NR_CPUS;
24768c2ecf20Sopenharmony_ci	pcpu_high_unit_cpu = NR_CPUS;
24778c2ecf20Sopenharmony_ci
24788c2ecf20Sopenharmony_ci	for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
24798c2ecf20Sopenharmony_ci		const struct pcpu_group_info *gi = &ai->groups[group];
24808c2ecf20Sopenharmony_ci
24818c2ecf20Sopenharmony_ci		group_offsets[group] = gi->base_offset;
24828c2ecf20Sopenharmony_ci		group_sizes[group] = gi->nr_units * ai->unit_size;
24838c2ecf20Sopenharmony_ci
24848c2ecf20Sopenharmony_ci		for (i = 0; i < gi->nr_units; i++) {
24858c2ecf20Sopenharmony_ci			cpu = gi->cpu_map[i];
24868c2ecf20Sopenharmony_ci			if (cpu == NR_CPUS)
24878c2ecf20Sopenharmony_ci				continue;
24888c2ecf20Sopenharmony_ci
24898c2ecf20Sopenharmony_ci			PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
24908c2ecf20Sopenharmony_ci			PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
24918c2ecf20Sopenharmony_ci			PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
24928c2ecf20Sopenharmony_ci
24938c2ecf20Sopenharmony_ci			unit_map[cpu] = unit + i;
24948c2ecf20Sopenharmony_ci			unit_off[cpu] = gi->base_offset + i * ai->unit_size;
24958c2ecf20Sopenharmony_ci
24968c2ecf20Sopenharmony_ci			/* determine low/high unit_cpu */
24978c2ecf20Sopenharmony_ci			if (pcpu_low_unit_cpu == NR_CPUS ||
24988c2ecf20Sopenharmony_ci			    unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
24998c2ecf20Sopenharmony_ci				pcpu_low_unit_cpu = cpu;
25008c2ecf20Sopenharmony_ci			if (pcpu_high_unit_cpu == NR_CPUS ||
25018c2ecf20Sopenharmony_ci			    unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
25028c2ecf20Sopenharmony_ci				pcpu_high_unit_cpu = cpu;
25038c2ecf20Sopenharmony_ci		}
25048c2ecf20Sopenharmony_ci	}
25058c2ecf20Sopenharmony_ci	pcpu_nr_units = unit;
25068c2ecf20Sopenharmony_ci
25078c2ecf20Sopenharmony_ci	for_each_possible_cpu(cpu)
25088c2ecf20Sopenharmony_ci		PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
25098c2ecf20Sopenharmony_ci
25108c2ecf20Sopenharmony_ci	/* we're done parsing the input, undefine BUG macro and dump config */
25118c2ecf20Sopenharmony_ci#undef PCPU_SETUP_BUG_ON
25128c2ecf20Sopenharmony_ci	pcpu_dump_alloc_info(KERN_DEBUG, ai);
25138c2ecf20Sopenharmony_ci
25148c2ecf20Sopenharmony_ci	pcpu_nr_groups = ai->nr_groups;
25158c2ecf20Sopenharmony_ci	pcpu_group_offsets = group_offsets;
25168c2ecf20Sopenharmony_ci	pcpu_group_sizes = group_sizes;
25178c2ecf20Sopenharmony_ci	pcpu_unit_map = unit_map;
25188c2ecf20Sopenharmony_ci	pcpu_unit_offsets = unit_off;
25198c2ecf20Sopenharmony_ci
25208c2ecf20Sopenharmony_ci	/* determine basic parameters */
25218c2ecf20Sopenharmony_ci	pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
25228c2ecf20Sopenharmony_ci	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
25238c2ecf20Sopenharmony_ci	pcpu_atom_size = ai->atom_size;
25248c2ecf20Sopenharmony_ci	pcpu_chunk_struct_size = struct_size(chunk, populated,
25258c2ecf20Sopenharmony_ci					     BITS_TO_LONGS(pcpu_unit_pages));
25268c2ecf20Sopenharmony_ci
25278c2ecf20Sopenharmony_ci	pcpu_stats_save_ai(ai);
25288c2ecf20Sopenharmony_ci
25298c2ecf20Sopenharmony_ci	/*
25308c2ecf20Sopenharmony_ci	 * Allocate chunk slots.  The additional last slot is for
25318c2ecf20Sopenharmony_ci	 * empty chunks.
25328c2ecf20Sopenharmony_ci	 */
25338c2ecf20Sopenharmony_ci	pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
25348c2ecf20Sopenharmony_ci	pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots *
25358c2ecf20Sopenharmony_ci					  sizeof(pcpu_chunk_lists[0]) *
25368c2ecf20Sopenharmony_ci					  PCPU_NR_CHUNK_TYPES,
25378c2ecf20Sopenharmony_ci					  SMP_CACHE_BYTES);
25388c2ecf20Sopenharmony_ci	if (!pcpu_chunk_lists)
25398c2ecf20Sopenharmony_ci		panic("%s: Failed to allocate %zu bytes\n", __func__,
25408c2ecf20Sopenharmony_ci		      pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]) *
25418c2ecf20Sopenharmony_ci		      PCPU_NR_CHUNK_TYPES);
25428c2ecf20Sopenharmony_ci
25438c2ecf20Sopenharmony_ci	for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
25448c2ecf20Sopenharmony_ci		for (i = 0; i < pcpu_nr_slots; i++)
25458c2ecf20Sopenharmony_ci			INIT_LIST_HEAD(&pcpu_chunk_list(type)[i]);
25468c2ecf20Sopenharmony_ci
25478c2ecf20Sopenharmony_ci	/*
25488c2ecf20Sopenharmony_ci	 * The end of the static region needs to be aligned with the
25498c2ecf20Sopenharmony_ci	 * minimum allocation size as this offsets the reserved and
25508c2ecf20Sopenharmony_ci	 * dynamic region.  The first chunk ends page aligned by
25518c2ecf20Sopenharmony_ci	 * expanding the dynamic region, therefore the dynamic region
25528c2ecf20Sopenharmony_ci	 * can be shrunk to compensate while still staying above the
25538c2ecf20Sopenharmony_ci	 * configured sizes.
25548c2ecf20Sopenharmony_ci	 */
25558c2ecf20Sopenharmony_ci	static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE);
25568c2ecf20Sopenharmony_ci	dyn_size = ai->dyn_size - (static_size - ai->static_size);
25578c2ecf20Sopenharmony_ci
25588c2ecf20Sopenharmony_ci	/*
25598c2ecf20Sopenharmony_ci	 * Initialize first chunk.
25608c2ecf20Sopenharmony_ci	 * If the reserved_size is non-zero, this initializes the reserved
25618c2ecf20Sopenharmony_ci	 * chunk.  If the reserved_size is zero, the reserved chunk is NULL
25628c2ecf20Sopenharmony_ci	 * and the dynamic region is initialized here.  The first chunk,
25638c2ecf20Sopenharmony_ci	 * pcpu_first_chunk, will always point to the chunk that serves
25648c2ecf20Sopenharmony_ci	 * the dynamic region.
25658c2ecf20Sopenharmony_ci	 */
25668c2ecf20Sopenharmony_ci	tmp_addr = (unsigned long)base_addr + static_size;
25678c2ecf20Sopenharmony_ci	map_size = ai->reserved_size ?: dyn_size;
25688c2ecf20Sopenharmony_ci	chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
25698c2ecf20Sopenharmony_ci
25708c2ecf20Sopenharmony_ci	/* init dynamic chunk if necessary */
25718c2ecf20Sopenharmony_ci	if (ai->reserved_size) {
25728c2ecf20Sopenharmony_ci		pcpu_reserved_chunk = chunk;
25738c2ecf20Sopenharmony_ci
25748c2ecf20Sopenharmony_ci		tmp_addr = (unsigned long)base_addr + static_size +
25758c2ecf20Sopenharmony_ci			   ai->reserved_size;
25768c2ecf20Sopenharmony_ci		map_size = dyn_size;
25778c2ecf20Sopenharmony_ci		chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
25788c2ecf20Sopenharmony_ci	}
25798c2ecf20Sopenharmony_ci
25808c2ecf20Sopenharmony_ci	/* link the first chunk in */
25818c2ecf20Sopenharmony_ci	pcpu_first_chunk = chunk;
25828c2ecf20Sopenharmony_ci	pcpu_nr_empty_pop_pages[PCPU_CHUNK_ROOT] = pcpu_first_chunk->nr_empty_pop_pages;
25838c2ecf20Sopenharmony_ci	pcpu_chunk_relocate(pcpu_first_chunk, -1);
25848c2ecf20Sopenharmony_ci
25858c2ecf20Sopenharmony_ci	/* include all regions of the first chunk */
25868c2ecf20Sopenharmony_ci	pcpu_nr_populated += PFN_DOWN(size_sum);
25878c2ecf20Sopenharmony_ci
25888c2ecf20Sopenharmony_ci	pcpu_stats_chunk_alloc();
25898c2ecf20Sopenharmony_ci	trace_percpu_create_chunk(base_addr);
25908c2ecf20Sopenharmony_ci
25918c2ecf20Sopenharmony_ci	/* we're done */
25928c2ecf20Sopenharmony_ci	pcpu_base_addr = base_addr;
25938c2ecf20Sopenharmony_ci}
25948c2ecf20Sopenharmony_ci
25958c2ecf20Sopenharmony_ci#ifdef CONFIG_SMP
25968c2ecf20Sopenharmony_ci
25978c2ecf20Sopenharmony_ciconst char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
25988c2ecf20Sopenharmony_ci	[PCPU_FC_AUTO]	= "auto",
25998c2ecf20Sopenharmony_ci	[PCPU_FC_EMBED]	= "embed",
26008c2ecf20Sopenharmony_ci	[PCPU_FC_PAGE]	= "page",
26018c2ecf20Sopenharmony_ci};
26028c2ecf20Sopenharmony_ci
26038c2ecf20Sopenharmony_cienum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
26048c2ecf20Sopenharmony_ci
26058c2ecf20Sopenharmony_cistatic int __init percpu_alloc_setup(char *str)
26068c2ecf20Sopenharmony_ci{
26078c2ecf20Sopenharmony_ci	if (!str)
26088c2ecf20Sopenharmony_ci		return -EINVAL;
26098c2ecf20Sopenharmony_ci
26108c2ecf20Sopenharmony_ci	if (0)
26118c2ecf20Sopenharmony_ci		/* nada */;
26128c2ecf20Sopenharmony_ci#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
26138c2ecf20Sopenharmony_ci	else if (!strcmp(str, "embed"))
26148c2ecf20Sopenharmony_ci		pcpu_chosen_fc = PCPU_FC_EMBED;
26158c2ecf20Sopenharmony_ci#endif
26168c2ecf20Sopenharmony_ci#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
26178c2ecf20Sopenharmony_ci	else if (!strcmp(str, "page"))
26188c2ecf20Sopenharmony_ci		pcpu_chosen_fc = PCPU_FC_PAGE;
26198c2ecf20Sopenharmony_ci#endif
26208c2ecf20Sopenharmony_ci	else
26218c2ecf20Sopenharmony_ci		pr_warn("unknown allocator %s specified\n", str);
26228c2ecf20Sopenharmony_ci
26238c2ecf20Sopenharmony_ci	return 0;
26248c2ecf20Sopenharmony_ci}
26258c2ecf20Sopenharmony_ciearly_param("percpu_alloc", percpu_alloc_setup);
26268c2ecf20Sopenharmony_ci
26278c2ecf20Sopenharmony_ci/*
26288c2ecf20Sopenharmony_ci * pcpu_embed_first_chunk() is used by the generic percpu setup.
26298c2ecf20Sopenharmony_ci * Build it if needed by the arch config or the generic setup is going
26308c2ecf20Sopenharmony_ci * to be used.
26318c2ecf20Sopenharmony_ci */
26328c2ecf20Sopenharmony_ci#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
26338c2ecf20Sopenharmony_ci	!defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
26348c2ecf20Sopenharmony_ci#define BUILD_EMBED_FIRST_CHUNK
26358c2ecf20Sopenharmony_ci#endif
26368c2ecf20Sopenharmony_ci
26378c2ecf20Sopenharmony_ci/* build pcpu_page_first_chunk() iff needed by the arch config */
26388c2ecf20Sopenharmony_ci#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
26398c2ecf20Sopenharmony_ci#define BUILD_PAGE_FIRST_CHUNK
26408c2ecf20Sopenharmony_ci#endif
26418c2ecf20Sopenharmony_ci
26428c2ecf20Sopenharmony_ci/* pcpu_build_alloc_info() is used by both embed and page first chunk */
26438c2ecf20Sopenharmony_ci#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
26448c2ecf20Sopenharmony_ci/**
26458c2ecf20Sopenharmony_ci * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
26468c2ecf20Sopenharmony_ci * @reserved_size: the size of reserved percpu area in bytes
26478c2ecf20Sopenharmony_ci * @dyn_size: minimum free size for dynamic allocation in bytes
26488c2ecf20Sopenharmony_ci * @atom_size: allocation atom size
26498c2ecf20Sopenharmony_ci * @cpu_distance_fn: callback to determine distance between cpus, optional
26508c2ecf20Sopenharmony_ci *
26518c2ecf20Sopenharmony_ci * This function determines grouping of units, their mappings to cpus
26528c2ecf20Sopenharmony_ci * and other parameters considering needed percpu size, allocation
26538c2ecf20Sopenharmony_ci * atom size and distances between CPUs.
26548c2ecf20Sopenharmony_ci *
26558c2ecf20Sopenharmony_ci * Groups are always multiples of atom size and CPUs which are of
26568c2ecf20Sopenharmony_ci * LOCAL_DISTANCE both ways are grouped together and share space for
26578c2ecf20Sopenharmony_ci * units in the same group.  The returned configuration is guaranteed
26588c2ecf20Sopenharmony_ci * to have CPUs on different nodes on different groups and >=75% usage
26598c2ecf20Sopenharmony_ci * of allocated virtual address space.
26608c2ecf20Sopenharmony_ci *
26618c2ecf20Sopenharmony_ci * RETURNS:
26628c2ecf20Sopenharmony_ci * On success, pointer to the new allocation_info is returned.  On
26638c2ecf20Sopenharmony_ci * failure, ERR_PTR value is returned.
26648c2ecf20Sopenharmony_ci */
26658c2ecf20Sopenharmony_cistatic struct pcpu_alloc_info * __init pcpu_build_alloc_info(
26668c2ecf20Sopenharmony_ci				size_t reserved_size, size_t dyn_size,
26678c2ecf20Sopenharmony_ci				size_t atom_size,
26688c2ecf20Sopenharmony_ci				pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
26698c2ecf20Sopenharmony_ci{
26708c2ecf20Sopenharmony_ci	static int group_map[NR_CPUS] __initdata;
26718c2ecf20Sopenharmony_ci	static int group_cnt[NR_CPUS] __initdata;
26728c2ecf20Sopenharmony_ci	const size_t static_size = __per_cpu_end - __per_cpu_start;
26738c2ecf20Sopenharmony_ci	int nr_groups = 1, nr_units = 0;
26748c2ecf20Sopenharmony_ci	size_t size_sum, min_unit_size, alloc_size;
26758c2ecf20Sopenharmony_ci	int upa, max_upa, best_upa;	/* units_per_alloc */
26768c2ecf20Sopenharmony_ci	int last_allocs, group, unit;
26778c2ecf20Sopenharmony_ci	unsigned int cpu, tcpu;
26788c2ecf20Sopenharmony_ci	struct pcpu_alloc_info *ai;
26798c2ecf20Sopenharmony_ci	unsigned int *cpu_map;
26808c2ecf20Sopenharmony_ci
26818c2ecf20Sopenharmony_ci	/* this function may be called multiple times */
26828c2ecf20Sopenharmony_ci	memset(group_map, 0, sizeof(group_map));
26838c2ecf20Sopenharmony_ci	memset(group_cnt, 0, sizeof(group_cnt));
26848c2ecf20Sopenharmony_ci
26858c2ecf20Sopenharmony_ci	/* calculate size_sum and ensure dyn_size is enough for early alloc */
26868c2ecf20Sopenharmony_ci	size_sum = PFN_ALIGN(static_size + reserved_size +
26878c2ecf20Sopenharmony_ci			    max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
26888c2ecf20Sopenharmony_ci	dyn_size = size_sum - static_size - reserved_size;
26898c2ecf20Sopenharmony_ci
26908c2ecf20Sopenharmony_ci	/*
26918c2ecf20Sopenharmony_ci	 * Determine min_unit_size, alloc_size and max_upa such that
26928c2ecf20Sopenharmony_ci	 * alloc_size is multiple of atom_size and is the smallest
26938c2ecf20Sopenharmony_ci	 * which can accommodate 4k aligned segments which are equal to
26948c2ecf20Sopenharmony_ci	 * or larger than min_unit_size.
26958c2ecf20Sopenharmony_ci	 */
26968c2ecf20Sopenharmony_ci	min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
26978c2ecf20Sopenharmony_ci
26988c2ecf20Sopenharmony_ci	/* determine the maximum # of units that can fit in an allocation */
26998c2ecf20Sopenharmony_ci	alloc_size = roundup(min_unit_size, atom_size);
27008c2ecf20Sopenharmony_ci	upa = alloc_size / min_unit_size;
27018c2ecf20Sopenharmony_ci	while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
27028c2ecf20Sopenharmony_ci		upa--;
27038c2ecf20Sopenharmony_ci	max_upa = upa;
27048c2ecf20Sopenharmony_ci
27058c2ecf20Sopenharmony_ci	/* group cpus according to their proximity */
27068c2ecf20Sopenharmony_ci	for_each_possible_cpu(cpu) {
27078c2ecf20Sopenharmony_ci		group = 0;
27088c2ecf20Sopenharmony_ci	next_group:
27098c2ecf20Sopenharmony_ci		for_each_possible_cpu(tcpu) {
27108c2ecf20Sopenharmony_ci			if (cpu == tcpu)
27118c2ecf20Sopenharmony_ci				break;
27128c2ecf20Sopenharmony_ci			if (group_map[tcpu] == group && cpu_distance_fn &&
27138c2ecf20Sopenharmony_ci			    (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
27148c2ecf20Sopenharmony_ci			     cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
27158c2ecf20Sopenharmony_ci				group++;
27168c2ecf20Sopenharmony_ci				nr_groups = max(nr_groups, group + 1);
27178c2ecf20Sopenharmony_ci				goto next_group;
27188c2ecf20Sopenharmony_ci			}
27198c2ecf20Sopenharmony_ci		}
27208c2ecf20Sopenharmony_ci		group_map[cpu] = group;
27218c2ecf20Sopenharmony_ci		group_cnt[group]++;
27228c2ecf20Sopenharmony_ci	}
27238c2ecf20Sopenharmony_ci
27248c2ecf20Sopenharmony_ci	/*
27258c2ecf20Sopenharmony_ci	 * Wasted space is caused by a ratio imbalance of upa to group_cnt.
27268c2ecf20Sopenharmony_ci	 * Expand the unit_size until we use >= 75% of the units allocated.
27278c2ecf20Sopenharmony_ci	 * Related to atom_size, which could be much larger than the unit_size.
27288c2ecf20Sopenharmony_ci	 */
27298c2ecf20Sopenharmony_ci	last_allocs = INT_MAX;
27308c2ecf20Sopenharmony_ci	for (upa = max_upa; upa; upa--) {
27318c2ecf20Sopenharmony_ci		int allocs = 0, wasted = 0;
27328c2ecf20Sopenharmony_ci
27338c2ecf20Sopenharmony_ci		if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
27348c2ecf20Sopenharmony_ci			continue;
27358c2ecf20Sopenharmony_ci
27368c2ecf20Sopenharmony_ci		for (group = 0; group < nr_groups; group++) {
27378c2ecf20Sopenharmony_ci			int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
27388c2ecf20Sopenharmony_ci			allocs += this_allocs;
27398c2ecf20Sopenharmony_ci			wasted += this_allocs * upa - group_cnt[group];
27408c2ecf20Sopenharmony_ci		}
27418c2ecf20Sopenharmony_ci
27428c2ecf20Sopenharmony_ci		/*
27438c2ecf20Sopenharmony_ci		 * Don't accept if wastage is over 1/3.  The
27448c2ecf20Sopenharmony_ci		 * greater-than comparison ensures upa==1 always
27458c2ecf20Sopenharmony_ci		 * passes the following check.
27468c2ecf20Sopenharmony_ci		 */
27478c2ecf20Sopenharmony_ci		if (wasted > num_possible_cpus() / 3)
27488c2ecf20Sopenharmony_ci			continue;
27498c2ecf20Sopenharmony_ci
27508c2ecf20Sopenharmony_ci		/* and then don't consume more memory */
27518c2ecf20Sopenharmony_ci		if (allocs > last_allocs)
27528c2ecf20Sopenharmony_ci			break;
27538c2ecf20Sopenharmony_ci		last_allocs = allocs;
27548c2ecf20Sopenharmony_ci		best_upa = upa;
27558c2ecf20Sopenharmony_ci	}
27568c2ecf20Sopenharmony_ci	upa = best_upa;
27578c2ecf20Sopenharmony_ci
27588c2ecf20Sopenharmony_ci	/* allocate and fill alloc_info */
27598c2ecf20Sopenharmony_ci	for (group = 0; group < nr_groups; group++)
27608c2ecf20Sopenharmony_ci		nr_units += roundup(group_cnt[group], upa);
27618c2ecf20Sopenharmony_ci
27628c2ecf20Sopenharmony_ci	ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
27638c2ecf20Sopenharmony_ci	if (!ai)
27648c2ecf20Sopenharmony_ci		return ERR_PTR(-ENOMEM);
27658c2ecf20Sopenharmony_ci	cpu_map = ai->groups[0].cpu_map;
27668c2ecf20Sopenharmony_ci
27678c2ecf20Sopenharmony_ci	for (group = 0; group < nr_groups; group++) {
27688c2ecf20Sopenharmony_ci		ai->groups[group].cpu_map = cpu_map;
27698c2ecf20Sopenharmony_ci		cpu_map += roundup(group_cnt[group], upa);
27708c2ecf20Sopenharmony_ci	}
27718c2ecf20Sopenharmony_ci
27728c2ecf20Sopenharmony_ci	ai->static_size = static_size;
27738c2ecf20Sopenharmony_ci	ai->reserved_size = reserved_size;
27748c2ecf20Sopenharmony_ci	ai->dyn_size = dyn_size;
27758c2ecf20Sopenharmony_ci	ai->unit_size = alloc_size / upa;
27768c2ecf20Sopenharmony_ci	ai->atom_size = atom_size;
27778c2ecf20Sopenharmony_ci	ai->alloc_size = alloc_size;
27788c2ecf20Sopenharmony_ci
27798c2ecf20Sopenharmony_ci	for (group = 0, unit = 0; group < nr_groups; group++) {
27808c2ecf20Sopenharmony_ci		struct pcpu_group_info *gi = &ai->groups[group];
27818c2ecf20Sopenharmony_ci
27828c2ecf20Sopenharmony_ci		/*
27838c2ecf20Sopenharmony_ci		 * Initialize base_offset as if all groups are located
27848c2ecf20Sopenharmony_ci		 * back-to-back.  The caller should update this to
27858c2ecf20Sopenharmony_ci		 * reflect actual allocation.
27868c2ecf20Sopenharmony_ci		 */
27878c2ecf20Sopenharmony_ci		gi->base_offset = unit * ai->unit_size;
27888c2ecf20Sopenharmony_ci
27898c2ecf20Sopenharmony_ci		for_each_possible_cpu(cpu)
27908c2ecf20Sopenharmony_ci			if (group_map[cpu] == group)
27918c2ecf20Sopenharmony_ci				gi->cpu_map[gi->nr_units++] = cpu;
27928c2ecf20Sopenharmony_ci		gi->nr_units = roundup(gi->nr_units, upa);
27938c2ecf20Sopenharmony_ci		unit += gi->nr_units;
27948c2ecf20Sopenharmony_ci	}
27958c2ecf20Sopenharmony_ci	BUG_ON(unit != nr_units);
27968c2ecf20Sopenharmony_ci
27978c2ecf20Sopenharmony_ci	return ai;
27988c2ecf20Sopenharmony_ci}
27998c2ecf20Sopenharmony_ci#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */
28008c2ecf20Sopenharmony_ci
28018c2ecf20Sopenharmony_ci#if defined(BUILD_EMBED_FIRST_CHUNK)
28028c2ecf20Sopenharmony_ci/**
28038c2ecf20Sopenharmony_ci * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
28048c2ecf20Sopenharmony_ci * @reserved_size: the size of reserved percpu area in bytes
28058c2ecf20Sopenharmony_ci * @dyn_size: minimum free size for dynamic allocation in bytes
28068c2ecf20Sopenharmony_ci * @atom_size: allocation atom size
28078c2ecf20Sopenharmony_ci * @cpu_distance_fn: callback to determine distance between cpus, optional
28088c2ecf20Sopenharmony_ci * @alloc_fn: function to allocate percpu page
28098c2ecf20Sopenharmony_ci * @free_fn: function to free percpu page
28108c2ecf20Sopenharmony_ci *
28118c2ecf20Sopenharmony_ci * This is a helper to ease setting up embedded first percpu chunk and
28128c2ecf20Sopenharmony_ci * can be called where pcpu_setup_first_chunk() is expected.
28138c2ecf20Sopenharmony_ci *
28148c2ecf20Sopenharmony_ci * If this function is used to setup the first chunk, it is allocated
28158c2ecf20Sopenharmony_ci * by calling @alloc_fn and used as-is without being mapped into
28168c2ecf20Sopenharmony_ci * vmalloc area.  Allocations are always whole multiples of @atom_size
28178c2ecf20Sopenharmony_ci * aligned to @atom_size.
28188c2ecf20Sopenharmony_ci *
28198c2ecf20Sopenharmony_ci * This enables the first chunk to piggy back on the linear physical
28208c2ecf20Sopenharmony_ci * mapping which often uses larger page size.  Please note that this
28218c2ecf20Sopenharmony_ci * can result in very sparse cpu->unit mapping on NUMA machines thus
28228c2ecf20Sopenharmony_ci * requiring large vmalloc address space.  Don't use this allocator if
28238c2ecf20Sopenharmony_ci * vmalloc space is not orders of magnitude larger than distances
28248c2ecf20Sopenharmony_ci * between node memory addresses (ie. 32bit NUMA machines).
28258c2ecf20Sopenharmony_ci *
28268c2ecf20Sopenharmony_ci * @dyn_size specifies the minimum dynamic area size.
28278c2ecf20Sopenharmony_ci *
28288c2ecf20Sopenharmony_ci * If the needed size is smaller than the minimum or specified unit
28298c2ecf20Sopenharmony_ci * size, the leftover is returned using @free_fn.
28308c2ecf20Sopenharmony_ci *
28318c2ecf20Sopenharmony_ci * RETURNS:
28328c2ecf20Sopenharmony_ci * 0 on success, -errno on failure.
28338c2ecf20Sopenharmony_ci */
28348c2ecf20Sopenharmony_ciint __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
28358c2ecf20Sopenharmony_ci				  size_t atom_size,
28368c2ecf20Sopenharmony_ci				  pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
28378c2ecf20Sopenharmony_ci				  pcpu_fc_alloc_fn_t alloc_fn,
28388c2ecf20Sopenharmony_ci				  pcpu_fc_free_fn_t free_fn)
28398c2ecf20Sopenharmony_ci{
28408c2ecf20Sopenharmony_ci	void *base = (void *)ULONG_MAX;
28418c2ecf20Sopenharmony_ci	void **areas = NULL;
28428c2ecf20Sopenharmony_ci	struct pcpu_alloc_info *ai;
28438c2ecf20Sopenharmony_ci	size_t size_sum, areas_size;
28448c2ecf20Sopenharmony_ci	unsigned long max_distance;
28458c2ecf20Sopenharmony_ci	int group, i, highest_group, rc = 0;
28468c2ecf20Sopenharmony_ci
28478c2ecf20Sopenharmony_ci	ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
28488c2ecf20Sopenharmony_ci				   cpu_distance_fn);
28498c2ecf20Sopenharmony_ci	if (IS_ERR(ai))
28508c2ecf20Sopenharmony_ci		return PTR_ERR(ai);
28518c2ecf20Sopenharmony_ci
28528c2ecf20Sopenharmony_ci	size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
28538c2ecf20Sopenharmony_ci	areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
28548c2ecf20Sopenharmony_ci
28558c2ecf20Sopenharmony_ci	areas = memblock_alloc(areas_size, SMP_CACHE_BYTES);
28568c2ecf20Sopenharmony_ci	if (!areas) {
28578c2ecf20Sopenharmony_ci		rc = -ENOMEM;
28588c2ecf20Sopenharmony_ci		goto out_free;
28598c2ecf20Sopenharmony_ci	}
28608c2ecf20Sopenharmony_ci
28618c2ecf20Sopenharmony_ci	/* allocate, copy and determine base address & max_distance */
28628c2ecf20Sopenharmony_ci	highest_group = 0;
28638c2ecf20Sopenharmony_ci	for (group = 0; group < ai->nr_groups; group++) {
28648c2ecf20Sopenharmony_ci		struct pcpu_group_info *gi = &ai->groups[group];
28658c2ecf20Sopenharmony_ci		unsigned int cpu = NR_CPUS;
28668c2ecf20Sopenharmony_ci		void *ptr;
28678c2ecf20Sopenharmony_ci
28688c2ecf20Sopenharmony_ci		for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
28698c2ecf20Sopenharmony_ci			cpu = gi->cpu_map[i];
28708c2ecf20Sopenharmony_ci		BUG_ON(cpu == NR_CPUS);
28718c2ecf20Sopenharmony_ci
28728c2ecf20Sopenharmony_ci		/* allocate space for the whole group */
28738c2ecf20Sopenharmony_ci		ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
28748c2ecf20Sopenharmony_ci		if (!ptr) {
28758c2ecf20Sopenharmony_ci			rc = -ENOMEM;
28768c2ecf20Sopenharmony_ci			goto out_free_areas;
28778c2ecf20Sopenharmony_ci		}
28788c2ecf20Sopenharmony_ci		/* kmemleak tracks the percpu allocations separately */
28798c2ecf20Sopenharmony_ci		kmemleak_free(ptr);
28808c2ecf20Sopenharmony_ci		areas[group] = ptr;
28818c2ecf20Sopenharmony_ci
28828c2ecf20Sopenharmony_ci		base = min(ptr, base);
28838c2ecf20Sopenharmony_ci		if (ptr > areas[highest_group])
28848c2ecf20Sopenharmony_ci			highest_group = group;
28858c2ecf20Sopenharmony_ci	}
28868c2ecf20Sopenharmony_ci	max_distance = areas[highest_group] - base;
28878c2ecf20Sopenharmony_ci	max_distance += ai->unit_size * ai->groups[highest_group].nr_units;
28888c2ecf20Sopenharmony_ci
28898c2ecf20Sopenharmony_ci	/* warn if maximum distance is further than 75% of vmalloc space */
28908c2ecf20Sopenharmony_ci	if (max_distance > VMALLOC_TOTAL * 3 / 4) {
28918c2ecf20Sopenharmony_ci		pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
28928c2ecf20Sopenharmony_ci				max_distance, VMALLOC_TOTAL);
28938c2ecf20Sopenharmony_ci#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
28948c2ecf20Sopenharmony_ci		/* and fail if we have fallback */
28958c2ecf20Sopenharmony_ci		rc = -EINVAL;
28968c2ecf20Sopenharmony_ci		goto out_free_areas;
28978c2ecf20Sopenharmony_ci#endif
28988c2ecf20Sopenharmony_ci	}
28998c2ecf20Sopenharmony_ci
29008c2ecf20Sopenharmony_ci	/*
29018c2ecf20Sopenharmony_ci	 * Copy data and free unused parts.  This should happen after all
29028c2ecf20Sopenharmony_ci	 * allocations are complete; otherwise, we may end up with
29038c2ecf20Sopenharmony_ci	 * overlapping groups.
29048c2ecf20Sopenharmony_ci	 */
29058c2ecf20Sopenharmony_ci	for (group = 0; group < ai->nr_groups; group++) {
29068c2ecf20Sopenharmony_ci		struct pcpu_group_info *gi = &ai->groups[group];
29078c2ecf20Sopenharmony_ci		void *ptr = areas[group];
29088c2ecf20Sopenharmony_ci
29098c2ecf20Sopenharmony_ci		for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
29108c2ecf20Sopenharmony_ci			if (gi->cpu_map[i] == NR_CPUS) {
29118c2ecf20Sopenharmony_ci				/* unused unit, free whole */
29128c2ecf20Sopenharmony_ci				free_fn(ptr, ai->unit_size);
29138c2ecf20Sopenharmony_ci				continue;
29148c2ecf20Sopenharmony_ci			}
29158c2ecf20Sopenharmony_ci			/* copy and return the unused part */
29168c2ecf20Sopenharmony_ci			memcpy(ptr, __per_cpu_load, ai->static_size);
29178c2ecf20Sopenharmony_ci			free_fn(ptr + size_sum, ai->unit_size - size_sum);
29188c2ecf20Sopenharmony_ci		}
29198c2ecf20Sopenharmony_ci	}
29208c2ecf20Sopenharmony_ci
29218c2ecf20Sopenharmony_ci	/* base address is now known, determine group base offsets */
29228c2ecf20Sopenharmony_ci	for (group = 0; group < ai->nr_groups; group++) {
29238c2ecf20Sopenharmony_ci		ai->groups[group].base_offset = areas[group] - base;
29248c2ecf20Sopenharmony_ci	}
29258c2ecf20Sopenharmony_ci
29268c2ecf20Sopenharmony_ci	pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n",
29278c2ecf20Sopenharmony_ci		PFN_DOWN(size_sum), ai->static_size, ai->reserved_size,
29288c2ecf20Sopenharmony_ci		ai->dyn_size, ai->unit_size);
29298c2ecf20Sopenharmony_ci
29308c2ecf20Sopenharmony_ci	pcpu_setup_first_chunk(ai, base);
29318c2ecf20Sopenharmony_ci	goto out_free;
29328c2ecf20Sopenharmony_ci
29338c2ecf20Sopenharmony_ciout_free_areas:
29348c2ecf20Sopenharmony_ci	for (group = 0; group < ai->nr_groups; group++)
29358c2ecf20Sopenharmony_ci		if (areas[group])
29368c2ecf20Sopenharmony_ci			free_fn(areas[group],
29378c2ecf20Sopenharmony_ci				ai->groups[group].nr_units * ai->unit_size);
29388c2ecf20Sopenharmony_ciout_free:
29398c2ecf20Sopenharmony_ci	pcpu_free_alloc_info(ai);
29408c2ecf20Sopenharmony_ci	if (areas)
29418c2ecf20Sopenharmony_ci		memblock_free_early(__pa(areas), areas_size);
29428c2ecf20Sopenharmony_ci	return rc;
29438c2ecf20Sopenharmony_ci}
29448c2ecf20Sopenharmony_ci#endif /* BUILD_EMBED_FIRST_CHUNK */
29458c2ecf20Sopenharmony_ci
29468c2ecf20Sopenharmony_ci#ifdef BUILD_PAGE_FIRST_CHUNK
29478c2ecf20Sopenharmony_ci/**
29488c2ecf20Sopenharmony_ci * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
29498c2ecf20Sopenharmony_ci * @reserved_size: the size of reserved percpu area in bytes
29508c2ecf20Sopenharmony_ci * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
29518c2ecf20Sopenharmony_ci * @free_fn: function to free percpu page, always called with PAGE_SIZE
29528c2ecf20Sopenharmony_ci * @populate_pte_fn: function to populate pte
29538c2ecf20Sopenharmony_ci *
29548c2ecf20Sopenharmony_ci * This is a helper to ease setting up page-remapped first percpu
29558c2ecf20Sopenharmony_ci * chunk and can be called where pcpu_setup_first_chunk() is expected.
29568c2ecf20Sopenharmony_ci *
29578c2ecf20Sopenharmony_ci * This is the basic allocator.  Static percpu area is allocated
29588c2ecf20Sopenharmony_ci * page-by-page into vmalloc area.
29598c2ecf20Sopenharmony_ci *
29608c2ecf20Sopenharmony_ci * RETURNS:
29618c2ecf20Sopenharmony_ci * 0 on success, -errno on failure.
29628c2ecf20Sopenharmony_ci */
29638c2ecf20Sopenharmony_ciint __init pcpu_page_first_chunk(size_t reserved_size,
29648c2ecf20Sopenharmony_ci				 pcpu_fc_alloc_fn_t alloc_fn,
29658c2ecf20Sopenharmony_ci				 pcpu_fc_free_fn_t free_fn,
29668c2ecf20Sopenharmony_ci				 pcpu_fc_populate_pte_fn_t populate_pte_fn)
29678c2ecf20Sopenharmony_ci{
29688c2ecf20Sopenharmony_ci	static struct vm_struct vm;
29698c2ecf20Sopenharmony_ci	struct pcpu_alloc_info *ai;
29708c2ecf20Sopenharmony_ci	char psize_str[16];
29718c2ecf20Sopenharmony_ci	int unit_pages;
29728c2ecf20Sopenharmony_ci	size_t pages_size;
29738c2ecf20Sopenharmony_ci	struct page **pages;
29748c2ecf20Sopenharmony_ci	int unit, i, j, rc = 0;
29758c2ecf20Sopenharmony_ci	int upa;
29768c2ecf20Sopenharmony_ci	int nr_g0_units;
29778c2ecf20Sopenharmony_ci
29788c2ecf20Sopenharmony_ci	snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
29798c2ecf20Sopenharmony_ci
29808c2ecf20Sopenharmony_ci	ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
29818c2ecf20Sopenharmony_ci	if (IS_ERR(ai))
29828c2ecf20Sopenharmony_ci		return PTR_ERR(ai);
29838c2ecf20Sopenharmony_ci	BUG_ON(ai->nr_groups != 1);
29848c2ecf20Sopenharmony_ci	upa = ai->alloc_size/ai->unit_size;
29858c2ecf20Sopenharmony_ci	nr_g0_units = roundup(num_possible_cpus(), upa);
29868c2ecf20Sopenharmony_ci	if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) {
29878c2ecf20Sopenharmony_ci		pcpu_free_alloc_info(ai);
29888c2ecf20Sopenharmony_ci		return -EINVAL;
29898c2ecf20Sopenharmony_ci	}
29908c2ecf20Sopenharmony_ci
29918c2ecf20Sopenharmony_ci	unit_pages = ai->unit_size >> PAGE_SHIFT;
29928c2ecf20Sopenharmony_ci
29938c2ecf20Sopenharmony_ci	/* unaligned allocations can't be freed, round up to page size */
29948c2ecf20Sopenharmony_ci	pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
29958c2ecf20Sopenharmony_ci			       sizeof(pages[0]));
29968c2ecf20Sopenharmony_ci	pages = memblock_alloc(pages_size, SMP_CACHE_BYTES);
29978c2ecf20Sopenharmony_ci	if (!pages)
29988c2ecf20Sopenharmony_ci		panic("%s: Failed to allocate %zu bytes\n", __func__,
29998c2ecf20Sopenharmony_ci		      pages_size);
30008c2ecf20Sopenharmony_ci
30018c2ecf20Sopenharmony_ci	/* allocate pages */
30028c2ecf20Sopenharmony_ci	j = 0;
30038c2ecf20Sopenharmony_ci	for (unit = 0; unit < num_possible_cpus(); unit++) {
30048c2ecf20Sopenharmony_ci		unsigned int cpu = ai->groups[0].cpu_map[unit];
30058c2ecf20Sopenharmony_ci		for (i = 0; i < unit_pages; i++) {
30068c2ecf20Sopenharmony_ci			void *ptr;
30078c2ecf20Sopenharmony_ci
30088c2ecf20Sopenharmony_ci			ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
30098c2ecf20Sopenharmony_ci			if (!ptr) {
30108c2ecf20Sopenharmony_ci				pr_warn("failed to allocate %s page for cpu%u\n",
30118c2ecf20Sopenharmony_ci						psize_str, cpu);
30128c2ecf20Sopenharmony_ci				goto enomem;
30138c2ecf20Sopenharmony_ci			}
30148c2ecf20Sopenharmony_ci			/* kmemleak tracks the percpu allocations separately */
30158c2ecf20Sopenharmony_ci			kmemleak_free(ptr);
30168c2ecf20Sopenharmony_ci			pages[j++] = virt_to_page(ptr);
30178c2ecf20Sopenharmony_ci		}
30188c2ecf20Sopenharmony_ci	}
30198c2ecf20Sopenharmony_ci
30208c2ecf20Sopenharmony_ci	/* allocate vm area, map the pages and copy static data */
30218c2ecf20Sopenharmony_ci	vm.flags = VM_ALLOC;
30228c2ecf20Sopenharmony_ci	vm.size = num_possible_cpus() * ai->unit_size;
30238c2ecf20Sopenharmony_ci	vm_area_register_early(&vm, PAGE_SIZE);
30248c2ecf20Sopenharmony_ci
30258c2ecf20Sopenharmony_ci	for (unit = 0; unit < num_possible_cpus(); unit++) {
30268c2ecf20Sopenharmony_ci		unsigned long unit_addr =
30278c2ecf20Sopenharmony_ci			(unsigned long)vm.addr + unit * ai->unit_size;
30288c2ecf20Sopenharmony_ci
30298c2ecf20Sopenharmony_ci		for (i = 0; i < unit_pages; i++)
30308c2ecf20Sopenharmony_ci			populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
30318c2ecf20Sopenharmony_ci
30328c2ecf20Sopenharmony_ci		/* pte already populated, the following shouldn't fail */
30338c2ecf20Sopenharmony_ci		rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
30348c2ecf20Sopenharmony_ci				      unit_pages);
30358c2ecf20Sopenharmony_ci		if (rc < 0)
30368c2ecf20Sopenharmony_ci			panic("failed to map percpu area, err=%d\n", rc);
30378c2ecf20Sopenharmony_ci
30388c2ecf20Sopenharmony_ci		/*
30398c2ecf20Sopenharmony_ci		 * FIXME: Archs with virtual cache should flush local
30408c2ecf20Sopenharmony_ci		 * cache for the linear mapping here - something
30418c2ecf20Sopenharmony_ci		 * equivalent to flush_cache_vmap() on the local cpu.
30428c2ecf20Sopenharmony_ci		 * flush_cache_vmap() can't be used as most supporting
30438c2ecf20Sopenharmony_ci		 * data structures are not set up yet.
30448c2ecf20Sopenharmony_ci		 */
30458c2ecf20Sopenharmony_ci
30468c2ecf20Sopenharmony_ci		/* copy static data */
30478c2ecf20Sopenharmony_ci		memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
30488c2ecf20Sopenharmony_ci	}
30498c2ecf20Sopenharmony_ci
30508c2ecf20Sopenharmony_ci	/* we're ready, commit */
30518c2ecf20Sopenharmony_ci	pr_info("%d %s pages/cpu s%zu r%zu d%zu\n",
30528c2ecf20Sopenharmony_ci		unit_pages, psize_str, ai->static_size,
30538c2ecf20Sopenharmony_ci		ai->reserved_size, ai->dyn_size);
30548c2ecf20Sopenharmony_ci
30558c2ecf20Sopenharmony_ci	pcpu_setup_first_chunk(ai, vm.addr);
30568c2ecf20Sopenharmony_ci	goto out_free_ar;
30578c2ecf20Sopenharmony_ci
30588c2ecf20Sopenharmony_cienomem:
30598c2ecf20Sopenharmony_ci	while (--j >= 0)
30608c2ecf20Sopenharmony_ci		free_fn(page_address(pages[j]), PAGE_SIZE);
30618c2ecf20Sopenharmony_ci	rc = -ENOMEM;
30628c2ecf20Sopenharmony_ciout_free_ar:
30638c2ecf20Sopenharmony_ci	memblock_free_early(__pa(pages), pages_size);
30648c2ecf20Sopenharmony_ci	pcpu_free_alloc_info(ai);
30658c2ecf20Sopenharmony_ci	return rc;
30668c2ecf20Sopenharmony_ci}
30678c2ecf20Sopenharmony_ci#endif /* BUILD_PAGE_FIRST_CHUNK */
30688c2ecf20Sopenharmony_ci
30698c2ecf20Sopenharmony_ci#ifndef	CONFIG_HAVE_SETUP_PER_CPU_AREA
30708c2ecf20Sopenharmony_ci/*
30718c2ecf20Sopenharmony_ci * Generic SMP percpu area setup.
30728c2ecf20Sopenharmony_ci *
30738c2ecf20Sopenharmony_ci * The embedding helper is used because its behavior closely resembles
30748c2ecf20Sopenharmony_ci * the original non-dynamic generic percpu area setup.  This is
30758c2ecf20Sopenharmony_ci * important because many archs have addressing restrictions and might
30768c2ecf20Sopenharmony_ci * fail if the percpu area is located far away from the previous
30778c2ecf20Sopenharmony_ci * location.  As an added bonus, in non-NUMA cases, embedding is
30788c2ecf20Sopenharmony_ci * generally a good idea TLB-wise because percpu area can piggy back
30798c2ecf20Sopenharmony_ci * on the physical linear memory mapping which uses large page
30808c2ecf20Sopenharmony_ci * mappings on applicable archs.
30818c2ecf20Sopenharmony_ci */
30828c2ecf20Sopenharmony_ciunsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
30838c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__per_cpu_offset);
30848c2ecf20Sopenharmony_ci
30858c2ecf20Sopenharmony_cistatic void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
30868c2ecf20Sopenharmony_ci				       size_t align)
30878c2ecf20Sopenharmony_ci{
30888c2ecf20Sopenharmony_ci	return  memblock_alloc_from(size, align, __pa(MAX_DMA_ADDRESS));
30898c2ecf20Sopenharmony_ci}
30908c2ecf20Sopenharmony_ci
30918c2ecf20Sopenharmony_cistatic void __init pcpu_dfl_fc_free(void *ptr, size_t size)
30928c2ecf20Sopenharmony_ci{
30938c2ecf20Sopenharmony_ci	memblock_free_early(__pa(ptr), size);
30948c2ecf20Sopenharmony_ci}
30958c2ecf20Sopenharmony_ci
30968c2ecf20Sopenharmony_civoid __init setup_per_cpu_areas(void)
30978c2ecf20Sopenharmony_ci{
30988c2ecf20Sopenharmony_ci	unsigned long delta;
30998c2ecf20Sopenharmony_ci	unsigned int cpu;
31008c2ecf20Sopenharmony_ci	int rc;
31018c2ecf20Sopenharmony_ci
31028c2ecf20Sopenharmony_ci	/*
31038c2ecf20Sopenharmony_ci	 * Always reserve area for module percpu variables.  That's
31048c2ecf20Sopenharmony_ci	 * what the legacy allocator did.
31058c2ecf20Sopenharmony_ci	 */
31068c2ecf20Sopenharmony_ci	rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
31078c2ecf20Sopenharmony_ci				    PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
31088c2ecf20Sopenharmony_ci				    pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
31098c2ecf20Sopenharmony_ci	if (rc < 0)
31108c2ecf20Sopenharmony_ci		panic("Failed to initialize percpu areas.");
31118c2ecf20Sopenharmony_ci
31128c2ecf20Sopenharmony_ci	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
31138c2ecf20Sopenharmony_ci	for_each_possible_cpu(cpu)
31148c2ecf20Sopenharmony_ci		__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
31158c2ecf20Sopenharmony_ci}
31168c2ecf20Sopenharmony_ci#endif	/* CONFIG_HAVE_SETUP_PER_CPU_AREA */
31178c2ecf20Sopenharmony_ci
31188c2ecf20Sopenharmony_ci#else	/* CONFIG_SMP */
31198c2ecf20Sopenharmony_ci
31208c2ecf20Sopenharmony_ci/*
31218c2ecf20Sopenharmony_ci * UP percpu area setup.
31228c2ecf20Sopenharmony_ci *
31238c2ecf20Sopenharmony_ci * UP always uses km-based percpu allocator with identity mapping.
31248c2ecf20Sopenharmony_ci * Static percpu variables are indistinguishable from the usual static
31258c2ecf20Sopenharmony_ci * variables and don't require any special preparation.
31268c2ecf20Sopenharmony_ci */
31278c2ecf20Sopenharmony_civoid __init setup_per_cpu_areas(void)
31288c2ecf20Sopenharmony_ci{
31298c2ecf20Sopenharmony_ci	const size_t unit_size =
31308c2ecf20Sopenharmony_ci		roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
31318c2ecf20Sopenharmony_ci					 PERCPU_DYNAMIC_RESERVE));
31328c2ecf20Sopenharmony_ci	struct pcpu_alloc_info *ai;
31338c2ecf20Sopenharmony_ci	void *fc;
31348c2ecf20Sopenharmony_ci
31358c2ecf20Sopenharmony_ci	ai = pcpu_alloc_alloc_info(1, 1);
31368c2ecf20Sopenharmony_ci	fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
31378c2ecf20Sopenharmony_ci	if (!ai || !fc)
31388c2ecf20Sopenharmony_ci		panic("Failed to allocate memory for percpu areas.");
31398c2ecf20Sopenharmony_ci	/* kmemleak tracks the percpu allocations separately */
31408c2ecf20Sopenharmony_ci	kmemleak_free(fc);
31418c2ecf20Sopenharmony_ci
31428c2ecf20Sopenharmony_ci	ai->dyn_size = unit_size;
31438c2ecf20Sopenharmony_ci	ai->unit_size = unit_size;
31448c2ecf20Sopenharmony_ci	ai->atom_size = unit_size;
31458c2ecf20Sopenharmony_ci	ai->alloc_size = unit_size;
31468c2ecf20Sopenharmony_ci	ai->groups[0].nr_units = 1;
31478c2ecf20Sopenharmony_ci	ai->groups[0].cpu_map[0] = 0;
31488c2ecf20Sopenharmony_ci
31498c2ecf20Sopenharmony_ci	pcpu_setup_first_chunk(ai, fc);
31508c2ecf20Sopenharmony_ci	pcpu_free_alloc_info(ai);
31518c2ecf20Sopenharmony_ci}
31528c2ecf20Sopenharmony_ci
31538c2ecf20Sopenharmony_ci#endif	/* CONFIG_SMP */
31548c2ecf20Sopenharmony_ci
31558c2ecf20Sopenharmony_ci/*
31568c2ecf20Sopenharmony_ci * pcpu_nr_pages - calculate total number of populated backing pages
31578c2ecf20Sopenharmony_ci *
31588c2ecf20Sopenharmony_ci * This reflects the number of pages populated to back chunks.  Metadata is
31598c2ecf20Sopenharmony_ci * excluded in the number exposed in meminfo as the number of backing pages
31608c2ecf20Sopenharmony_ci * scales with the number of cpus and can quickly outweigh the memory used for
31618c2ecf20Sopenharmony_ci * metadata.  It also keeps this calculation nice and simple.
31628c2ecf20Sopenharmony_ci *
31638c2ecf20Sopenharmony_ci * RETURNS:
31648c2ecf20Sopenharmony_ci * Total number of populated backing pages in use by the allocator.
31658c2ecf20Sopenharmony_ci */
31668c2ecf20Sopenharmony_ciunsigned long pcpu_nr_pages(void)
31678c2ecf20Sopenharmony_ci{
31688c2ecf20Sopenharmony_ci	return pcpu_nr_populated * pcpu_nr_units;
31698c2ecf20Sopenharmony_ci}
31708c2ecf20Sopenharmony_ci
31718c2ecf20Sopenharmony_ci/*
31728c2ecf20Sopenharmony_ci * Percpu allocator is initialized early during boot when neither slab or
31738c2ecf20Sopenharmony_ci * workqueue is available.  Plug async management until everything is up
31748c2ecf20Sopenharmony_ci * and running.
31758c2ecf20Sopenharmony_ci */
31768c2ecf20Sopenharmony_cistatic int __init percpu_enable_async(void)
31778c2ecf20Sopenharmony_ci{
31788c2ecf20Sopenharmony_ci	pcpu_async_enabled = true;
31798c2ecf20Sopenharmony_ci	return 0;
31808c2ecf20Sopenharmony_ci}
31818c2ecf20Sopenharmony_cisubsys_initcall(percpu_enable_async);
3182