162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Manage cache of swap slots to be used for and returned from
462306a36Sopenharmony_ci * swap.
562306a36Sopenharmony_ci *
662306a36Sopenharmony_ci * Copyright(c) 2016 Intel Corporation.
762306a36Sopenharmony_ci *
862306a36Sopenharmony_ci * Author: Tim Chen <tim.c.chen@linux.intel.com>
962306a36Sopenharmony_ci *
1062306a36Sopenharmony_ci * We allocate the swap slots from the global pool and put
1162306a36Sopenharmony_ci * it into local per cpu caches.  This has the advantage
1262306a36Sopenharmony_ci * of no needing to acquire the swap_info lock every time
1362306a36Sopenharmony_ci * we need a new slot.
1462306a36Sopenharmony_ci *
1562306a36Sopenharmony_ci * There is also opportunity to simply return the slot
1662306a36Sopenharmony_ci * to local caches without needing to acquire swap_info
1762306a36Sopenharmony_ci * lock.  We do not reuse the returned slots directly but
1862306a36Sopenharmony_ci * move them back to the global pool in a batch.  This
1962306a36Sopenharmony_ci * allows the slots to coalesce and reduce fragmentation.
2062306a36Sopenharmony_ci *
2162306a36Sopenharmony_ci * The swap entry allocated is marked with SWAP_HAS_CACHE
2262306a36Sopenharmony_ci * flag in map_count that prevents it from being allocated
2362306a36Sopenharmony_ci * again from the global pool.
2462306a36Sopenharmony_ci *
2562306a36Sopenharmony_ci * The swap slots cache is protected by a mutex instead of
2662306a36Sopenharmony_ci * a spin lock as when we search for slots with scan_swap_map,
2762306a36Sopenharmony_ci * we can possibly sleep.
2862306a36Sopenharmony_ci */
2962306a36Sopenharmony_ci
3062306a36Sopenharmony_ci#include <linux/swap_slots.h>
3162306a36Sopenharmony_ci#include <linux/cpu.h>
3262306a36Sopenharmony_ci#include <linux/cpumask.h>
3362306a36Sopenharmony_ci#include <linux/slab.h>
3462306a36Sopenharmony_ci#include <linux/vmalloc.h>
3562306a36Sopenharmony_ci#include <linux/mutex.h>
3662306a36Sopenharmony_ci#include <linux/mm.h>
3762306a36Sopenharmony_ci
3862306a36Sopenharmony_cistatic DEFINE_PER_CPU(struct swap_slots_cache, swp_slots);
3962306a36Sopenharmony_cistatic bool	swap_slot_cache_active;
4062306a36Sopenharmony_cibool	swap_slot_cache_enabled;
4162306a36Sopenharmony_cistatic bool	swap_slot_cache_initialized;
4262306a36Sopenharmony_cistatic DEFINE_MUTEX(swap_slots_cache_mutex);
4362306a36Sopenharmony_ci/* Serialize swap slots cache enable/disable operations */
4462306a36Sopenharmony_cistatic DEFINE_MUTEX(swap_slots_cache_enable_mutex);
4562306a36Sopenharmony_ci
4662306a36Sopenharmony_cistatic void __drain_swap_slots_cache(unsigned int type);
4762306a36Sopenharmony_ci
4862306a36Sopenharmony_ci#define use_swap_slot_cache (swap_slot_cache_active && swap_slot_cache_enabled)
4962306a36Sopenharmony_ci#define SLOTS_CACHE 0x1
5062306a36Sopenharmony_ci#define SLOTS_CACHE_RET 0x2
5162306a36Sopenharmony_ci
5262306a36Sopenharmony_cistatic void deactivate_swap_slots_cache(void)
5362306a36Sopenharmony_ci{
5462306a36Sopenharmony_ci	mutex_lock(&swap_slots_cache_mutex);
5562306a36Sopenharmony_ci	swap_slot_cache_active = false;
5662306a36Sopenharmony_ci	__drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET);
5762306a36Sopenharmony_ci	mutex_unlock(&swap_slots_cache_mutex);
5862306a36Sopenharmony_ci}
5962306a36Sopenharmony_ci
6062306a36Sopenharmony_cistatic void reactivate_swap_slots_cache(void)
6162306a36Sopenharmony_ci{
6262306a36Sopenharmony_ci	mutex_lock(&swap_slots_cache_mutex);
6362306a36Sopenharmony_ci	swap_slot_cache_active = true;
6462306a36Sopenharmony_ci	mutex_unlock(&swap_slots_cache_mutex);
6562306a36Sopenharmony_ci}
6662306a36Sopenharmony_ci
6762306a36Sopenharmony_ci/* Must not be called with cpu hot plug lock */
6862306a36Sopenharmony_civoid disable_swap_slots_cache_lock(void)
6962306a36Sopenharmony_ci{
7062306a36Sopenharmony_ci	mutex_lock(&swap_slots_cache_enable_mutex);
7162306a36Sopenharmony_ci	swap_slot_cache_enabled = false;
7262306a36Sopenharmony_ci	if (swap_slot_cache_initialized) {
7362306a36Sopenharmony_ci		/* serialize with cpu hotplug operations */
7462306a36Sopenharmony_ci		cpus_read_lock();
7562306a36Sopenharmony_ci		__drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET);
7662306a36Sopenharmony_ci		cpus_read_unlock();
7762306a36Sopenharmony_ci	}
7862306a36Sopenharmony_ci}
7962306a36Sopenharmony_ci
8062306a36Sopenharmony_cistatic void __reenable_swap_slots_cache(void)
8162306a36Sopenharmony_ci{
8262306a36Sopenharmony_ci	swap_slot_cache_enabled = has_usable_swap();
8362306a36Sopenharmony_ci}
8462306a36Sopenharmony_ci
8562306a36Sopenharmony_civoid reenable_swap_slots_cache_unlock(void)
8662306a36Sopenharmony_ci{
8762306a36Sopenharmony_ci	__reenable_swap_slots_cache();
8862306a36Sopenharmony_ci	mutex_unlock(&swap_slots_cache_enable_mutex);
8962306a36Sopenharmony_ci}
9062306a36Sopenharmony_ci
9162306a36Sopenharmony_cistatic bool check_cache_active(void)
9262306a36Sopenharmony_ci{
9362306a36Sopenharmony_ci	long pages;
9462306a36Sopenharmony_ci
9562306a36Sopenharmony_ci	if (!swap_slot_cache_enabled)
9662306a36Sopenharmony_ci		return false;
9762306a36Sopenharmony_ci
9862306a36Sopenharmony_ci	pages = get_nr_swap_pages();
9962306a36Sopenharmony_ci	if (!swap_slot_cache_active) {
10062306a36Sopenharmony_ci		if (pages > num_online_cpus() *
10162306a36Sopenharmony_ci		    THRESHOLD_ACTIVATE_SWAP_SLOTS_CACHE)
10262306a36Sopenharmony_ci			reactivate_swap_slots_cache();
10362306a36Sopenharmony_ci		goto out;
10462306a36Sopenharmony_ci	}
10562306a36Sopenharmony_ci
10662306a36Sopenharmony_ci	/* if global pool of slot caches too low, deactivate cache */
10762306a36Sopenharmony_ci	if (pages < num_online_cpus() * THRESHOLD_DEACTIVATE_SWAP_SLOTS_CACHE)
10862306a36Sopenharmony_ci		deactivate_swap_slots_cache();
10962306a36Sopenharmony_ciout:
11062306a36Sopenharmony_ci	return swap_slot_cache_active;
11162306a36Sopenharmony_ci}
11262306a36Sopenharmony_ci
11362306a36Sopenharmony_cistatic int alloc_swap_slot_cache(unsigned int cpu)
11462306a36Sopenharmony_ci{
11562306a36Sopenharmony_ci	struct swap_slots_cache *cache;
11662306a36Sopenharmony_ci	swp_entry_t *slots, *slots_ret;
11762306a36Sopenharmony_ci
11862306a36Sopenharmony_ci	/*
11962306a36Sopenharmony_ci	 * Do allocation outside swap_slots_cache_mutex
12062306a36Sopenharmony_ci	 * as kvzalloc could trigger reclaim and folio_alloc_swap,
12162306a36Sopenharmony_ci	 * which can lock swap_slots_cache_mutex.
12262306a36Sopenharmony_ci	 */
12362306a36Sopenharmony_ci	slots = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t),
12462306a36Sopenharmony_ci			 GFP_KERNEL);
12562306a36Sopenharmony_ci	if (!slots)
12662306a36Sopenharmony_ci		return -ENOMEM;
12762306a36Sopenharmony_ci
12862306a36Sopenharmony_ci	slots_ret = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t),
12962306a36Sopenharmony_ci			     GFP_KERNEL);
13062306a36Sopenharmony_ci	if (!slots_ret) {
13162306a36Sopenharmony_ci		kvfree(slots);
13262306a36Sopenharmony_ci		return -ENOMEM;
13362306a36Sopenharmony_ci	}
13462306a36Sopenharmony_ci
13562306a36Sopenharmony_ci	mutex_lock(&swap_slots_cache_mutex);
13662306a36Sopenharmony_ci	cache = &per_cpu(swp_slots, cpu);
13762306a36Sopenharmony_ci	if (cache->slots || cache->slots_ret) {
13862306a36Sopenharmony_ci		/* cache already allocated */
13962306a36Sopenharmony_ci		mutex_unlock(&swap_slots_cache_mutex);
14062306a36Sopenharmony_ci
14162306a36Sopenharmony_ci		kvfree(slots);
14262306a36Sopenharmony_ci		kvfree(slots_ret);
14362306a36Sopenharmony_ci
14462306a36Sopenharmony_ci		return 0;
14562306a36Sopenharmony_ci	}
14662306a36Sopenharmony_ci
14762306a36Sopenharmony_ci	if (!cache->lock_initialized) {
14862306a36Sopenharmony_ci		mutex_init(&cache->alloc_lock);
14962306a36Sopenharmony_ci		spin_lock_init(&cache->free_lock);
15062306a36Sopenharmony_ci		cache->lock_initialized = true;
15162306a36Sopenharmony_ci	}
15262306a36Sopenharmony_ci	cache->nr = 0;
15362306a36Sopenharmony_ci	cache->cur = 0;
15462306a36Sopenharmony_ci	cache->n_ret = 0;
15562306a36Sopenharmony_ci	/*
15662306a36Sopenharmony_ci	 * We initialized alloc_lock and free_lock earlier.  We use
15762306a36Sopenharmony_ci	 * !cache->slots or !cache->slots_ret to know if it is safe to acquire
15862306a36Sopenharmony_ci	 * the corresponding lock and use the cache.  Memory barrier below
15962306a36Sopenharmony_ci	 * ensures the assumption.
16062306a36Sopenharmony_ci	 */
16162306a36Sopenharmony_ci	mb();
16262306a36Sopenharmony_ci	cache->slots = slots;
16362306a36Sopenharmony_ci	cache->slots_ret = slots_ret;
16462306a36Sopenharmony_ci	mutex_unlock(&swap_slots_cache_mutex);
16562306a36Sopenharmony_ci	return 0;
16662306a36Sopenharmony_ci}
16762306a36Sopenharmony_ci
16862306a36Sopenharmony_cistatic void drain_slots_cache_cpu(unsigned int cpu, unsigned int type,
16962306a36Sopenharmony_ci				  bool free_slots)
17062306a36Sopenharmony_ci{
17162306a36Sopenharmony_ci	struct swap_slots_cache *cache;
17262306a36Sopenharmony_ci	swp_entry_t *slots = NULL;
17362306a36Sopenharmony_ci
17462306a36Sopenharmony_ci	cache = &per_cpu(swp_slots, cpu);
17562306a36Sopenharmony_ci	if ((type & SLOTS_CACHE) && cache->slots) {
17662306a36Sopenharmony_ci		mutex_lock(&cache->alloc_lock);
17762306a36Sopenharmony_ci		swapcache_free_entries(cache->slots + cache->cur, cache->nr);
17862306a36Sopenharmony_ci		cache->cur = 0;
17962306a36Sopenharmony_ci		cache->nr = 0;
18062306a36Sopenharmony_ci		if (free_slots && cache->slots) {
18162306a36Sopenharmony_ci			kvfree(cache->slots);
18262306a36Sopenharmony_ci			cache->slots = NULL;
18362306a36Sopenharmony_ci		}
18462306a36Sopenharmony_ci		mutex_unlock(&cache->alloc_lock);
18562306a36Sopenharmony_ci	}
18662306a36Sopenharmony_ci	if ((type & SLOTS_CACHE_RET) && cache->slots_ret) {
18762306a36Sopenharmony_ci		spin_lock_irq(&cache->free_lock);
18862306a36Sopenharmony_ci		swapcache_free_entries(cache->slots_ret, cache->n_ret);
18962306a36Sopenharmony_ci		cache->n_ret = 0;
19062306a36Sopenharmony_ci		if (free_slots && cache->slots_ret) {
19162306a36Sopenharmony_ci			slots = cache->slots_ret;
19262306a36Sopenharmony_ci			cache->slots_ret = NULL;
19362306a36Sopenharmony_ci		}
19462306a36Sopenharmony_ci		spin_unlock_irq(&cache->free_lock);
19562306a36Sopenharmony_ci		kvfree(slots);
19662306a36Sopenharmony_ci	}
19762306a36Sopenharmony_ci}
19862306a36Sopenharmony_ci
19962306a36Sopenharmony_cistatic void __drain_swap_slots_cache(unsigned int type)
20062306a36Sopenharmony_ci{
20162306a36Sopenharmony_ci	unsigned int cpu;
20262306a36Sopenharmony_ci
20362306a36Sopenharmony_ci	/*
20462306a36Sopenharmony_ci	 * This function is called during
20562306a36Sopenharmony_ci	 *	1) swapoff, when we have to make sure no
20662306a36Sopenharmony_ci	 *	   left over slots are in cache when we remove
20762306a36Sopenharmony_ci	 *	   a swap device;
20862306a36Sopenharmony_ci	 *      2) disabling of swap slot cache, when we run low
20962306a36Sopenharmony_ci	 *	   on swap slots when allocating memory and need
21062306a36Sopenharmony_ci	 *	   to return swap slots to global pool.
21162306a36Sopenharmony_ci	 *
21262306a36Sopenharmony_ci	 * We cannot acquire cpu hot plug lock here as
21362306a36Sopenharmony_ci	 * this function can be invoked in the cpu
21462306a36Sopenharmony_ci	 * hot plug path:
21562306a36Sopenharmony_ci	 * cpu_up -> lock cpu_hotplug -> cpu hotplug state callback
21662306a36Sopenharmony_ci	 *   -> memory allocation -> direct reclaim -> folio_alloc_swap
21762306a36Sopenharmony_ci	 *   -> drain_swap_slots_cache
21862306a36Sopenharmony_ci	 *
21962306a36Sopenharmony_ci	 * Hence the loop over current online cpu below could miss cpu that
22062306a36Sopenharmony_ci	 * is being brought online but not yet marked as online.
22162306a36Sopenharmony_ci	 * That is okay as we do not schedule and run anything on a
22262306a36Sopenharmony_ci	 * cpu before it has been marked online. Hence, we will not
22362306a36Sopenharmony_ci	 * fill any swap slots in slots cache of such cpu.
22462306a36Sopenharmony_ci	 * There are no slots on such cpu that need to be drained.
22562306a36Sopenharmony_ci	 */
22662306a36Sopenharmony_ci	for_each_online_cpu(cpu)
22762306a36Sopenharmony_ci		drain_slots_cache_cpu(cpu, type, false);
22862306a36Sopenharmony_ci}
22962306a36Sopenharmony_ci
23062306a36Sopenharmony_cistatic int free_slot_cache(unsigned int cpu)
23162306a36Sopenharmony_ci{
23262306a36Sopenharmony_ci	mutex_lock(&swap_slots_cache_mutex);
23362306a36Sopenharmony_ci	drain_slots_cache_cpu(cpu, SLOTS_CACHE | SLOTS_CACHE_RET, true);
23462306a36Sopenharmony_ci	mutex_unlock(&swap_slots_cache_mutex);
23562306a36Sopenharmony_ci	return 0;
23662306a36Sopenharmony_ci}
23762306a36Sopenharmony_ci
23862306a36Sopenharmony_civoid enable_swap_slots_cache(void)
23962306a36Sopenharmony_ci{
24062306a36Sopenharmony_ci	mutex_lock(&swap_slots_cache_enable_mutex);
24162306a36Sopenharmony_ci	if (!swap_slot_cache_initialized) {
24262306a36Sopenharmony_ci		int ret;
24362306a36Sopenharmony_ci
24462306a36Sopenharmony_ci		ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache",
24562306a36Sopenharmony_ci					alloc_swap_slot_cache, free_slot_cache);
24662306a36Sopenharmony_ci		if (WARN_ONCE(ret < 0, "Cache allocation failed (%s), operating "
24762306a36Sopenharmony_ci				       "without swap slots cache.\n", __func__))
24862306a36Sopenharmony_ci			goto out_unlock;
24962306a36Sopenharmony_ci
25062306a36Sopenharmony_ci		swap_slot_cache_initialized = true;
25162306a36Sopenharmony_ci	}
25262306a36Sopenharmony_ci
25362306a36Sopenharmony_ci	__reenable_swap_slots_cache();
25462306a36Sopenharmony_ciout_unlock:
25562306a36Sopenharmony_ci	mutex_unlock(&swap_slots_cache_enable_mutex);
25662306a36Sopenharmony_ci}
25762306a36Sopenharmony_ci
25862306a36Sopenharmony_ci/* called with swap slot cache's alloc lock held */
25962306a36Sopenharmony_cistatic int refill_swap_slots_cache(struct swap_slots_cache *cache)
26062306a36Sopenharmony_ci{
26162306a36Sopenharmony_ci	if (!use_swap_slot_cache)
26262306a36Sopenharmony_ci		return 0;
26362306a36Sopenharmony_ci
26462306a36Sopenharmony_ci	cache->cur = 0;
26562306a36Sopenharmony_ci	if (swap_slot_cache_active)
26662306a36Sopenharmony_ci		cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE,
26762306a36Sopenharmony_ci					   cache->slots, 1);
26862306a36Sopenharmony_ci
26962306a36Sopenharmony_ci	return cache->nr;
27062306a36Sopenharmony_ci}
27162306a36Sopenharmony_ci
27262306a36Sopenharmony_civoid free_swap_slot(swp_entry_t entry)
27362306a36Sopenharmony_ci{
27462306a36Sopenharmony_ci	struct swap_slots_cache *cache;
27562306a36Sopenharmony_ci
27662306a36Sopenharmony_ci	cache = raw_cpu_ptr(&swp_slots);
27762306a36Sopenharmony_ci	if (likely(use_swap_slot_cache && cache->slots_ret)) {
27862306a36Sopenharmony_ci		spin_lock_irq(&cache->free_lock);
27962306a36Sopenharmony_ci		/* Swap slots cache may be deactivated before acquiring lock */
28062306a36Sopenharmony_ci		if (!use_swap_slot_cache || !cache->slots_ret) {
28162306a36Sopenharmony_ci			spin_unlock_irq(&cache->free_lock);
28262306a36Sopenharmony_ci			goto direct_free;
28362306a36Sopenharmony_ci		}
28462306a36Sopenharmony_ci		if (cache->n_ret >= SWAP_SLOTS_CACHE_SIZE) {
28562306a36Sopenharmony_ci			/*
28662306a36Sopenharmony_ci			 * Return slots to global pool.
28762306a36Sopenharmony_ci			 * The current swap_map value is SWAP_HAS_CACHE.
28862306a36Sopenharmony_ci			 * Set it to 0 to indicate it is available for
28962306a36Sopenharmony_ci			 * allocation in global pool
29062306a36Sopenharmony_ci			 */
29162306a36Sopenharmony_ci			swapcache_free_entries(cache->slots_ret, cache->n_ret);
29262306a36Sopenharmony_ci			cache->n_ret = 0;
29362306a36Sopenharmony_ci		}
29462306a36Sopenharmony_ci		cache->slots_ret[cache->n_ret++] = entry;
29562306a36Sopenharmony_ci		spin_unlock_irq(&cache->free_lock);
29662306a36Sopenharmony_ci	} else {
29762306a36Sopenharmony_cidirect_free:
29862306a36Sopenharmony_ci		swapcache_free_entries(&entry, 1);
29962306a36Sopenharmony_ci	}
30062306a36Sopenharmony_ci}
30162306a36Sopenharmony_ci
30262306a36Sopenharmony_ciswp_entry_t folio_alloc_swap(struct folio *folio)
30362306a36Sopenharmony_ci{
30462306a36Sopenharmony_ci	swp_entry_t entry;
30562306a36Sopenharmony_ci	struct swap_slots_cache *cache;
30662306a36Sopenharmony_ci
30762306a36Sopenharmony_ci	entry.val = 0;
30862306a36Sopenharmony_ci
30962306a36Sopenharmony_ci	if (folio_test_large(folio)) {
31062306a36Sopenharmony_ci		if (IS_ENABLED(CONFIG_THP_SWAP) && arch_thp_swp_supported())
31162306a36Sopenharmony_ci			get_swap_pages(1, &entry, folio_nr_pages(folio));
31262306a36Sopenharmony_ci		goto out;
31362306a36Sopenharmony_ci	}
31462306a36Sopenharmony_ci
31562306a36Sopenharmony_ci	/*
31662306a36Sopenharmony_ci	 * Preemption is allowed here, because we may sleep
31762306a36Sopenharmony_ci	 * in refill_swap_slots_cache().  But it is safe, because
31862306a36Sopenharmony_ci	 * accesses to the per-CPU data structure are protected by the
31962306a36Sopenharmony_ci	 * mutex cache->alloc_lock.
32062306a36Sopenharmony_ci	 *
32162306a36Sopenharmony_ci	 * The alloc path here does not touch cache->slots_ret
32262306a36Sopenharmony_ci	 * so cache->free_lock is not taken.
32362306a36Sopenharmony_ci	 */
32462306a36Sopenharmony_ci	cache = raw_cpu_ptr(&swp_slots);
32562306a36Sopenharmony_ci
32662306a36Sopenharmony_ci	if (likely(check_cache_active() && cache->slots)) {
32762306a36Sopenharmony_ci		mutex_lock(&cache->alloc_lock);
32862306a36Sopenharmony_ci		if (cache->slots) {
32962306a36Sopenharmony_cirepeat:
33062306a36Sopenharmony_ci			if (cache->nr) {
33162306a36Sopenharmony_ci				entry = cache->slots[cache->cur];
33262306a36Sopenharmony_ci				cache->slots[cache->cur++].val = 0;
33362306a36Sopenharmony_ci				cache->nr--;
33462306a36Sopenharmony_ci			} else if (refill_swap_slots_cache(cache)) {
33562306a36Sopenharmony_ci				goto repeat;
33662306a36Sopenharmony_ci			}
33762306a36Sopenharmony_ci		}
33862306a36Sopenharmony_ci		mutex_unlock(&cache->alloc_lock);
33962306a36Sopenharmony_ci		if (entry.val)
34062306a36Sopenharmony_ci			goto out;
34162306a36Sopenharmony_ci	}
34262306a36Sopenharmony_ci
34362306a36Sopenharmony_ci	get_swap_pages(1, &entry, 1);
34462306a36Sopenharmony_ciout:
34562306a36Sopenharmony_ci	if (mem_cgroup_try_charge_swap(folio, entry)) {
34662306a36Sopenharmony_ci		put_swap_folio(folio, entry);
34762306a36Sopenharmony_ci		entry.val = 0;
34862306a36Sopenharmony_ci	}
34962306a36Sopenharmony_ci	return entry;
35062306a36Sopenharmony_ci}
351