162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Manage cache of swap slots to be used for and returned from 462306a36Sopenharmony_ci * swap. 562306a36Sopenharmony_ci * 662306a36Sopenharmony_ci * Copyright(c) 2016 Intel Corporation. 762306a36Sopenharmony_ci * 862306a36Sopenharmony_ci * Author: Tim Chen <tim.c.chen@linux.intel.com> 962306a36Sopenharmony_ci * 1062306a36Sopenharmony_ci * We allocate the swap slots from the global pool and put 1162306a36Sopenharmony_ci * it into local per cpu caches. This has the advantage 1262306a36Sopenharmony_ci * of no needing to acquire the swap_info lock every time 1362306a36Sopenharmony_ci * we need a new slot. 1462306a36Sopenharmony_ci * 1562306a36Sopenharmony_ci * There is also opportunity to simply return the slot 1662306a36Sopenharmony_ci * to local caches without needing to acquire swap_info 1762306a36Sopenharmony_ci * lock. We do not reuse the returned slots directly but 1862306a36Sopenharmony_ci * move them back to the global pool in a batch. This 1962306a36Sopenharmony_ci * allows the slots to coalesce and reduce fragmentation. 2062306a36Sopenharmony_ci * 2162306a36Sopenharmony_ci * The swap entry allocated is marked with SWAP_HAS_CACHE 2262306a36Sopenharmony_ci * flag in map_count that prevents it from being allocated 2362306a36Sopenharmony_ci * again from the global pool. 2462306a36Sopenharmony_ci * 2562306a36Sopenharmony_ci * The swap slots cache is protected by a mutex instead of 2662306a36Sopenharmony_ci * a spin lock as when we search for slots with scan_swap_map, 2762306a36Sopenharmony_ci * we can possibly sleep. 2862306a36Sopenharmony_ci */ 2962306a36Sopenharmony_ci 3062306a36Sopenharmony_ci#include <linux/swap_slots.h> 3162306a36Sopenharmony_ci#include <linux/cpu.h> 3262306a36Sopenharmony_ci#include <linux/cpumask.h> 3362306a36Sopenharmony_ci#include <linux/slab.h> 3462306a36Sopenharmony_ci#include <linux/vmalloc.h> 3562306a36Sopenharmony_ci#include <linux/mutex.h> 3662306a36Sopenharmony_ci#include <linux/mm.h> 3762306a36Sopenharmony_ci 3862306a36Sopenharmony_cistatic DEFINE_PER_CPU(struct swap_slots_cache, swp_slots); 3962306a36Sopenharmony_cistatic bool swap_slot_cache_active; 4062306a36Sopenharmony_cibool swap_slot_cache_enabled; 4162306a36Sopenharmony_cistatic bool swap_slot_cache_initialized; 4262306a36Sopenharmony_cistatic DEFINE_MUTEX(swap_slots_cache_mutex); 4362306a36Sopenharmony_ci/* Serialize swap slots cache enable/disable operations */ 4462306a36Sopenharmony_cistatic DEFINE_MUTEX(swap_slots_cache_enable_mutex); 4562306a36Sopenharmony_ci 4662306a36Sopenharmony_cistatic void __drain_swap_slots_cache(unsigned int type); 4762306a36Sopenharmony_ci 4862306a36Sopenharmony_ci#define use_swap_slot_cache (swap_slot_cache_active && swap_slot_cache_enabled) 4962306a36Sopenharmony_ci#define SLOTS_CACHE 0x1 5062306a36Sopenharmony_ci#define SLOTS_CACHE_RET 0x2 5162306a36Sopenharmony_ci 5262306a36Sopenharmony_cistatic void deactivate_swap_slots_cache(void) 5362306a36Sopenharmony_ci{ 5462306a36Sopenharmony_ci mutex_lock(&swap_slots_cache_mutex); 5562306a36Sopenharmony_ci swap_slot_cache_active = false; 5662306a36Sopenharmony_ci __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET); 5762306a36Sopenharmony_ci mutex_unlock(&swap_slots_cache_mutex); 5862306a36Sopenharmony_ci} 5962306a36Sopenharmony_ci 6062306a36Sopenharmony_cistatic void reactivate_swap_slots_cache(void) 6162306a36Sopenharmony_ci{ 6262306a36Sopenharmony_ci mutex_lock(&swap_slots_cache_mutex); 6362306a36Sopenharmony_ci swap_slot_cache_active = true; 6462306a36Sopenharmony_ci mutex_unlock(&swap_slots_cache_mutex); 6562306a36Sopenharmony_ci} 6662306a36Sopenharmony_ci 6762306a36Sopenharmony_ci/* Must not be called with cpu hot plug lock */ 6862306a36Sopenharmony_civoid disable_swap_slots_cache_lock(void) 6962306a36Sopenharmony_ci{ 7062306a36Sopenharmony_ci mutex_lock(&swap_slots_cache_enable_mutex); 7162306a36Sopenharmony_ci swap_slot_cache_enabled = false; 7262306a36Sopenharmony_ci if (swap_slot_cache_initialized) { 7362306a36Sopenharmony_ci /* serialize with cpu hotplug operations */ 7462306a36Sopenharmony_ci cpus_read_lock(); 7562306a36Sopenharmony_ci __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET); 7662306a36Sopenharmony_ci cpus_read_unlock(); 7762306a36Sopenharmony_ci } 7862306a36Sopenharmony_ci} 7962306a36Sopenharmony_ci 8062306a36Sopenharmony_cistatic void __reenable_swap_slots_cache(void) 8162306a36Sopenharmony_ci{ 8262306a36Sopenharmony_ci swap_slot_cache_enabled = has_usable_swap(); 8362306a36Sopenharmony_ci} 8462306a36Sopenharmony_ci 8562306a36Sopenharmony_civoid reenable_swap_slots_cache_unlock(void) 8662306a36Sopenharmony_ci{ 8762306a36Sopenharmony_ci __reenable_swap_slots_cache(); 8862306a36Sopenharmony_ci mutex_unlock(&swap_slots_cache_enable_mutex); 8962306a36Sopenharmony_ci} 9062306a36Sopenharmony_ci 9162306a36Sopenharmony_cistatic bool check_cache_active(void) 9262306a36Sopenharmony_ci{ 9362306a36Sopenharmony_ci long pages; 9462306a36Sopenharmony_ci 9562306a36Sopenharmony_ci if (!swap_slot_cache_enabled) 9662306a36Sopenharmony_ci return false; 9762306a36Sopenharmony_ci 9862306a36Sopenharmony_ci pages = get_nr_swap_pages(); 9962306a36Sopenharmony_ci if (!swap_slot_cache_active) { 10062306a36Sopenharmony_ci if (pages > num_online_cpus() * 10162306a36Sopenharmony_ci THRESHOLD_ACTIVATE_SWAP_SLOTS_CACHE) 10262306a36Sopenharmony_ci reactivate_swap_slots_cache(); 10362306a36Sopenharmony_ci goto out; 10462306a36Sopenharmony_ci } 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_ci /* if global pool of slot caches too low, deactivate cache */ 10762306a36Sopenharmony_ci if (pages < num_online_cpus() * THRESHOLD_DEACTIVATE_SWAP_SLOTS_CACHE) 10862306a36Sopenharmony_ci deactivate_swap_slots_cache(); 10962306a36Sopenharmony_ciout: 11062306a36Sopenharmony_ci return swap_slot_cache_active; 11162306a36Sopenharmony_ci} 11262306a36Sopenharmony_ci 11362306a36Sopenharmony_cistatic int alloc_swap_slot_cache(unsigned int cpu) 11462306a36Sopenharmony_ci{ 11562306a36Sopenharmony_ci struct swap_slots_cache *cache; 11662306a36Sopenharmony_ci swp_entry_t *slots, *slots_ret; 11762306a36Sopenharmony_ci 11862306a36Sopenharmony_ci /* 11962306a36Sopenharmony_ci * Do allocation outside swap_slots_cache_mutex 12062306a36Sopenharmony_ci * as kvzalloc could trigger reclaim and folio_alloc_swap, 12162306a36Sopenharmony_ci * which can lock swap_slots_cache_mutex. 12262306a36Sopenharmony_ci */ 12362306a36Sopenharmony_ci slots = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t), 12462306a36Sopenharmony_ci GFP_KERNEL); 12562306a36Sopenharmony_ci if (!slots) 12662306a36Sopenharmony_ci return -ENOMEM; 12762306a36Sopenharmony_ci 12862306a36Sopenharmony_ci slots_ret = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t), 12962306a36Sopenharmony_ci GFP_KERNEL); 13062306a36Sopenharmony_ci if (!slots_ret) { 13162306a36Sopenharmony_ci kvfree(slots); 13262306a36Sopenharmony_ci return -ENOMEM; 13362306a36Sopenharmony_ci } 13462306a36Sopenharmony_ci 13562306a36Sopenharmony_ci mutex_lock(&swap_slots_cache_mutex); 13662306a36Sopenharmony_ci cache = &per_cpu(swp_slots, cpu); 13762306a36Sopenharmony_ci if (cache->slots || cache->slots_ret) { 13862306a36Sopenharmony_ci /* cache already allocated */ 13962306a36Sopenharmony_ci mutex_unlock(&swap_slots_cache_mutex); 14062306a36Sopenharmony_ci 14162306a36Sopenharmony_ci kvfree(slots); 14262306a36Sopenharmony_ci kvfree(slots_ret); 14362306a36Sopenharmony_ci 14462306a36Sopenharmony_ci return 0; 14562306a36Sopenharmony_ci } 14662306a36Sopenharmony_ci 14762306a36Sopenharmony_ci if (!cache->lock_initialized) { 14862306a36Sopenharmony_ci mutex_init(&cache->alloc_lock); 14962306a36Sopenharmony_ci spin_lock_init(&cache->free_lock); 15062306a36Sopenharmony_ci cache->lock_initialized = true; 15162306a36Sopenharmony_ci } 15262306a36Sopenharmony_ci cache->nr = 0; 15362306a36Sopenharmony_ci cache->cur = 0; 15462306a36Sopenharmony_ci cache->n_ret = 0; 15562306a36Sopenharmony_ci /* 15662306a36Sopenharmony_ci * We initialized alloc_lock and free_lock earlier. We use 15762306a36Sopenharmony_ci * !cache->slots or !cache->slots_ret to know if it is safe to acquire 15862306a36Sopenharmony_ci * the corresponding lock and use the cache. Memory barrier below 15962306a36Sopenharmony_ci * ensures the assumption. 16062306a36Sopenharmony_ci */ 16162306a36Sopenharmony_ci mb(); 16262306a36Sopenharmony_ci cache->slots = slots; 16362306a36Sopenharmony_ci cache->slots_ret = slots_ret; 16462306a36Sopenharmony_ci mutex_unlock(&swap_slots_cache_mutex); 16562306a36Sopenharmony_ci return 0; 16662306a36Sopenharmony_ci} 16762306a36Sopenharmony_ci 16862306a36Sopenharmony_cistatic void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, 16962306a36Sopenharmony_ci bool free_slots) 17062306a36Sopenharmony_ci{ 17162306a36Sopenharmony_ci struct swap_slots_cache *cache; 17262306a36Sopenharmony_ci swp_entry_t *slots = NULL; 17362306a36Sopenharmony_ci 17462306a36Sopenharmony_ci cache = &per_cpu(swp_slots, cpu); 17562306a36Sopenharmony_ci if ((type & SLOTS_CACHE) && cache->slots) { 17662306a36Sopenharmony_ci mutex_lock(&cache->alloc_lock); 17762306a36Sopenharmony_ci swapcache_free_entries(cache->slots + cache->cur, cache->nr); 17862306a36Sopenharmony_ci cache->cur = 0; 17962306a36Sopenharmony_ci cache->nr = 0; 18062306a36Sopenharmony_ci if (free_slots && cache->slots) { 18162306a36Sopenharmony_ci kvfree(cache->slots); 18262306a36Sopenharmony_ci cache->slots = NULL; 18362306a36Sopenharmony_ci } 18462306a36Sopenharmony_ci mutex_unlock(&cache->alloc_lock); 18562306a36Sopenharmony_ci } 18662306a36Sopenharmony_ci if ((type & SLOTS_CACHE_RET) && cache->slots_ret) { 18762306a36Sopenharmony_ci spin_lock_irq(&cache->free_lock); 18862306a36Sopenharmony_ci swapcache_free_entries(cache->slots_ret, cache->n_ret); 18962306a36Sopenharmony_ci cache->n_ret = 0; 19062306a36Sopenharmony_ci if (free_slots && cache->slots_ret) { 19162306a36Sopenharmony_ci slots = cache->slots_ret; 19262306a36Sopenharmony_ci cache->slots_ret = NULL; 19362306a36Sopenharmony_ci } 19462306a36Sopenharmony_ci spin_unlock_irq(&cache->free_lock); 19562306a36Sopenharmony_ci kvfree(slots); 19662306a36Sopenharmony_ci } 19762306a36Sopenharmony_ci} 19862306a36Sopenharmony_ci 19962306a36Sopenharmony_cistatic void __drain_swap_slots_cache(unsigned int type) 20062306a36Sopenharmony_ci{ 20162306a36Sopenharmony_ci unsigned int cpu; 20262306a36Sopenharmony_ci 20362306a36Sopenharmony_ci /* 20462306a36Sopenharmony_ci * This function is called during 20562306a36Sopenharmony_ci * 1) swapoff, when we have to make sure no 20662306a36Sopenharmony_ci * left over slots are in cache when we remove 20762306a36Sopenharmony_ci * a swap device; 20862306a36Sopenharmony_ci * 2) disabling of swap slot cache, when we run low 20962306a36Sopenharmony_ci * on swap slots when allocating memory and need 21062306a36Sopenharmony_ci * to return swap slots to global pool. 21162306a36Sopenharmony_ci * 21262306a36Sopenharmony_ci * We cannot acquire cpu hot plug lock here as 21362306a36Sopenharmony_ci * this function can be invoked in the cpu 21462306a36Sopenharmony_ci * hot plug path: 21562306a36Sopenharmony_ci * cpu_up -> lock cpu_hotplug -> cpu hotplug state callback 21662306a36Sopenharmony_ci * -> memory allocation -> direct reclaim -> folio_alloc_swap 21762306a36Sopenharmony_ci * -> drain_swap_slots_cache 21862306a36Sopenharmony_ci * 21962306a36Sopenharmony_ci * Hence the loop over current online cpu below could miss cpu that 22062306a36Sopenharmony_ci * is being brought online but not yet marked as online. 22162306a36Sopenharmony_ci * That is okay as we do not schedule and run anything on a 22262306a36Sopenharmony_ci * cpu before it has been marked online. Hence, we will not 22362306a36Sopenharmony_ci * fill any swap slots in slots cache of such cpu. 22462306a36Sopenharmony_ci * There are no slots on such cpu that need to be drained. 22562306a36Sopenharmony_ci */ 22662306a36Sopenharmony_ci for_each_online_cpu(cpu) 22762306a36Sopenharmony_ci drain_slots_cache_cpu(cpu, type, false); 22862306a36Sopenharmony_ci} 22962306a36Sopenharmony_ci 23062306a36Sopenharmony_cistatic int free_slot_cache(unsigned int cpu) 23162306a36Sopenharmony_ci{ 23262306a36Sopenharmony_ci mutex_lock(&swap_slots_cache_mutex); 23362306a36Sopenharmony_ci drain_slots_cache_cpu(cpu, SLOTS_CACHE | SLOTS_CACHE_RET, true); 23462306a36Sopenharmony_ci mutex_unlock(&swap_slots_cache_mutex); 23562306a36Sopenharmony_ci return 0; 23662306a36Sopenharmony_ci} 23762306a36Sopenharmony_ci 23862306a36Sopenharmony_civoid enable_swap_slots_cache(void) 23962306a36Sopenharmony_ci{ 24062306a36Sopenharmony_ci mutex_lock(&swap_slots_cache_enable_mutex); 24162306a36Sopenharmony_ci if (!swap_slot_cache_initialized) { 24262306a36Sopenharmony_ci int ret; 24362306a36Sopenharmony_ci 24462306a36Sopenharmony_ci ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache", 24562306a36Sopenharmony_ci alloc_swap_slot_cache, free_slot_cache); 24662306a36Sopenharmony_ci if (WARN_ONCE(ret < 0, "Cache allocation failed (%s), operating " 24762306a36Sopenharmony_ci "without swap slots cache.\n", __func__)) 24862306a36Sopenharmony_ci goto out_unlock; 24962306a36Sopenharmony_ci 25062306a36Sopenharmony_ci swap_slot_cache_initialized = true; 25162306a36Sopenharmony_ci } 25262306a36Sopenharmony_ci 25362306a36Sopenharmony_ci __reenable_swap_slots_cache(); 25462306a36Sopenharmony_ciout_unlock: 25562306a36Sopenharmony_ci mutex_unlock(&swap_slots_cache_enable_mutex); 25662306a36Sopenharmony_ci} 25762306a36Sopenharmony_ci 25862306a36Sopenharmony_ci/* called with swap slot cache's alloc lock held */ 25962306a36Sopenharmony_cistatic int refill_swap_slots_cache(struct swap_slots_cache *cache) 26062306a36Sopenharmony_ci{ 26162306a36Sopenharmony_ci if (!use_swap_slot_cache) 26262306a36Sopenharmony_ci return 0; 26362306a36Sopenharmony_ci 26462306a36Sopenharmony_ci cache->cur = 0; 26562306a36Sopenharmony_ci if (swap_slot_cache_active) 26662306a36Sopenharmony_ci cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, 26762306a36Sopenharmony_ci cache->slots, 1); 26862306a36Sopenharmony_ci 26962306a36Sopenharmony_ci return cache->nr; 27062306a36Sopenharmony_ci} 27162306a36Sopenharmony_ci 27262306a36Sopenharmony_civoid free_swap_slot(swp_entry_t entry) 27362306a36Sopenharmony_ci{ 27462306a36Sopenharmony_ci struct swap_slots_cache *cache; 27562306a36Sopenharmony_ci 27662306a36Sopenharmony_ci cache = raw_cpu_ptr(&swp_slots); 27762306a36Sopenharmony_ci if (likely(use_swap_slot_cache && cache->slots_ret)) { 27862306a36Sopenharmony_ci spin_lock_irq(&cache->free_lock); 27962306a36Sopenharmony_ci /* Swap slots cache may be deactivated before acquiring lock */ 28062306a36Sopenharmony_ci if (!use_swap_slot_cache || !cache->slots_ret) { 28162306a36Sopenharmony_ci spin_unlock_irq(&cache->free_lock); 28262306a36Sopenharmony_ci goto direct_free; 28362306a36Sopenharmony_ci } 28462306a36Sopenharmony_ci if (cache->n_ret >= SWAP_SLOTS_CACHE_SIZE) { 28562306a36Sopenharmony_ci /* 28662306a36Sopenharmony_ci * Return slots to global pool. 28762306a36Sopenharmony_ci * The current swap_map value is SWAP_HAS_CACHE. 28862306a36Sopenharmony_ci * Set it to 0 to indicate it is available for 28962306a36Sopenharmony_ci * allocation in global pool 29062306a36Sopenharmony_ci */ 29162306a36Sopenharmony_ci swapcache_free_entries(cache->slots_ret, cache->n_ret); 29262306a36Sopenharmony_ci cache->n_ret = 0; 29362306a36Sopenharmony_ci } 29462306a36Sopenharmony_ci cache->slots_ret[cache->n_ret++] = entry; 29562306a36Sopenharmony_ci spin_unlock_irq(&cache->free_lock); 29662306a36Sopenharmony_ci } else { 29762306a36Sopenharmony_cidirect_free: 29862306a36Sopenharmony_ci swapcache_free_entries(&entry, 1); 29962306a36Sopenharmony_ci } 30062306a36Sopenharmony_ci} 30162306a36Sopenharmony_ci 30262306a36Sopenharmony_ciswp_entry_t folio_alloc_swap(struct folio *folio) 30362306a36Sopenharmony_ci{ 30462306a36Sopenharmony_ci swp_entry_t entry; 30562306a36Sopenharmony_ci struct swap_slots_cache *cache; 30662306a36Sopenharmony_ci 30762306a36Sopenharmony_ci entry.val = 0; 30862306a36Sopenharmony_ci 30962306a36Sopenharmony_ci if (folio_test_large(folio)) { 31062306a36Sopenharmony_ci if (IS_ENABLED(CONFIG_THP_SWAP) && arch_thp_swp_supported()) 31162306a36Sopenharmony_ci get_swap_pages(1, &entry, folio_nr_pages(folio)); 31262306a36Sopenharmony_ci goto out; 31362306a36Sopenharmony_ci } 31462306a36Sopenharmony_ci 31562306a36Sopenharmony_ci /* 31662306a36Sopenharmony_ci * Preemption is allowed here, because we may sleep 31762306a36Sopenharmony_ci * in refill_swap_slots_cache(). But it is safe, because 31862306a36Sopenharmony_ci * accesses to the per-CPU data structure are protected by the 31962306a36Sopenharmony_ci * mutex cache->alloc_lock. 32062306a36Sopenharmony_ci * 32162306a36Sopenharmony_ci * The alloc path here does not touch cache->slots_ret 32262306a36Sopenharmony_ci * so cache->free_lock is not taken. 32362306a36Sopenharmony_ci */ 32462306a36Sopenharmony_ci cache = raw_cpu_ptr(&swp_slots); 32562306a36Sopenharmony_ci 32662306a36Sopenharmony_ci if (likely(check_cache_active() && cache->slots)) { 32762306a36Sopenharmony_ci mutex_lock(&cache->alloc_lock); 32862306a36Sopenharmony_ci if (cache->slots) { 32962306a36Sopenharmony_cirepeat: 33062306a36Sopenharmony_ci if (cache->nr) { 33162306a36Sopenharmony_ci entry = cache->slots[cache->cur]; 33262306a36Sopenharmony_ci cache->slots[cache->cur++].val = 0; 33362306a36Sopenharmony_ci cache->nr--; 33462306a36Sopenharmony_ci } else if (refill_swap_slots_cache(cache)) { 33562306a36Sopenharmony_ci goto repeat; 33662306a36Sopenharmony_ci } 33762306a36Sopenharmony_ci } 33862306a36Sopenharmony_ci mutex_unlock(&cache->alloc_lock); 33962306a36Sopenharmony_ci if (entry.val) 34062306a36Sopenharmony_ci goto out; 34162306a36Sopenharmony_ci } 34262306a36Sopenharmony_ci 34362306a36Sopenharmony_ci get_swap_pages(1, &entry, 1); 34462306a36Sopenharmony_ciout: 34562306a36Sopenharmony_ci if (mem_cgroup_try_charge_swap(folio, entry)) { 34662306a36Sopenharmony_ci put_swap_folio(folio, entry); 34762306a36Sopenharmony_ci entry.val = 0; 34862306a36Sopenharmony_ci } 34962306a36Sopenharmony_ci return entry; 35062306a36Sopenharmony_ci} 351