18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * linux/mm/swapfile.c 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 68c2ecf20Sopenharmony_ci * Swap reorganised 29.12.95, Stephen Tweedie 78c2ecf20Sopenharmony_ci */ 88c2ecf20Sopenharmony_ci 98c2ecf20Sopenharmony_ci#include <linux/mm.h> 108c2ecf20Sopenharmony_ci#include <linux/sched/mm.h> 118c2ecf20Sopenharmony_ci#include <linux/sched/task.h> 128c2ecf20Sopenharmony_ci#include <linux/hugetlb.h> 138c2ecf20Sopenharmony_ci#include <linux/mman.h> 148c2ecf20Sopenharmony_ci#include <linux/slab.h> 158c2ecf20Sopenharmony_ci#include <linux/kernel_stat.h> 168c2ecf20Sopenharmony_ci#include <linux/swap.h> 178c2ecf20Sopenharmony_ci#include <linux/vmalloc.h> 188c2ecf20Sopenharmony_ci#include <linux/pagemap.h> 198c2ecf20Sopenharmony_ci#include <linux/namei.h> 208c2ecf20Sopenharmony_ci#include <linux/shmem_fs.h> 218c2ecf20Sopenharmony_ci#include <linux/blkdev.h> 228c2ecf20Sopenharmony_ci#include <linux/random.h> 238c2ecf20Sopenharmony_ci#include <linux/writeback.h> 248c2ecf20Sopenharmony_ci#include <linux/proc_fs.h> 258c2ecf20Sopenharmony_ci#include <linux/seq_file.h> 268c2ecf20Sopenharmony_ci#include <linux/init.h> 278c2ecf20Sopenharmony_ci#include <linux/ksm.h> 288c2ecf20Sopenharmony_ci#include <linux/rmap.h> 298c2ecf20Sopenharmony_ci#include <linux/security.h> 308c2ecf20Sopenharmony_ci#include <linux/backing-dev.h> 318c2ecf20Sopenharmony_ci#include <linux/mutex.h> 328c2ecf20Sopenharmony_ci#include <linux/capability.h> 338c2ecf20Sopenharmony_ci#include <linux/syscalls.h> 348c2ecf20Sopenharmony_ci#include <linux/memcontrol.h> 358c2ecf20Sopenharmony_ci#include <linux/poll.h> 368c2ecf20Sopenharmony_ci#include <linux/oom.h> 378c2ecf20Sopenharmony_ci#include <linux/frontswap.h> 388c2ecf20Sopenharmony_ci#include <linux/swapfile.h> 398c2ecf20Sopenharmony_ci#include <linux/export.h> 408c2ecf20Sopenharmony_ci#include <linux/swap_slots.h> 418c2ecf20Sopenharmony_ci#include <linux/sort.h> 428c2ecf20Sopenharmony_ci 438c2ecf20Sopenharmony_ci#include <asm/tlbflush.h> 448c2ecf20Sopenharmony_ci#include <linux/swapops.h> 458c2ecf20Sopenharmony_ci#include <linux/swap_cgroup.h> 468c2ecf20Sopenharmony_ci#include <linux/zswapd.h> 478c2ecf20Sopenharmony_ci 488c2ecf20Sopenharmony_cistatic bool swap_count_continued(struct swap_info_struct *, pgoff_t, 498c2ecf20Sopenharmony_ci unsigned char); 508c2ecf20Sopenharmony_cistatic void free_swap_count_continuations(struct swap_info_struct *); 518c2ecf20Sopenharmony_cistatic sector_t map_swap_entry(swp_entry_t, struct block_device**); 528c2ecf20Sopenharmony_ci 538c2ecf20Sopenharmony_ciDEFINE_SPINLOCK(swap_lock); 548c2ecf20Sopenharmony_cistatic unsigned int nr_swapfiles; 558c2ecf20Sopenharmony_ciatomic_long_t nr_swap_pages; 568c2ecf20Sopenharmony_ci/* 578c2ecf20Sopenharmony_ci * Some modules use swappable objects and may try to swap them out under 588c2ecf20Sopenharmony_ci * memory pressure (via the shrinker). Before doing so, they may wish to 598c2ecf20Sopenharmony_ci * check to see if any swap space is available. 608c2ecf20Sopenharmony_ci */ 618c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(nr_swap_pages); 628c2ecf20Sopenharmony_ci/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ 638c2ecf20Sopenharmony_cilong total_swap_pages; 648c2ecf20Sopenharmony_cistatic int least_priority = -1; 658c2ecf20Sopenharmony_ci 668c2ecf20Sopenharmony_cistatic const char Bad_file[] = "Bad swap file entry "; 678c2ecf20Sopenharmony_cistatic const char Unused_file[] = "Unused swap file entry "; 688c2ecf20Sopenharmony_cistatic const char Bad_offset[] = "Bad swap offset entry "; 698c2ecf20Sopenharmony_cistatic const char Unused_offset[] = "Unused swap offset entry "; 708c2ecf20Sopenharmony_ci 718c2ecf20Sopenharmony_ci/* 728c2ecf20Sopenharmony_ci * all active swap_info_structs 738c2ecf20Sopenharmony_ci * protected with swap_lock, and ordered by priority. 748c2ecf20Sopenharmony_ci */ 758c2ecf20Sopenharmony_ciPLIST_HEAD(swap_active_head); 768c2ecf20Sopenharmony_ci 778c2ecf20Sopenharmony_ci/* 788c2ecf20Sopenharmony_ci * all available (active, not full) swap_info_structs 798c2ecf20Sopenharmony_ci * protected with swap_avail_lock, ordered by priority. 808c2ecf20Sopenharmony_ci * This is used by get_swap_page() instead of swap_active_head 818c2ecf20Sopenharmony_ci * because swap_active_head includes all swap_info_structs, 828c2ecf20Sopenharmony_ci * but get_swap_page() doesn't need to look at full ones. 838c2ecf20Sopenharmony_ci * This uses its own lock instead of swap_lock because when a 848c2ecf20Sopenharmony_ci * swap_info_struct changes between not-full/full, it needs to 858c2ecf20Sopenharmony_ci * add/remove itself to/from this list, but the swap_info_struct->lock 868c2ecf20Sopenharmony_ci * is held and the locking order requires swap_lock to be taken 878c2ecf20Sopenharmony_ci * before any swap_info_struct->lock. 888c2ecf20Sopenharmony_ci */ 898c2ecf20Sopenharmony_cistatic struct plist_head *swap_avail_heads; 908c2ecf20Sopenharmony_cistatic DEFINE_SPINLOCK(swap_avail_lock); 918c2ecf20Sopenharmony_ci 928c2ecf20Sopenharmony_cistruct swap_info_struct *swap_info[MAX_SWAPFILES]; 938c2ecf20Sopenharmony_ci 948c2ecf20Sopenharmony_cistatic DEFINE_MUTEX(swapon_mutex); 958c2ecf20Sopenharmony_ci 968c2ecf20Sopenharmony_cistatic DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); 978c2ecf20Sopenharmony_ci/* Activity counter to indicate that a swapon or swapoff has occurred */ 988c2ecf20Sopenharmony_cistatic atomic_t proc_poll_event = ATOMIC_INIT(0); 998c2ecf20Sopenharmony_ci 1008c2ecf20Sopenharmony_ciatomic_t nr_rotate_swap = ATOMIC_INIT(0); 1018c2ecf20Sopenharmony_ci 1028c2ecf20Sopenharmony_cistatic struct swap_info_struct *swap_type_to_swap_info(int type) 1038c2ecf20Sopenharmony_ci{ 1048c2ecf20Sopenharmony_ci if (type >= READ_ONCE(nr_swapfiles)) 1058c2ecf20Sopenharmony_ci return NULL; 1068c2ecf20Sopenharmony_ci 1078c2ecf20Sopenharmony_ci smp_rmb(); /* Pairs with smp_wmb in alloc_swap_info. */ 1088c2ecf20Sopenharmony_ci return READ_ONCE(swap_info[type]); 1098c2ecf20Sopenharmony_ci} 1108c2ecf20Sopenharmony_ci 1118c2ecf20Sopenharmony_cistatic inline unsigned char swap_count(unsigned char ent) 1128c2ecf20Sopenharmony_ci{ 1138c2ecf20Sopenharmony_ci return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ 1148c2ecf20Sopenharmony_ci} 1158c2ecf20Sopenharmony_ci 1168c2ecf20Sopenharmony_ci/* Reclaim the swap entry anyway if possible */ 1178c2ecf20Sopenharmony_ci#define TTRS_ANYWAY 0x1 1188c2ecf20Sopenharmony_ci/* 1198c2ecf20Sopenharmony_ci * Reclaim the swap entry if there are no more mappings of the 1208c2ecf20Sopenharmony_ci * corresponding page 1218c2ecf20Sopenharmony_ci */ 1228c2ecf20Sopenharmony_ci#define TTRS_UNMAPPED 0x2 1238c2ecf20Sopenharmony_ci/* Reclaim the swap entry if swap is getting full*/ 1248c2ecf20Sopenharmony_ci#define TTRS_FULL 0x4 1258c2ecf20Sopenharmony_ci 1268c2ecf20Sopenharmony_ci/* returns 1 if swap entry is freed */ 1278c2ecf20Sopenharmony_cistatic int __try_to_reclaim_swap(struct swap_info_struct *si, 1288c2ecf20Sopenharmony_ci unsigned long offset, unsigned long flags) 1298c2ecf20Sopenharmony_ci{ 1308c2ecf20Sopenharmony_ci swp_entry_t entry = swp_entry(si->type, offset); 1318c2ecf20Sopenharmony_ci struct page *page; 1328c2ecf20Sopenharmony_ci int ret = 0; 1338c2ecf20Sopenharmony_ci 1348c2ecf20Sopenharmony_ci page = find_get_page(swap_address_space(entry), offset); 1358c2ecf20Sopenharmony_ci if (!page) 1368c2ecf20Sopenharmony_ci return 0; 1378c2ecf20Sopenharmony_ci /* 1388c2ecf20Sopenharmony_ci * When this function is called from scan_swap_map_slots() and it's 1398c2ecf20Sopenharmony_ci * called by vmscan.c at reclaiming pages. So, we hold a lock on a page, 1408c2ecf20Sopenharmony_ci * here. We have to use trylock for avoiding deadlock. This is a special 1418c2ecf20Sopenharmony_ci * case and you should use try_to_free_swap() with explicit lock_page() 1428c2ecf20Sopenharmony_ci * in usual operations. 1438c2ecf20Sopenharmony_ci */ 1448c2ecf20Sopenharmony_ci if (trylock_page(page)) { 1458c2ecf20Sopenharmony_ci if ((flags & TTRS_ANYWAY) || 1468c2ecf20Sopenharmony_ci ((flags & TTRS_UNMAPPED) && !page_mapped(page)) || 1478c2ecf20Sopenharmony_ci ((flags & TTRS_FULL) && mem_cgroup_swap_full(page))) 1488c2ecf20Sopenharmony_ci ret = try_to_free_swap(page); 1498c2ecf20Sopenharmony_ci unlock_page(page); 1508c2ecf20Sopenharmony_ci } 1518c2ecf20Sopenharmony_ci put_page(page); 1528c2ecf20Sopenharmony_ci return ret; 1538c2ecf20Sopenharmony_ci} 1548c2ecf20Sopenharmony_ci 1558c2ecf20Sopenharmony_cistatic inline struct swap_extent *first_se(struct swap_info_struct *sis) 1568c2ecf20Sopenharmony_ci{ 1578c2ecf20Sopenharmony_ci struct rb_node *rb = rb_first(&sis->swap_extent_root); 1588c2ecf20Sopenharmony_ci return rb_entry(rb, struct swap_extent, rb_node); 1598c2ecf20Sopenharmony_ci} 1608c2ecf20Sopenharmony_ci 1618c2ecf20Sopenharmony_cistatic inline struct swap_extent *next_se(struct swap_extent *se) 1628c2ecf20Sopenharmony_ci{ 1638c2ecf20Sopenharmony_ci struct rb_node *rb = rb_next(&se->rb_node); 1648c2ecf20Sopenharmony_ci return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL; 1658c2ecf20Sopenharmony_ci} 1668c2ecf20Sopenharmony_ci 1678c2ecf20Sopenharmony_ci/* 1688c2ecf20Sopenharmony_ci * swapon tell device that all the old swap contents can be discarded, 1698c2ecf20Sopenharmony_ci * to allow the swap device to optimize its wear-levelling. 1708c2ecf20Sopenharmony_ci */ 1718c2ecf20Sopenharmony_cistatic int discard_swap(struct swap_info_struct *si) 1728c2ecf20Sopenharmony_ci{ 1738c2ecf20Sopenharmony_ci struct swap_extent *se; 1748c2ecf20Sopenharmony_ci sector_t start_block; 1758c2ecf20Sopenharmony_ci sector_t nr_blocks; 1768c2ecf20Sopenharmony_ci int err = 0; 1778c2ecf20Sopenharmony_ci 1788c2ecf20Sopenharmony_ci /* Do not discard the swap header page! */ 1798c2ecf20Sopenharmony_ci se = first_se(si); 1808c2ecf20Sopenharmony_ci start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); 1818c2ecf20Sopenharmony_ci nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); 1828c2ecf20Sopenharmony_ci if (nr_blocks) { 1838c2ecf20Sopenharmony_ci err = blkdev_issue_discard(si->bdev, start_block, 1848c2ecf20Sopenharmony_ci nr_blocks, GFP_KERNEL, 0); 1858c2ecf20Sopenharmony_ci if (err) 1868c2ecf20Sopenharmony_ci return err; 1878c2ecf20Sopenharmony_ci cond_resched(); 1888c2ecf20Sopenharmony_ci } 1898c2ecf20Sopenharmony_ci 1908c2ecf20Sopenharmony_ci for (se = next_se(se); se; se = next_se(se)) { 1918c2ecf20Sopenharmony_ci start_block = se->start_block << (PAGE_SHIFT - 9); 1928c2ecf20Sopenharmony_ci nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); 1938c2ecf20Sopenharmony_ci 1948c2ecf20Sopenharmony_ci err = blkdev_issue_discard(si->bdev, start_block, 1958c2ecf20Sopenharmony_ci nr_blocks, GFP_KERNEL, 0); 1968c2ecf20Sopenharmony_ci if (err) 1978c2ecf20Sopenharmony_ci break; 1988c2ecf20Sopenharmony_ci 1998c2ecf20Sopenharmony_ci cond_resched(); 2008c2ecf20Sopenharmony_ci } 2018c2ecf20Sopenharmony_ci return err; /* That will often be -EOPNOTSUPP */ 2028c2ecf20Sopenharmony_ci} 2038c2ecf20Sopenharmony_ci 2048c2ecf20Sopenharmony_cistatic struct swap_extent * 2058c2ecf20Sopenharmony_cioffset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset) 2068c2ecf20Sopenharmony_ci{ 2078c2ecf20Sopenharmony_ci struct swap_extent *se; 2088c2ecf20Sopenharmony_ci struct rb_node *rb; 2098c2ecf20Sopenharmony_ci 2108c2ecf20Sopenharmony_ci rb = sis->swap_extent_root.rb_node; 2118c2ecf20Sopenharmony_ci while (rb) { 2128c2ecf20Sopenharmony_ci se = rb_entry(rb, struct swap_extent, rb_node); 2138c2ecf20Sopenharmony_ci if (offset < se->start_page) 2148c2ecf20Sopenharmony_ci rb = rb->rb_left; 2158c2ecf20Sopenharmony_ci else if (offset >= se->start_page + se->nr_pages) 2168c2ecf20Sopenharmony_ci rb = rb->rb_right; 2178c2ecf20Sopenharmony_ci else 2188c2ecf20Sopenharmony_ci return se; 2198c2ecf20Sopenharmony_ci } 2208c2ecf20Sopenharmony_ci /* It *must* be present */ 2218c2ecf20Sopenharmony_ci BUG(); 2228c2ecf20Sopenharmony_ci} 2238c2ecf20Sopenharmony_ci 2248c2ecf20Sopenharmony_cisector_t swap_page_sector(struct page *page) 2258c2ecf20Sopenharmony_ci{ 2268c2ecf20Sopenharmony_ci struct swap_info_struct *sis = page_swap_info(page); 2278c2ecf20Sopenharmony_ci struct swap_extent *se; 2288c2ecf20Sopenharmony_ci sector_t sector; 2298c2ecf20Sopenharmony_ci pgoff_t offset; 2308c2ecf20Sopenharmony_ci 2318c2ecf20Sopenharmony_ci offset = __page_file_index(page); 2328c2ecf20Sopenharmony_ci se = offset_to_swap_extent(sis, offset); 2338c2ecf20Sopenharmony_ci sector = se->start_block + (offset - se->start_page); 2348c2ecf20Sopenharmony_ci return sector << (PAGE_SHIFT - 9); 2358c2ecf20Sopenharmony_ci} 2368c2ecf20Sopenharmony_ci 2378c2ecf20Sopenharmony_ci/* 2388c2ecf20Sopenharmony_ci * swap allocation tell device that a cluster of swap can now be discarded, 2398c2ecf20Sopenharmony_ci * to allow the swap device to optimize its wear-levelling. 2408c2ecf20Sopenharmony_ci */ 2418c2ecf20Sopenharmony_cistatic void discard_swap_cluster(struct swap_info_struct *si, 2428c2ecf20Sopenharmony_ci pgoff_t start_page, pgoff_t nr_pages) 2438c2ecf20Sopenharmony_ci{ 2448c2ecf20Sopenharmony_ci struct swap_extent *se = offset_to_swap_extent(si, start_page); 2458c2ecf20Sopenharmony_ci 2468c2ecf20Sopenharmony_ci while (nr_pages) { 2478c2ecf20Sopenharmony_ci pgoff_t offset = start_page - se->start_page; 2488c2ecf20Sopenharmony_ci sector_t start_block = se->start_block + offset; 2498c2ecf20Sopenharmony_ci sector_t nr_blocks = se->nr_pages - offset; 2508c2ecf20Sopenharmony_ci 2518c2ecf20Sopenharmony_ci if (nr_blocks > nr_pages) 2528c2ecf20Sopenharmony_ci nr_blocks = nr_pages; 2538c2ecf20Sopenharmony_ci start_page += nr_blocks; 2548c2ecf20Sopenharmony_ci nr_pages -= nr_blocks; 2558c2ecf20Sopenharmony_ci 2568c2ecf20Sopenharmony_ci start_block <<= PAGE_SHIFT - 9; 2578c2ecf20Sopenharmony_ci nr_blocks <<= PAGE_SHIFT - 9; 2588c2ecf20Sopenharmony_ci if (blkdev_issue_discard(si->bdev, start_block, 2598c2ecf20Sopenharmony_ci nr_blocks, GFP_NOIO, 0)) 2608c2ecf20Sopenharmony_ci break; 2618c2ecf20Sopenharmony_ci 2628c2ecf20Sopenharmony_ci se = next_se(se); 2638c2ecf20Sopenharmony_ci } 2648c2ecf20Sopenharmony_ci} 2658c2ecf20Sopenharmony_ci 2668c2ecf20Sopenharmony_ci#ifdef CONFIG_THP_SWAP 2678c2ecf20Sopenharmony_ci#define SWAPFILE_CLUSTER HPAGE_PMD_NR 2688c2ecf20Sopenharmony_ci 2698c2ecf20Sopenharmony_ci#define swap_entry_size(size) (size) 2708c2ecf20Sopenharmony_ci#else 2718c2ecf20Sopenharmony_ci#define SWAPFILE_CLUSTER 256 2728c2ecf20Sopenharmony_ci 2738c2ecf20Sopenharmony_ci/* 2748c2ecf20Sopenharmony_ci * Define swap_entry_size() as constant to let compiler to optimize 2758c2ecf20Sopenharmony_ci * out some code if !CONFIG_THP_SWAP 2768c2ecf20Sopenharmony_ci */ 2778c2ecf20Sopenharmony_ci#define swap_entry_size(size) 1 2788c2ecf20Sopenharmony_ci#endif 2798c2ecf20Sopenharmony_ci#define LATENCY_LIMIT 256 2808c2ecf20Sopenharmony_ci 2818c2ecf20Sopenharmony_cistatic inline void cluster_set_flag(struct swap_cluster_info *info, 2828c2ecf20Sopenharmony_ci unsigned int flag) 2838c2ecf20Sopenharmony_ci{ 2848c2ecf20Sopenharmony_ci info->flags = flag; 2858c2ecf20Sopenharmony_ci} 2868c2ecf20Sopenharmony_ci 2878c2ecf20Sopenharmony_cistatic inline unsigned int cluster_count(struct swap_cluster_info *info) 2888c2ecf20Sopenharmony_ci{ 2898c2ecf20Sopenharmony_ci return info->data; 2908c2ecf20Sopenharmony_ci} 2918c2ecf20Sopenharmony_ci 2928c2ecf20Sopenharmony_cistatic inline void cluster_set_count(struct swap_cluster_info *info, 2938c2ecf20Sopenharmony_ci unsigned int c) 2948c2ecf20Sopenharmony_ci{ 2958c2ecf20Sopenharmony_ci info->data = c; 2968c2ecf20Sopenharmony_ci} 2978c2ecf20Sopenharmony_ci 2988c2ecf20Sopenharmony_cistatic inline void cluster_set_count_flag(struct swap_cluster_info *info, 2998c2ecf20Sopenharmony_ci unsigned int c, unsigned int f) 3008c2ecf20Sopenharmony_ci{ 3018c2ecf20Sopenharmony_ci info->flags = f; 3028c2ecf20Sopenharmony_ci info->data = c; 3038c2ecf20Sopenharmony_ci} 3048c2ecf20Sopenharmony_ci 3058c2ecf20Sopenharmony_cistatic inline unsigned int cluster_next(struct swap_cluster_info *info) 3068c2ecf20Sopenharmony_ci{ 3078c2ecf20Sopenharmony_ci return info->data; 3088c2ecf20Sopenharmony_ci} 3098c2ecf20Sopenharmony_ci 3108c2ecf20Sopenharmony_cistatic inline void cluster_set_next(struct swap_cluster_info *info, 3118c2ecf20Sopenharmony_ci unsigned int n) 3128c2ecf20Sopenharmony_ci{ 3138c2ecf20Sopenharmony_ci info->data = n; 3148c2ecf20Sopenharmony_ci} 3158c2ecf20Sopenharmony_ci 3168c2ecf20Sopenharmony_cistatic inline void cluster_set_next_flag(struct swap_cluster_info *info, 3178c2ecf20Sopenharmony_ci unsigned int n, unsigned int f) 3188c2ecf20Sopenharmony_ci{ 3198c2ecf20Sopenharmony_ci info->flags = f; 3208c2ecf20Sopenharmony_ci info->data = n; 3218c2ecf20Sopenharmony_ci} 3228c2ecf20Sopenharmony_ci 3238c2ecf20Sopenharmony_cistatic inline bool cluster_is_free(struct swap_cluster_info *info) 3248c2ecf20Sopenharmony_ci{ 3258c2ecf20Sopenharmony_ci return info->flags & CLUSTER_FLAG_FREE; 3268c2ecf20Sopenharmony_ci} 3278c2ecf20Sopenharmony_ci 3288c2ecf20Sopenharmony_cistatic inline bool cluster_is_null(struct swap_cluster_info *info) 3298c2ecf20Sopenharmony_ci{ 3308c2ecf20Sopenharmony_ci return info->flags & CLUSTER_FLAG_NEXT_NULL; 3318c2ecf20Sopenharmony_ci} 3328c2ecf20Sopenharmony_ci 3338c2ecf20Sopenharmony_cistatic inline void cluster_set_null(struct swap_cluster_info *info) 3348c2ecf20Sopenharmony_ci{ 3358c2ecf20Sopenharmony_ci info->flags = CLUSTER_FLAG_NEXT_NULL; 3368c2ecf20Sopenharmony_ci info->data = 0; 3378c2ecf20Sopenharmony_ci} 3388c2ecf20Sopenharmony_ci 3398c2ecf20Sopenharmony_cistatic inline bool cluster_is_huge(struct swap_cluster_info *info) 3408c2ecf20Sopenharmony_ci{ 3418c2ecf20Sopenharmony_ci if (IS_ENABLED(CONFIG_THP_SWAP)) 3428c2ecf20Sopenharmony_ci return info->flags & CLUSTER_FLAG_HUGE; 3438c2ecf20Sopenharmony_ci return false; 3448c2ecf20Sopenharmony_ci} 3458c2ecf20Sopenharmony_ci 3468c2ecf20Sopenharmony_cistatic inline void cluster_clear_huge(struct swap_cluster_info *info) 3478c2ecf20Sopenharmony_ci{ 3488c2ecf20Sopenharmony_ci info->flags &= ~CLUSTER_FLAG_HUGE; 3498c2ecf20Sopenharmony_ci} 3508c2ecf20Sopenharmony_ci 3518c2ecf20Sopenharmony_cistatic inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, 3528c2ecf20Sopenharmony_ci unsigned long offset) 3538c2ecf20Sopenharmony_ci{ 3548c2ecf20Sopenharmony_ci struct swap_cluster_info *ci; 3558c2ecf20Sopenharmony_ci 3568c2ecf20Sopenharmony_ci ci = si->cluster_info; 3578c2ecf20Sopenharmony_ci if (ci) { 3588c2ecf20Sopenharmony_ci ci += offset / SWAPFILE_CLUSTER; 3598c2ecf20Sopenharmony_ci spin_lock(&ci->lock); 3608c2ecf20Sopenharmony_ci } 3618c2ecf20Sopenharmony_ci return ci; 3628c2ecf20Sopenharmony_ci} 3638c2ecf20Sopenharmony_ci 3648c2ecf20Sopenharmony_cistatic inline void unlock_cluster(struct swap_cluster_info *ci) 3658c2ecf20Sopenharmony_ci{ 3668c2ecf20Sopenharmony_ci if (ci) 3678c2ecf20Sopenharmony_ci spin_unlock(&ci->lock); 3688c2ecf20Sopenharmony_ci} 3698c2ecf20Sopenharmony_ci 3708c2ecf20Sopenharmony_ci/* 3718c2ecf20Sopenharmony_ci * Determine the locking method in use for this device. Return 3728c2ecf20Sopenharmony_ci * swap_cluster_info if SSD-style cluster-based locking is in place. 3738c2ecf20Sopenharmony_ci */ 3748c2ecf20Sopenharmony_cistatic inline struct swap_cluster_info *lock_cluster_or_swap_info( 3758c2ecf20Sopenharmony_ci struct swap_info_struct *si, unsigned long offset) 3768c2ecf20Sopenharmony_ci{ 3778c2ecf20Sopenharmony_ci struct swap_cluster_info *ci; 3788c2ecf20Sopenharmony_ci 3798c2ecf20Sopenharmony_ci /* Try to use fine-grained SSD-style locking if available: */ 3808c2ecf20Sopenharmony_ci ci = lock_cluster(si, offset); 3818c2ecf20Sopenharmony_ci /* Otherwise, fall back to traditional, coarse locking: */ 3828c2ecf20Sopenharmony_ci if (!ci) 3838c2ecf20Sopenharmony_ci spin_lock(&si->lock); 3848c2ecf20Sopenharmony_ci 3858c2ecf20Sopenharmony_ci return ci; 3868c2ecf20Sopenharmony_ci} 3878c2ecf20Sopenharmony_ci 3888c2ecf20Sopenharmony_cistatic inline void unlock_cluster_or_swap_info(struct swap_info_struct *si, 3898c2ecf20Sopenharmony_ci struct swap_cluster_info *ci) 3908c2ecf20Sopenharmony_ci{ 3918c2ecf20Sopenharmony_ci if (ci) 3928c2ecf20Sopenharmony_ci unlock_cluster(ci); 3938c2ecf20Sopenharmony_ci else 3948c2ecf20Sopenharmony_ci spin_unlock(&si->lock); 3958c2ecf20Sopenharmony_ci} 3968c2ecf20Sopenharmony_ci 3978c2ecf20Sopenharmony_cistatic inline bool cluster_list_empty(struct swap_cluster_list *list) 3988c2ecf20Sopenharmony_ci{ 3998c2ecf20Sopenharmony_ci return cluster_is_null(&list->head); 4008c2ecf20Sopenharmony_ci} 4018c2ecf20Sopenharmony_ci 4028c2ecf20Sopenharmony_cistatic inline unsigned int cluster_list_first(struct swap_cluster_list *list) 4038c2ecf20Sopenharmony_ci{ 4048c2ecf20Sopenharmony_ci return cluster_next(&list->head); 4058c2ecf20Sopenharmony_ci} 4068c2ecf20Sopenharmony_ci 4078c2ecf20Sopenharmony_cistatic void cluster_list_init(struct swap_cluster_list *list) 4088c2ecf20Sopenharmony_ci{ 4098c2ecf20Sopenharmony_ci cluster_set_null(&list->head); 4108c2ecf20Sopenharmony_ci cluster_set_null(&list->tail); 4118c2ecf20Sopenharmony_ci} 4128c2ecf20Sopenharmony_ci 4138c2ecf20Sopenharmony_cistatic void cluster_list_add_tail(struct swap_cluster_list *list, 4148c2ecf20Sopenharmony_ci struct swap_cluster_info *ci, 4158c2ecf20Sopenharmony_ci unsigned int idx) 4168c2ecf20Sopenharmony_ci{ 4178c2ecf20Sopenharmony_ci if (cluster_list_empty(list)) { 4188c2ecf20Sopenharmony_ci cluster_set_next_flag(&list->head, idx, 0); 4198c2ecf20Sopenharmony_ci cluster_set_next_flag(&list->tail, idx, 0); 4208c2ecf20Sopenharmony_ci } else { 4218c2ecf20Sopenharmony_ci struct swap_cluster_info *ci_tail; 4228c2ecf20Sopenharmony_ci unsigned int tail = cluster_next(&list->tail); 4238c2ecf20Sopenharmony_ci 4248c2ecf20Sopenharmony_ci /* 4258c2ecf20Sopenharmony_ci * Nested cluster lock, but both cluster locks are 4268c2ecf20Sopenharmony_ci * only acquired when we held swap_info_struct->lock 4278c2ecf20Sopenharmony_ci */ 4288c2ecf20Sopenharmony_ci ci_tail = ci + tail; 4298c2ecf20Sopenharmony_ci spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING); 4308c2ecf20Sopenharmony_ci cluster_set_next(ci_tail, idx); 4318c2ecf20Sopenharmony_ci spin_unlock(&ci_tail->lock); 4328c2ecf20Sopenharmony_ci cluster_set_next_flag(&list->tail, idx, 0); 4338c2ecf20Sopenharmony_ci } 4348c2ecf20Sopenharmony_ci} 4358c2ecf20Sopenharmony_ci 4368c2ecf20Sopenharmony_cistatic unsigned int cluster_list_del_first(struct swap_cluster_list *list, 4378c2ecf20Sopenharmony_ci struct swap_cluster_info *ci) 4388c2ecf20Sopenharmony_ci{ 4398c2ecf20Sopenharmony_ci unsigned int idx; 4408c2ecf20Sopenharmony_ci 4418c2ecf20Sopenharmony_ci idx = cluster_next(&list->head); 4428c2ecf20Sopenharmony_ci if (cluster_next(&list->tail) == idx) { 4438c2ecf20Sopenharmony_ci cluster_set_null(&list->head); 4448c2ecf20Sopenharmony_ci cluster_set_null(&list->tail); 4458c2ecf20Sopenharmony_ci } else 4468c2ecf20Sopenharmony_ci cluster_set_next_flag(&list->head, 4478c2ecf20Sopenharmony_ci cluster_next(&ci[idx]), 0); 4488c2ecf20Sopenharmony_ci 4498c2ecf20Sopenharmony_ci return idx; 4508c2ecf20Sopenharmony_ci} 4518c2ecf20Sopenharmony_ci 4528c2ecf20Sopenharmony_ci/* Add a cluster to discard list and schedule it to do discard */ 4538c2ecf20Sopenharmony_cistatic void swap_cluster_schedule_discard(struct swap_info_struct *si, 4548c2ecf20Sopenharmony_ci unsigned int idx) 4558c2ecf20Sopenharmony_ci{ 4568c2ecf20Sopenharmony_ci /* 4578c2ecf20Sopenharmony_ci * If scan_swap_map() can't find a free cluster, it will check 4588c2ecf20Sopenharmony_ci * si->swap_map directly. To make sure the discarding cluster isn't 4598c2ecf20Sopenharmony_ci * taken by scan_swap_map(), mark the swap entries bad (occupied). It 4608c2ecf20Sopenharmony_ci * will be cleared after discard 4618c2ecf20Sopenharmony_ci */ 4628c2ecf20Sopenharmony_ci memset(si->swap_map + idx * SWAPFILE_CLUSTER, 4638c2ecf20Sopenharmony_ci SWAP_MAP_BAD, SWAPFILE_CLUSTER); 4648c2ecf20Sopenharmony_ci 4658c2ecf20Sopenharmony_ci cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx); 4668c2ecf20Sopenharmony_ci 4678c2ecf20Sopenharmony_ci schedule_work(&si->discard_work); 4688c2ecf20Sopenharmony_ci} 4698c2ecf20Sopenharmony_ci 4708c2ecf20Sopenharmony_cistatic void __free_cluster(struct swap_info_struct *si, unsigned long idx) 4718c2ecf20Sopenharmony_ci{ 4728c2ecf20Sopenharmony_ci struct swap_cluster_info *ci = si->cluster_info; 4738c2ecf20Sopenharmony_ci 4748c2ecf20Sopenharmony_ci cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE); 4758c2ecf20Sopenharmony_ci cluster_list_add_tail(&si->free_clusters, ci, idx); 4768c2ecf20Sopenharmony_ci} 4778c2ecf20Sopenharmony_ci 4788c2ecf20Sopenharmony_ci/* 4798c2ecf20Sopenharmony_ci * Doing discard actually. After a cluster discard is finished, the cluster 4808c2ecf20Sopenharmony_ci * will be added to free cluster list. caller should hold si->lock. 4818c2ecf20Sopenharmony_ci*/ 4828c2ecf20Sopenharmony_cistatic void swap_do_scheduled_discard(struct swap_info_struct *si) 4838c2ecf20Sopenharmony_ci{ 4848c2ecf20Sopenharmony_ci struct swap_cluster_info *info, *ci; 4858c2ecf20Sopenharmony_ci unsigned int idx; 4868c2ecf20Sopenharmony_ci 4878c2ecf20Sopenharmony_ci info = si->cluster_info; 4888c2ecf20Sopenharmony_ci 4898c2ecf20Sopenharmony_ci while (!cluster_list_empty(&si->discard_clusters)) { 4908c2ecf20Sopenharmony_ci idx = cluster_list_del_first(&si->discard_clusters, info); 4918c2ecf20Sopenharmony_ci spin_unlock(&si->lock); 4928c2ecf20Sopenharmony_ci 4938c2ecf20Sopenharmony_ci discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, 4948c2ecf20Sopenharmony_ci SWAPFILE_CLUSTER); 4958c2ecf20Sopenharmony_ci 4968c2ecf20Sopenharmony_ci spin_lock(&si->lock); 4978c2ecf20Sopenharmony_ci ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); 4988c2ecf20Sopenharmony_ci __free_cluster(si, idx); 4998c2ecf20Sopenharmony_ci memset(si->swap_map + idx * SWAPFILE_CLUSTER, 5008c2ecf20Sopenharmony_ci 0, SWAPFILE_CLUSTER); 5018c2ecf20Sopenharmony_ci unlock_cluster(ci); 5028c2ecf20Sopenharmony_ci } 5038c2ecf20Sopenharmony_ci} 5048c2ecf20Sopenharmony_ci 5058c2ecf20Sopenharmony_cistatic void swap_discard_work(struct work_struct *work) 5068c2ecf20Sopenharmony_ci{ 5078c2ecf20Sopenharmony_ci struct swap_info_struct *si; 5088c2ecf20Sopenharmony_ci 5098c2ecf20Sopenharmony_ci si = container_of(work, struct swap_info_struct, discard_work); 5108c2ecf20Sopenharmony_ci 5118c2ecf20Sopenharmony_ci spin_lock(&si->lock); 5128c2ecf20Sopenharmony_ci swap_do_scheduled_discard(si); 5138c2ecf20Sopenharmony_ci spin_unlock(&si->lock); 5148c2ecf20Sopenharmony_ci} 5158c2ecf20Sopenharmony_ci 5168c2ecf20Sopenharmony_cistatic void alloc_cluster(struct swap_info_struct *si, unsigned long idx) 5178c2ecf20Sopenharmony_ci{ 5188c2ecf20Sopenharmony_ci struct swap_cluster_info *ci = si->cluster_info; 5198c2ecf20Sopenharmony_ci 5208c2ecf20Sopenharmony_ci VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx); 5218c2ecf20Sopenharmony_ci cluster_list_del_first(&si->free_clusters, ci); 5228c2ecf20Sopenharmony_ci cluster_set_count_flag(ci + idx, 0, 0); 5238c2ecf20Sopenharmony_ci} 5248c2ecf20Sopenharmony_ci 5258c2ecf20Sopenharmony_cistatic void free_cluster(struct swap_info_struct *si, unsigned long idx) 5268c2ecf20Sopenharmony_ci{ 5278c2ecf20Sopenharmony_ci struct swap_cluster_info *ci = si->cluster_info + idx; 5288c2ecf20Sopenharmony_ci 5298c2ecf20Sopenharmony_ci VM_BUG_ON(cluster_count(ci) != 0); 5308c2ecf20Sopenharmony_ci /* 5318c2ecf20Sopenharmony_ci * If the swap is discardable, prepare discard the cluster 5328c2ecf20Sopenharmony_ci * instead of free it immediately. The cluster will be freed 5338c2ecf20Sopenharmony_ci * after discard. 5348c2ecf20Sopenharmony_ci */ 5358c2ecf20Sopenharmony_ci if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == 5368c2ecf20Sopenharmony_ci (SWP_WRITEOK | SWP_PAGE_DISCARD)) { 5378c2ecf20Sopenharmony_ci swap_cluster_schedule_discard(si, idx); 5388c2ecf20Sopenharmony_ci return; 5398c2ecf20Sopenharmony_ci } 5408c2ecf20Sopenharmony_ci 5418c2ecf20Sopenharmony_ci __free_cluster(si, idx); 5428c2ecf20Sopenharmony_ci} 5438c2ecf20Sopenharmony_ci 5448c2ecf20Sopenharmony_ci/* 5458c2ecf20Sopenharmony_ci * The cluster corresponding to page_nr will be used. The cluster will be 5468c2ecf20Sopenharmony_ci * removed from free cluster list and its usage counter will be increased. 5478c2ecf20Sopenharmony_ci */ 5488c2ecf20Sopenharmony_cistatic void inc_cluster_info_page(struct swap_info_struct *p, 5498c2ecf20Sopenharmony_ci struct swap_cluster_info *cluster_info, unsigned long page_nr) 5508c2ecf20Sopenharmony_ci{ 5518c2ecf20Sopenharmony_ci unsigned long idx = page_nr / SWAPFILE_CLUSTER; 5528c2ecf20Sopenharmony_ci 5538c2ecf20Sopenharmony_ci if (!cluster_info) 5548c2ecf20Sopenharmony_ci return; 5558c2ecf20Sopenharmony_ci if (cluster_is_free(&cluster_info[idx])) 5568c2ecf20Sopenharmony_ci alloc_cluster(p, idx); 5578c2ecf20Sopenharmony_ci 5588c2ecf20Sopenharmony_ci VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER); 5598c2ecf20Sopenharmony_ci cluster_set_count(&cluster_info[idx], 5608c2ecf20Sopenharmony_ci cluster_count(&cluster_info[idx]) + 1); 5618c2ecf20Sopenharmony_ci} 5628c2ecf20Sopenharmony_ci 5638c2ecf20Sopenharmony_ci/* 5648c2ecf20Sopenharmony_ci * The cluster corresponding to page_nr decreases one usage. If the usage 5658c2ecf20Sopenharmony_ci * counter becomes 0, which means no page in the cluster is in using, we can 5668c2ecf20Sopenharmony_ci * optionally discard the cluster and add it to free cluster list. 5678c2ecf20Sopenharmony_ci */ 5688c2ecf20Sopenharmony_cistatic void dec_cluster_info_page(struct swap_info_struct *p, 5698c2ecf20Sopenharmony_ci struct swap_cluster_info *cluster_info, unsigned long page_nr) 5708c2ecf20Sopenharmony_ci{ 5718c2ecf20Sopenharmony_ci unsigned long idx = page_nr / SWAPFILE_CLUSTER; 5728c2ecf20Sopenharmony_ci 5738c2ecf20Sopenharmony_ci if (!cluster_info) 5748c2ecf20Sopenharmony_ci return; 5758c2ecf20Sopenharmony_ci 5768c2ecf20Sopenharmony_ci VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0); 5778c2ecf20Sopenharmony_ci cluster_set_count(&cluster_info[idx], 5788c2ecf20Sopenharmony_ci cluster_count(&cluster_info[idx]) - 1); 5798c2ecf20Sopenharmony_ci 5808c2ecf20Sopenharmony_ci if (cluster_count(&cluster_info[idx]) == 0) 5818c2ecf20Sopenharmony_ci free_cluster(p, idx); 5828c2ecf20Sopenharmony_ci} 5838c2ecf20Sopenharmony_ci 5848c2ecf20Sopenharmony_ci/* 5858c2ecf20Sopenharmony_ci * It's possible scan_swap_map() uses a free cluster in the middle of free 5868c2ecf20Sopenharmony_ci * cluster list. Avoiding such abuse to avoid list corruption. 5878c2ecf20Sopenharmony_ci */ 5888c2ecf20Sopenharmony_cistatic bool 5898c2ecf20Sopenharmony_ciscan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, 5908c2ecf20Sopenharmony_ci unsigned long offset) 5918c2ecf20Sopenharmony_ci{ 5928c2ecf20Sopenharmony_ci struct percpu_cluster *percpu_cluster; 5938c2ecf20Sopenharmony_ci bool conflict; 5948c2ecf20Sopenharmony_ci 5958c2ecf20Sopenharmony_ci offset /= SWAPFILE_CLUSTER; 5968c2ecf20Sopenharmony_ci conflict = !cluster_list_empty(&si->free_clusters) && 5978c2ecf20Sopenharmony_ci offset != cluster_list_first(&si->free_clusters) && 5988c2ecf20Sopenharmony_ci cluster_is_free(&si->cluster_info[offset]); 5998c2ecf20Sopenharmony_ci 6008c2ecf20Sopenharmony_ci if (!conflict) 6018c2ecf20Sopenharmony_ci return false; 6028c2ecf20Sopenharmony_ci 6038c2ecf20Sopenharmony_ci percpu_cluster = this_cpu_ptr(si->percpu_cluster); 6048c2ecf20Sopenharmony_ci cluster_set_null(&percpu_cluster->index); 6058c2ecf20Sopenharmony_ci return true; 6068c2ecf20Sopenharmony_ci} 6078c2ecf20Sopenharmony_ci 6088c2ecf20Sopenharmony_ci/* 6098c2ecf20Sopenharmony_ci * Try to get a swap entry from current cpu's swap entry pool (a cluster). This 6108c2ecf20Sopenharmony_ci * might involve allocating a new cluster for current CPU too. 6118c2ecf20Sopenharmony_ci */ 6128c2ecf20Sopenharmony_cistatic bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, 6138c2ecf20Sopenharmony_ci unsigned long *offset, unsigned long *scan_base) 6148c2ecf20Sopenharmony_ci{ 6158c2ecf20Sopenharmony_ci struct percpu_cluster *cluster; 6168c2ecf20Sopenharmony_ci struct swap_cluster_info *ci; 6178c2ecf20Sopenharmony_ci unsigned long tmp, max; 6188c2ecf20Sopenharmony_ci 6198c2ecf20Sopenharmony_cinew_cluster: 6208c2ecf20Sopenharmony_ci cluster = this_cpu_ptr(si->percpu_cluster); 6218c2ecf20Sopenharmony_ci if (cluster_is_null(&cluster->index)) { 6228c2ecf20Sopenharmony_ci if (!cluster_list_empty(&si->free_clusters)) { 6238c2ecf20Sopenharmony_ci cluster->index = si->free_clusters.head; 6248c2ecf20Sopenharmony_ci cluster->next = cluster_next(&cluster->index) * 6258c2ecf20Sopenharmony_ci SWAPFILE_CLUSTER; 6268c2ecf20Sopenharmony_ci } else if (!cluster_list_empty(&si->discard_clusters)) { 6278c2ecf20Sopenharmony_ci /* 6288c2ecf20Sopenharmony_ci * we don't have free cluster but have some clusters in 6298c2ecf20Sopenharmony_ci * discarding, do discard now and reclaim them, then 6308c2ecf20Sopenharmony_ci * reread cluster_next_cpu since we dropped si->lock 6318c2ecf20Sopenharmony_ci */ 6328c2ecf20Sopenharmony_ci swap_do_scheduled_discard(si); 6338c2ecf20Sopenharmony_ci *scan_base = this_cpu_read(*si->cluster_next_cpu); 6348c2ecf20Sopenharmony_ci *offset = *scan_base; 6358c2ecf20Sopenharmony_ci goto new_cluster; 6368c2ecf20Sopenharmony_ci } else 6378c2ecf20Sopenharmony_ci return false; 6388c2ecf20Sopenharmony_ci } 6398c2ecf20Sopenharmony_ci 6408c2ecf20Sopenharmony_ci /* 6418c2ecf20Sopenharmony_ci * Other CPUs can use our cluster if they can't find a free cluster, 6428c2ecf20Sopenharmony_ci * check if there is still free entry in the cluster 6438c2ecf20Sopenharmony_ci */ 6448c2ecf20Sopenharmony_ci tmp = cluster->next; 6458c2ecf20Sopenharmony_ci max = min_t(unsigned long, si->max, 6468c2ecf20Sopenharmony_ci (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER); 6478c2ecf20Sopenharmony_ci if (tmp < max) { 6488c2ecf20Sopenharmony_ci ci = lock_cluster(si, tmp); 6498c2ecf20Sopenharmony_ci while (tmp < max) { 6508c2ecf20Sopenharmony_ci if (!si->swap_map[tmp]) 6518c2ecf20Sopenharmony_ci break; 6528c2ecf20Sopenharmony_ci tmp++; 6538c2ecf20Sopenharmony_ci } 6548c2ecf20Sopenharmony_ci unlock_cluster(ci); 6558c2ecf20Sopenharmony_ci } 6568c2ecf20Sopenharmony_ci if (tmp >= max) { 6578c2ecf20Sopenharmony_ci cluster_set_null(&cluster->index); 6588c2ecf20Sopenharmony_ci goto new_cluster; 6598c2ecf20Sopenharmony_ci } 6608c2ecf20Sopenharmony_ci cluster->next = tmp + 1; 6618c2ecf20Sopenharmony_ci *offset = tmp; 6628c2ecf20Sopenharmony_ci *scan_base = tmp; 6638c2ecf20Sopenharmony_ci return true; 6648c2ecf20Sopenharmony_ci} 6658c2ecf20Sopenharmony_ci 6668c2ecf20Sopenharmony_cistatic void __del_from_avail_list(struct swap_info_struct *p) 6678c2ecf20Sopenharmony_ci{ 6688c2ecf20Sopenharmony_ci int nid; 6698c2ecf20Sopenharmony_ci 6708c2ecf20Sopenharmony_ci assert_spin_locked(&p->lock); 6718c2ecf20Sopenharmony_ci for_each_node(nid) 6728c2ecf20Sopenharmony_ci plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]); 6738c2ecf20Sopenharmony_ci} 6748c2ecf20Sopenharmony_ci 6758c2ecf20Sopenharmony_cistatic void del_from_avail_list(struct swap_info_struct *p) 6768c2ecf20Sopenharmony_ci{ 6778c2ecf20Sopenharmony_ci spin_lock(&swap_avail_lock); 6788c2ecf20Sopenharmony_ci __del_from_avail_list(p); 6798c2ecf20Sopenharmony_ci spin_unlock(&swap_avail_lock); 6808c2ecf20Sopenharmony_ci} 6818c2ecf20Sopenharmony_ci 6828c2ecf20Sopenharmony_cistatic void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, 6838c2ecf20Sopenharmony_ci unsigned int nr_entries) 6848c2ecf20Sopenharmony_ci{ 6858c2ecf20Sopenharmony_ci unsigned int end = offset + nr_entries - 1; 6868c2ecf20Sopenharmony_ci 6878c2ecf20Sopenharmony_ci if (offset == si->lowest_bit) 6888c2ecf20Sopenharmony_ci si->lowest_bit += nr_entries; 6898c2ecf20Sopenharmony_ci if (end == si->highest_bit) 6908c2ecf20Sopenharmony_ci WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries); 6918c2ecf20Sopenharmony_ci si->inuse_pages += nr_entries; 6928c2ecf20Sopenharmony_ci if (si->inuse_pages == si->pages) { 6938c2ecf20Sopenharmony_ci si->lowest_bit = si->max; 6948c2ecf20Sopenharmony_ci si->highest_bit = 0; 6958c2ecf20Sopenharmony_ci del_from_avail_list(si); 6968c2ecf20Sopenharmony_ci } 6978c2ecf20Sopenharmony_ci} 6988c2ecf20Sopenharmony_ci 6998c2ecf20Sopenharmony_cistatic void add_to_avail_list(struct swap_info_struct *p) 7008c2ecf20Sopenharmony_ci{ 7018c2ecf20Sopenharmony_ci int nid; 7028c2ecf20Sopenharmony_ci 7038c2ecf20Sopenharmony_ci spin_lock(&swap_avail_lock); 7048c2ecf20Sopenharmony_ci for_each_node(nid) { 7058c2ecf20Sopenharmony_ci WARN_ON(!plist_node_empty(&p->avail_lists[nid])); 7068c2ecf20Sopenharmony_ci plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]); 7078c2ecf20Sopenharmony_ci } 7088c2ecf20Sopenharmony_ci spin_unlock(&swap_avail_lock); 7098c2ecf20Sopenharmony_ci} 7108c2ecf20Sopenharmony_ci 7118c2ecf20Sopenharmony_cistatic void swap_range_free(struct swap_info_struct *si, unsigned long offset, 7128c2ecf20Sopenharmony_ci unsigned int nr_entries) 7138c2ecf20Sopenharmony_ci{ 7148c2ecf20Sopenharmony_ci unsigned long begin = offset; 7158c2ecf20Sopenharmony_ci unsigned long end = offset + nr_entries - 1; 7168c2ecf20Sopenharmony_ci void (*swap_slot_free_notify)(struct block_device *, unsigned long); 7178c2ecf20Sopenharmony_ci 7188c2ecf20Sopenharmony_ci if (offset < si->lowest_bit) 7198c2ecf20Sopenharmony_ci si->lowest_bit = offset; 7208c2ecf20Sopenharmony_ci if (end > si->highest_bit) { 7218c2ecf20Sopenharmony_ci bool was_full = !si->highest_bit; 7228c2ecf20Sopenharmony_ci 7238c2ecf20Sopenharmony_ci WRITE_ONCE(si->highest_bit, end); 7248c2ecf20Sopenharmony_ci if (was_full && (si->flags & SWP_WRITEOK)) 7258c2ecf20Sopenharmony_ci add_to_avail_list(si); 7268c2ecf20Sopenharmony_ci } 7278c2ecf20Sopenharmony_ci atomic_long_add(nr_entries, &nr_swap_pages); 7288c2ecf20Sopenharmony_ci si->inuse_pages -= nr_entries; 7298c2ecf20Sopenharmony_ci if (si->flags & SWP_BLKDEV) 7308c2ecf20Sopenharmony_ci swap_slot_free_notify = 7318c2ecf20Sopenharmony_ci si->bdev->bd_disk->fops->swap_slot_free_notify; 7328c2ecf20Sopenharmony_ci else 7338c2ecf20Sopenharmony_ci swap_slot_free_notify = NULL; 7348c2ecf20Sopenharmony_ci while (offset <= end) { 7358c2ecf20Sopenharmony_ci arch_swap_invalidate_page(si->type, offset); 7368c2ecf20Sopenharmony_ci frontswap_invalidate_page(si->type, offset); 7378c2ecf20Sopenharmony_ci if (swap_slot_free_notify) 7388c2ecf20Sopenharmony_ci swap_slot_free_notify(si->bdev, offset); 7398c2ecf20Sopenharmony_ci offset++; 7408c2ecf20Sopenharmony_ci } 7418c2ecf20Sopenharmony_ci clear_shadow_from_swap_cache(si->type, begin, end); 7428c2ecf20Sopenharmony_ci} 7438c2ecf20Sopenharmony_ci 7448c2ecf20Sopenharmony_cistatic void set_cluster_next(struct swap_info_struct *si, unsigned long next) 7458c2ecf20Sopenharmony_ci{ 7468c2ecf20Sopenharmony_ci unsigned long prev; 7478c2ecf20Sopenharmony_ci 7488c2ecf20Sopenharmony_ci if (!(si->flags & SWP_SOLIDSTATE)) { 7498c2ecf20Sopenharmony_ci si->cluster_next = next; 7508c2ecf20Sopenharmony_ci return; 7518c2ecf20Sopenharmony_ci } 7528c2ecf20Sopenharmony_ci 7538c2ecf20Sopenharmony_ci prev = this_cpu_read(*si->cluster_next_cpu); 7548c2ecf20Sopenharmony_ci /* 7558c2ecf20Sopenharmony_ci * Cross the swap address space size aligned trunk, choose 7568c2ecf20Sopenharmony_ci * another trunk randomly to avoid lock contention on swap 7578c2ecf20Sopenharmony_ci * address space if possible. 7588c2ecf20Sopenharmony_ci */ 7598c2ecf20Sopenharmony_ci if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) != 7608c2ecf20Sopenharmony_ci (next >> SWAP_ADDRESS_SPACE_SHIFT)) { 7618c2ecf20Sopenharmony_ci /* No free swap slots available */ 7628c2ecf20Sopenharmony_ci if (si->highest_bit <= si->lowest_bit) 7638c2ecf20Sopenharmony_ci return; 7648c2ecf20Sopenharmony_ci next = si->lowest_bit + 7658c2ecf20Sopenharmony_ci prandom_u32_max(si->highest_bit - si->lowest_bit + 1); 7668c2ecf20Sopenharmony_ci next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES); 7678c2ecf20Sopenharmony_ci next = max_t(unsigned int, next, si->lowest_bit); 7688c2ecf20Sopenharmony_ci } 7698c2ecf20Sopenharmony_ci this_cpu_write(*si->cluster_next_cpu, next); 7708c2ecf20Sopenharmony_ci} 7718c2ecf20Sopenharmony_ci 7728c2ecf20Sopenharmony_cistatic int scan_swap_map_slots(struct swap_info_struct *si, 7738c2ecf20Sopenharmony_ci unsigned char usage, int nr, 7748c2ecf20Sopenharmony_ci swp_entry_t slots[]) 7758c2ecf20Sopenharmony_ci{ 7768c2ecf20Sopenharmony_ci struct swap_cluster_info *ci; 7778c2ecf20Sopenharmony_ci unsigned long offset; 7788c2ecf20Sopenharmony_ci unsigned long scan_base; 7798c2ecf20Sopenharmony_ci unsigned long last_in_cluster = 0; 7808c2ecf20Sopenharmony_ci int latency_ration = LATENCY_LIMIT; 7818c2ecf20Sopenharmony_ci int n_ret = 0; 7828c2ecf20Sopenharmony_ci bool scanned_many = false; 7838c2ecf20Sopenharmony_ci 7848c2ecf20Sopenharmony_ci /* 7858c2ecf20Sopenharmony_ci * We try to cluster swap pages by allocating them sequentially 7868c2ecf20Sopenharmony_ci * in swap. Once we've allocated SWAPFILE_CLUSTER pages this 7878c2ecf20Sopenharmony_ci * way, however, we resort to first-free allocation, starting 7888c2ecf20Sopenharmony_ci * a new cluster. This prevents us from scattering swap pages 7898c2ecf20Sopenharmony_ci * all over the entire swap partition, so that we reduce 7908c2ecf20Sopenharmony_ci * overall disk seek times between swap pages. -- sct 7918c2ecf20Sopenharmony_ci * But we do now try to find an empty cluster. -Andrea 7928c2ecf20Sopenharmony_ci * And we let swap pages go all over an SSD partition. Hugh 7938c2ecf20Sopenharmony_ci */ 7948c2ecf20Sopenharmony_ci 7958c2ecf20Sopenharmony_ci si->flags += SWP_SCANNING; 7968c2ecf20Sopenharmony_ci /* 7978c2ecf20Sopenharmony_ci * Use percpu scan base for SSD to reduce lock contention on 7988c2ecf20Sopenharmony_ci * cluster and swap cache. For HDD, sequential access is more 7998c2ecf20Sopenharmony_ci * important. 8008c2ecf20Sopenharmony_ci */ 8018c2ecf20Sopenharmony_ci if (si->flags & SWP_SOLIDSTATE) 8028c2ecf20Sopenharmony_ci scan_base = this_cpu_read(*si->cluster_next_cpu); 8038c2ecf20Sopenharmony_ci else 8048c2ecf20Sopenharmony_ci scan_base = si->cluster_next; 8058c2ecf20Sopenharmony_ci offset = scan_base; 8068c2ecf20Sopenharmony_ci 8078c2ecf20Sopenharmony_ci /* SSD algorithm */ 8088c2ecf20Sopenharmony_ci if (si->cluster_info) { 8098c2ecf20Sopenharmony_ci if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) 8108c2ecf20Sopenharmony_ci goto scan; 8118c2ecf20Sopenharmony_ci } else if (unlikely(!si->cluster_nr--)) { 8128c2ecf20Sopenharmony_ci if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { 8138c2ecf20Sopenharmony_ci si->cluster_nr = SWAPFILE_CLUSTER - 1; 8148c2ecf20Sopenharmony_ci goto checks; 8158c2ecf20Sopenharmony_ci } 8168c2ecf20Sopenharmony_ci 8178c2ecf20Sopenharmony_ci spin_unlock(&si->lock); 8188c2ecf20Sopenharmony_ci 8198c2ecf20Sopenharmony_ci /* 8208c2ecf20Sopenharmony_ci * If seek is expensive, start searching for new cluster from 8218c2ecf20Sopenharmony_ci * start of partition, to minimize the span of allocated swap. 8228c2ecf20Sopenharmony_ci * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info 8238c2ecf20Sopenharmony_ci * case, just handled by scan_swap_map_try_ssd_cluster() above. 8248c2ecf20Sopenharmony_ci */ 8258c2ecf20Sopenharmony_ci scan_base = offset = si->lowest_bit; 8268c2ecf20Sopenharmony_ci last_in_cluster = offset + SWAPFILE_CLUSTER - 1; 8278c2ecf20Sopenharmony_ci 8288c2ecf20Sopenharmony_ci /* Locate the first empty (unaligned) cluster */ 8298c2ecf20Sopenharmony_ci for (; last_in_cluster <= si->highest_bit; offset++) { 8308c2ecf20Sopenharmony_ci if (si->swap_map[offset]) 8318c2ecf20Sopenharmony_ci last_in_cluster = offset + SWAPFILE_CLUSTER; 8328c2ecf20Sopenharmony_ci else if (offset == last_in_cluster) { 8338c2ecf20Sopenharmony_ci spin_lock(&si->lock); 8348c2ecf20Sopenharmony_ci offset -= SWAPFILE_CLUSTER - 1; 8358c2ecf20Sopenharmony_ci si->cluster_next = offset; 8368c2ecf20Sopenharmony_ci si->cluster_nr = SWAPFILE_CLUSTER - 1; 8378c2ecf20Sopenharmony_ci goto checks; 8388c2ecf20Sopenharmony_ci } 8398c2ecf20Sopenharmony_ci if (unlikely(--latency_ration < 0)) { 8408c2ecf20Sopenharmony_ci cond_resched(); 8418c2ecf20Sopenharmony_ci latency_ration = LATENCY_LIMIT; 8428c2ecf20Sopenharmony_ci } 8438c2ecf20Sopenharmony_ci } 8448c2ecf20Sopenharmony_ci 8458c2ecf20Sopenharmony_ci offset = scan_base; 8468c2ecf20Sopenharmony_ci spin_lock(&si->lock); 8478c2ecf20Sopenharmony_ci si->cluster_nr = SWAPFILE_CLUSTER - 1; 8488c2ecf20Sopenharmony_ci } 8498c2ecf20Sopenharmony_ci 8508c2ecf20Sopenharmony_cichecks: 8518c2ecf20Sopenharmony_ci if (si->cluster_info) { 8528c2ecf20Sopenharmony_ci while (scan_swap_map_ssd_cluster_conflict(si, offset)) { 8538c2ecf20Sopenharmony_ci /* take a break if we already got some slots */ 8548c2ecf20Sopenharmony_ci if (n_ret) 8558c2ecf20Sopenharmony_ci goto done; 8568c2ecf20Sopenharmony_ci if (!scan_swap_map_try_ssd_cluster(si, &offset, 8578c2ecf20Sopenharmony_ci &scan_base)) 8588c2ecf20Sopenharmony_ci goto scan; 8598c2ecf20Sopenharmony_ci } 8608c2ecf20Sopenharmony_ci } 8618c2ecf20Sopenharmony_ci if (!(si->flags & SWP_WRITEOK)) 8628c2ecf20Sopenharmony_ci goto no_page; 8638c2ecf20Sopenharmony_ci if (!si->highest_bit) 8648c2ecf20Sopenharmony_ci goto no_page; 8658c2ecf20Sopenharmony_ci if (offset > si->highest_bit) 8668c2ecf20Sopenharmony_ci scan_base = offset = si->lowest_bit; 8678c2ecf20Sopenharmony_ci 8688c2ecf20Sopenharmony_ci ci = lock_cluster(si, offset); 8698c2ecf20Sopenharmony_ci /* reuse swap entry of cache-only swap if not busy. */ 8708c2ecf20Sopenharmony_ci if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 8718c2ecf20Sopenharmony_ci int swap_was_freed; 8728c2ecf20Sopenharmony_ci unlock_cluster(ci); 8738c2ecf20Sopenharmony_ci spin_unlock(&si->lock); 8748c2ecf20Sopenharmony_ci swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); 8758c2ecf20Sopenharmony_ci spin_lock(&si->lock); 8768c2ecf20Sopenharmony_ci /* entry was freed successfully, try to use this again */ 8778c2ecf20Sopenharmony_ci if (swap_was_freed) 8788c2ecf20Sopenharmony_ci goto checks; 8798c2ecf20Sopenharmony_ci goto scan; /* check next one */ 8808c2ecf20Sopenharmony_ci } 8818c2ecf20Sopenharmony_ci 8828c2ecf20Sopenharmony_ci if (si->swap_map[offset]) { 8838c2ecf20Sopenharmony_ci unlock_cluster(ci); 8848c2ecf20Sopenharmony_ci if (!n_ret) 8858c2ecf20Sopenharmony_ci goto scan; 8868c2ecf20Sopenharmony_ci else 8878c2ecf20Sopenharmony_ci goto done; 8888c2ecf20Sopenharmony_ci } 8898c2ecf20Sopenharmony_ci WRITE_ONCE(si->swap_map[offset], usage); 8908c2ecf20Sopenharmony_ci inc_cluster_info_page(si, si->cluster_info, offset); 8918c2ecf20Sopenharmony_ci unlock_cluster(ci); 8928c2ecf20Sopenharmony_ci 8938c2ecf20Sopenharmony_ci swap_range_alloc(si, offset, 1); 8948c2ecf20Sopenharmony_ci slots[n_ret++] = swp_entry(si->type, offset); 8958c2ecf20Sopenharmony_ci 8968c2ecf20Sopenharmony_ci /* got enough slots or reach max slots? */ 8978c2ecf20Sopenharmony_ci if ((n_ret == nr) || (offset >= si->highest_bit)) 8988c2ecf20Sopenharmony_ci goto done; 8998c2ecf20Sopenharmony_ci 9008c2ecf20Sopenharmony_ci /* search for next available slot */ 9018c2ecf20Sopenharmony_ci 9028c2ecf20Sopenharmony_ci /* time to take a break? */ 9038c2ecf20Sopenharmony_ci if (unlikely(--latency_ration < 0)) { 9048c2ecf20Sopenharmony_ci if (n_ret) 9058c2ecf20Sopenharmony_ci goto done; 9068c2ecf20Sopenharmony_ci spin_unlock(&si->lock); 9078c2ecf20Sopenharmony_ci cond_resched(); 9088c2ecf20Sopenharmony_ci spin_lock(&si->lock); 9098c2ecf20Sopenharmony_ci latency_ration = LATENCY_LIMIT; 9108c2ecf20Sopenharmony_ci } 9118c2ecf20Sopenharmony_ci 9128c2ecf20Sopenharmony_ci /* try to get more slots in cluster */ 9138c2ecf20Sopenharmony_ci if (si->cluster_info) { 9148c2ecf20Sopenharmony_ci if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) 9158c2ecf20Sopenharmony_ci goto checks; 9168c2ecf20Sopenharmony_ci } else if (si->cluster_nr && !si->swap_map[++offset]) { 9178c2ecf20Sopenharmony_ci /* non-ssd case, still more slots in cluster? */ 9188c2ecf20Sopenharmony_ci --si->cluster_nr; 9198c2ecf20Sopenharmony_ci goto checks; 9208c2ecf20Sopenharmony_ci } 9218c2ecf20Sopenharmony_ci 9228c2ecf20Sopenharmony_ci /* 9238c2ecf20Sopenharmony_ci * Even if there's no free clusters available (fragmented), 9248c2ecf20Sopenharmony_ci * try to scan a little more quickly with lock held unless we 9258c2ecf20Sopenharmony_ci * have scanned too many slots already. 9268c2ecf20Sopenharmony_ci */ 9278c2ecf20Sopenharmony_ci if (!scanned_many) { 9288c2ecf20Sopenharmony_ci unsigned long scan_limit; 9298c2ecf20Sopenharmony_ci 9308c2ecf20Sopenharmony_ci if (offset < scan_base) 9318c2ecf20Sopenharmony_ci scan_limit = scan_base; 9328c2ecf20Sopenharmony_ci else 9338c2ecf20Sopenharmony_ci scan_limit = si->highest_bit; 9348c2ecf20Sopenharmony_ci for (; offset <= scan_limit && --latency_ration > 0; 9358c2ecf20Sopenharmony_ci offset++) { 9368c2ecf20Sopenharmony_ci if (!si->swap_map[offset]) 9378c2ecf20Sopenharmony_ci goto checks; 9388c2ecf20Sopenharmony_ci } 9398c2ecf20Sopenharmony_ci } 9408c2ecf20Sopenharmony_ci 9418c2ecf20Sopenharmony_cidone: 9428c2ecf20Sopenharmony_ci set_cluster_next(si, offset + 1); 9438c2ecf20Sopenharmony_ci si->flags -= SWP_SCANNING; 9448c2ecf20Sopenharmony_ci return n_ret; 9458c2ecf20Sopenharmony_ci 9468c2ecf20Sopenharmony_ciscan: 9478c2ecf20Sopenharmony_ci spin_unlock(&si->lock); 9488c2ecf20Sopenharmony_ci while (++offset <= READ_ONCE(si->highest_bit)) { 9498c2ecf20Sopenharmony_ci if (data_race(!si->swap_map[offset])) { 9508c2ecf20Sopenharmony_ci spin_lock(&si->lock); 9518c2ecf20Sopenharmony_ci goto checks; 9528c2ecf20Sopenharmony_ci } 9538c2ecf20Sopenharmony_ci if (vm_swap_full() && 9548c2ecf20Sopenharmony_ci READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { 9558c2ecf20Sopenharmony_ci spin_lock(&si->lock); 9568c2ecf20Sopenharmony_ci goto checks; 9578c2ecf20Sopenharmony_ci } 9588c2ecf20Sopenharmony_ci if (unlikely(--latency_ration < 0)) { 9598c2ecf20Sopenharmony_ci cond_resched(); 9608c2ecf20Sopenharmony_ci latency_ration = LATENCY_LIMIT; 9618c2ecf20Sopenharmony_ci scanned_many = true; 9628c2ecf20Sopenharmony_ci } 9638c2ecf20Sopenharmony_ci } 9648c2ecf20Sopenharmony_ci offset = si->lowest_bit; 9658c2ecf20Sopenharmony_ci while (offset < scan_base) { 9668c2ecf20Sopenharmony_ci if (data_race(!si->swap_map[offset])) { 9678c2ecf20Sopenharmony_ci spin_lock(&si->lock); 9688c2ecf20Sopenharmony_ci goto checks; 9698c2ecf20Sopenharmony_ci } 9708c2ecf20Sopenharmony_ci if (vm_swap_full() && 9718c2ecf20Sopenharmony_ci READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { 9728c2ecf20Sopenharmony_ci spin_lock(&si->lock); 9738c2ecf20Sopenharmony_ci goto checks; 9748c2ecf20Sopenharmony_ci } 9758c2ecf20Sopenharmony_ci if (unlikely(--latency_ration < 0)) { 9768c2ecf20Sopenharmony_ci cond_resched(); 9778c2ecf20Sopenharmony_ci latency_ration = LATENCY_LIMIT; 9788c2ecf20Sopenharmony_ci scanned_many = true; 9798c2ecf20Sopenharmony_ci } 9808c2ecf20Sopenharmony_ci offset++; 9818c2ecf20Sopenharmony_ci } 9828c2ecf20Sopenharmony_ci spin_lock(&si->lock); 9838c2ecf20Sopenharmony_ci 9848c2ecf20Sopenharmony_cino_page: 9858c2ecf20Sopenharmony_ci si->flags -= SWP_SCANNING; 9868c2ecf20Sopenharmony_ci return n_ret; 9878c2ecf20Sopenharmony_ci} 9888c2ecf20Sopenharmony_ci 9898c2ecf20Sopenharmony_cistatic int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) 9908c2ecf20Sopenharmony_ci{ 9918c2ecf20Sopenharmony_ci unsigned long idx; 9928c2ecf20Sopenharmony_ci struct swap_cluster_info *ci; 9938c2ecf20Sopenharmony_ci unsigned long offset, i; 9948c2ecf20Sopenharmony_ci unsigned char *map; 9958c2ecf20Sopenharmony_ci 9968c2ecf20Sopenharmony_ci /* 9978c2ecf20Sopenharmony_ci * Should not even be attempting cluster allocations when huge 9988c2ecf20Sopenharmony_ci * page swap is disabled. Warn and fail the allocation. 9998c2ecf20Sopenharmony_ci */ 10008c2ecf20Sopenharmony_ci if (!IS_ENABLED(CONFIG_THP_SWAP)) { 10018c2ecf20Sopenharmony_ci VM_WARN_ON_ONCE(1); 10028c2ecf20Sopenharmony_ci return 0; 10038c2ecf20Sopenharmony_ci } 10048c2ecf20Sopenharmony_ci 10058c2ecf20Sopenharmony_ci if (cluster_list_empty(&si->free_clusters)) 10068c2ecf20Sopenharmony_ci return 0; 10078c2ecf20Sopenharmony_ci 10088c2ecf20Sopenharmony_ci idx = cluster_list_first(&si->free_clusters); 10098c2ecf20Sopenharmony_ci offset = idx * SWAPFILE_CLUSTER; 10108c2ecf20Sopenharmony_ci ci = lock_cluster(si, offset); 10118c2ecf20Sopenharmony_ci alloc_cluster(si, idx); 10128c2ecf20Sopenharmony_ci cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE); 10138c2ecf20Sopenharmony_ci 10148c2ecf20Sopenharmony_ci map = si->swap_map + offset; 10158c2ecf20Sopenharmony_ci for (i = 0; i < SWAPFILE_CLUSTER; i++) 10168c2ecf20Sopenharmony_ci map[i] = SWAP_HAS_CACHE; 10178c2ecf20Sopenharmony_ci unlock_cluster(ci); 10188c2ecf20Sopenharmony_ci swap_range_alloc(si, offset, SWAPFILE_CLUSTER); 10198c2ecf20Sopenharmony_ci *slot = swp_entry(si->type, offset); 10208c2ecf20Sopenharmony_ci 10218c2ecf20Sopenharmony_ci return 1; 10228c2ecf20Sopenharmony_ci} 10238c2ecf20Sopenharmony_ci 10248c2ecf20Sopenharmony_cistatic void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) 10258c2ecf20Sopenharmony_ci{ 10268c2ecf20Sopenharmony_ci unsigned long offset = idx * SWAPFILE_CLUSTER; 10278c2ecf20Sopenharmony_ci struct swap_cluster_info *ci; 10288c2ecf20Sopenharmony_ci 10298c2ecf20Sopenharmony_ci ci = lock_cluster(si, offset); 10308c2ecf20Sopenharmony_ci memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); 10318c2ecf20Sopenharmony_ci cluster_set_count_flag(ci, 0, 0); 10328c2ecf20Sopenharmony_ci free_cluster(si, idx); 10338c2ecf20Sopenharmony_ci unlock_cluster(ci); 10348c2ecf20Sopenharmony_ci swap_range_free(si, offset, SWAPFILE_CLUSTER); 10358c2ecf20Sopenharmony_ci} 10368c2ecf20Sopenharmony_ci 10378c2ecf20Sopenharmony_cistatic unsigned long scan_swap_map(struct swap_info_struct *si, 10388c2ecf20Sopenharmony_ci unsigned char usage) 10398c2ecf20Sopenharmony_ci{ 10408c2ecf20Sopenharmony_ci swp_entry_t entry; 10418c2ecf20Sopenharmony_ci int n_ret; 10428c2ecf20Sopenharmony_ci 10438c2ecf20Sopenharmony_ci n_ret = scan_swap_map_slots(si, usage, 1, &entry); 10448c2ecf20Sopenharmony_ci 10458c2ecf20Sopenharmony_ci if (n_ret) 10468c2ecf20Sopenharmony_ci return swp_offset(entry); 10478c2ecf20Sopenharmony_ci else 10488c2ecf20Sopenharmony_ci return 0; 10498c2ecf20Sopenharmony_ci 10508c2ecf20Sopenharmony_ci} 10518c2ecf20Sopenharmony_ci 10528c2ecf20Sopenharmony_ciint get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) 10538c2ecf20Sopenharmony_ci{ 10548c2ecf20Sopenharmony_ci unsigned long size = swap_entry_size(entry_size); 10558c2ecf20Sopenharmony_ci struct swap_info_struct *si, *next; 10568c2ecf20Sopenharmony_ci long avail_pgs; 10578c2ecf20Sopenharmony_ci int n_ret = 0; 10588c2ecf20Sopenharmony_ci int node; 10598c2ecf20Sopenharmony_ci 10608c2ecf20Sopenharmony_ci /* Only single cluster request supported */ 10618c2ecf20Sopenharmony_ci WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER); 10628c2ecf20Sopenharmony_ci 10638c2ecf20Sopenharmony_ci spin_lock(&swap_avail_lock); 10648c2ecf20Sopenharmony_ci 10658c2ecf20Sopenharmony_ci avail_pgs = atomic_long_read(&nr_swap_pages) / size; 10668c2ecf20Sopenharmony_ci if (avail_pgs <= 0) { 10678c2ecf20Sopenharmony_ci spin_unlock(&swap_avail_lock); 10688c2ecf20Sopenharmony_ci goto noswap; 10698c2ecf20Sopenharmony_ci } 10708c2ecf20Sopenharmony_ci 10718c2ecf20Sopenharmony_ci n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs); 10728c2ecf20Sopenharmony_ci 10738c2ecf20Sopenharmony_ci atomic_long_sub(n_goal * size, &nr_swap_pages); 10748c2ecf20Sopenharmony_ci 10758c2ecf20Sopenharmony_cistart_over: 10768c2ecf20Sopenharmony_ci node = numa_node_id(); 10778c2ecf20Sopenharmony_ci plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { 10788c2ecf20Sopenharmony_ci /* requeue si to after same-priority siblings */ 10798c2ecf20Sopenharmony_ci plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); 10808c2ecf20Sopenharmony_ci spin_unlock(&swap_avail_lock); 10818c2ecf20Sopenharmony_ci spin_lock(&si->lock); 10828c2ecf20Sopenharmony_ci if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { 10838c2ecf20Sopenharmony_ci spin_lock(&swap_avail_lock); 10848c2ecf20Sopenharmony_ci if (plist_node_empty(&si->avail_lists[node])) { 10858c2ecf20Sopenharmony_ci spin_unlock(&si->lock); 10868c2ecf20Sopenharmony_ci goto nextsi; 10878c2ecf20Sopenharmony_ci } 10888c2ecf20Sopenharmony_ci WARN(!si->highest_bit, 10898c2ecf20Sopenharmony_ci "swap_info %d in list but !highest_bit\n", 10908c2ecf20Sopenharmony_ci si->type); 10918c2ecf20Sopenharmony_ci WARN(!(si->flags & SWP_WRITEOK), 10928c2ecf20Sopenharmony_ci "swap_info %d in list but !SWP_WRITEOK\n", 10938c2ecf20Sopenharmony_ci si->type); 10948c2ecf20Sopenharmony_ci __del_from_avail_list(si); 10958c2ecf20Sopenharmony_ci spin_unlock(&si->lock); 10968c2ecf20Sopenharmony_ci goto nextsi; 10978c2ecf20Sopenharmony_ci } 10988c2ecf20Sopenharmony_ci if (size == SWAPFILE_CLUSTER) { 10998c2ecf20Sopenharmony_ci if (si->flags & SWP_BLKDEV) 11008c2ecf20Sopenharmony_ci n_ret = swap_alloc_cluster(si, swp_entries); 11018c2ecf20Sopenharmony_ci } else 11028c2ecf20Sopenharmony_ci n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, 11038c2ecf20Sopenharmony_ci n_goal, swp_entries); 11048c2ecf20Sopenharmony_ci spin_unlock(&si->lock); 11058c2ecf20Sopenharmony_ci if (n_ret || size == SWAPFILE_CLUSTER) 11068c2ecf20Sopenharmony_ci goto check_out; 11078c2ecf20Sopenharmony_ci pr_debug("scan_swap_map of si %d failed to find offset\n", 11088c2ecf20Sopenharmony_ci si->type); 11098c2ecf20Sopenharmony_ci cond_resched(); 11108c2ecf20Sopenharmony_ci 11118c2ecf20Sopenharmony_ci spin_lock(&swap_avail_lock); 11128c2ecf20Sopenharmony_cinextsi: 11138c2ecf20Sopenharmony_ci /* 11148c2ecf20Sopenharmony_ci * if we got here, it's likely that si was almost full before, 11158c2ecf20Sopenharmony_ci * and since scan_swap_map() can drop the si->lock, multiple 11168c2ecf20Sopenharmony_ci * callers probably all tried to get a page from the same si 11178c2ecf20Sopenharmony_ci * and it filled up before we could get one; or, the si filled 11188c2ecf20Sopenharmony_ci * up between us dropping swap_avail_lock and taking si->lock. 11198c2ecf20Sopenharmony_ci * Since we dropped the swap_avail_lock, the swap_avail_head 11208c2ecf20Sopenharmony_ci * list may have been modified; so if next is still in the 11218c2ecf20Sopenharmony_ci * swap_avail_head list then try it, otherwise start over 11228c2ecf20Sopenharmony_ci * if we have not gotten any slots. 11238c2ecf20Sopenharmony_ci */ 11248c2ecf20Sopenharmony_ci if (plist_node_empty(&next->avail_lists[node])) 11258c2ecf20Sopenharmony_ci goto start_over; 11268c2ecf20Sopenharmony_ci } 11278c2ecf20Sopenharmony_ci 11288c2ecf20Sopenharmony_ci spin_unlock(&swap_avail_lock); 11298c2ecf20Sopenharmony_ci 11308c2ecf20Sopenharmony_cicheck_out: 11318c2ecf20Sopenharmony_ci if (n_ret < n_goal) 11328c2ecf20Sopenharmony_ci atomic_long_add((long)(n_goal - n_ret) * size, 11338c2ecf20Sopenharmony_ci &nr_swap_pages); 11348c2ecf20Sopenharmony_cinoswap: 11358c2ecf20Sopenharmony_ci return n_ret; 11368c2ecf20Sopenharmony_ci} 11378c2ecf20Sopenharmony_ci 11388c2ecf20Sopenharmony_ci/* The only caller of this function is now suspend routine */ 11398c2ecf20Sopenharmony_ciswp_entry_t get_swap_page_of_type(int type) 11408c2ecf20Sopenharmony_ci{ 11418c2ecf20Sopenharmony_ci struct swap_info_struct *si = swap_type_to_swap_info(type); 11428c2ecf20Sopenharmony_ci pgoff_t offset; 11438c2ecf20Sopenharmony_ci 11448c2ecf20Sopenharmony_ci if (!si) 11458c2ecf20Sopenharmony_ci goto fail; 11468c2ecf20Sopenharmony_ci 11478c2ecf20Sopenharmony_ci spin_lock(&si->lock); 11488c2ecf20Sopenharmony_ci if (si->flags & SWP_WRITEOK) { 11498c2ecf20Sopenharmony_ci /* This is called for allocating swap entry, not cache */ 11508c2ecf20Sopenharmony_ci offset = scan_swap_map(si, 1); 11518c2ecf20Sopenharmony_ci if (offset) { 11528c2ecf20Sopenharmony_ci atomic_long_dec(&nr_swap_pages); 11538c2ecf20Sopenharmony_ci spin_unlock(&si->lock); 11548c2ecf20Sopenharmony_ci return swp_entry(type, offset); 11558c2ecf20Sopenharmony_ci } 11568c2ecf20Sopenharmony_ci } 11578c2ecf20Sopenharmony_ci spin_unlock(&si->lock); 11588c2ecf20Sopenharmony_cifail: 11598c2ecf20Sopenharmony_ci return (swp_entry_t) {0}; 11608c2ecf20Sopenharmony_ci} 11618c2ecf20Sopenharmony_ci 11628c2ecf20Sopenharmony_cistatic struct swap_info_struct *__swap_info_get(swp_entry_t entry) 11638c2ecf20Sopenharmony_ci{ 11648c2ecf20Sopenharmony_ci struct swap_info_struct *p; 11658c2ecf20Sopenharmony_ci unsigned long offset; 11668c2ecf20Sopenharmony_ci 11678c2ecf20Sopenharmony_ci if (!entry.val) 11688c2ecf20Sopenharmony_ci goto out; 11698c2ecf20Sopenharmony_ci p = swp_swap_info(entry); 11708c2ecf20Sopenharmony_ci if (!p) 11718c2ecf20Sopenharmony_ci goto bad_nofile; 11728c2ecf20Sopenharmony_ci if (data_race(!(p->flags & SWP_USED))) 11738c2ecf20Sopenharmony_ci goto bad_device; 11748c2ecf20Sopenharmony_ci offset = swp_offset(entry); 11758c2ecf20Sopenharmony_ci if (offset >= p->max) 11768c2ecf20Sopenharmony_ci goto bad_offset; 11778c2ecf20Sopenharmony_ci return p; 11788c2ecf20Sopenharmony_ci 11798c2ecf20Sopenharmony_cibad_offset: 11808c2ecf20Sopenharmony_ci pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val); 11818c2ecf20Sopenharmony_ci goto out; 11828c2ecf20Sopenharmony_cibad_device: 11838c2ecf20Sopenharmony_ci pr_err("swap_info_get: %s%08lx\n", Unused_file, entry.val); 11848c2ecf20Sopenharmony_ci goto out; 11858c2ecf20Sopenharmony_cibad_nofile: 11868c2ecf20Sopenharmony_ci pr_err("swap_info_get: %s%08lx\n", Bad_file, entry.val); 11878c2ecf20Sopenharmony_ciout: 11888c2ecf20Sopenharmony_ci return NULL; 11898c2ecf20Sopenharmony_ci} 11908c2ecf20Sopenharmony_ci 11918c2ecf20Sopenharmony_cistatic struct swap_info_struct *_swap_info_get(swp_entry_t entry) 11928c2ecf20Sopenharmony_ci{ 11938c2ecf20Sopenharmony_ci struct swap_info_struct *p; 11948c2ecf20Sopenharmony_ci 11958c2ecf20Sopenharmony_ci p = __swap_info_get(entry); 11968c2ecf20Sopenharmony_ci if (!p) 11978c2ecf20Sopenharmony_ci goto out; 11988c2ecf20Sopenharmony_ci if (data_race(!p->swap_map[swp_offset(entry)])) 11998c2ecf20Sopenharmony_ci goto bad_free; 12008c2ecf20Sopenharmony_ci return p; 12018c2ecf20Sopenharmony_ci 12028c2ecf20Sopenharmony_cibad_free: 12038c2ecf20Sopenharmony_ci pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val); 12048c2ecf20Sopenharmony_ciout: 12058c2ecf20Sopenharmony_ci return NULL; 12068c2ecf20Sopenharmony_ci} 12078c2ecf20Sopenharmony_ci 12088c2ecf20Sopenharmony_cistatic struct swap_info_struct *swap_info_get(swp_entry_t entry) 12098c2ecf20Sopenharmony_ci{ 12108c2ecf20Sopenharmony_ci struct swap_info_struct *p; 12118c2ecf20Sopenharmony_ci 12128c2ecf20Sopenharmony_ci p = _swap_info_get(entry); 12138c2ecf20Sopenharmony_ci if (p) 12148c2ecf20Sopenharmony_ci spin_lock(&p->lock); 12158c2ecf20Sopenharmony_ci return p; 12168c2ecf20Sopenharmony_ci} 12178c2ecf20Sopenharmony_ci 12188c2ecf20Sopenharmony_cistatic struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, 12198c2ecf20Sopenharmony_ci struct swap_info_struct *q) 12208c2ecf20Sopenharmony_ci{ 12218c2ecf20Sopenharmony_ci struct swap_info_struct *p; 12228c2ecf20Sopenharmony_ci 12238c2ecf20Sopenharmony_ci p = _swap_info_get(entry); 12248c2ecf20Sopenharmony_ci 12258c2ecf20Sopenharmony_ci if (p != q) { 12268c2ecf20Sopenharmony_ci if (q != NULL) 12278c2ecf20Sopenharmony_ci spin_unlock(&q->lock); 12288c2ecf20Sopenharmony_ci if (p != NULL) 12298c2ecf20Sopenharmony_ci spin_lock(&p->lock); 12308c2ecf20Sopenharmony_ci } 12318c2ecf20Sopenharmony_ci return p; 12328c2ecf20Sopenharmony_ci} 12338c2ecf20Sopenharmony_ci 12348c2ecf20Sopenharmony_cistatic unsigned char __swap_entry_free_locked(struct swap_info_struct *p, 12358c2ecf20Sopenharmony_ci unsigned long offset, 12368c2ecf20Sopenharmony_ci unsigned char usage) 12378c2ecf20Sopenharmony_ci{ 12388c2ecf20Sopenharmony_ci unsigned char count; 12398c2ecf20Sopenharmony_ci unsigned char has_cache; 12408c2ecf20Sopenharmony_ci 12418c2ecf20Sopenharmony_ci count = p->swap_map[offset]; 12428c2ecf20Sopenharmony_ci 12438c2ecf20Sopenharmony_ci has_cache = count & SWAP_HAS_CACHE; 12448c2ecf20Sopenharmony_ci count &= ~SWAP_HAS_CACHE; 12458c2ecf20Sopenharmony_ci 12468c2ecf20Sopenharmony_ci if (usage == SWAP_HAS_CACHE) { 12478c2ecf20Sopenharmony_ci VM_BUG_ON(!has_cache); 12488c2ecf20Sopenharmony_ci has_cache = 0; 12498c2ecf20Sopenharmony_ci } else if (count == SWAP_MAP_SHMEM) { 12508c2ecf20Sopenharmony_ci /* 12518c2ecf20Sopenharmony_ci * Or we could insist on shmem.c using a special 12528c2ecf20Sopenharmony_ci * swap_shmem_free() and free_shmem_swap_and_cache()... 12538c2ecf20Sopenharmony_ci */ 12548c2ecf20Sopenharmony_ci count = 0; 12558c2ecf20Sopenharmony_ci } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { 12568c2ecf20Sopenharmony_ci if (count == COUNT_CONTINUED) { 12578c2ecf20Sopenharmony_ci if (swap_count_continued(p, offset, count)) 12588c2ecf20Sopenharmony_ci count = SWAP_MAP_MAX | COUNT_CONTINUED; 12598c2ecf20Sopenharmony_ci else 12608c2ecf20Sopenharmony_ci count = SWAP_MAP_MAX; 12618c2ecf20Sopenharmony_ci } else 12628c2ecf20Sopenharmony_ci count--; 12638c2ecf20Sopenharmony_ci } 12648c2ecf20Sopenharmony_ci 12658c2ecf20Sopenharmony_ci usage = count | has_cache; 12668c2ecf20Sopenharmony_ci if (usage) 12678c2ecf20Sopenharmony_ci WRITE_ONCE(p->swap_map[offset], usage); 12688c2ecf20Sopenharmony_ci else 12698c2ecf20Sopenharmony_ci WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE); 12708c2ecf20Sopenharmony_ci 12718c2ecf20Sopenharmony_ci return usage; 12728c2ecf20Sopenharmony_ci} 12738c2ecf20Sopenharmony_ci 12748c2ecf20Sopenharmony_ci/* 12758c2ecf20Sopenharmony_ci * Note that when only holding the PTL, swapoff might succeed immediately 12768c2ecf20Sopenharmony_ci * after freeing a swap entry. Therefore, immediately after 12778c2ecf20Sopenharmony_ci * __swap_entry_free(), the swap info might become stale and should not 12788c2ecf20Sopenharmony_ci * be touched without a prior get_swap_device(). 12798c2ecf20Sopenharmony_ci * 12808c2ecf20Sopenharmony_ci * Check whether swap entry is valid in the swap device. If so, 12818c2ecf20Sopenharmony_ci * return pointer to swap_info_struct, and keep the swap entry valid 12828c2ecf20Sopenharmony_ci * via preventing the swap device from being swapoff, until 12838c2ecf20Sopenharmony_ci * put_swap_device() is called. Otherwise return NULL. 12848c2ecf20Sopenharmony_ci * 12858c2ecf20Sopenharmony_ci * The entirety of the RCU read critical section must come before the 12868c2ecf20Sopenharmony_ci * return from or after the call to synchronize_rcu() in 12878c2ecf20Sopenharmony_ci * enable_swap_info() or swapoff(). So if "si->flags & SWP_VALID" is 12888c2ecf20Sopenharmony_ci * true, the si->map, si->cluster_info, etc. must be valid in the 12898c2ecf20Sopenharmony_ci * critical section. 12908c2ecf20Sopenharmony_ci * 12918c2ecf20Sopenharmony_ci * Notice that swapoff or swapoff+swapon can still happen before the 12928c2ecf20Sopenharmony_ci * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock() 12938c2ecf20Sopenharmony_ci * in put_swap_device() if there isn't any other way to prevent 12948c2ecf20Sopenharmony_ci * swapoff, such as page lock, page table lock, etc. The caller must 12958c2ecf20Sopenharmony_ci * be prepared for that. For example, the following situation is 12968c2ecf20Sopenharmony_ci * possible. 12978c2ecf20Sopenharmony_ci * 12988c2ecf20Sopenharmony_ci * CPU1 CPU2 12998c2ecf20Sopenharmony_ci * do_swap_page() 13008c2ecf20Sopenharmony_ci * ... swapoff+swapon 13018c2ecf20Sopenharmony_ci * __read_swap_cache_async() 13028c2ecf20Sopenharmony_ci * swapcache_prepare() 13038c2ecf20Sopenharmony_ci * __swap_duplicate() 13048c2ecf20Sopenharmony_ci * // check swap_map 13058c2ecf20Sopenharmony_ci * // verify PTE not changed 13068c2ecf20Sopenharmony_ci * 13078c2ecf20Sopenharmony_ci * In __swap_duplicate(), the swap_map need to be checked before 13088c2ecf20Sopenharmony_ci * changing partly because the specified swap entry may be for another 13098c2ecf20Sopenharmony_ci * swap device which has been swapoff. And in do_swap_page(), after 13108c2ecf20Sopenharmony_ci * the page is read from the swap device, the PTE is verified not 13118c2ecf20Sopenharmony_ci * changed with the page table locked to check whether the swap device 13128c2ecf20Sopenharmony_ci * has been swapoff or swapoff+swapon. 13138c2ecf20Sopenharmony_ci */ 13148c2ecf20Sopenharmony_cistruct swap_info_struct *get_swap_device(swp_entry_t entry) 13158c2ecf20Sopenharmony_ci{ 13168c2ecf20Sopenharmony_ci struct swap_info_struct *si; 13178c2ecf20Sopenharmony_ci unsigned long offset; 13188c2ecf20Sopenharmony_ci 13198c2ecf20Sopenharmony_ci if (!entry.val) 13208c2ecf20Sopenharmony_ci goto out; 13218c2ecf20Sopenharmony_ci si = swp_swap_info(entry); 13228c2ecf20Sopenharmony_ci if (!si) 13238c2ecf20Sopenharmony_ci goto bad_nofile; 13248c2ecf20Sopenharmony_ci 13258c2ecf20Sopenharmony_ci rcu_read_lock(); 13268c2ecf20Sopenharmony_ci if (data_race(!(si->flags & SWP_VALID))) 13278c2ecf20Sopenharmony_ci goto unlock_out; 13288c2ecf20Sopenharmony_ci offset = swp_offset(entry); 13298c2ecf20Sopenharmony_ci if (offset >= si->max) 13308c2ecf20Sopenharmony_ci goto unlock_out; 13318c2ecf20Sopenharmony_ci 13328c2ecf20Sopenharmony_ci return si; 13338c2ecf20Sopenharmony_cibad_nofile: 13348c2ecf20Sopenharmony_ci pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); 13358c2ecf20Sopenharmony_ciout: 13368c2ecf20Sopenharmony_ci return NULL; 13378c2ecf20Sopenharmony_ciunlock_out: 13388c2ecf20Sopenharmony_ci rcu_read_unlock(); 13398c2ecf20Sopenharmony_ci return NULL; 13408c2ecf20Sopenharmony_ci} 13418c2ecf20Sopenharmony_ci 13428c2ecf20Sopenharmony_cistatic unsigned char __swap_entry_free(struct swap_info_struct *p, 13438c2ecf20Sopenharmony_ci swp_entry_t entry) 13448c2ecf20Sopenharmony_ci{ 13458c2ecf20Sopenharmony_ci struct swap_cluster_info *ci; 13468c2ecf20Sopenharmony_ci unsigned long offset = swp_offset(entry); 13478c2ecf20Sopenharmony_ci unsigned char usage; 13488c2ecf20Sopenharmony_ci 13498c2ecf20Sopenharmony_ci ci = lock_cluster_or_swap_info(p, offset); 13508c2ecf20Sopenharmony_ci usage = __swap_entry_free_locked(p, offset, 1); 13518c2ecf20Sopenharmony_ci unlock_cluster_or_swap_info(p, ci); 13528c2ecf20Sopenharmony_ci if (!usage) 13538c2ecf20Sopenharmony_ci free_swap_slot(entry); 13548c2ecf20Sopenharmony_ci 13558c2ecf20Sopenharmony_ci return usage; 13568c2ecf20Sopenharmony_ci} 13578c2ecf20Sopenharmony_ci 13588c2ecf20Sopenharmony_cistatic void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) 13598c2ecf20Sopenharmony_ci{ 13608c2ecf20Sopenharmony_ci struct swap_cluster_info *ci; 13618c2ecf20Sopenharmony_ci unsigned long offset = swp_offset(entry); 13628c2ecf20Sopenharmony_ci unsigned char count; 13638c2ecf20Sopenharmony_ci 13648c2ecf20Sopenharmony_ci ci = lock_cluster(p, offset); 13658c2ecf20Sopenharmony_ci count = p->swap_map[offset]; 13668c2ecf20Sopenharmony_ci VM_BUG_ON(count != SWAP_HAS_CACHE); 13678c2ecf20Sopenharmony_ci p->swap_map[offset] = 0; 13688c2ecf20Sopenharmony_ci dec_cluster_info_page(p, p->cluster_info, offset); 13698c2ecf20Sopenharmony_ci unlock_cluster(ci); 13708c2ecf20Sopenharmony_ci 13718c2ecf20Sopenharmony_ci mem_cgroup_uncharge_swap(entry, 1); 13728c2ecf20Sopenharmony_ci swap_range_free(p, offset, 1); 13738c2ecf20Sopenharmony_ci} 13748c2ecf20Sopenharmony_ci 13758c2ecf20Sopenharmony_ci/* 13768c2ecf20Sopenharmony_ci * Caller has made sure that the swap device corresponding to entry 13778c2ecf20Sopenharmony_ci * is still around or has not been recycled. 13788c2ecf20Sopenharmony_ci */ 13798c2ecf20Sopenharmony_civoid swap_free(swp_entry_t entry) 13808c2ecf20Sopenharmony_ci{ 13818c2ecf20Sopenharmony_ci struct swap_info_struct *p; 13828c2ecf20Sopenharmony_ci 13838c2ecf20Sopenharmony_ci p = _swap_info_get(entry); 13848c2ecf20Sopenharmony_ci if (p) 13858c2ecf20Sopenharmony_ci __swap_entry_free(p, entry); 13868c2ecf20Sopenharmony_ci} 13878c2ecf20Sopenharmony_ci 13888c2ecf20Sopenharmony_ci/* 13898c2ecf20Sopenharmony_ci * Called after dropping swapcache to decrease refcnt to swap entries. 13908c2ecf20Sopenharmony_ci */ 13918c2ecf20Sopenharmony_civoid put_swap_page(struct page *page, swp_entry_t entry) 13928c2ecf20Sopenharmony_ci{ 13938c2ecf20Sopenharmony_ci unsigned long offset = swp_offset(entry); 13948c2ecf20Sopenharmony_ci unsigned long idx = offset / SWAPFILE_CLUSTER; 13958c2ecf20Sopenharmony_ci struct swap_cluster_info *ci; 13968c2ecf20Sopenharmony_ci struct swap_info_struct *si; 13978c2ecf20Sopenharmony_ci unsigned char *map; 13988c2ecf20Sopenharmony_ci unsigned int i, free_entries = 0; 13998c2ecf20Sopenharmony_ci unsigned char val; 14008c2ecf20Sopenharmony_ci int size = swap_entry_size(thp_nr_pages(page)); 14018c2ecf20Sopenharmony_ci 14028c2ecf20Sopenharmony_ci si = _swap_info_get(entry); 14038c2ecf20Sopenharmony_ci if (!si) 14048c2ecf20Sopenharmony_ci return; 14058c2ecf20Sopenharmony_ci 14068c2ecf20Sopenharmony_ci ci = lock_cluster_or_swap_info(si, offset); 14078c2ecf20Sopenharmony_ci if (size == SWAPFILE_CLUSTER) { 14088c2ecf20Sopenharmony_ci VM_BUG_ON(!cluster_is_huge(ci)); 14098c2ecf20Sopenharmony_ci map = si->swap_map + offset; 14108c2ecf20Sopenharmony_ci for (i = 0; i < SWAPFILE_CLUSTER; i++) { 14118c2ecf20Sopenharmony_ci val = map[i]; 14128c2ecf20Sopenharmony_ci VM_BUG_ON(!(val & SWAP_HAS_CACHE)); 14138c2ecf20Sopenharmony_ci if (val == SWAP_HAS_CACHE) 14148c2ecf20Sopenharmony_ci free_entries++; 14158c2ecf20Sopenharmony_ci } 14168c2ecf20Sopenharmony_ci cluster_clear_huge(ci); 14178c2ecf20Sopenharmony_ci if (free_entries == SWAPFILE_CLUSTER) { 14188c2ecf20Sopenharmony_ci unlock_cluster_or_swap_info(si, ci); 14198c2ecf20Sopenharmony_ci spin_lock(&si->lock); 14208c2ecf20Sopenharmony_ci mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); 14218c2ecf20Sopenharmony_ci swap_free_cluster(si, idx); 14228c2ecf20Sopenharmony_ci spin_unlock(&si->lock); 14238c2ecf20Sopenharmony_ci return; 14248c2ecf20Sopenharmony_ci } 14258c2ecf20Sopenharmony_ci } 14268c2ecf20Sopenharmony_ci for (i = 0; i < size; i++, entry.val++) { 14278c2ecf20Sopenharmony_ci if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { 14288c2ecf20Sopenharmony_ci unlock_cluster_or_swap_info(si, ci); 14298c2ecf20Sopenharmony_ci free_swap_slot(entry); 14308c2ecf20Sopenharmony_ci if (i == size - 1) 14318c2ecf20Sopenharmony_ci return; 14328c2ecf20Sopenharmony_ci lock_cluster_or_swap_info(si, offset); 14338c2ecf20Sopenharmony_ci } 14348c2ecf20Sopenharmony_ci } 14358c2ecf20Sopenharmony_ci unlock_cluster_or_swap_info(si, ci); 14368c2ecf20Sopenharmony_ci} 14378c2ecf20Sopenharmony_ci 14388c2ecf20Sopenharmony_ci#ifdef CONFIG_THP_SWAP 14398c2ecf20Sopenharmony_ciint split_swap_cluster(swp_entry_t entry) 14408c2ecf20Sopenharmony_ci{ 14418c2ecf20Sopenharmony_ci struct swap_info_struct *si; 14428c2ecf20Sopenharmony_ci struct swap_cluster_info *ci; 14438c2ecf20Sopenharmony_ci unsigned long offset = swp_offset(entry); 14448c2ecf20Sopenharmony_ci 14458c2ecf20Sopenharmony_ci si = _swap_info_get(entry); 14468c2ecf20Sopenharmony_ci if (!si) 14478c2ecf20Sopenharmony_ci return -EBUSY; 14488c2ecf20Sopenharmony_ci ci = lock_cluster(si, offset); 14498c2ecf20Sopenharmony_ci cluster_clear_huge(ci); 14508c2ecf20Sopenharmony_ci unlock_cluster(ci); 14518c2ecf20Sopenharmony_ci return 0; 14528c2ecf20Sopenharmony_ci} 14538c2ecf20Sopenharmony_ci#endif 14548c2ecf20Sopenharmony_ci 14558c2ecf20Sopenharmony_cistatic int swp_entry_cmp(const void *ent1, const void *ent2) 14568c2ecf20Sopenharmony_ci{ 14578c2ecf20Sopenharmony_ci const swp_entry_t *e1 = ent1, *e2 = ent2; 14588c2ecf20Sopenharmony_ci 14598c2ecf20Sopenharmony_ci return (int)swp_type(*e1) - (int)swp_type(*e2); 14608c2ecf20Sopenharmony_ci} 14618c2ecf20Sopenharmony_ci 14628c2ecf20Sopenharmony_civoid swapcache_free_entries(swp_entry_t *entries, int n) 14638c2ecf20Sopenharmony_ci{ 14648c2ecf20Sopenharmony_ci struct swap_info_struct *p, *prev; 14658c2ecf20Sopenharmony_ci int i; 14668c2ecf20Sopenharmony_ci 14678c2ecf20Sopenharmony_ci if (n <= 0) 14688c2ecf20Sopenharmony_ci return; 14698c2ecf20Sopenharmony_ci 14708c2ecf20Sopenharmony_ci prev = NULL; 14718c2ecf20Sopenharmony_ci p = NULL; 14728c2ecf20Sopenharmony_ci 14738c2ecf20Sopenharmony_ci /* 14748c2ecf20Sopenharmony_ci * Sort swap entries by swap device, so each lock is only taken once. 14758c2ecf20Sopenharmony_ci * nr_swapfiles isn't absolutely correct, but the overhead of sort() is 14768c2ecf20Sopenharmony_ci * so low that it isn't necessary to optimize further. 14778c2ecf20Sopenharmony_ci */ 14788c2ecf20Sopenharmony_ci if (nr_swapfiles > 1) 14798c2ecf20Sopenharmony_ci sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL); 14808c2ecf20Sopenharmony_ci for (i = 0; i < n; ++i) { 14818c2ecf20Sopenharmony_ci p = swap_info_get_cont(entries[i], prev); 14828c2ecf20Sopenharmony_ci if (p) 14838c2ecf20Sopenharmony_ci swap_entry_free(p, entries[i]); 14848c2ecf20Sopenharmony_ci prev = p; 14858c2ecf20Sopenharmony_ci } 14868c2ecf20Sopenharmony_ci if (p) 14878c2ecf20Sopenharmony_ci spin_unlock(&p->lock); 14888c2ecf20Sopenharmony_ci} 14898c2ecf20Sopenharmony_ci 14908c2ecf20Sopenharmony_ci/* 14918c2ecf20Sopenharmony_ci * How many references to page are currently swapped out? 14928c2ecf20Sopenharmony_ci * This does not give an exact answer when swap count is continued, 14938c2ecf20Sopenharmony_ci * but does include the high COUNT_CONTINUED flag to allow for that. 14948c2ecf20Sopenharmony_ci */ 14958c2ecf20Sopenharmony_ciint page_swapcount(struct page *page) 14968c2ecf20Sopenharmony_ci{ 14978c2ecf20Sopenharmony_ci int count = 0; 14988c2ecf20Sopenharmony_ci struct swap_info_struct *p; 14998c2ecf20Sopenharmony_ci struct swap_cluster_info *ci; 15008c2ecf20Sopenharmony_ci swp_entry_t entry; 15018c2ecf20Sopenharmony_ci unsigned long offset; 15028c2ecf20Sopenharmony_ci 15038c2ecf20Sopenharmony_ci entry.val = page_private(page); 15048c2ecf20Sopenharmony_ci p = _swap_info_get(entry); 15058c2ecf20Sopenharmony_ci if (p) { 15068c2ecf20Sopenharmony_ci offset = swp_offset(entry); 15078c2ecf20Sopenharmony_ci ci = lock_cluster_or_swap_info(p, offset); 15088c2ecf20Sopenharmony_ci count = swap_count(p->swap_map[offset]); 15098c2ecf20Sopenharmony_ci unlock_cluster_or_swap_info(p, ci); 15108c2ecf20Sopenharmony_ci } 15118c2ecf20Sopenharmony_ci return count; 15128c2ecf20Sopenharmony_ci} 15138c2ecf20Sopenharmony_ci 15148c2ecf20Sopenharmony_ciint __swap_count(swp_entry_t entry) 15158c2ecf20Sopenharmony_ci{ 15168c2ecf20Sopenharmony_ci struct swap_info_struct *si; 15178c2ecf20Sopenharmony_ci pgoff_t offset = swp_offset(entry); 15188c2ecf20Sopenharmony_ci int count = 0; 15198c2ecf20Sopenharmony_ci 15208c2ecf20Sopenharmony_ci si = get_swap_device(entry); 15218c2ecf20Sopenharmony_ci if (si) { 15228c2ecf20Sopenharmony_ci count = swap_count(si->swap_map[offset]); 15238c2ecf20Sopenharmony_ci put_swap_device(si); 15248c2ecf20Sopenharmony_ci } 15258c2ecf20Sopenharmony_ci return count; 15268c2ecf20Sopenharmony_ci} 15278c2ecf20Sopenharmony_ci 15288c2ecf20Sopenharmony_cistatic int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) 15298c2ecf20Sopenharmony_ci{ 15308c2ecf20Sopenharmony_ci int count = 0; 15318c2ecf20Sopenharmony_ci pgoff_t offset = swp_offset(entry); 15328c2ecf20Sopenharmony_ci struct swap_cluster_info *ci; 15338c2ecf20Sopenharmony_ci 15348c2ecf20Sopenharmony_ci ci = lock_cluster_or_swap_info(si, offset); 15358c2ecf20Sopenharmony_ci count = swap_count(si->swap_map[offset]); 15368c2ecf20Sopenharmony_ci unlock_cluster_or_swap_info(si, ci); 15378c2ecf20Sopenharmony_ci return count; 15388c2ecf20Sopenharmony_ci} 15398c2ecf20Sopenharmony_ci 15408c2ecf20Sopenharmony_ci/* 15418c2ecf20Sopenharmony_ci * How many references to @entry are currently swapped out? 15428c2ecf20Sopenharmony_ci * This does not give an exact answer when swap count is continued, 15438c2ecf20Sopenharmony_ci * but does include the high COUNT_CONTINUED flag to allow for that. 15448c2ecf20Sopenharmony_ci */ 15458c2ecf20Sopenharmony_ciint __swp_swapcount(swp_entry_t entry) 15468c2ecf20Sopenharmony_ci{ 15478c2ecf20Sopenharmony_ci int count = 0; 15488c2ecf20Sopenharmony_ci struct swap_info_struct *si; 15498c2ecf20Sopenharmony_ci 15508c2ecf20Sopenharmony_ci si = get_swap_device(entry); 15518c2ecf20Sopenharmony_ci if (si) { 15528c2ecf20Sopenharmony_ci count = swap_swapcount(si, entry); 15538c2ecf20Sopenharmony_ci put_swap_device(si); 15548c2ecf20Sopenharmony_ci } 15558c2ecf20Sopenharmony_ci return count; 15568c2ecf20Sopenharmony_ci} 15578c2ecf20Sopenharmony_ci 15588c2ecf20Sopenharmony_ci/* 15598c2ecf20Sopenharmony_ci * How many references to @entry are currently swapped out? 15608c2ecf20Sopenharmony_ci * This considers COUNT_CONTINUED so it returns exact answer. 15618c2ecf20Sopenharmony_ci */ 15628c2ecf20Sopenharmony_ciint swp_swapcount(swp_entry_t entry) 15638c2ecf20Sopenharmony_ci{ 15648c2ecf20Sopenharmony_ci int count, tmp_count, n; 15658c2ecf20Sopenharmony_ci struct swap_info_struct *p; 15668c2ecf20Sopenharmony_ci struct swap_cluster_info *ci; 15678c2ecf20Sopenharmony_ci struct page *page; 15688c2ecf20Sopenharmony_ci pgoff_t offset; 15698c2ecf20Sopenharmony_ci unsigned char *map; 15708c2ecf20Sopenharmony_ci 15718c2ecf20Sopenharmony_ci p = _swap_info_get(entry); 15728c2ecf20Sopenharmony_ci if (!p) 15738c2ecf20Sopenharmony_ci return 0; 15748c2ecf20Sopenharmony_ci 15758c2ecf20Sopenharmony_ci offset = swp_offset(entry); 15768c2ecf20Sopenharmony_ci 15778c2ecf20Sopenharmony_ci ci = lock_cluster_or_swap_info(p, offset); 15788c2ecf20Sopenharmony_ci 15798c2ecf20Sopenharmony_ci count = swap_count(p->swap_map[offset]); 15808c2ecf20Sopenharmony_ci if (!(count & COUNT_CONTINUED)) 15818c2ecf20Sopenharmony_ci goto out; 15828c2ecf20Sopenharmony_ci 15838c2ecf20Sopenharmony_ci count &= ~COUNT_CONTINUED; 15848c2ecf20Sopenharmony_ci n = SWAP_MAP_MAX + 1; 15858c2ecf20Sopenharmony_ci 15868c2ecf20Sopenharmony_ci page = vmalloc_to_page(p->swap_map + offset); 15878c2ecf20Sopenharmony_ci offset &= ~PAGE_MASK; 15888c2ecf20Sopenharmony_ci VM_BUG_ON(page_private(page) != SWP_CONTINUED); 15898c2ecf20Sopenharmony_ci 15908c2ecf20Sopenharmony_ci do { 15918c2ecf20Sopenharmony_ci page = list_next_entry(page, lru); 15928c2ecf20Sopenharmony_ci map = kmap_atomic(page); 15938c2ecf20Sopenharmony_ci tmp_count = map[offset]; 15948c2ecf20Sopenharmony_ci kunmap_atomic(map); 15958c2ecf20Sopenharmony_ci 15968c2ecf20Sopenharmony_ci count += (tmp_count & ~COUNT_CONTINUED) * n; 15978c2ecf20Sopenharmony_ci n *= (SWAP_CONT_MAX + 1); 15988c2ecf20Sopenharmony_ci } while (tmp_count & COUNT_CONTINUED); 15998c2ecf20Sopenharmony_ciout: 16008c2ecf20Sopenharmony_ci unlock_cluster_or_swap_info(p, ci); 16018c2ecf20Sopenharmony_ci return count; 16028c2ecf20Sopenharmony_ci} 16038c2ecf20Sopenharmony_ci 16048c2ecf20Sopenharmony_cistatic bool swap_page_trans_huge_swapped(struct swap_info_struct *si, 16058c2ecf20Sopenharmony_ci swp_entry_t entry) 16068c2ecf20Sopenharmony_ci{ 16078c2ecf20Sopenharmony_ci struct swap_cluster_info *ci; 16088c2ecf20Sopenharmony_ci unsigned char *map = si->swap_map; 16098c2ecf20Sopenharmony_ci unsigned long roffset = swp_offset(entry); 16108c2ecf20Sopenharmony_ci unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER); 16118c2ecf20Sopenharmony_ci int i; 16128c2ecf20Sopenharmony_ci bool ret = false; 16138c2ecf20Sopenharmony_ci 16148c2ecf20Sopenharmony_ci ci = lock_cluster_or_swap_info(si, offset); 16158c2ecf20Sopenharmony_ci if (!ci || !cluster_is_huge(ci)) { 16168c2ecf20Sopenharmony_ci if (swap_count(map[roffset])) 16178c2ecf20Sopenharmony_ci ret = true; 16188c2ecf20Sopenharmony_ci goto unlock_out; 16198c2ecf20Sopenharmony_ci } 16208c2ecf20Sopenharmony_ci for (i = 0; i < SWAPFILE_CLUSTER; i++) { 16218c2ecf20Sopenharmony_ci if (swap_count(map[offset + i])) { 16228c2ecf20Sopenharmony_ci ret = true; 16238c2ecf20Sopenharmony_ci break; 16248c2ecf20Sopenharmony_ci } 16258c2ecf20Sopenharmony_ci } 16268c2ecf20Sopenharmony_ciunlock_out: 16278c2ecf20Sopenharmony_ci unlock_cluster_or_swap_info(si, ci); 16288c2ecf20Sopenharmony_ci return ret; 16298c2ecf20Sopenharmony_ci} 16308c2ecf20Sopenharmony_ci 16318c2ecf20Sopenharmony_cistatic bool page_swapped(struct page *page) 16328c2ecf20Sopenharmony_ci{ 16338c2ecf20Sopenharmony_ci swp_entry_t entry; 16348c2ecf20Sopenharmony_ci struct swap_info_struct *si; 16358c2ecf20Sopenharmony_ci 16368c2ecf20Sopenharmony_ci if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) 16378c2ecf20Sopenharmony_ci return page_swapcount(page) != 0; 16388c2ecf20Sopenharmony_ci 16398c2ecf20Sopenharmony_ci page = compound_head(page); 16408c2ecf20Sopenharmony_ci entry.val = page_private(page); 16418c2ecf20Sopenharmony_ci si = _swap_info_get(entry); 16428c2ecf20Sopenharmony_ci if (si) 16438c2ecf20Sopenharmony_ci return swap_page_trans_huge_swapped(si, entry); 16448c2ecf20Sopenharmony_ci return false; 16458c2ecf20Sopenharmony_ci} 16468c2ecf20Sopenharmony_ci 16478c2ecf20Sopenharmony_cistatic int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, 16488c2ecf20Sopenharmony_ci int *total_swapcount) 16498c2ecf20Sopenharmony_ci{ 16508c2ecf20Sopenharmony_ci int i, map_swapcount, _total_mapcount, _total_swapcount; 16518c2ecf20Sopenharmony_ci unsigned long offset = 0; 16528c2ecf20Sopenharmony_ci struct swap_info_struct *si; 16538c2ecf20Sopenharmony_ci struct swap_cluster_info *ci = NULL; 16548c2ecf20Sopenharmony_ci unsigned char *map = NULL; 16558c2ecf20Sopenharmony_ci int mapcount, swapcount = 0; 16568c2ecf20Sopenharmony_ci 16578c2ecf20Sopenharmony_ci /* hugetlbfs shouldn't call it */ 16588c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(PageHuge(page), page); 16598c2ecf20Sopenharmony_ci 16608c2ecf20Sopenharmony_ci if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) { 16618c2ecf20Sopenharmony_ci mapcount = page_trans_huge_mapcount(page, total_mapcount); 16628c2ecf20Sopenharmony_ci if (PageSwapCache(page)) 16638c2ecf20Sopenharmony_ci swapcount = page_swapcount(page); 16648c2ecf20Sopenharmony_ci if (total_swapcount) 16658c2ecf20Sopenharmony_ci *total_swapcount = swapcount; 16668c2ecf20Sopenharmony_ci return mapcount + swapcount; 16678c2ecf20Sopenharmony_ci } 16688c2ecf20Sopenharmony_ci 16698c2ecf20Sopenharmony_ci page = compound_head(page); 16708c2ecf20Sopenharmony_ci 16718c2ecf20Sopenharmony_ci _total_mapcount = _total_swapcount = map_swapcount = 0; 16728c2ecf20Sopenharmony_ci if (PageSwapCache(page)) { 16738c2ecf20Sopenharmony_ci swp_entry_t entry; 16748c2ecf20Sopenharmony_ci 16758c2ecf20Sopenharmony_ci entry.val = page_private(page); 16768c2ecf20Sopenharmony_ci si = _swap_info_get(entry); 16778c2ecf20Sopenharmony_ci if (si) { 16788c2ecf20Sopenharmony_ci map = si->swap_map; 16798c2ecf20Sopenharmony_ci offset = swp_offset(entry); 16808c2ecf20Sopenharmony_ci } 16818c2ecf20Sopenharmony_ci } 16828c2ecf20Sopenharmony_ci if (map) 16838c2ecf20Sopenharmony_ci ci = lock_cluster(si, offset); 16848c2ecf20Sopenharmony_ci for (i = 0; i < HPAGE_PMD_NR; i++) { 16858c2ecf20Sopenharmony_ci mapcount = atomic_read(&page[i]._mapcount) + 1; 16868c2ecf20Sopenharmony_ci _total_mapcount += mapcount; 16878c2ecf20Sopenharmony_ci if (map) { 16888c2ecf20Sopenharmony_ci swapcount = swap_count(map[offset + i]); 16898c2ecf20Sopenharmony_ci _total_swapcount += swapcount; 16908c2ecf20Sopenharmony_ci } 16918c2ecf20Sopenharmony_ci map_swapcount = max(map_swapcount, mapcount + swapcount); 16928c2ecf20Sopenharmony_ci } 16938c2ecf20Sopenharmony_ci unlock_cluster(ci); 16948c2ecf20Sopenharmony_ci if (PageDoubleMap(page)) { 16958c2ecf20Sopenharmony_ci map_swapcount -= 1; 16968c2ecf20Sopenharmony_ci _total_mapcount -= HPAGE_PMD_NR; 16978c2ecf20Sopenharmony_ci } 16988c2ecf20Sopenharmony_ci mapcount = compound_mapcount(page); 16998c2ecf20Sopenharmony_ci map_swapcount += mapcount; 17008c2ecf20Sopenharmony_ci _total_mapcount += mapcount; 17018c2ecf20Sopenharmony_ci if (total_mapcount) 17028c2ecf20Sopenharmony_ci *total_mapcount = _total_mapcount; 17038c2ecf20Sopenharmony_ci if (total_swapcount) 17048c2ecf20Sopenharmony_ci *total_swapcount = _total_swapcount; 17058c2ecf20Sopenharmony_ci 17068c2ecf20Sopenharmony_ci return map_swapcount; 17078c2ecf20Sopenharmony_ci} 17088c2ecf20Sopenharmony_ci 17098c2ecf20Sopenharmony_ci/* 17108c2ecf20Sopenharmony_ci * We can write to an anon page without COW if there are no other references 17118c2ecf20Sopenharmony_ci * to it. And as a side-effect, free up its swap: because the old content 17128c2ecf20Sopenharmony_ci * on disk will never be read, and seeking back there to write new content 17138c2ecf20Sopenharmony_ci * later would only waste time away from clustering. 17148c2ecf20Sopenharmony_ci * 17158c2ecf20Sopenharmony_ci * NOTE: total_map_swapcount should not be relied upon by the caller if 17168c2ecf20Sopenharmony_ci * reuse_swap_page() returns false, but it may be always overwritten 17178c2ecf20Sopenharmony_ci * (see the other implementation for CONFIG_SWAP=n). 17188c2ecf20Sopenharmony_ci */ 17198c2ecf20Sopenharmony_cibool reuse_swap_page(struct page *page, int *total_map_swapcount) 17208c2ecf20Sopenharmony_ci{ 17218c2ecf20Sopenharmony_ci int count, total_mapcount, total_swapcount; 17228c2ecf20Sopenharmony_ci 17238c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(!PageLocked(page), page); 17248c2ecf20Sopenharmony_ci if (unlikely(PageKsm(page))) 17258c2ecf20Sopenharmony_ci return false; 17268c2ecf20Sopenharmony_ci count = page_trans_huge_map_swapcount(page, &total_mapcount, 17278c2ecf20Sopenharmony_ci &total_swapcount); 17288c2ecf20Sopenharmony_ci if (total_map_swapcount) 17298c2ecf20Sopenharmony_ci *total_map_swapcount = total_mapcount + total_swapcount; 17308c2ecf20Sopenharmony_ci if (count == 1 && PageSwapCache(page) && 17318c2ecf20Sopenharmony_ci (likely(!PageTransCompound(page)) || 17328c2ecf20Sopenharmony_ci /* The remaining swap count will be freed soon */ 17338c2ecf20Sopenharmony_ci total_swapcount == page_swapcount(page))) { 17348c2ecf20Sopenharmony_ci if (!PageWriteback(page)) { 17358c2ecf20Sopenharmony_ci page = compound_head(page); 17368c2ecf20Sopenharmony_ci delete_from_swap_cache(page); 17378c2ecf20Sopenharmony_ci SetPageDirty(page); 17388c2ecf20Sopenharmony_ci } else { 17398c2ecf20Sopenharmony_ci swp_entry_t entry; 17408c2ecf20Sopenharmony_ci struct swap_info_struct *p; 17418c2ecf20Sopenharmony_ci 17428c2ecf20Sopenharmony_ci entry.val = page_private(page); 17438c2ecf20Sopenharmony_ci p = swap_info_get(entry); 17448c2ecf20Sopenharmony_ci if (p->flags & SWP_STABLE_WRITES) { 17458c2ecf20Sopenharmony_ci spin_unlock(&p->lock); 17468c2ecf20Sopenharmony_ci return false; 17478c2ecf20Sopenharmony_ci } 17488c2ecf20Sopenharmony_ci spin_unlock(&p->lock); 17498c2ecf20Sopenharmony_ci } 17508c2ecf20Sopenharmony_ci } 17518c2ecf20Sopenharmony_ci 17528c2ecf20Sopenharmony_ci return count <= 1; 17538c2ecf20Sopenharmony_ci} 17548c2ecf20Sopenharmony_ci 17558c2ecf20Sopenharmony_ci/* 17568c2ecf20Sopenharmony_ci * If swap is getting full, or if there are no more mappings of this page, 17578c2ecf20Sopenharmony_ci * then try_to_free_swap is called to free its swap space. 17588c2ecf20Sopenharmony_ci */ 17598c2ecf20Sopenharmony_ciint try_to_free_swap(struct page *page) 17608c2ecf20Sopenharmony_ci{ 17618c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(!PageLocked(page), page); 17628c2ecf20Sopenharmony_ci 17638c2ecf20Sopenharmony_ci if (!PageSwapCache(page)) 17648c2ecf20Sopenharmony_ci return 0; 17658c2ecf20Sopenharmony_ci if (PageWriteback(page)) 17668c2ecf20Sopenharmony_ci return 0; 17678c2ecf20Sopenharmony_ci if (page_swapped(page)) 17688c2ecf20Sopenharmony_ci return 0; 17698c2ecf20Sopenharmony_ci 17708c2ecf20Sopenharmony_ci /* 17718c2ecf20Sopenharmony_ci * Once hibernation has begun to create its image of memory, 17728c2ecf20Sopenharmony_ci * there's a danger that one of the calls to try_to_free_swap() 17738c2ecf20Sopenharmony_ci * - most probably a call from __try_to_reclaim_swap() while 17748c2ecf20Sopenharmony_ci * hibernation is allocating its own swap pages for the image, 17758c2ecf20Sopenharmony_ci * but conceivably even a call from memory reclaim - will free 17768c2ecf20Sopenharmony_ci * the swap from a page which has already been recorded in the 17778c2ecf20Sopenharmony_ci * image as a clean swapcache page, and then reuse its swap for 17788c2ecf20Sopenharmony_ci * another page of the image. On waking from hibernation, the 17798c2ecf20Sopenharmony_ci * original page might be freed under memory pressure, then 17808c2ecf20Sopenharmony_ci * later read back in from swap, now with the wrong data. 17818c2ecf20Sopenharmony_ci * 17828c2ecf20Sopenharmony_ci * Hibernation suspends storage while it is writing the image 17838c2ecf20Sopenharmony_ci * to disk so check that here. 17848c2ecf20Sopenharmony_ci */ 17858c2ecf20Sopenharmony_ci if (pm_suspended_storage()) 17868c2ecf20Sopenharmony_ci return 0; 17878c2ecf20Sopenharmony_ci 17888c2ecf20Sopenharmony_ci page = compound_head(page); 17898c2ecf20Sopenharmony_ci delete_from_swap_cache(page); 17908c2ecf20Sopenharmony_ci SetPageDirty(page); 17918c2ecf20Sopenharmony_ci return 1; 17928c2ecf20Sopenharmony_ci} 17938c2ecf20Sopenharmony_ci 17948c2ecf20Sopenharmony_ci/* 17958c2ecf20Sopenharmony_ci * Free the swap entry like above, but also try to 17968c2ecf20Sopenharmony_ci * free the page cache entry if it is the last user. 17978c2ecf20Sopenharmony_ci */ 17988c2ecf20Sopenharmony_ciint free_swap_and_cache(swp_entry_t entry) 17998c2ecf20Sopenharmony_ci{ 18008c2ecf20Sopenharmony_ci struct swap_info_struct *p; 18018c2ecf20Sopenharmony_ci unsigned char count; 18028c2ecf20Sopenharmony_ci 18038c2ecf20Sopenharmony_ci if (non_swap_entry(entry)) 18048c2ecf20Sopenharmony_ci return 1; 18058c2ecf20Sopenharmony_ci 18068c2ecf20Sopenharmony_ci p = get_swap_device(entry); 18078c2ecf20Sopenharmony_ci if (p) { 18088c2ecf20Sopenharmony_ci if (WARN_ON(data_race(!p->swap_map[swp_offset(entry)]))) { 18098c2ecf20Sopenharmony_ci put_swap_device(p); 18108c2ecf20Sopenharmony_ci return 0; 18118c2ecf20Sopenharmony_ci } 18128c2ecf20Sopenharmony_ci 18138c2ecf20Sopenharmony_ci count = __swap_entry_free(p, entry); 18148c2ecf20Sopenharmony_ci if (count == SWAP_HAS_CACHE && 18158c2ecf20Sopenharmony_ci !swap_page_trans_huge_swapped(p, entry)) 18168c2ecf20Sopenharmony_ci __try_to_reclaim_swap(p, swp_offset(entry), 18178c2ecf20Sopenharmony_ci TTRS_UNMAPPED | TTRS_FULL); 18188c2ecf20Sopenharmony_ci put_swap_device(p); 18198c2ecf20Sopenharmony_ci } 18208c2ecf20Sopenharmony_ci return p != NULL; 18218c2ecf20Sopenharmony_ci} 18228c2ecf20Sopenharmony_ci 18238c2ecf20Sopenharmony_ci#ifdef CONFIG_HIBERNATION 18248c2ecf20Sopenharmony_ci/* 18258c2ecf20Sopenharmony_ci * Find the swap type that corresponds to given device (if any). 18268c2ecf20Sopenharmony_ci * 18278c2ecf20Sopenharmony_ci * @offset - number of the PAGE_SIZE-sized block of the device, starting 18288c2ecf20Sopenharmony_ci * from 0, in which the swap header is expected to be located. 18298c2ecf20Sopenharmony_ci * 18308c2ecf20Sopenharmony_ci * This is needed for the suspend to disk (aka swsusp). 18318c2ecf20Sopenharmony_ci */ 18328c2ecf20Sopenharmony_ciint swap_type_of(dev_t device, sector_t offset) 18338c2ecf20Sopenharmony_ci{ 18348c2ecf20Sopenharmony_ci int type; 18358c2ecf20Sopenharmony_ci 18368c2ecf20Sopenharmony_ci if (!device) 18378c2ecf20Sopenharmony_ci return -1; 18388c2ecf20Sopenharmony_ci 18398c2ecf20Sopenharmony_ci spin_lock(&swap_lock); 18408c2ecf20Sopenharmony_ci for (type = 0; type < nr_swapfiles; type++) { 18418c2ecf20Sopenharmony_ci struct swap_info_struct *sis = swap_info[type]; 18428c2ecf20Sopenharmony_ci 18438c2ecf20Sopenharmony_ci if (!(sis->flags & SWP_WRITEOK)) 18448c2ecf20Sopenharmony_ci continue; 18458c2ecf20Sopenharmony_ci 18468c2ecf20Sopenharmony_ci if (device == sis->bdev->bd_dev) { 18478c2ecf20Sopenharmony_ci struct swap_extent *se = first_se(sis); 18488c2ecf20Sopenharmony_ci 18498c2ecf20Sopenharmony_ci if (se->start_block == offset) { 18508c2ecf20Sopenharmony_ci spin_unlock(&swap_lock); 18518c2ecf20Sopenharmony_ci return type; 18528c2ecf20Sopenharmony_ci } 18538c2ecf20Sopenharmony_ci } 18548c2ecf20Sopenharmony_ci } 18558c2ecf20Sopenharmony_ci spin_unlock(&swap_lock); 18568c2ecf20Sopenharmony_ci return -ENODEV; 18578c2ecf20Sopenharmony_ci} 18588c2ecf20Sopenharmony_ci 18598c2ecf20Sopenharmony_ciint find_first_swap(dev_t *device) 18608c2ecf20Sopenharmony_ci{ 18618c2ecf20Sopenharmony_ci int type; 18628c2ecf20Sopenharmony_ci 18638c2ecf20Sopenharmony_ci spin_lock(&swap_lock); 18648c2ecf20Sopenharmony_ci for (type = 0; type < nr_swapfiles; type++) { 18658c2ecf20Sopenharmony_ci struct swap_info_struct *sis = swap_info[type]; 18668c2ecf20Sopenharmony_ci 18678c2ecf20Sopenharmony_ci if (!(sis->flags & SWP_WRITEOK)) 18688c2ecf20Sopenharmony_ci continue; 18698c2ecf20Sopenharmony_ci *device = sis->bdev->bd_dev; 18708c2ecf20Sopenharmony_ci spin_unlock(&swap_lock); 18718c2ecf20Sopenharmony_ci return type; 18728c2ecf20Sopenharmony_ci } 18738c2ecf20Sopenharmony_ci spin_unlock(&swap_lock); 18748c2ecf20Sopenharmony_ci return -ENODEV; 18758c2ecf20Sopenharmony_ci} 18768c2ecf20Sopenharmony_ci 18778c2ecf20Sopenharmony_ci/* 18788c2ecf20Sopenharmony_ci * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev 18798c2ecf20Sopenharmony_ci * corresponding to given index in swap_info (swap type). 18808c2ecf20Sopenharmony_ci */ 18818c2ecf20Sopenharmony_cisector_t swapdev_block(int type, pgoff_t offset) 18828c2ecf20Sopenharmony_ci{ 18838c2ecf20Sopenharmony_ci struct block_device *bdev; 18848c2ecf20Sopenharmony_ci struct swap_info_struct *si = swap_type_to_swap_info(type); 18858c2ecf20Sopenharmony_ci 18868c2ecf20Sopenharmony_ci if (!si || !(si->flags & SWP_WRITEOK)) 18878c2ecf20Sopenharmony_ci return 0; 18888c2ecf20Sopenharmony_ci return map_swap_entry(swp_entry(type, offset), &bdev); 18898c2ecf20Sopenharmony_ci} 18908c2ecf20Sopenharmony_ci 18918c2ecf20Sopenharmony_ci/* 18928c2ecf20Sopenharmony_ci * Return either the total number of swap pages of given type, or the number 18938c2ecf20Sopenharmony_ci * of free pages of that type (depending on @free) 18948c2ecf20Sopenharmony_ci * 18958c2ecf20Sopenharmony_ci * This is needed for software suspend 18968c2ecf20Sopenharmony_ci */ 18978c2ecf20Sopenharmony_ciunsigned int count_swap_pages(int type, int free) 18988c2ecf20Sopenharmony_ci{ 18998c2ecf20Sopenharmony_ci unsigned int n = 0; 19008c2ecf20Sopenharmony_ci 19018c2ecf20Sopenharmony_ci spin_lock(&swap_lock); 19028c2ecf20Sopenharmony_ci if ((unsigned int)type < nr_swapfiles) { 19038c2ecf20Sopenharmony_ci struct swap_info_struct *sis = swap_info[type]; 19048c2ecf20Sopenharmony_ci 19058c2ecf20Sopenharmony_ci spin_lock(&sis->lock); 19068c2ecf20Sopenharmony_ci if (sis->flags & SWP_WRITEOK) { 19078c2ecf20Sopenharmony_ci n = sis->pages; 19088c2ecf20Sopenharmony_ci if (free) 19098c2ecf20Sopenharmony_ci n -= sis->inuse_pages; 19108c2ecf20Sopenharmony_ci } 19118c2ecf20Sopenharmony_ci spin_unlock(&sis->lock); 19128c2ecf20Sopenharmony_ci } 19138c2ecf20Sopenharmony_ci spin_unlock(&swap_lock); 19148c2ecf20Sopenharmony_ci return n; 19158c2ecf20Sopenharmony_ci} 19168c2ecf20Sopenharmony_ci#endif /* CONFIG_HIBERNATION */ 19178c2ecf20Sopenharmony_ci 19188c2ecf20Sopenharmony_cistatic inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) 19198c2ecf20Sopenharmony_ci{ 19208c2ecf20Sopenharmony_ci return pte_same(pte_swp_clear_flags(pte), swp_pte); 19218c2ecf20Sopenharmony_ci} 19228c2ecf20Sopenharmony_ci 19238c2ecf20Sopenharmony_ci/* 19248c2ecf20Sopenharmony_ci * No need to decide whether this PTE shares the swap entry with others, 19258c2ecf20Sopenharmony_ci * just let do_wp_page work it out if a write is requested later - to 19268c2ecf20Sopenharmony_ci * force COW, vm_page_prot omits write permission from any private vma. 19278c2ecf20Sopenharmony_ci */ 19288c2ecf20Sopenharmony_cistatic int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, 19298c2ecf20Sopenharmony_ci unsigned long addr, swp_entry_t entry, struct page *page) 19308c2ecf20Sopenharmony_ci{ 19318c2ecf20Sopenharmony_ci struct page *swapcache; 19328c2ecf20Sopenharmony_ci spinlock_t *ptl; 19338c2ecf20Sopenharmony_ci pte_t *pte; 19348c2ecf20Sopenharmony_ci int ret = 1; 19358c2ecf20Sopenharmony_ci 19368c2ecf20Sopenharmony_ci swapcache = page; 19378c2ecf20Sopenharmony_ci page = ksm_might_need_to_copy(page, vma, addr); 19388c2ecf20Sopenharmony_ci if (unlikely(!page)) 19398c2ecf20Sopenharmony_ci return -ENOMEM; 19408c2ecf20Sopenharmony_ci 19418c2ecf20Sopenharmony_ci pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 19428c2ecf20Sopenharmony_ci if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { 19438c2ecf20Sopenharmony_ci ret = 0; 19448c2ecf20Sopenharmony_ci goto out; 19458c2ecf20Sopenharmony_ci } 19468c2ecf20Sopenharmony_ci 19478c2ecf20Sopenharmony_ci dec_mm_counter(vma->vm_mm, MM_SWAPENTS); 19488c2ecf20Sopenharmony_ci inc_mm_counter(vma->vm_mm, MM_ANONPAGES); 19498c2ecf20Sopenharmony_ci get_page(page); 19508c2ecf20Sopenharmony_ci set_pte_at(vma->vm_mm, addr, pte, 19518c2ecf20Sopenharmony_ci pte_mkold(mk_pte(page, vma->vm_page_prot))); 19528c2ecf20Sopenharmony_ci if (page == swapcache) { 19538c2ecf20Sopenharmony_ci page_add_anon_rmap(page, vma, addr, false); 19548c2ecf20Sopenharmony_ci } else { /* ksm created a completely new copy */ 19558c2ecf20Sopenharmony_ci page_add_new_anon_rmap(page, vma, addr, false); 19568c2ecf20Sopenharmony_ci lru_cache_add_inactive_or_unevictable(page, vma); 19578c2ecf20Sopenharmony_ci } 19588c2ecf20Sopenharmony_ci swap_free(entry); 19598c2ecf20Sopenharmony_ciout: 19608c2ecf20Sopenharmony_ci pte_unmap_unlock(pte, ptl); 19618c2ecf20Sopenharmony_ci if (page != swapcache) { 19628c2ecf20Sopenharmony_ci unlock_page(page); 19638c2ecf20Sopenharmony_ci put_page(page); 19648c2ecf20Sopenharmony_ci } 19658c2ecf20Sopenharmony_ci return ret; 19668c2ecf20Sopenharmony_ci} 19678c2ecf20Sopenharmony_ci 19688c2ecf20Sopenharmony_cistatic int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 19698c2ecf20Sopenharmony_ci unsigned long addr, unsigned long end, 19708c2ecf20Sopenharmony_ci unsigned int type, bool frontswap, 19718c2ecf20Sopenharmony_ci unsigned long *fs_pages_to_unuse) 19728c2ecf20Sopenharmony_ci{ 19738c2ecf20Sopenharmony_ci struct page *page; 19748c2ecf20Sopenharmony_ci swp_entry_t entry; 19758c2ecf20Sopenharmony_ci pte_t *pte; 19768c2ecf20Sopenharmony_ci struct swap_info_struct *si; 19778c2ecf20Sopenharmony_ci unsigned long offset; 19788c2ecf20Sopenharmony_ci int ret = 0; 19798c2ecf20Sopenharmony_ci volatile unsigned char *swap_map; 19808c2ecf20Sopenharmony_ci 19818c2ecf20Sopenharmony_ci si = swap_info[type]; 19828c2ecf20Sopenharmony_ci pte = pte_offset_map(pmd, addr); 19838c2ecf20Sopenharmony_ci do { 19848c2ecf20Sopenharmony_ci struct vm_fault vmf; 19858c2ecf20Sopenharmony_ci 19868c2ecf20Sopenharmony_ci if (!is_swap_pte(*pte)) 19878c2ecf20Sopenharmony_ci continue; 19888c2ecf20Sopenharmony_ci 19898c2ecf20Sopenharmony_ci entry = pte_to_swp_entry(*pte); 19908c2ecf20Sopenharmony_ci if (swp_type(entry) != type) 19918c2ecf20Sopenharmony_ci continue; 19928c2ecf20Sopenharmony_ci 19938c2ecf20Sopenharmony_ci offset = swp_offset(entry); 19948c2ecf20Sopenharmony_ci if (frontswap && !frontswap_test(si, offset)) 19958c2ecf20Sopenharmony_ci continue; 19968c2ecf20Sopenharmony_ci 19978c2ecf20Sopenharmony_ci pte_unmap(pte); 19988c2ecf20Sopenharmony_ci swap_map = &si->swap_map[offset]; 19998c2ecf20Sopenharmony_ci page = lookup_swap_cache(entry, vma, addr); 20008c2ecf20Sopenharmony_ci if (!page) { 20018c2ecf20Sopenharmony_ci vmf.vma = vma; 20028c2ecf20Sopenharmony_ci vmf.address = addr; 20038c2ecf20Sopenharmony_ci vmf.pmd = pmd; 20048c2ecf20Sopenharmony_ci page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, 20058c2ecf20Sopenharmony_ci &vmf); 20068c2ecf20Sopenharmony_ci } 20078c2ecf20Sopenharmony_ci if (!page) { 20088c2ecf20Sopenharmony_ci if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD) 20098c2ecf20Sopenharmony_ci goto try_next; 20108c2ecf20Sopenharmony_ci return -ENOMEM; 20118c2ecf20Sopenharmony_ci } 20128c2ecf20Sopenharmony_ci 20138c2ecf20Sopenharmony_ci lock_page(page); 20148c2ecf20Sopenharmony_ci wait_on_page_writeback(page); 20158c2ecf20Sopenharmony_ci ret = unuse_pte(vma, pmd, addr, entry, page); 20168c2ecf20Sopenharmony_ci if (ret < 0) { 20178c2ecf20Sopenharmony_ci unlock_page(page); 20188c2ecf20Sopenharmony_ci put_page(page); 20198c2ecf20Sopenharmony_ci goto out; 20208c2ecf20Sopenharmony_ci } 20218c2ecf20Sopenharmony_ci 20228c2ecf20Sopenharmony_ci try_to_free_swap(page); 20238c2ecf20Sopenharmony_ci unlock_page(page); 20248c2ecf20Sopenharmony_ci put_page(page); 20258c2ecf20Sopenharmony_ci 20268c2ecf20Sopenharmony_ci if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) { 20278c2ecf20Sopenharmony_ci ret = FRONTSWAP_PAGES_UNUSED; 20288c2ecf20Sopenharmony_ci goto out; 20298c2ecf20Sopenharmony_ci } 20308c2ecf20Sopenharmony_citry_next: 20318c2ecf20Sopenharmony_ci pte = pte_offset_map(pmd, addr); 20328c2ecf20Sopenharmony_ci } while (pte++, addr += PAGE_SIZE, addr != end); 20338c2ecf20Sopenharmony_ci pte_unmap(pte - 1); 20348c2ecf20Sopenharmony_ci 20358c2ecf20Sopenharmony_ci ret = 0; 20368c2ecf20Sopenharmony_ciout: 20378c2ecf20Sopenharmony_ci return ret; 20388c2ecf20Sopenharmony_ci} 20398c2ecf20Sopenharmony_ci 20408c2ecf20Sopenharmony_cistatic inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, 20418c2ecf20Sopenharmony_ci unsigned long addr, unsigned long end, 20428c2ecf20Sopenharmony_ci unsigned int type, bool frontswap, 20438c2ecf20Sopenharmony_ci unsigned long *fs_pages_to_unuse) 20448c2ecf20Sopenharmony_ci{ 20458c2ecf20Sopenharmony_ci pmd_t *pmd; 20468c2ecf20Sopenharmony_ci unsigned long next; 20478c2ecf20Sopenharmony_ci int ret; 20488c2ecf20Sopenharmony_ci 20498c2ecf20Sopenharmony_ci pmd = pmd_offset(pud, addr); 20508c2ecf20Sopenharmony_ci do { 20518c2ecf20Sopenharmony_ci cond_resched(); 20528c2ecf20Sopenharmony_ci next = pmd_addr_end(addr, end); 20538c2ecf20Sopenharmony_ci if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 20548c2ecf20Sopenharmony_ci continue; 20558c2ecf20Sopenharmony_ci ret = unuse_pte_range(vma, pmd, addr, next, type, 20568c2ecf20Sopenharmony_ci frontswap, fs_pages_to_unuse); 20578c2ecf20Sopenharmony_ci if (ret) 20588c2ecf20Sopenharmony_ci return ret; 20598c2ecf20Sopenharmony_ci } while (pmd++, addr = next, addr != end); 20608c2ecf20Sopenharmony_ci return 0; 20618c2ecf20Sopenharmony_ci} 20628c2ecf20Sopenharmony_ci 20638c2ecf20Sopenharmony_cistatic inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, 20648c2ecf20Sopenharmony_ci unsigned long addr, unsigned long end, 20658c2ecf20Sopenharmony_ci unsigned int type, bool frontswap, 20668c2ecf20Sopenharmony_ci unsigned long *fs_pages_to_unuse) 20678c2ecf20Sopenharmony_ci{ 20688c2ecf20Sopenharmony_ci pud_t *pud; 20698c2ecf20Sopenharmony_ci unsigned long next; 20708c2ecf20Sopenharmony_ci int ret; 20718c2ecf20Sopenharmony_ci 20728c2ecf20Sopenharmony_ci pud = pud_offset(p4d, addr); 20738c2ecf20Sopenharmony_ci do { 20748c2ecf20Sopenharmony_ci next = pud_addr_end(addr, end); 20758c2ecf20Sopenharmony_ci if (pud_none_or_clear_bad(pud)) 20768c2ecf20Sopenharmony_ci continue; 20778c2ecf20Sopenharmony_ci ret = unuse_pmd_range(vma, pud, addr, next, type, 20788c2ecf20Sopenharmony_ci frontswap, fs_pages_to_unuse); 20798c2ecf20Sopenharmony_ci if (ret) 20808c2ecf20Sopenharmony_ci return ret; 20818c2ecf20Sopenharmony_ci } while (pud++, addr = next, addr != end); 20828c2ecf20Sopenharmony_ci return 0; 20838c2ecf20Sopenharmony_ci} 20848c2ecf20Sopenharmony_ci 20858c2ecf20Sopenharmony_cistatic inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, 20868c2ecf20Sopenharmony_ci unsigned long addr, unsigned long end, 20878c2ecf20Sopenharmony_ci unsigned int type, bool frontswap, 20888c2ecf20Sopenharmony_ci unsigned long *fs_pages_to_unuse) 20898c2ecf20Sopenharmony_ci{ 20908c2ecf20Sopenharmony_ci p4d_t *p4d; 20918c2ecf20Sopenharmony_ci unsigned long next; 20928c2ecf20Sopenharmony_ci int ret; 20938c2ecf20Sopenharmony_ci 20948c2ecf20Sopenharmony_ci p4d = p4d_offset(pgd, addr); 20958c2ecf20Sopenharmony_ci do { 20968c2ecf20Sopenharmony_ci next = p4d_addr_end(addr, end); 20978c2ecf20Sopenharmony_ci if (p4d_none_or_clear_bad(p4d)) 20988c2ecf20Sopenharmony_ci continue; 20998c2ecf20Sopenharmony_ci ret = unuse_pud_range(vma, p4d, addr, next, type, 21008c2ecf20Sopenharmony_ci frontswap, fs_pages_to_unuse); 21018c2ecf20Sopenharmony_ci if (ret) 21028c2ecf20Sopenharmony_ci return ret; 21038c2ecf20Sopenharmony_ci } while (p4d++, addr = next, addr != end); 21048c2ecf20Sopenharmony_ci return 0; 21058c2ecf20Sopenharmony_ci} 21068c2ecf20Sopenharmony_ci 21078c2ecf20Sopenharmony_cistatic int unuse_vma(struct vm_area_struct *vma, unsigned int type, 21088c2ecf20Sopenharmony_ci bool frontswap, unsigned long *fs_pages_to_unuse) 21098c2ecf20Sopenharmony_ci{ 21108c2ecf20Sopenharmony_ci pgd_t *pgd; 21118c2ecf20Sopenharmony_ci unsigned long addr, end, next; 21128c2ecf20Sopenharmony_ci int ret; 21138c2ecf20Sopenharmony_ci 21148c2ecf20Sopenharmony_ci addr = vma->vm_start; 21158c2ecf20Sopenharmony_ci end = vma->vm_end; 21168c2ecf20Sopenharmony_ci 21178c2ecf20Sopenharmony_ci pgd = pgd_offset(vma->vm_mm, addr); 21188c2ecf20Sopenharmony_ci do { 21198c2ecf20Sopenharmony_ci next = pgd_addr_end(addr, end); 21208c2ecf20Sopenharmony_ci if (pgd_none_or_clear_bad(pgd)) 21218c2ecf20Sopenharmony_ci continue; 21228c2ecf20Sopenharmony_ci ret = unuse_p4d_range(vma, pgd, addr, next, type, 21238c2ecf20Sopenharmony_ci frontswap, fs_pages_to_unuse); 21248c2ecf20Sopenharmony_ci if (ret) 21258c2ecf20Sopenharmony_ci return ret; 21268c2ecf20Sopenharmony_ci } while (pgd++, addr = next, addr != end); 21278c2ecf20Sopenharmony_ci return 0; 21288c2ecf20Sopenharmony_ci} 21298c2ecf20Sopenharmony_ci 21308c2ecf20Sopenharmony_cistatic int unuse_mm(struct mm_struct *mm, unsigned int type, 21318c2ecf20Sopenharmony_ci bool frontswap, unsigned long *fs_pages_to_unuse) 21328c2ecf20Sopenharmony_ci{ 21338c2ecf20Sopenharmony_ci struct vm_area_struct *vma; 21348c2ecf20Sopenharmony_ci int ret = 0; 21358c2ecf20Sopenharmony_ci 21368c2ecf20Sopenharmony_ci mmap_read_lock(mm); 21378c2ecf20Sopenharmony_ci for (vma = mm->mmap; vma; vma = vma->vm_next) { 21388c2ecf20Sopenharmony_ci if (vma->anon_vma) { 21398c2ecf20Sopenharmony_ci ret = unuse_vma(vma, type, frontswap, 21408c2ecf20Sopenharmony_ci fs_pages_to_unuse); 21418c2ecf20Sopenharmony_ci if (ret) 21428c2ecf20Sopenharmony_ci break; 21438c2ecf20Sopenharmony_ci } 21448c2ecf20Sopenharmony_ci cond_resched(); 21458c2ecf20Sopenharmony_ci } 21468c2ecf20Sopenharmony_ci mmap_read_unlock(mm); 21478c2ecf20Sopenharmony_ci return ret; 21488c2ecf20Sopenharmony_ci} 21498c2ecf20Sopenharmony_ci 21508c2ecf20Sopenharmony_ci/* 21518c2ecf20Sopenharmony_ci * Scan swap_map (or frontswap_map if frontswap parameter is true) 21528c2ecf20Sopenharmony_ci * from current position to next entry still in use. Return 0 21538c2ecf20Sopenharmony_ci * if there are no inuse entries after prev till end of the map. 21548c2ecf20Sopenharmony_ci */ 21558c2ecf20Sopenharmony_cistatic unsigned int find_next_to_unuse(struct swap_info_struct *si, 21568c2ecf20Sopenharmony_ci unsigned int prev, bool frontswap) 21578c2ecf20Sopenharmony_ci{ 21588c2ecf20Sopenharmony_ci unsigned int i; 21598c2ecf20Sopenharmony_ci unsigned char count; 21608c2ecf20Sopenharmony_ci 21618c2ecf20Sopenharmony_ci /* 21628c2ecf20Sopenharmony_ci * No need for swap_lock here: we're just looking 21638c2ecf20Sopenharmony_ci * for whether an entry is in use, not modifying it; false 21648c2ecf20Sopenharmony_ci * hits are okay, and sys_swapoff() has already prevented new 21658c2ecf20Sopenharmony_ci * allocations from this area (while holding swap_lock). 21668c2ecf20Sopenharmony_ci */ 21678c2ecf20Sopenharmony_ci for (i = prev + 1; i < si->max; i++) { 21688c2ecf20Sopenharmony_ci count = READ_ONCE(si->swap_map[i]); 21698c2ecf20Sopenharmony_ci if (count && swap_count(count) != SWAP_MAP_BAD) 21708c2ecf20Sopenharmony_ci if (!frontswap || frontswap_test(si, i)) 21718c2ecf20Sopenharmony_ci break; 21728c2ecf20Sopenharmony_ci if ((i % LATENCY_LIMIT) == 0) 21738c2ecf20Sopenharmony_ci cond_resched(); 21748c2ecf20Sopenharmony_ci } 21758c2ecf20Sopenharmony_ci 21768c2ecf20Sopenharmony_ci if (i == si->max) 21778c2ecf20Sopenharmony_ci i = 0; 21788c2ecf20Sopenharmony_ci 21798c2ecf20Sopenharmony_ci return i; 21808c2ecf20Sopenharmony_ci} 21818c2ecf20Sopenharmony_ci 21828c2ecf20Sopenharmony_ci/* 21838c2ecf20Sopenharmony_ci * If the boolean frontswap is true, only unuse pages_to_unuse pages; 21848c2ecf20Sopenharmony_ci * pages_to_unuse==0 means all pages; ignored if frontswap is false 21858c2ecf20Sopenharmony_ci */ 21868c2ecf20Sopenharmony_ciint try_to_unuse(unsigned int type, bool frontswap, 21878c2ecf20Sopenharmony_ci unsigned long pages_to_unuse) 21888c2ecf20Sopenharmony_ci{ 21898c2ecf20Sopenharmony_ci struct mm_struct *prev_mm; 21908c2ecf20Sopenharmony_ci struct mm_struct *mm; 21918c2ecf20Sopenharmony_ci struct list_head *p; 21928c2ecf20Sopenharmony_ci int retval = 0; 21938c2ecf20Sopenharmony_ci struct swap_info_struct *si = swap_info[type]; 21948c2ecf20Sopenharmony_ci struct page *page; 21958c2ecf20Sopenharmony_ci swp_entry_t entry; 21968c2ecf20Sopenharmony_ci unsigned int i; 21978c2ecf20Sopenharmony_ci 21988c2ecf20Sopenharmony_ci if (!READ_ONCE(si->inuse_pages)) 21998c2ecf20Sopenharmony_ci return 0; 22008c2ecf20Sopenharmony_ci 22018c2ecf20Sopenharmony_ci if (!frontswap) 22028c2ecf20Sopenharmony_ci pages_to_unuse = 0; 22038c2ecf20Sopenharmony_ci 22048c2ecf20Sopenharmony_ciretry: 22058c2ecf20Sopenharmony_ci retval = shmem_unuse(type, frontswap, &pages_to_unuse); 22068c2ecf20Sopenharmony_ci if (retval) 22078c2ecf20Sopenharmony_ci goto out; 22088c2ecf20Sopenharmony_ci 22098c2ecf20Sopenharmony_ci prev_mm = &init_mm; 22108c2ecf20Sopenharmony_ci mmget(prev_mm); 22118c2ecf20Sopenharmony_ci 22128c2ecf20Sopenharmony_ci spin_lock(&mmlist_lock); 22138c2ecf20Sopenharmony_ci p = &init_mm.mmlist; 22148c2ecf20Sopenharmony_ci while (READ_ONCE(si->inuse_pages) && 22158c2ecf20Sopenharmony_ci !signal_pending(current) && 22168c2ecf20Sopenharmony_ci (p = p->next) != &init_mm.mmlist) { 22178c2ecf20Sopenharmony_ci 22188c2ecf20Sopenharmony_ci mm = list_entry(p, struct mm_struct, mmlist); 22198c2ecf20Sopenharmony_ci if (!mmget_not_zero(mm)) 22208c2ecf20Sopenharmony_ci continue; 22218c2ecf20Sopenharmony_ci spin_unlock(&mmlist_lock); 22228c2ecf20Sopenharmony_ci mmput(prev_mm); 22238c2ecf20Sopenharmony_ci prev_mm = mm; 22248c2ecf20Sopenharmony_ci retval = unuse_mm(mm, type, frontswap, &pages_to_unuse); 22258c2ecf20Sopenharmony_ci 22268c2ecf20Sopenharmony_ci if (retval) { 22278c2ecf20Sopenharmony_ci mmput(prev_mm); 22288c2ecf20Sopenharmony_ci goto out; 22298c2ecf20Sopenharmony_ci } 22308c2ecf20Sopenharmony_ci 22318c2ecf20Sopenharmony_ci /* 22328c2ecf20Sopenharmony_ci * Make sure that we aren't completely killing 22338c2ecf20Sopenharmony_ci * interactive performance. 22348c2ecf20Sopenharmony_ci */ 22358c2ecf20Sopenharmony_ci cond_resched(); 22368c2ecf20Sopenharmony_ci spin_lock(&mmlist_lock); 22378c2ecf20Sopenharmony_ci } 22388c2ecf20Sopenharmony_ci spin_unlock(&mmlist_lock); 22398c2ecf20Sopenharmony_ci 22408c2ecf20Sopenharmony_ci mmput(prev_mm); 22418c2ecf20Sopenharmony_ci 22428c2ecf20Sopenharmony_ci i = 0; 22438c2ecf20Sopenharmony_ci while (READ_ONCE(si->inuse_pages) && 22448c2ecf20Sopenharmony_ci !signal_pending(current) && 22458c2ecf20Sopenharmony_ci (i = find_next_to_unuse(si, i, frontswap)) != 0) { 22468c2ecf20Sopenharmony_ci 22478c2ecf20Sopenharmony_ci entry = swp_entry(type, i); 22488c2ecf20Sopenharmony_ci page = find_get_page(swap_address_space(entry), i); 22498c2ecf20Sopenharmony_ci if (!page) 22508c2ecf20Sopenharmony_ci continue; 22518c2ecf20Sopenharmony_ci 22528c2ecf20Sopenharmony_ci /* 22538c2ecf20Sopenharmony_ci * It is conceivable that a racing task removed this page from 22548c2ecf20Sopenharmony_ci * swap cache just before we acquired the page lock. The page 22558c2ecf20Sopenharmony_ci * might even be back in swap cache on another swap area. But 22568c2ecf20Sopenharmony_ci * that is okay, try_to_free_swap() only removes stale pages. 22578c2ecf20Sopenharmony_ci */ 22588c2ecf20Sopenharmony_ci lock_page(page); 22598c2ecf20Sopenharmony_ci wait_on_page_writeback(page); 22608c2ecf20Sopenharmony_ci try_to_free_swap(page); 22618c2ecf20Sopenharmony_ci unlock_page(page); 22628c2ecf20Sopenharmony_ci put_page(page); 22638c2ecf20Sopenharmony_ci 22648c2ecf20Sopenharmony_ci /* 22658c2ecf20Sopenharmony_ci * For frontswap, we just need to unuse pages_to_unuse, if 22668c2ecf20Sopenharmony_ci * it was specified. Need not check frontswap again here as 22678c2ecf20Sopenharmony_ci * we already zeroed out pages_to_unuse if not frontswap. 22688c2ecf20Sopenharmony_ci */ 22698c2ecf20Sopenharmony_ci if (pages_to_unuse && --pages_to_unuse == 0) 22708c2ecf20Sopenharmony_ci goto out; 22718c2ecf20Sopenharmony_ci } 22728c2ecf20Sopenharmony_ci 22738c2ecf20Sopenharmony_ci /* 22748c2ecf20Sopenharmony_ci * Lets check again to see if there are still swap entries in the map. 22758c2ecf20Sopenharmony_ci * If yes, we would need to do retry the unuse logic again. 22768c2ecf20Sopenharmony_ci * Under global memory pressure, swap entries can be reinserted back 22778c2ecf20Sopenharmony_ci * into process space after the mmlist loop above passes over them. 22788c2ecf20Sopenharmony_ci * 22798c2ecf20Sopenharmony_ci * Limit the number of retries? No: when mmget_not_zero() above fails, 22808c2ecf20Sopenharmony_ci * that mm is likely to be freeing swap from exit_mmap(), which proceeds 22818c2ecf20Sopenharmony_ci * at its own independent pace; and even shmem_writepage() could have 22828c2ecf20Sopenharmony_ci * been preempted after get_swap_page(), temporarily hiding that swap. 22838c2ecf20Sopenharmony_ci * It's easy and robust (though cpu-intensive) just to keep retrying. 22848c2ecf20Sopenharmony_ci */ 22858c2ecf20Sopenharmony_ci if (READ_ONCE(si->inuse_pages)) { 22868c2ecf20Sopenharmony_ci if (!signal_pending(current)) 22878c2ecf20Sopenharmony_ci goto retry; 22888c2ecf20Sopenharmony_ci retval = -EINTR; 22898c2ecf20Sopenharmony_ci } 22908c2ecf20Sopenharmony_ciout: 22918c2ecf20Sopenharmony_ci return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval; 22928c2ecf20Sopenharmony_ci} 22938c2ecf20Sopenharmony_ci 22948c2ecf20Sopenharmony_ci/* 22958c2ecf20Sopenharmony_ci * After a successful try_to_unuse, if no swap is now in use, we know 22968c2ecf20Sopenharmony_ci * we can empty the mmlist. swap_lock must be held on entry and exit. 22978c2ecf20Sopenharmony_ci * Note that mmlist_lock nests inside swap_lock, and an mm must be 22988c2ecf20Sopenharmony_ci * added to the mmlist just after page_duplicate - before would be racy. 22998c2ecf20Sopenharmony_ci */ 23008c2ecf20Sopenharmony_cistatic void drain_mmlist(void) 23018c2ecf20Sopenharmony_ci{ 23028c2ecf20Sopenharmony_ci struct list_head *p, *next; 23038c2ecf20Sopenharmony_ci unsigned int type; 23048c2ecf20Sopenharmony_ci 23058c2ecf20Sopenharmony_ci for (type = 0; type < nr_swapfiles; type++) 23068c2ecf20Sopenharmony_ci if (swap_info[type]->inuse_pages) 23078c2ecf20Sopenharmony_ci return; 23088c2ecf20Sopenharmony_ci spin_lock(&mmlist_lock); 23098c2ecf20Sopenharmony_ci list_for_each_safe(p, next, &init_mm.mmlist) 23108c2ecf20Sopenharmony_ci list_del_init(p); 23118c2ecf20Sopenharmony_ci spin_unlock(&mmlist_lock); 23128c2ecf20Sopenharmony_ci} 23138c2ecf20Sopenharmony_ci 23148c2ecf20Sopenharmony_ci/* 23158c2ecf20Sopenharmony_ci * Use this swapdev's extent info to locate the (PAGE_SIZE) block which 23168c2ecf20Sopenharmony_ci * corresponds to page offset for the specified swap entry. 23178c2ecf20Sopenharmony_ci * Note that the type of this function is sector_t, but it returns page offset 23188c2ecf20Sopenharmony_ci * into the bdev, not sector offset. 23198c2ecf20Sopenharmony_ci */ 23208c2ecf20Sopenharmony_cistatic sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) 23218c2ecf20Sopenharmony_ci{ 23228c2ecf20Sopenharmony_ci struct swap_info_struct *sis; 23238c2ecf20Sopenharmony_ci struct swap_extent *se; 23248c2ecf20Sopenharmony_ci pgoff_t offset; 23258c2ecf20Sopenharmony_ci 23268c2ecf20Sopenharmony_ci sis = swp_swap_info(entry); 23278c2ecf20Sopenharmony_ci *bdev = sis->bdev; 23288c2ecf20Sopenharmony_ci 23298c2ecf20Sopenharmony_ci offset = swp_offset(entry); 23308c2ecf20Sopenharmony_ci se = offset_to_swap_extent(sis, offset); 23318c2ecf20Sopenharmony_ci return se->start_block + (offset - se->start_page); 23328c2ecf20Sopenharmony_ci} 23338c2ecf20Sopenharmony_ci 23348c2ecf20Sopenharmony_ci/* 23358c2ecf20Sopenharmony_ci * Returns the page offset into bdev for the specified page's swap entry. 23368c2ecf20Sopenharmony_ci */ 23378c2ecf20Sopenharmony_cisector_t map_swap_page(struct page *page, struct block_device **bdev) 23388c2ecf20Sopenharmony_ci{ 23398c2ecf20Sopenharmony_ci swp_entry_t entry; 23408c2ecf20Sopenharmony_ci entry.val = page_private(page); 23418c2ecf20Sopenharmony_ci return map_swap_entry(entry, bdev); 23428c2ecf20Sopenharmony_ci} 23438c2ecf20Sopenharmony_ci 23448c2ecf20Sopenharmony_ci/* 23458c2ecf20Sopenharmony_ci * Free all of a swapdev's extent information 23468c2ecf20Sopenharmony_ci */ 23478c2ecf20Sopenharmony_cistatic void destroy_swap_extents(struct swap_info_struct *sis) 23488c2ecf20Sopenharmony_ci{ 23498c2ecf20Sopenharmony_ci while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) { 23508c2ecf20Sopenharmony_ci struct rb_node *rb = sis->swap_extent_root.rb_node; 23518c2ecf20Sopenharmony_ci struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node); 23528c2ecf20Sopenharmony_ci 23538c2ecf20Sopenharmony_ci rb_erase(rb, &sis->swap_extent_root); 23548c2ecf20Sopenharmony_ci kfree(se); 23558c2ecf20Sopenharmony_ci } 23568c2ecf20Sopenharmony_ci 23578c2ecf20Sopenharmony_ci if (sis->flags & SWP_ACTIVATED) { 23588c2ecf20Sopenharmony_ci struct file *swap_file = sis->swap_file; 23598c2ecf20Sopenharmony_ci struct address_space *mapping = swap_file->f_mapping; 23608c2ecf20Sopenharmony_ci 23618c2ecf20Sopenharmony_ci sis->flags &= ~SWP_ACTIVATED; 23628c2ecf20Sopenharmony_ci if (mapping->a_ops->swap_deactivate) 23638c2ecf20Sopenharmony_ci mapping->a_ops->swap_deactivate(swap_file); 23648c2ecf20Sopenharmony_ci } 23658c2ecf20Sopenharmony_ci} 23668c2ecf20Sopenharmony_ci 23678c2ecf20Sopenharmony_ci/* 23688c2ecf20Sopenharmony_ci * Add a block range (and the corresponding page range) into this swapdev's 23698c2ecf20Sopenharmony_ci * extent tree. 23708c2ecf20Sopenharmony_ci * 23718c2ecf20Sopenharmony_ci * This function rather assumes that it is called in ascending page order. 23728c2ecf20Sopenharmony_ci */ 23738c2ecf20Sopenharmony_ciint 23748c2ecf20Sopenharmony_ciadd_swap_extent(struct swap_info_struct *sis, unsigned long start_page, 23758c2ecf20Sopenharmony_ci unsigned long nr_pages, sector_t start_block) 23768c2ecf20Sopenharmony_ci{ 23778c2ecf20Sopenharmony_ci struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL; 23788c2ecf20Sopenharmony_ci struct swap_extent *se; 23798c2ecf20Sopenharmony_ci struct swap_extent *new_se; 23808c2ecf20Sopenharmony_ci 23818c2ecf20Sopenharmony_ci /* 23828c2ecf20Sopenharmony_ci * place the new node at the right most since the 23838c2ecf20Sopenharmony_ci * function is called in ascending page order. 23848c2ecf20Sopenharmony_ci */ 23858c2ecf20Sopenharmony_ci while (*link) { 23868c2ecf20Sopenharmony_ci parent = *link; 23878c2ecf20Sopenharmony_ci link = &parent->rb_right; 23888c2ecf20Sopenharmony_ci } 23898c2ecf20Sopenharmony_ci 23908c2ecf20Sopenharmony_ci if (parent) { 23918c2ecf20Sopenharmony_ci se = rb_entry(parent, struct swap_extent, rb_node); 23928c2ecf20Sopenharmony_ci BUG_ON(se->start_page + se->nr_pages != start_page); 23938c2ecf20Sopenharmony_ci if (se->start_block + se->nr_pages == start_block) { 23948c2ecf20Sopenharmony_ci /* Merge it */ 23958c2ecf20Sopenharmony_ci se->nr_pages += nr_pages; 23968c2ecf20Sopenharmony_ci return 0; 23978c2ecf20Sopenharmony_ci } 23988c2ecf20Sopenharmony_ci } 23998c2ecf20Sopenharmony_ci 24008c2ecf20Sopenharmony_ci /* No merge, insert a new extent. */ 24018c2ecf20Sopenharmony_ci new_se = kmalloc(sizeof(*se), GFP_KERNEL); 24028c2ecf20Sopenharmony_ci if (new_se == NULL) 24038c2ecf20Sopenharmony_ci return -ENOMEM; 24048c2ecf20Sopenharmony_ci new_se->start_page = start_page; 24058c2ecf20Sopenharmony_ci new_se->nr_pages = nr_pages; 24068c2ecf20Sopenharmony_ci new_se->start_block = start_block; 24078c2ecf20Sopenharmony_ci 24088c2ecf20Sopenharmony_ci rb_link_node(&new_se->rb_node, parent, link); 24098c2ecf20Sopenharmony_ci rb_insert_color(&new_se->rb_node, &sis->swap_extent_root); 24108c2ecf20Sopenharmony_ci return 1; 24118c2ecf20Sopenharmony_ci} 24128c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(add_swap_extent); 24138c2ecf20Sopenharmony_ci 24148c2ecf20Sopenharmony_ci/* 24158c2ecf20Sopenharmony_ci * A `swap extent' is a simple thing which maps a contiguous range of pages 24168c2ecf20Sopenharmony_ci * onto a contiguous range of disk blocks. An ordered list of swap extents 24178c2ecf20Sopenharmony_ci * is built at swapon time and is then used at swap_writepage/swap_readpage 24188c2ecf20Sopenharmony_ci * time for locating where on disk a page belongs. 24198c2ecf20Sopenharmony_ci * 24208c2ecf20Sopenharmony_ci * If the swapfile is an S_ISBLK block device, a single extent is installed. 24218c2ecf20Sopenharmony_ci * This is done so that the main operating code can treat S_ISBLK and S_ISREG 24228c2ecf20Sopenharmony_ci * swap files identically. 24238c2ecf20Sopenharmony_ci * 24248c2ecf20Sopenharmony_ci * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap 24258c2ecf20Sopenharmony_ci * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK 24268c2ecf20Sopenharmony_ci * swapfiles are handled *identically* after swapon time. 24278c2ecf20Sopenharmony_ci * 24288c2ecf20Sopenharmony_ci * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks 24298c2ecf20Sopenharmony_ci * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If 24308c2ecf20Sopenharmony_ci * some stray blocks are found which do not fall within the PAGE_SIZE alignment 24318c2ecf20Sopenharmony_ci * requirements, they are simply tossed out - we will never use those blocks 24328c2ecf20Sopenharmony_ci * for swapping. 24338c2ecf20Sopenharmony_ci * 24348c2ecf20Sopenharmony_ci * For all swap devices we set S_SWAPFILE across the life of the swapon. This 24358c2ecf20Sopenharmony_ci * prevents users from writing to the swap device, which will corrupt memory. 24368c2ecf20Sopenharmony_ci * 24378c2ecf20Sopenharmony_ci * The amount of disk space which a single swap extent represents varies. 24388c2ecf20Sopenharmony_ci * Typically it is in the 1-4 megabyte range. So we can have hundreds of 24398c2ecf20Sopenharmony_ci * extents in the list. To avoid much list walking, we cache the previous 24408c2ecf20Sopenharmony_ci * search location in `curr_swap_extent', and start new searches from there. 24418c2ecf20Sopenharmony_ci * This is extremely effective. The average number of iterations in 24428c2ecf20Sopenharmony_ci * map_swap_page() has been measured at about 0.3 per page. - akpm. 24438c2ecf20Sopenharmony_ci */ 24448c2ecf20Sopenharmony_cistatic int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) 24458c2ecf20Sopenharmony_ci{ 24468c2ecf20Sopenharmony_ci struct file *swap_file = sis->swap_file; 24478c2ecf20Sopenharmony_ci struct address_space *mapping = swap_file->f_mapping; 24488c2ecf20Sopenharmony_ci struct inode *inode = mapping->host; 24498c2ecf20Sopenharmony_ci int ret; 24508c2ecf20Sopenharmony_ci 24518c2ecf20Sopenharmony_ci if (S_ISBLK(inode->i_mode)) { 24528c2ecf20Sopenharmony_ci ret = add_swap_extent(sis, 0, sis->max, 0); 24538c2ecf20Sopenharmony_ci *span = sis->pages; 24548c2ecf20Sopenharmony_ci return ret; 24558c2ecf20Sopenharmony_ci } 24568c2ecf20Sopenharmony_ci 24578c2ecf20Sopenharmony_ci if (mapping->a_ops->swap_activate) { 24588c2ecf20Sopenharmony_ci ret = mapping->a_ops->swap_activate(sis, swap_file, span); 24598c2ecf20Sopenharmony_ci if (ret >= 0) 24608c2ecf20Sopenharmony_ci sis->flags |= SWP_ACTIVATED; 24618c2ecf20Sopenharmony_ci if (!ret) { 24628c2ecf20Sopenharmony_ci sis->flags |= SWP_FS_OPS; 24638c2ecf20Sopenharmony_ci ret = add_swap_extent(sis, 0, sis->max, 0); 24648c2ecf20Sopenharmony_ci *span = sis->pages; 24658c2ecf20Sopenharmony_ci } 24668c2ecf20Sopenharmony_ci return ret; 24678c2ecf20Sopenharmony_ci } 24688c2ecf20Sopenharmony_ci 24698c2ecf20Sopenharmony_ci return generic_swapfile_activate(sis, swap_file, span); 24708c2ecf20Sopenharmony_ci} 24718c2ecf20Sopenharmony_ci 24728c2ecf20Sopenharmony_cistatic int swap_node(struct swap_info_struct *p) 24738c2ecf20Sopenharmony_ci{ 24748c2ecf20Sopenharmony_ci struct block_device *bdev; 24758c2ecf20Sopenharmony_ci 24768c2ecf20Sopenharmony_ci if (p->bdev) 24778c2ecf20Sopenharmony_ci bdev = p->bdev; 24788c2ecf20Sopenharmony_ci else 24798c2ecf20Sopenharmony_ci bdev = p->swap_file->f_inode->i_sb->s_bdev; 24808c2ecf20Sopenharmony_ci 24818c2ecf20Sopenharmony_ci return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; 24828c2ecf20Sopenharmony_ci} 24838c2ecf20Sopenharmony_ci 24848c2ecf20Sopenharmony_cistatic void setup_swap_info(struct swap_info_struct *p, int prio, 24858c2ecf20Sopenharmony_ci unsigned char *swap_map, 24868c2ecf20Sopenharmony_ci struct swap_cluster_info *cluster_info) 24878c2ecf20Sopenharmony_ci{ 24888c2ecf20Sopenharmony_ci int i; 24898c2ecf20Sopenharmony_ci 24908c2ecf20Sopenharmony_ci if (prio >= 0) 24918c2ecf20Sopenharmony_ci p->prio = prio; 24928c2ecf20Sopenharmony_ci else 24938c2ecf20Sopenharmony_ci p->prio = --least_priority; 24948c2ecf20Sopenharmony_ci /* 24958c2ecf20Sopenharmony_ci * the plist prio is negated because plist ordering is 24968c2ecf20Sopenharmony_ci * low-to-high, while swap ordering is high-to-low 24978c2ecf20Sopenharmony_ci */ 24988c2ecf20Sopenharmony_ci p->list.prio = -p->prio; 24998c2ecf20Sopenharmony_ci for_each_node(i) { 25008c2ecf20Sopenharmony_ci if (p->prio >= 0) 25018c2ecf20Sopenharmony_ci p->avail_lists[i].prio = -p->prio; 25028c2ecf20Sopenharmony_ci else { 25038c2ecf20Sopenharmony_ci if (swap_node(p) == i) 25048c2ecf20Sopenharmony_ci p->avail_lists[i].prio = 1; 25058c2ecf20Sopenharmony_ci else 25068c2ecf20Sopenharmony_ci p->avail_lists[i].prio = -p->prio; 25078c2ecf20Sopenharmony_ci } 25088c2ecf20Sopenharmony_ci } 25098c2ecf20Sopenharmony_ci p->swap_map = swap_map; 25108c2ecf20Sopenharmony_ci p->cluster_info = cluster_info; 25118c2ecf20Sopenharmony_ci} 25128c2ecf20Sopenharmony_ci 25138c2ecf20Sopenharmony_cistatic void _enable_swap_info(struct swap_info_struct *p) 25148c2ecf20Sopenharmony_ci{ 25158c2ecf20Sopenharmony_ci p->flags |= SWP_WRITEOK | SWP_VALID; 25168c2ecf20Sopenharmony_ci atomic_long_add(p->pages, &nr_swap_pages); 25178c2ecf20Sopenharmony_ci total_swap_pages += p->pages; 25188c2ecf20Sopenharmony_ci 25198c2ecf20Sopenharmony_ci assert_spin_locked(&swap_lock); 25208c2ecf20Sopenharmony_ci /* 25218c2ecf20Sopenharmony_ci * both lists are plists, and thus priority ordered. 25228c2ecf20Sopenharmony_ci * swap_active_head needs to be priority ordered for swapoff(), 25238c2ecf20Sopenharmony_ci * which on removal of any swap_info_struct with an auto-assigned 25248c2ecf20Sopenharmony_ci * (i.e. negative) priority increments the auto-assigned priority 25258c2ecf20Sopenharmony_ci * of any lower-priority swap_info_structs. 25268c2ecf20Sopenharmony_ci * swap_avail_head needs to be priority ordered for get_swap_page(), 25278c2ecf20Sopenharmony_ci * which allocates swap pages from the highest available priority 25288c2ecf20Sopenharmony_ci * swap_info_struct. 25298c2ecf20Sopenharmony_ci */ 25308c2ecf20Sopenharmony_ci plist_add(&p->list, &swap_active_head); 25318c2ecf20Sopenharmony_ci add_to_avail_list(p); 25328c2ecf20Sopenharmony_ci} 25338c2ecf20Sopenharmony_ci 25348c2ecf20Sopenharmony_cistatic void enable_swap_info(struct swap_info_struct *p, int prio, 25358c2ecf20Sopenharmony_ci unsigned char *swap_map, 25368c2ecf20Sopenharmony_ci struct swap_cluster_info *cluster_info, 25378c2ecf20Sopenharmony_ci unsigned long *frontswap_map) 25388c2ecf20Sopenharmony_ci{ 25398c2ecf20Sopenharmony_ci frontswap_init(p->type, frontswap_map); 25408c2ecf20Sopenharmony_ci spin_lock(&swap_lock); 25418c2ecf20Sopenharmony_ci spin_lock(&p->lock); 25428c2ecf20Sopenharmony_ci setup_swap_info(p, prio, swap_map, cluster_info); 25438c2ecf20Sopenharmony_ci spin_unlock(&p->lock); 25448c2ecf20Sopenharmony_ci spin_unlock(&swap_lock); 25458c2ecf20Sopenharmony_ci /* 25468c2ecf20Sopenharmony_ci * Guarantee swap_map, cluster_info, etc. fields are valid 25478c2ecf20Sopenharmony_ci * between get/put_swap_device() if SWP_VALID bit is set 25488c2ecf20Sopenharmony_ci */ 25498c2ecf20Sopenharmony_ci synchronize_rcu(); 25508c2ecf20Sopenharmony_ci spin_lock(&swap_lock); 25518c2ecf20Sopenharmony_ci spin_lock(&p->lock); 25528c2ecf20Sopenharmony_ci _enable_swap_info(p); 25538c2ecf20Sopenharmony_ci spin_unlock(&p->lock); 25548c2ecf20Sopenharmony_ci spin_unlock(&swap_lock); 25558c2ecf20Sopenharmony_ci} 25568c2ecf20Sopenharmony_ci 25578c2ecf20Sopenharmony_cistatic void reinsert_swap_info(struct swap_info_struct *p) 25588c2ecf20Sopenharmony_ci{ 25598c2ecf20Sopenharmony_ci spin_lock(&swap_lock); 25608c2ecf20Sopenharmony_ci spin_lock(&p->lock); 25618c2ecf20Sopenharmony_ci setup_swap_info(p, p->prio, p->swap_map, p->cluster_info); 25628c2ecf20Sopenharmony_ci _enable_swap_info(p); 25638c2ecf20Sopenharmony_ci spin_unlock(&p->lock); 25648c2ecf20Sopenharmony_ci spin_unlock(&swap_lock); 25658c2ecf20Sopenharmony_ci} 25668c2ecf20Sopenharmony_ci 25678c2ecf20Sopenharmony_cibool has_usable_swap(void) 25688c2ecf20Sopenharmony_ci{ 25698c2ecf20Sopenharmony_ci bool ret = true; 25708c2ecf20Sopenharmony_ci 25718c2ecf20Sopenharmony_ci spin_lock(&swap_lock); 25728c2ecf20Sopenharmony_ci if (plist_head_empty(&swap_active_head)) 25738c2ecf20Sopenharmony_ci ret = false; 25748c2ecf20Sopenharmony_ci spin_unlock(&swap_lock); 25758c2ecf20Sopenharmony_ci return ret; 25768c2ecf20Sopenharmony_ci} 25778c2ecf20Sopenharmony_ci 25788c2ecf20Sopenharmony_ciSYSCALL_DEFINE1(swapoff, const char __user *, specialfile) 25798c2ecf20Sopenharmony_ci{ 25808c2ecf20Sopenharmony_ci struct swap_info_struct *p = NULL; 25818c2ecf20Sopenharmony_ci unsigned char *swap_map; 25828c2ecf20Sopenharmony_ci struct swap_cluster_info *cluster_info; 25838c2ecf20Sopenharmony_ci unsigned long *frontswap_map; 25848c2ecf20Sopenharmony_ci struct file *swap_file, *victim; 25858c2ecf20Sopenharmony_ci struct address_space *mapping; 25868c2ecf20Sopenharmony_ci struct inode *inode; 25878c2ecf20Sopenharmony_ci struct filename *pathname; 25888c2ecf20Sopenharmony_ci int err, found = 0; 25898c2ecf20Sopenharmony_ci unsigned int old_block_size; 25908c2ecf20Sopenharmony_ci 25918c2ecf20Sopenharmony_ci if (!capable(CAP_SYS_ADMIN)) 25928c2ecf20Sopenharmony_ci return -EPERM; 25938c2ecf20Sopenharmony_ci 25948c2ecf20Sopenharmony_ci BUG_ON(!current->mm); 25958c2ecf20Sopenharmony_ci 25968c2ecf20Sopenharmony_ci pathname = getname(specialfile); 25978c2ecf20Sopenharmony_ci if (IS_ERR(pathname)) 25988c2ecf20Sopenharmony_ci return PTR_ERR(pathname); 25998c2ecf20Sopenharmony_ci 26008c2ecf20Sopenharmony_ci victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); 26018c2ecf20Sopenharmony_ci err = PTR_ERR(victim); 26028c2ecf20Sopenharmony_ci if (IS_ERR(victim)) 26038c2ecf20Sopenharmony_ci goto out; 26048c2ecf20Sopenharmony_ci 26058c2ecf20Sopenharmony_ci mapping = victim->f_mapping; 26068c2ecf20Sopenharmony_ci spin_lock(&swap_lock); 26078c2ecf20Sopenharmony_ci plist_for_each_entry(p, &swap_active_head, list) { 26088c2ecf20Sopenharmony_ci if (p->flags & SWP_WRITEOK) { 26098c2ecf20Sopenharmony_ci if (p->swap_file->f_mapping == mapping) { 26108c2ecf20Sopenharmony_ci found = 1; 26118c2ecf20Sopenharmony_ci break; 26128c2ecf20Sopenharmony_ci } 26138c2ecf20Sopenharmony_ci } 26148c2ecf20Sopenharmony_ci } 26158c2ecf20Sopenharmony_ci if (!found) { 26168c2ecf20Sopenharmony_ci err = -EINVAL; 26178c2ecf20Sopenharmony_ci spin_unlock(&swap_lock); 26188c2ecf20Sopenharmony_ci goto out_dput; 26198c2ecf20Sopenharmony_ci } 26208c2ecf20Sopenharmony_ci if (!security_vm_enough_memory_mm(current->mm, p->pages)) 26218c2ecf20Sopenharmony_ci vm_unacct_memory(p->pages); 26228c2ecf20Sopenharmony_ci else { 26238c2ecf20Sopenharmony_ci err = -ENOMEM; 26248c2ecf20Sopenharmony_ci spin_unlock(&swap_lock); 26258c2ecf20Sopenharmony_ci goto out_dput; 26268c2ecf20Sopenharmony_ci } 26278c2ecf20Sopenharmony_ci spin_lock(&p->lock); 26288c2ecf20Sopenharmony_ci del_from_avail_list(p); 26298c2ecf20Sopenharmony_ci if (p->prio < 0) { 26308c2ecf20Sopenharmony_ci struct swap_info_struct *si = p; 26318c2ecf20Sopenharmony_ci int nid; 26328c2ecf20Sopenharmony_ci 26338c2ecf20Sopenharmony_ci plist_for_each_entry_continue(si, &swap_active_head, list) { 26348c2ecf20Sopenharmony_ci si->prio++; 26358c2ecf20Sopenharmony_ci si->list.prio--; 26368c2ecf20Sopenharmony_ci for_each_node(nid) { 26378c2ecf20Sopenharmony_ci if (si->avail_lists[nid].prio != 1) 26388c2ecf20Sopenharmony_ci si->avail_lists[nid].prio--; 26398c2ecf20Sopenharmony_ci } 26408c2ecf20Sopenharmony_ci } 26418c2ecf20Sopenharmony_ci least_priority++; 26428c2ecf20Sopenharmony_ci } 26438c2ecf20Sopenharmony_ci plist_del(&p->list, &swap_active_head); 26448c2ecf20Sopenharmony_ci atomic_long_sub(p->pages, &nr_swap_pages); 26458c2ecf20Sopenharmony_ci total_swap_pages -= p->pages; 26468c2ecf20Sopenharmony_ci p->flags &= ~SWP_WRITEOK; 26478c2ecf20Sopenharmony_ci spin_unlock(&p->lock); 26488c2ecf20Sopenharmony_ci spin_unlock(&swap_lock); 26498c2ecf20Sopenharmony_ci 26508c2ecf20Sopenharmony_ci disable_swap_slots_cache_lock(); 26518c2ecf20Sopenharmony_ci 26528c2ecf20Sopenharmony_ci set_current_oom_origin(); 26538c2ecf20Sopenharmony_ci err = try_to_unuse(p->type, false, 0); /* force unuse all pages */ 26548c2ecf20Sopenharmony_ci clear_current_oom_origin(); 26558c2ecf20Sopenharmony_ci 26568c2ecf20Sopenharmony_ci if (err) { 26578c2ecf20Sopenharmony_ci /* re-insert swap space back into swap_list */ 26588c2ecf20Sopenharmony_ci reinsert_swap_info(p); 26598c2ecf20Sopenharmony_ci reenable_swap_slots_cache_unlock(); 26608c2ecf20Sopenharmony_ci goto out_dput; 26618c2ecf20Sopenharmony_ci } 26628c2ecf20Sopenharmony_ci 26638c2ecf20Sopenharmony_ci reenable_swap_slots_cache_unlock(); 26648c2ecf20Sopenharmony_ci 26658c2ecf20Sopenharmony_ci spin_lock(&swap_lock); 26668c2ecf20Sopenharmony_ci spin_lock(&p->lock); 26678c2ecf20Sopenharmony_ci p->flags &= ~SWP_VALID; /* mark swap device as invalid */ 26688c2ecf20Sopenharmony_ci spin_unlock(&p->lock); 26698c2ecf20Sopenharmony_ci spin_unlock(&swap_lock); 26708c2ecf20Sopenharmony_ci /* 26718c2ecf20Sopenharmony_ci * wait for swap operations protected by get/put_swap_device() 26728c2ecf20Sopenharmony_ci * to complete 26738c2ecf20Sopenharmony_ci */ 26748c2ecf20Sopenharmony_ci synchronize_rcu(); 26758c2ecf20Sopenharmony_ci 26768c2ecf20Sopenharmony_ci flush_work(&p->discard_work); 26778c2ecf20Sopenharmony_ci 26788c2ecf20Sopenharmony_ci destroy_swap_extents(p); 26798c2ecf20Sopenharmony_ci if (p->flags & SWP_CONTINUED) 26808c2ecf20Sopenharmony_ci free_swap_count_continuations(p); 26818c2ecf20Sopenharmony_ci 26828c2ecf20Sopenharmony_ci if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev))) 26838c2ecf20Sopenharmony_ci atomic_dec(&nr_rotate_swap); 26848c2ecf20Sopenharmony_ci 26858c2ecf20Sopenharmony_ci mutex_lock(&swapon_mutex); 26868c2ecf20Sopenharmony_ci spin_lock(&swap_lock); 26878c2ecf20Sopenharmony_ci spin_lock(&p->lock); 26888c2ecf20Sopenharmony_ci drain_mmlist(); 26898c2ecf20Sopenharmony_ci 26908c2ecf20Sopenharmony_ci /* wait for anyone still in scan_swap_map */ 26918c2ecf20Sopenharmony_ci p->highest_bit = 0; /* cuts scans short */ 26928c2ecf20Sopenharmony_ci while (p->flags >= SWP_SCANNING) { 26938c2ecf20Sopenharmony_ci spin_unlock(&p->lock); 26948c2ecf20Sopenharmony_ci spin_unlock(&swap_lock); 26958c2ecf20Sopenharmony_ci schedule_timeout_uninterruptible(1); 26968c2ecf20Sopenharmony_ci spin_lock(&swap_lock); 26978c2ecf20Sopenharmony_ci spin_lock(&p->lock); 26988c2ecf20Sopenharmony_ci } 26998c2ecf20Sopenharmony_ci 27008c2ecf20Sopenharmony_ci swap_file = p->swap_file; 27018c2ecf20Sopenharmony_ci old_block_size = p->old_block_size; 27028c2ecf20Sopenharmony_ci p->swap_file = NULL; 27038c2ecf20Sopenharmony_ci p->max = 0; 27048c2ecf20Sopenharmony_ci swap_map = p->swap_map; 27058c2ecf20Sopenharmony_ci p->swap_map = NULL; 27068c2ecf20Sopenharmony_ci cluster_info = p->cluster_info; 27078c2ecf20Sopenharmony_ci p->cluster_info = NULL; 27088c2ecf20Sopenharmony_ci frontswap_map = frontswap_map_get(p); 27098c2ecf20Sopenharmony_ci spin_unlock(&p->lock); 27108c2ecf20Sopenharmony_ci spin_unlock(&swap_lock); 27118c2ecf20Sopenharmony_ci arch_swap_invalidate_area(p->type); 27128c2ecf20Sopenharmony_ci frontswap_invalidate_area(p->type); 27138c2ecf20Sopenharmony_ci frontswap_map_set(p, NULL); 27148c2ecf20Sopenharmony_ci mutex_unlock(&swapon_mutex); 27158c2ecf20Sopenharmony_ci free_percpu(p->percpu_cluster); 27168c2ecf20Sopenharmony_ci p->percpu_cluster = NULL; 27178c2ecf20Sopenharmony_ci free_percpu(p->cluster_next_cpu); 27188c2ecf20Sopenharmony_ci p->cluster_next_cpu = NULL; 27198c2ecf20Sopenharmony_ci vfree(swap_map); 27208c2ecf20Sopenharmony_ci kvfree(cluster_info); 27218c2ecf20Sopenharmony_ci kvfree(frontswap_map); 27228c2ecf20Sopenharmony_ci /* Destroy swap account information */ 27238c2ecf20Sopenharmony_ci swap_cgroup_swapoff(p->type); 27248c2ecf20Sopenharmony_ci exit_swap_address_space(p->type); 27258c2ecf20Sopenharmony_ci 27268c2ecf20Sopenharmony_ci inode = mapping->host; 27278c2ecf20Sopenharmony_ci if (S_ISBLK(inode->i_mode)) { 27288c2ecf20Sopenharmony_ci struct block_device *bdev = I_BDEV(inode); 27298c2ecf20Sopenharmony_ci 27308c2ecf20Sopenharmony_ci set_blocksize(bdev, old_block_size); 27318c2ecf20Sopenharmony_ci blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); 27328c2ecf20Sopenharmony_ci } 27338c2ecf20Sopenharmony_ci 27348c2ecf20Sopenharmony_ci inode_lock(inode); 27358c2ecf20Sopenharmony_ci inode->i_flags &= ~S_SWAPFILE; 27368c2ecf20Sopenharmony_ci inode_unlock(inode); 27378c2ecf20Sopenharmony_ci filp_close(swap_file, NULL); 27388c2ecf20Sopenharmony_ci 27398c2ecf20Sopenharmony_ci /* 27408c2ecf20Sopenharmony_ci * Clear the SWP_USED flag after all resources are freed so that swapon 27418c2ecf20Sopenharmony_ci * can reuse this swap_info in alloc_swap_info() safely. It is ok to 27428c2ecf20Sopenharmony_ci * not hold p->lock after we cleared its SWP_WRITEOK. 27438c2ecf20Sopenharmony_ci */ 27448c2ecf20Sopenharmony_ci spin_lock(&swap_lock); 27458c2ecf20Sopenharmony_ci p->flags = 0; 27468c2ecf20Sopenharmony_ci spin_unlock(&swap_lock); 27478c2ecf20Sopenharmony_ci 27488c2ecf20Sopenharmony_ci err = 0; 27498c2ecf20Sopenharmony_ci atomic_inc(&proc_poll_event); 27508c2ecf20Sopenharmony_ci wake_up_interruptible(&proc_poll_wait); 27518c2ecf20Sopenharmony_ci 27528c2ecf20Sopenharmony_ciout_dput: 27538c2ecf20Sopenharmony_ci filp_close(victim, NULL); 27548c2ecf20Sopenharmony_ciout: 27558c2ecf20Sopenharmony_ci putname(pathname); 27568c2ecf20Sopenharmony_ci return err; 27578c2ecf20Sopenharmony_ci} 27588c2ecf20Sopenharmony_ci 27598c2ecf20Sopenharmony_ci#ifdef CONFIG_PROC_FS 27608c2ecf20Sopenharmony_cistatic __poll_t swaps_poll(struct file *file, poll_table *wait) 27618c2ecf20Sopenharmony_ci{ 27628c2ecf20Sopenharmony_ci struct seq_file *seq = file->private_data; 27638c2ecf20Sopenharmony_ci 27648c2ecf20Sopenharmony_ci poll_wait(file, &proc_poll_wait, wait); 27658c2ecf20Sopenharmony_ci 27668c2ecf20Sopenharmony_ci if (seq->poll_event != atomic_read(&proc_poll_event)) { 27678c2ecf20Sopenharmony_ci seq->poll_event = atomic_read(&proc_poll_event); 27688c2ecf20Sopenharmony_ci return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI; 27698c2ecf20Sopenharmony_ci } 27708c2ecf20Sopenharmony_ci 27718c2ecf20Sopenharmony_ci return EPOLLIN | EPOLLRDNORM; 27728c2ecf20Sopenharmony_ci} 27738c2ecf20Sopenharmony_ci 27748c2ecf20Sopenharmony_ci/* iterator */ 27758c2ecf20Sopenharmony_cistatic void *swap_start(struct seq_file *swap, loff_t *pos) 27768c2ecf20Sopenharmony_ci{ 27778c2ecf20Sopenharmony_ci struct swap_info_struct *si; 27788c2ecf20Sopenharmony_ci int type; 27798c2ecf20Sopenharmony_ci loff_t l = *pos; 27808c2ecf20Sopenharmony_ci 27818c2ecf20Sopenharmony_ci mutex_lock(&swapon_mutex); 27828c2ecf20Sopenharmony_ci 27838c2ecf20Sopenharmony_ci if (!l) 27848c2ecf20Sopenharmony_ci return SEQ_START_TOKEN; 27858c2ecf20Sopenharmony_ci 27868c2ecf20Sopenharmony_ci for (type = 0; (si = swap_type_to_swap_info(type)); type++) { 27878c2ecf20Sopenharmony_ci if (!(si->flags & SWP_USED) || !si->swap_map) 27888c2ecf20Sopenharmony_ci continue; 27898c2ecf20Sopenharmony_ci if (!--l) 27908c2ecf20Sopenharmony_ci return si; 27918c2ecf20Sopenharmony_ci } 27928c2ecf20Sopenharmony_ci 27938c2ecf20Sopenharmony_ci return NULL; 27948c2ecf20Sopenharmony_ci} 27958c2ecf20Sopenharmony_ci 27968c2ecf20Sopenharmony_cistatic void *swap_next(struct seq_file *swap, void *v, loff_t *pos) 27978c2ecf20Sopenharmony_ci{ 27988c2ecf20Sopenharmony_ci struct swap_info_struct *si = v; 27998c2ecf20Sopenharmony_ci int type; 28008c2ecf20Sopenharmony_ci 28018c2ecf20Sopenharmony_ci if (v == SEQ_START_TOKEN) 28028c2ecf20Sopenharmony_ci type = 0; 28038c2ecf20Sopenharmony_ci else 28048c2ecf20Sopenharmony_ci type = si->type + 1; 28058c2ecf20Sopenharmony_ci 28068c2ecf20Sopenharmony_ci ++(*pos); 28078c2ecf20Sopenharmony_ci for (; (si = swap_type_to_swap_info(type)); type++) { 28088c2ecf20Sopenharmony_ci if (!(si->flags & SWP_USED) || !si->swap_map) 28098c2ecf20Sopenharmony_ci continue; 28108c2ecf20Sopenharmony_ci return si; 28118c2ecf20Sopenharmony_ci } 28128c2ecf20Sopenharmony_ci 28138c2ecf20Sopenharmony_ci return NULL; 28148c2ecf20Sopenharmony_ci} 28158c2ecf20Sopenharmony_ci 28168c2ecf20Sopenharmony_cistatic void swap_stop(struct seq_file *swap, void *v) 28178c2ecf20Sopenharmony_ci{ 28188c2ecf20Sopenharmony_ci mutex_unlock(&swapon_mutex); 28198c2ecf20Sopenharmony_ci} 28208c2ecf20Sopenharmony_ci 28218c2ecf20Sopenharmony_cistatic int swap_show(struct seq_file *swap, void *v) 28228c2ecf20Sopenharmony_ci{ 28238c2ecf20Sopenharmony_ci struct swap_info_struct *si = v; 28248c2ecf20Sopenharmony_ci struct file *file; 28258c2ecf20Sopenharmony_ci int len; 28268c2ecf20Sopenharmony_ci unsigned int bytes, inuse; 28278c2ecf20Sopenharmony_ci 28288c2ecf20Sopenharmony_ci if (si == SEQ_START_TOKEN) { 28298c2ecf20Sopenharmony_ci seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); 28308c2ecf20Sopenharmony_ci return 0; 28318c2ecf20Sopenharmony_ci } 28328c2ecf20Sopenharmony_ci 28338c2ecf20Sopenharmony_ci bytes = si->pages << (PAGE_SHIFT - 10); 28348c2ecf20Sopenharmony_ci inuse = si->inuse_pages << (PAGE_SHIFT - 10); 28358c2ecf20Sopenharmony_ci 28368c2ecf20Sopenharmony_ci file = si->swap_file; 28378c2ecf20Sopenharmony_ci len = seq_file_path(swap, file, " \t\n\\"); 28388c2ecf20Sopenharmony_ci seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n", 28398c2ecf20Sopenharmony_ci len < 40 ? 40 - len : 1, " ", 28408c2ecf20Sopenharmony_ci S_ISBLK(file_inode(file)->i_mode) ? 28418c2ecf20Sopenharmony_ci "partition" : "file\t", 28428c2ecf20Sopenharmony_ci bytes, bytes < 10000000 ? "\t" : "", 28438c2ecf20Sopenharmony_ci inuse, inuse < 10000000 ? "\t" : "", 28448c2ecf20Sopenharmony_ci si->prio); 28458c2ecf20Sopenharmony_ci return 0; 28468c2ecf20Sopenharmony_ci} 28478c2ecf20Sopenharmony_ci 28488c2ecf20Sopenharmony_cistatic const struct seq_operations swaps_op = { 28498c2ecf20Sopenharmony_ci .start = swap_start, 28508c2ecf20Sopenharmony_ci .next = swap_next, 28518c2ecf20Sopenharmony_ci .stop = swap_stop, 28528c2ecf20Sopenharmony_ci .show = swap_show 28538c2ecf20Sopenharmony_ci}; 28548c2ecf20Sopenharmony_ci 28558c2ecf20Sopenharmony_cistatic int swaps_open(struct inode *inode, struct file *file) 28568c2ecf20Sopenharmony_ci{ 28578c2ecf20Sopenharmony_ci struct seq_file *seq; 28588c2ecf20Sopenharmony_ci int ret; 28598c2ecf20Sopenharmony_ci 28608c2ecf20Sopenharmony_ci ret = seq_open(file, &swaps_op); 28618c2ecf20Sopenharmony_ci if (ret) 28628c2ecf20Sopenharmony_ci return ret; 28638c2ecf20Sopenharmony_ci 28648c2ecf20Sopenharmony_ci seq = file->private_data; 28658c2ecf20Sopenharmony_ci seq->poll_event = atomic_read(&proc_poll_event); 28668c2ecf20Sopenharmony_ci return 0; 28678c2ecf20Sopenharmony_ci} 28688c2ecf20Sopenharmony_ci 28698c2ecf20Sopenharmony_cistatic const struct proc_ops swaps_proc_ops = { 28708c2ecf20Sopenharmony_ci .proc_flags = PROC_ENTRY_PERMANENT, 28718c2ecf20Sopenharmony_ci .proc_open = swaps_open, 28728c2ecf20Sopenharmony_ci .proc_read = seq_read, 28738c2ecf20Sopenharmony_ci .proc_lseek = seq_lseek, 28748c2ecf20Sopenharmony_ci .proc_release = seq_release, 28758c2ecf20Sopenharmony_ci .proc_poll = swaps_poll, 28768c2ecf20Sopenharmony_ci}; 28778c2ecf20Sopenharmony_ci 28788c2ecf20Sopenharmony_cistatic int __init procswaps_init(void) 28798c2ecf20Sopenharmony_ci{ 28808c2ecf20Sopenharmony_ci proc_create("swaps", 0, NULL, &swaps_proc_ops); 28818c2ecf20Sopenharmony_ci return 0; 28828c2ecf20Sopenharmony_ci} 28838c2ecf20Sopenharmony_ci__initcall(procswaps_init); 28848c2ecf20Sopenharmony_ci#endif /* CONFIG_PROC_FS */ 28858c2ecf20Sopenharmony_ci 28868c2ecf20Sopenharmony_ci#ifdef MAX_SWAPFILES_CHECK 28878c2ecf20Sopenharmony_cistatic int __init max_swapfiles_check(void) 28888c2ecf20Sopenharmony_ci{ 28898c2ecf20Sopenharmony_ci MAX_SWAPFILES_CHECK(); 28908c2ecf20Sopenharmony_ci return 0; 28918c2ecf20Sopenharmony_ci} 28928c2ecf20Sopenharmony_cilate_initcall(max_swapfiles_check); 28938c2ecf20Sopenharmony_ci#endif 28948c2ecf20Sopenharmony_ci 28958c2ecf20Sopenharmony_cistatic struct swap_info_struct *alloc_swap_info(void) 28968c2ecf20Sopenharmony_ci{ 28978c2ecf20Sopenharmony_ci struct swap_info_struct *p; 28988c2ecf20Sopenharmony_ci struct swap_info_struct *defer = NULL; 28998c2ecf20Sopenharmony_ci unsigned int type; 29008c2ecf20Sopenharmony_ci int i; 29018c2ecf20Sopenharmony_ci 29028c2ecf20Sopenharmony_ci p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL); 29038c2ecf20Sopenharmony_ci if (!p) 29048c2ecf20Sopenharmony_ci return ERR_PTR(-ENOMEM); 29058c2ecf20Sopenharmony_ci 29068c2ecf20Sopenharmony_ci spin_lock(&swap_lock); 29078c2ecf20Sopenharmony_ci for (type = 0; type < nr_swapfiles; type++) { 29088c2ecf20Sopenharmony_ci if (!(swap_info[type]->flags & SWP_USED)) 29098c2ecf20Sopenharmony_ci break; 29108c2ecf20Sopenharmony_ci } 29118c2ecf20Sopenharmony_ci if (type >= MAX_SWAPFILES) { 29128c2ecf20Sopenharmony_ci spin_unlock(&swap_lock); 29138c2ecf20Sopenharmony_ci kvfree(p); 29148c2ecf20Sopenharmony_ci return ERR_PTR(-EPERM); 29158c2ecf20Sopenharmony_ci } 29168c2ecf20Sopenharmony_ci if (type >= nr_swapfiles) { 29178c2ecf20Sopenharmony_ci p->type = type; 29188c2ecf20Sopenharmony_ci WRITE_ONCE(swap_info[type], p); 29198c2ecf20Sopenharmony_ci /* 29208c2ecf20Sopenharmony_ci * Write swap_info[type] before nr_swapfiles, in case a 29218c2ecf20Sopenharmony_ci * racing procfs swap_start() or swap_next() is reading them. 29228c2ecf20Sopenharmony_ci * (We never shrink nr_swapfiles, we never free this entry.) 29238c2ecf20Sopenharmony_ci */ 29248c2ecf20Sopenharmony_ci smp_wmb(); 29258c2ecf20Sopenharmony_ci WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1); 29268c2ecf20Sopenharmony_ci } else { 29278c2ecf20Sopenharmony_ci defer = p; 29288c2ecf20Sopenharmony_ci p = swap_info[type]; 29298c2ecf20Sopenharmony_ci /* 29308c2ecf20Sopenharmony_ci * Do not memset this entry: a racing procfs swap_next() 29318c2ecf20Sopenharmony_ci * would be relying on p->type to remain valid. 29328c2ecf20Sopenharmony_ci */ 29338c2ecf20Sopenharmony_ci } 29348c2ecf20Sopenharmony_ci p->swap_extent_root = RB_ROOT; 29358c2ecf20Sopenharmony_ci plist_node_init(&p->list, 0); 29368c2ecf20Sopenharmony_ci for_each_node(i) 29378c2ecf20Sopenharmony_ci plist_node_init(&p->avail_lists[i], 0); 29388c2ecf20Sopenharmony_ci p->flags = SWP_USED; 29398c2ecf20Sopenharmony_ci spin_unlock(&swap_lock); 29408c2ecf20Sopenharmony_ci kvfree(defer); 29418c2ecf20Sopenharmony_ci spin_lock_init(&p->lock); 29428c2ecf20Sopenharmony_ci spin_lock_init(&p->cont_lock); 29438c2ecf20Sopenharmony_ci 29448c2ecf20Sopenharmony_ci return p; 29458c2ecf20Sopenharmony_ci} 29468c2ecf20Sopenharmony_ci 29478c2ecf20Sopenharmony_cistatic int claim_swapfile(struct swap_info_struct *p, struct inode *inode) 29488c2ecf20Sopenharmony_ci{ 29498c2ecf20Sopenharmony_ci int error; 29508c2ecf20Sopenharmony_ci 29518c2ecf20Sopenharmony_ci if (S_ISBLK(inode->i_mode)) { 29528c2ecf20Sopenharmony_ci p->bdev = blkdev_get_by_dev(inode->i_rdev, 29538c2ecf20Sopenharmony_ci FMODE_READ | FMODE_WRITE | FMODE_EXCL, p); 29548c2ecf20Sopenharmony_ci if (IS_ERR(p->bdev)) { 29558c2ecf20Sopenharmony_ci error = PTR_ERR(p->bdev); 29568c2ecf20Sopenharmony_ci p->bdev = NULL; 29578c2ecf20Sopenharmony_ci return error; 29588c2ecf20Sopenharmony_ci } 29598c2ecf20Sopenharmony_ci p->old_block_size = block_size(p->bdev); 29608c2ecf20Sopenharmony_ci error = set_blocksize(p->bdev, PAGE_SIZE); 29618c2ecf20Sopenharmony_ci if (error < 0) 29628c2ecf20Sopenharmony_ci return error; 29638c2ecf20Sopenharmony_ci /* 29648c2ecf20Sopenharmony_ci * Zoned block devices contain zones that have a sequential 29658c2ecf20Sopenharmony_ci * write only restriction. Hence zoned block devices are not 29668c2ecf20Sopenharmony_ci * suitable for swapping. Disallow them here. 29678c2ecf20Sopenharmony_ci */ 29688c2ecf20Sopenharmony_ci if (blk_queue_is_zoned(p->bdev->bd_disk->queue)) 29698c2ecf20Sopenharmony_ci return -EINVAL; 29708c2ecf20Sopenharmony_ci p->flags |= SWP_BLKDEV; 29718c2ecf20Sopenharmony_ci } else if (S_ISREG(inode->i_mode)) { 29728c2ecf20Sopenharmony_ci p->bdev = inode->i_sb->s_bdev; 29738c2ecf20Sopenharmony_ci } 29748c2ecf20Sopenharmony_ci 29758c2ecf20Sopenharmony_ci return 0; 29768c2ecf20Sopenharmony_ci} 29778c2ecf20Sopenharmony_ci 29788c2ecf20Sopenharmony_ci 29798c2ecf20Sopenharmony_ci/* 29808c2ecf20Sopenharmony_ci * Find out how many pages are allowed for a single swap device. There 29818c2ecf20Sopenharmony_ci * are two limiting factors: 29828c2ecf20Sopenharmony_ci * 1) the number of bits for the swap offset in the swp_entry_t type, and 29838c2ecf20Sopenharmony_ci * 2) the number of bits in the swap pte, as defined by the different 29848c2ecf20Sopenharmony_ci * architectures. 29858c2ecf20Sopenharmony_ci * 29868c2ecf20Sopenharmony_ci * In order to find the largest possible bit mask, a swap entry with 29878c2ecf20Sopenharmony_ci * swap type 0 and swap offset ~0UL is created, encoded to a swap pte, 29888c2ecf20Sopenharmony_ci * decoded to a swp_entry_t again, and finally the swap offset is 29898c2ecf20Sopenharmony_ci * extracted. 29908c2ecf20Sopenharmony_ci * 29918c2ecf20Sopenharmony_ci * This will mask all the bits from the initial ~0UL mask that can't 29928c2ecf20Sopenharmony_ci * be encoded in either the swp_entry_t or the architecture definition 29938c2ecf20Sopenharmony_ci * of a swap pte. 29948c2ecf20Sopenharmony_ci */ 29958c2ecf20Sopenharmony_ciunsigned long generic_max_swapfile_size(void) 29968c2ecf20Sopenharmony_ci{ 29978c2ecf20Sopenharmony_ci return swp_offset(pte_to_swp_entry( 29988c2ecf20Sopenharmony_ci swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; 29998c2ecf20Sopenharmony_ci} 30008c2ecf20Sopenharmony_ci 30018c2ecf20Sopenharmony_ci/* Can be overridden by an architecture for additional checks. */ 30028c2ecf20Sopenharmony_ci__weak unsigned long max_swapfile_size(void) 30038c2ecf20Sopenharmony_ci{ 30048c2ecf20Sopenharmony_ci return generic_max_swapfile_size(); 30058c2ecf20Sopenharmony_ci} 30068c2ecf20Sopenharmony_ci 30078c2ecf20Sopenharmony_cistatic unsigned long read_swap_header(struct swap_info_struct *p, 30088c2ecf20Sopenharmony_ci union swap_header *swap_header, 30098c2ecf20Sopenharmony_ci struct inode *inode) 30108c2ecf20Sopenharmony_ci{ 30118c2ecf20Sopenharmony_ci int i; 30128c2ecf20Sopenharmony_ci unsigned long maxpages; 30138c2ecf20Sopenharmony_ci unsigned long swapfilepages; 30148c2ecf20Sopenharmony_ci unsigned long last_page; 30158c2ecf20Sopenharmony_ci 30168c2ecf20Sopenharmony_ci if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { 30178c2ecf20Sopenharmony_ci pr_err("Unable to find swap-space signature\n"); 30188c2ecf20Sopenharmony_ci return 0; 30198c2ecf20Sopenharmony_ci } 30208c2ecf20Sopenharmony_ci 30218c2ecf20Sopenharmony_ci /* swap partition endianess hack... */ 30228c2ecf20Sopenharmony_ci if (swab32(swap_header->info.version) == 1) { 30238c2ecf20Sopenharmony_ci swab32s(&swap_header->info.version); 30248c2ecf20Sopenharmony_ci swab32s(&swap_header->info.last_page); 30258c2ecf20Sopenharmony_ci swab32s(&swap_header->info.nr_badpages); 30268c2ecf20Sopenharmony_ci if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 30278c2ecf20Sopenharmony_ci return 0; 30288c2ecf20Sopenharmony_ci for (i = 0; i < swap_header->info.nr_badpages; i++) 30298c2ecf20Sopenharmony_ci swab32s(&swap_header->info.badpages[i]); 30308c2ecf20Sopenharmony_ci } 30318c2ecf20Sopenharmony_ci /* Check the swap header's sub-version */ 30328c2ecf20Sopenharmony_ci if (swap_header->info.version != 1) { 30338c2ecf20Sopenharmony_ci pr_warn("Unable to handle swap header version %d\n", 30348c2ecf20Sopenharmony_ci swap_header->info.version); 30358c2ecf20Sopenharmony_ci return 0; 30368c2ecf20Sopenharmony_ci } 30378c2ecf20Sopenharmony_ci 30388c2ecf20Sopenharmony_ci p->lowest_bit = 1; 30398c2ecf20Sopenharmony_ci p->cluster_next = 1; 30408c2ecf20Sopenharmony_ci p->cluster_nr = 0; 30418c2ecf20Sopenharmony_ci 30428c2ecf20Sopenharmony_ci maxpages = max_swapfile_size(); 30438c2ecf20Sopenharmony_ci last_page = swap_header->info.last_page; 30448c2ecf20Sopenharmony_ci if (!last_page) { 30458c2ecf20Sopenharmony_ci pr_warn("Empty swap-file\n"); 30468c2ecf20Sopenharmony_ci return 0; 30478c2ecf20Sopenharmony_ci } 30488c2ecf20Sopenharmony_ci if (last_page > maxpages) { 30498c2ecf20Sopenharmony_ci pr_warn("Truncating oversized swap area, only using %luk out of %luk\n", 30508c2ecf20Sopenharmony_ci maxpages << (PAGE_SHIFT - 10), 30518c2ecf20Sopenharmony_ci last_page << (PAGE_SHIFT - 10)); 30528c2ecf20Sopenharmony_ci } 30538c2ecf20Sopenharmony_ci if (maxpages > last_page) { 30548c2ecf20Sopenharmony_ci maxpages = last_page + 1; 30558c2ecf20Sopenharmony_ci /* p->max is an unsigned int: don't overflow it */ 30568c2ecf20Sopenharmony_ci if ((unsigned int)maxpages == 0) 30578c2ecf20Sopenharmony_ci maxpages = UINT_MAX; 30588c2ecf20Sopenharmony_ci } 30598c2ecf20Sopenharmony_ci p->highest_bit = maxpages - 1; 30608c2ecf20Sopenharmony_ci 30618c2ecf20Sopenharmony_ci if (!maxpages) 30628c2ecf20Sopenharmony_ci return 0; 30638c2ecf20Sopenharmony_ci swapfilepages = i_size_read(inode) >> PAGE_SHIFT; 30648c2ecf20Sopenharmony_ci if (swapfilepages && maxpages > swapfilepages) { 30658c2ecf20Sopenharmony_ci pr_warn("Swap area shorter than signature indicates\n"); 30668c2ecf20Sopenharmony_ci return 0; 30678c2ecf20Sopenharmony_ci } 30688c2ecf20Sopenharmony_ci if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) 30698c2ecf20Sopenharmony_ci return 0; 30708c2ecf20Sopenharmony_ci if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 30718c2ecf20Sopenharmony_ci return 0; 30728c2ecf20Sopenharmony_ci 30738c2ecf20Sopenharmony_ci return maxpages; 30748c2ecf20Sopenharmony_ci} 30758c2ecf20Sopenharmony_ci 30768c2ecf20Sopenharmony_ci#define SWAP_CLUSTER_INFO_COLS \ 30778c2ecf20Sopenharmony_ci DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info)) 30788c2ecf20Sopenharmony_ci#define SWAP_CLUSTER_SPACE_COLS \ 30798c2ecf20Sopenharmony_ci DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER) 30808c2ecf20Sopenharmony_ci#define SWAP_CLUSTER_COLS \ 30818c2ecf20Sopenharmony_ci max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS) 30828c2ecf20Sopenharmony_ci 30838c2ecf20Sopenharmony_cistatic int setup_swap_map_and_extents(struct swap_info_struct *p, 30848c2ecf20Sopenharmony_ci union swap_header *swap_header, 30858c2ecf20Sopenharmony_ci unsigned char *swap_map, 30868c2ecf20Sopenharmony_ci struct swap_cluster_info *cluster_info, 30878c2ecf20Sopenharmony_ci unsigned long maxpages, 30888c2ecf20Sopenharmony_ci sector_t *span) 30898c2ecf20Sopenharmony_ci{ 30908c2ecf20Sopenharmony_ci unsigned int j, k; 30918c2ecf20Sopenharmony_ci unsigned int nr_good_pages; 30928c2ecf20Sopenharmony_ci int nr_extents; 30938c2ecf20Sopenharmony_ci unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); 30948c2ecf20Sopenharmony_ci unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS; 30958c2ecf20Sopenharmony_ci unsigned long i, idx; 30968c2ecf20Sopenharmony_ci 30978c2ecf20Sopenharmony_ci nr_good_pages = maxpages - 1; /* omit header page */ 30988c2ecf20Sopenharmony_ci 30998c2ecf20Sopenharmony_ci cluster_list_init(&p->free_clusters); 31008c2ecf20Sopenharmony_ci cluster_list_init(&p->discard_clusters); 31018c2ecf20Sopenharmony_ci 31028c2ecf20Sopenharmony_ci for (i = 0; i < swap_header->info.nr_badpages; i++) { 31038c2ecf20Sopenharmony_ci unsigned int page_nr = swap_header->info.badpages[i]; 31048c2ecf20Sopenharmony_ci if (page_nr == 0 || page_nr > swap_header->info.last_page) 31058c2ecf20Sopenharmony_ci return -EINVAL; 31068c2ecf20Sopenharmony_ci if (page_nr < maxpages) { 31078c2ecf20Sopenharmony_ci swap_map[page_nr] = SWAP_MAP_BAD; 31088c2ecf20Sopenharmony_ci nr_good_pages--; 31098c2ecf20Sopenharmony_ci /* 31108c2ecf20Sopenharmony_ci * Haven't marked the cluster free yet, no list 31118c2ecf20Sopenharmony_ci * operation involved 31128c2ecf20Sopenharmony_ci */ 31138c2ecf20Sopenharmony_ci inc_cluster_info_page(p, cluster_info, page_nr); 31148c2ecf20Sopenharmony_ci } 31158c2ecf20Sopenharmony_ci } 31168c2ecf20Sopenharmony_ci 31178c2ecf20Sopenharmony_ci /* Haven't marked the cluster free yet, no list operation involved */ 31188c2ecf20Sopenharmony_ci for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) 31198c2ecf20Sopenharmony_ci inc_cluster_info_page(p, cluster_info, i); 31208c2ecf20Sopenharmony_ci 31218c2ecf20Sopenharmony_ci if (nr_good_pages) { 31228c2ecf20Sopenharmony_ci swap_map[0] = SWAP_MAP_BAD; 31238c2ecf20Sopenharmony_ci /* 31248c2ecf20Sopenharmony_ci * Not mark the cluster free yet, no list 31258c2ecf20Sopenharmony_ci * operation involved 31268c2ecf20Sopenharmony_ci */ 31278c2ecf20Sopenharmony_ci inc_cluster_info_page(p, cluster_info, 0); 31288c2ecf20Sopenharmony_ci p->max = maxpages; 31298c2ecf20Sopenharmony_ci p->pages = nr_good_pages; 31308c2ecf20Sopenharmony_ci nr_extents = setup_swap_extents(p, span); 31318c2ecf20Sopenharmony_ci if (nr_extents < 0) 31328c2ecf20Sopenharmony_ci return nr_extents; 31338c2ecf20Sopenharmony_ci nr_good_pages = p->pages; 31348c2ecf20Sopenharmony_ci } 31358c2ecf20Sopenharmony_ci if (!nr_good_pages) { 31368c2ecf20Sopenharmony_ci pr_warn("Empty swap-file\n"); 31378c2ecf20Sopenharmony_ci return -EINVAL; 31388c2ecf20Sopenharmony_ci } 31398c2ecf20Sopenharmony_ci 31408c2ecf20Sopenharmony_ci if (!cluster_info) 31418c2ecf20Sopenharmony_ci return nr_extents; 31428c2ecf20Sopenharmony_ci 31438c2ecf20Sopenharmony_ci 31448c2ecf20Sopenharmony_ci /* 31458c2ecf20Sopenharmony_ci * Reduce false cache line sharing between cluster_info and 31468c2ecf20Sopenharmony_ci * sharing same address space. 31478c2ecf20Sopenharmony_ci */ 31488c2ecf20Sopenharmony_ci for (k = 0; k < SWAP_CLUSTER_COLS; k++) { 31498c2ecf20Sopenharmony_ci j = (k + col) % SWAP_CLUSTER_COLS; 31508c2ecf20Sopenharmony_ci for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { 31518c2ecf20Sopenharmony_ci idx = i * SWAP_CLUSTER_COLS + j; 31528c2ecf20Sopenharmony_ci if (idx >= nr_clusters) 31538c2ecf20Sopenharmony_ci continue; 31548c2ecf20Sopenharmony_ci if (cluster_count(&cluster_info[idx])) 31558c2ecf20Sopenharmony_ci continue; 31568c2ecf20Sopenharmony_ci cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); 31578c2ecf20Sopenharmony_ci cluster_list_add_tail(&p->free_clusters, cluster_info, 31588c2ecf20Sopenharmony_ci idx); 31598c2ecf20Sopenharmony_ci } 31608c2ecf20Sopenharmony_ci } 31618c2ecf20Sopenharmony_ci return nr_extents; 31628c2ecf20Sopenharmony_ci} 31638c2ecf20Sopenharmony_ci 31648c2ecf20Sopenharmony_ci/* 31658c2ecf20Sopenharmony_ci * Helper to sys_swapon determining if a given swap 31668c2ecf20Sopenharmony_ci * backing device queue supports DISCARD operations. 31678c2ecf20Sopenharmony_ci */ 31688c2ecf20Sopenharmony_cistatic bool swap_discardable(struct swap_info_struct *si) 31698c2ecf20Sopenharmony_ci{ 31708c2ecf20Sopenharmony_ci struct request_queue *q = bdev_get_queue(si->bdev); 31718c2ecf20Sopenharmony_ci 31728c2ecf20Sopenharmony_ci if (!q || !blk_queue_discard(q)) 31738c2ecf20Sopenharmony_ci return false; 31748c2ecf20Sopenharmony_ci 31758c2ecf20Sopenharmony_ci return true; 31768c2ecf20Sopenharmony_ci} 31778c2ecf20Sopenharmony_ci 31788c2ecf20Sopenharmony_ciSYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) 31798c2ecf20Sopenharmony_ci{ 31808c2ecf20Sopenharmony_ci struct swap_info_struct *p; 31818c2ecf20Sopenharmony_ci struct filename *name; 31828c2ecf20Sopenharmony_ci struct file *swap_file = NULL; 31838c2ecf20Sopenharmony_ci struct address_space *mapping; 31848c2ecf20Sopenharmony_ci int prio; 31858c2ecf20Sopenharmony_ci int error; 31868c2ecf20Sopenharmony_ci union swap_header *swap_header; 31878c2ecf20Sopenharmony_ci int nr_extents; 31888c2ecf20Sopenharmony_ci sector_t span; 31898c2ecf20Sopenharmony_ci unsigned long maxpages; 31908c2ecf20Sopenharmony_ci unsigned char *swap_map = NULL; 31918c2ecf20Sopenharmony_ci struct swap_cluster_info *cluster_info = NULL; 31928c2ecf20Sopenharmony_ci unsigned long *frontswap_map = NULL; 31938c2ecf20Sopenharmony_ci struct page *page = NULL; 31948c2ecf20Sopenharmony_ci struct inode *inode = NULL; 31958c2ecf20Sopenharmony_ci bool inced_nr_rotate_swap = false; 31968c2ecf20Sopenharmony_ci 31978c2ecf20Sopenharmony_ci if (swap_flags & ~SWAP_FLAGS_VALID) 31988c2ecf20Sopenharmony_ci return -EINVAL; 31998c2ecf20Sopenharmony_ci 32008c2ecf20Sopenharmony_ci if (!capable(CAP_SYS_ADMIN)) 32018c2ecf20Sopenharmony_ci return -EPERM; 32028c2ecf20Sopenharmony_ci 32038c2ecf20Sopenharmony_ci if (!swap_avail_heads) 32048c2ecf20Sopenharmony_ci return -ENOMEM; 32058c2ecf20Sopenharmony_ci 32068c2ecf20Sopenharmony_ci p = alloc_swap_info(); 32078c2ecf20Sopenharmony_ci if (IS_ERR(p)) 32088c2ecf20Sopenharmony_ci return PTR_ERR(p); 32098c2ecf20Sopenharmony_ci 32108c2ecf20Sopenharmony_ci INIT_WORK(&p->discard_work, swap_discard_work); 32118c2ecf20Sopenharmony_ci 32128c2ecf20Sopenharmony_ci name = getname(specialfile); 32138c2ecf20Sopenharmony_ci if (IS_ERR(name)) { 32148c2ecf20Sopenharmony_ci error = PTR_ERR(name); 32158c2ecf20Sopenharmony_ci name = NULL; 32168c2ecf20Sopenharmony_ci goto bad_swap; 32178c2ecf20Sopenharmony_ci } 32188c2ecf20Sopenharmony_ci swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0); 32198c2ecf20Sopenharmony_ci if (IS_ERR(swap_file)) { 32208c2ecf20Sopenharmony_ci error = PTR_ERR(swap_file); 32218c2ecf20Sopenharmony_ci swap_file = NULL; 32228c2ecf20Sopenharmony_ci goto bad_swap; 32238c2ecf20Sopenharmony_ci } 32248c2ecf20Sopenharmony_ci 32258c2ecf20Sopenharmony_ci p->swap_file = swap_file; 32268c2ecf20Sopenharmony_ci mapping = swap_file->f_mapping; 32278c2ecf20Sopenharmony_ci inode = mapping->host; 32288c2ecf20Sopenharmony_ci 32298c2ecf20Sopenharmony_ci error = claim_swapfile(p, inode); 32308c2ecf20Sopenharmony_ci if (unlikely(error)) 32318c2ecf20Sopenharmony_ci goto bad_swap; 32328c2ecf20Sopenharmony_ci 32338c2ecf20Sopenharmony_ci inode_lock(inode); 32348c2ecf20Sopenharmony_ci if (IS_SWAPFILE(inode)) { 32358c2ecf20Sopenharmony_ci error = -EBUSY; 32368c2ecf20Sopenharmony_ci goto bad_swap_unlock_inode; 32378c2ecf20Sopenharmony_ci } 32388c2ecf20Sopenharmony_ci 32398c2ecf20Sopenharmony_ci /* 32408c2ecf20Sopenharmony_ci * Read the swap header. 32418c2ecf20Sopenharmony_ci */ 32428c2ecf20Sopenharmony_ci if (!mapping->a_ops->readpage) { 32438c2ecf20Sopenharmony_ci error = -EINVAL; 32448c2ecf20Sopenharmony_ci goto bad_swap_unlock_inode; 32458c2ecf20Sopenharmony_ci } 32468c2ecf20Sopenharmony_ci page = read_mapping_page(mapping, 0, swap_file); 32478c2ecf20Sopenharmony_ci if (IS_ERR(page)) { 32488c2ecf20Sopenharmony_ci error = PTR_ERR(page); 32498c2ecf20Sopenharmony_ci goto bad_swap_unlock_inode; 32508c2ecf20Sopenharmony_ci } 32518c2ecf20Sopenharmony_ci swap_header = kmap(page); 32528c2ecf20Sopenharmony_ci 32538c2ecf20Sopenharmony_ci maxpages = read_swap_header(p, swap_header, inode); 32548c2ecf20Sopenharmony_ci if (unlikely(!maxpages)) { 32558c2ecf20Sopenharmony_ci error = -EINVAL; 32568c2ecf20Sopenharmony_ci goto bad_swap_unlock_inode; 32578c2ecf20Sopenharmony_ci } 32588c2ecf20Sopenharmony_ci 32598c2ecf20Sopenharmony_ci /* OK, set up the swap map and apply the bad block list */ 32608c2ecf20Sopenharmony_ci swap_map = vzalloc(maxpages); 32618c2ecf20Sopenharmony_ci if (!swap_map) { 32628c2ecf20Sopenharmony_ci error = -ENOMEM; 32638c2ecf20Sopenharmony_ci goto bad_swap_unlock_inode; 32648c2ecf20Sopenharmony_ci } 32658c2ecf20Sopenharmony_ci 32668c2ecf20Sopenharmony_ci if (p->bdev && blk_queue_stable_writes(p->bdev->bd_disk->queue)) 32678c2ecf20Sopenharmony_ci p->flags |= SWP_STABLE_WRITES; 32688c2ecf20Sopenharmony_ci 32698c2ecf20Sopenharmony_ci if (p->bdev && p->bdev->bd_disk->fops->rw_page) 32708c2ecf20Sopenharmony_ci p->flags |= SWP_SYNCHRONOUS_IO; 32718c2ecf20Sopenharmony_ci 32728c2ecf20Sopenharmony_ci if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { 32738c2ecf20Sopenharmony_ci int cpu; 32748c2ecf20Sopenharmony_ci unsigned long ci, nr_cluster; 32758c2ecf20Sopenharmony_ci 32768c2ecf20Sopenharmony_ci p->flags |= SWP_SOLIDSTATE; 32778c2ecf20Sopenharmony_ci p->cluster_next_cpu = alloc_percpu(unsigned int); 32788c2ecf20Sopenharmony_ci if (!p->cluster_next_cpu) { 32798c2ecf20Sopenharmony_ci error = -ENOMEM; 32808c2ecf20Sopenharmony_ci goto bad_swap_unlock_inode; 32818c2ecf20Sopenharmony_ci } 32828c2ecf20Sopenharmony_ci /* 32838c2ecf20Sopenharmony_ci * select a random position to start with to help wear leveling 32848c2ecf20Sopenharmony_ci * SSD 32858c2ecf20Sopenharmony_ci */ 32868c2ecf20Sopenharmony_ci for_each_possible_cpu(cpu) { 32878c2ecf20Sopenharmony_ci per_cpu(*p->cluster_next_cpu, cpu) = 32888c2ecf20Sopenharmony_ci 1 + prandom_u32_max(p->highest_bit); 32898c2ecf20Sopenharmony_ci } 32908c2ecf20Sopenharmony_ci nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); 32918c2ecf20Sopenharmony_ci 32928c2ecf20Sopenharmony_ci cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info), 32938c2ecf20Sopenharmony_ci GFP_KERNEL); 32948c2ecf20Sopenharmony_ci if (!cluster_info) { 32958c2ecf20Sopenharmony_ci error = -ENOMEM; 32968c2ecf20Sopenharmony_ci goto bad_swap_unlock_inode; 32978c2ecf20Sopenharmony_ci } 32988c2ecf20Sopenharmony_ci 32998c2ecf20Sopenharmony_ci for (ci = 0; ci < nr_cluster; ci++) 33008c2ecf20Sopenharmony_ci spin_lock_init(&((cluster_info + ci)->lock)); 33018c2ecf20Sopenharmony_ci 33028c2ecf20Sopenharmony_ci p->percpu_cluster = alloc_percpu(struct percpu_cluster); 33038c2ecf20Sopenharmony_ci if (!p->percpu_cluster) { 33048c2ecf20Sopenharmony_ci error = -ENOMEM; 33058c2ecf20Sopenharmony_ci goto bad_swap_unlock_inode; 33068c2ecf20Sopenharmony_ci } 33078c2ecf20Sopenharmony_ci for_each_possible_cpu(cpu) { 33088c2ecf20Sopenharmony_ci struct percpu_cluster *cluster; 33098c2ecf20Sopenharmony_ci cluster = per_cpu_ptr(p->percpu_cluster, cpu); 33108c2ecf20Sopenharmony_ci cluster_set_null(&cluster->index); 33118c2ecf20Sopenharmony_ci } 33128c2ecf20Sopenharmony_ci } else { 33138c2ecf20Sopenharmony_ci atomic_inc(&nr_rotate_swap); 33148c2ecf20Sopenharmony_ci inced_nr_rotate_swap = true; 33158c2ecf20Sopenharmony_ci } 33168c2ecf20Sopenharmony_ci 33178c2ecf20Sopenharmony_ci error = swap_cgroup_swapon(p->type, maxpages); 33188c2ecf20Sopenharmony_ci if (error) 33198c2ecf20Sopenharmony_ci goto bad_swap_unlock_inode; 33208c2ecf20Sopenharmony_ci 33218c2ecf20Sopenharmony_ci nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map, 33228c2ecf20Sopenharmony_ci cluster_info, maxpages, &span); 33238c2ecf20Sopenharmony_ci if (unlikely(nr_extents < 0)) { 33248c2ecf20Sopenharmony_ci error = nr_extents; 33258c2ecf20Sopenharmony_ci goto bad_swap_unlock_inode; 33268c2ecf20Sopenharmony_ci } 33278c2ecf20Sopenharmony_ci /* frontswap enabled? set up bit-per-page map for frontswap */ 33288c2ecf20Sopenharmony_ci if (IS_ENABLED(CONFIG_FRONTSWAP)) 33298c2ecf20Sopenharmony_ci frontswap_map = kvcalloc(BITS_TO_LONGS(maxpages), 33308c2ecf20Sopenharmony_ci sizeof(long), 33318c2ecf20Sopenharmony_ci GFP_KERNEL); 33328c2ecf20Sopenharmony_ci 33338c2ecf20Sopenharmony_ci if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { 33348c2ecf20Sopenharmony_ci /* 33358c2ecf20Sopenharmony_ci * When discard is enabled for swap with no particular 33368c2ecf20Sopenharmony_ci * policy flagged, we set all swap discard flags here in 33378c2ecf20Sopenharmony_ci * order to sustain backward compatibility with older 33388c2ecf20Sopenharmony_ci * swapon(8) releases. 33398c2ecf20Sopenharmony_ci */ 33408c2ecf20Sopenharmony_ci p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | 33418c2ecf20Sopenharmony_ci SWP_PAGE_DISCARD); 33428c2ecf20Sopenharmony_ci 33438c2ecf20Sopenharmony_ci /* 33448c2ecf20Sopenharmony_ci * By flagging sys_swapon, a sysadmin can tell us to 33458c2ecf20Sopenharmony_ci * either do single-time area discards only, or to just 33468c2ecf20Sopenharmony_ci * perform discards for released swap page-clusters. 33478c2ecf20Sopenharmony_ci * Now it's time to adjust the p->flags accordingly. 33488c2ecf20Sopenharmony_ci */ 33498c2ecf20Sopenharmony_ci if (swap_flags & SWAP_FLAG_DISCARD_ONCE) 33508c2ecf20Sopenharmony_ci p->flags &= ~SWP_PAGE_DISCARD; 33518c2ecf20Sopenharmony_ci else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) 33528c2ecf20Sopenharmony_ci p->flags &= ~SWP_AREA_DISCARD; 33538c2ecf20Sopenharmony_ci 33548c2ecf20Sopenharmony_ci /* issue a swapon-time discard if it's still required */ 33558c2ecf20Sopenharmony_ci if (p->flags & SWP_AREA_DISCARD) { 33568c2ecf20Sopenharmony_ci int err = discard_swap(p); 33578c2ecf20Sopenharmony_ci if (unlikely(err)) 33588c2ecf20Sopenharmony_ci pr_err("swapon: discard_swap(%p): %d\n", 33598c2ecf20Sopenharmony_ci p, err); 33608c2ecf20Sopenharmony_ci } 33618c2ecf20Sopenharmony_ci } 33628c2ecf20Sopenharmony_ci 33638c2ecf20Sopenharmony_ci error = init_swap_address_space(p->type, maxpages); 33648c2ecf20Sopenharmony_ci if (error) 33658c2ecf20Sopenharmony_ci goto bad_swap_unlock_inode; 33668c2ecf20Sopenharmony_ci 33678c2ecf20Sopenharmony_ci /* 33688c2ecf20Sopenharmony_ci * Flush any pending IO and dirty mappings before we start using this 33698c2ecf20Sopenharmony_ci * swap device. 33708c2ecf20Sopenharmony_ci */ 33718c2ecf20Sopenharmony_ci inode->i_flags |= S_SWAPFILE; 33728c2ecf20Sopenharmony_ci error = inode_drain_writes(inode); 33738c2ecf20Sopenharmony_ci if (error) { 33748c2ecf20Sopenharmony_ci inode->i_flags &= ~S_SWAPFILE; 33758c2ecf20Sopenharmony_ci goto free_swap_address_space; 33768c2ecf20Sopenharmony_ci } 33778c2ecf20Sopenharmony_ci 33788c2ecf20Sopenharmony_ci mutex_lock(&swapon_mutex); 33798c2ecf20Sopenharmony_ci prio = -1; 33808c2ecf20Sopenharmony_ci if (swap_flags & SWAP_FLAG_PREFER) 33818c2ecf20Sopenharmony_ci prio = 33828c2ecf20Sopenharmony_ci (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; 33838c2ecf20Sopenharmony_ci enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map); 33848c2ecf20Sopenharmony_ci 33858c2ecf20Sopenharmony_ci pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", 33868c2ecf20Sopenharmony_ci p->pages<<(PAGE_SHIFT-10), name->name, p->prio, 33878c2ecf20Sopenharmony_ci nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), 33888c2ecf20Sopenharmony_ci (p->flags & SWP_SOLIDSTATE) ? "SS" : "", 33898c2ecf20Sopenharmony_ci (p->flags & SWP_DISCARDABLE) ? "D" : "", 33908c2ecf20Sopenharmony_ci (p->flags & SWP_AREA_DISCARD) ? "s" : "", 33918c2ecf20Sopenharmony_ci (p->flags & SWP_PAGE_DISCARD) ? "c" : "", 33928c2ecf20Sopenharmony_ci (frontswap_map) ? "FS" : ""); 33938c2ecf20Sopenharmony_ci 33948c2ecf20Sopenharmony_ci mutex_unlock(&swapon_mutex); 33958c2ecf20Sopenharmony_ci atomic_inc(&proc_poll_event); 33968c2ecf20Sopenharmony_ci wake_up_interruptible(&proc_poll_wait); 33978c2ecf20Sopenharmony_ci 33988c2ecf20Sopenharmony_ci error = 0; 33998c2ecf20Sopenharmony_ci goto out; 34008c2ecf20Sopenharmony_cifree_swap_address_space: 34018c2ecf20Sopenharmony_ci exit_swap_address_space(p->type); 34028c2ecf20Sopenharmony_cibad_swap_unlock_inode: 34038c2ecf20Sopenharmony_ci inode_unlock(inode); 34048c2ecf20Sopenharmony_cibad_swap: 34058c2ecf20Sopenharmony_ci free_percpu(p->percpu_cluster); 34068c2ecf20Sopenharmony_ci p->percpu_cluster = NULL; 34078c2ecf20Sopenharmony_ci free_percpu(p->cluster_next_cpu); 34088c2ecf20Sopenharmony_ci p->cluster_next_cpu = NULL; 34098c2ecf20Sopenharmony_ci if (inode && S_ISBLK(inode->i_mode) && p->bdev) { 34108c2ecf20Sopenharmony_ci set_blocksize(p->bdev, p->old_block_size); 34118c2ecf20Sopenharmony_ci blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); 34128c2ecf20Sopenharmony_ci } 34138c2ecf20Sopenharmony_ci inode = NULL; 34148c2ecf20Sopenharmony_ci destroy_swap_extents(p); 34158c2ecf20Sopenharmony_ci swap_cgroup_swapoff(p->type); 34168c2ecf20Sopenharmony_ci spin_lock(&swap_lock); 34178c2ecf20Sopenharmony_ci p->swap_file = NULL; 34188c2ecf20Sopenharmony_ci p->flags = 0; 34198c2ecf20Sopenharmony_ci spin_unlock(&swap_lock); 34208c2ecf20Sopenharmony_ci vfree(swap_map); 34218c2ecf20Sopenharmony_ci kvfree(cluster_info); 34228c2ecf20Sopenharmony_ci kvfree(frontswap_map); 34238c2ecf20Sopenharmony_ci if (inced_nr_rotate_swap) 34248c2ecf20Sopenharmony_ci atomic_dec(&nr_rotate_swap); 34258c2ecf20Sopenharmony_ci if (swap_file) 34268c2ecf20Sopenharmony_ci filp_close(swap_file, NULL); 34278c2ecf20Sopenharmony_ciout: 34288c2ecf20Sopenharmony_ci if (page && !IS_ERR(page)) { 34298c2ecf20Sopenharmony_ci kunmap(page); 34308c2ecf20Sopenharmony_ci put_page(page); 34318c2ecf20Sopenharmony_ci } 34328c2ecf20Sopenharmony_ci if (name) 34338c2ecf20Sopenharmony_ci putname(name); 34348c2ecf20Sopenharmony_ci if (inode) 34358c2ecf20Sopenharmony_ci inode_unlock(inode); 34368c2ecf20Sopenharmony_ci if (!error) 34378c2ecf20Sopenharmony_ci enable_swap_slots_cache(); 34388c2ecf20Sopenharmony_ci return error; 34398c2ecf20Sopenharmony_ci} 34408c2ecf20Sopenharmony_ci 34418c2ecf20Sopenharmony_civoid si_swapinfo(struct sysinfo *val) 34428c2ecf20Sopenharmony_ci{ 34438c2ecf20Sopenharmony_ci unsigned int type; 34448c2ecf20Sopenharmony_ci unsigned long nr_to_be_unused = 0; 34458c2ecf20Sopenharmony_ci 34468c2ecf20Sopenharmony_ci spin_lock(&swap_lock); 34478c2ecf20Sopenharmony_ci for (type = 0; type < nr_swapfiles; type++) { 34488c2ecf20Sopenharmony_ci struct swap_info_struct *si = swap_info[type]; 34498c2ecf20Sopenharmony_ci 34508c2ecf20Sopenharmony_ci if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) 34518c2ecf20Sopenharmony_ci nr_to_be_unused += si->inuse_pages; 34528c2ecf20Sopenharmony_ci } 34538c2ecf20Sopenharmony_ci val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; 34548c2ecf20Sopenharmony_ci val->totalswap = total_swap_pages + nr_to_be_unused; 34558c2ecf20Sopenharmony_ci spin_unlock(&swap_lock); 34568c2ecf20Sopenharmony_ci} 34578c2ecf20Sopenharmony_ci 34588c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_ZSWAPD 34598c2ecf20Sopenharmony_cibool free_swap_is_low(void) 34608c2ecf20Sopenharmony_ci{ 34618c2ecf20Sopenharmony_ci unsigned int type; 34628c2ecf20Sopenharmony_ci unsigned long long freeswap = 0; 34638c2ecf20Sopenharmony_ci unsigned long nr_to_be_unused = 0; 34648c2ecf20Sopenharmony_ci 34658c2ecf20Sopenharmony_ci spin_lock(&swap_lock); 34668c2ecf20Sopenharmony_ci for (type = 0; type < nr_swapfiles; type++) { 34678c2ecf20Sopenharmony_ci struct swap_info_struct *si = swap_info[type]; 34688c2ecf20Sopenharmony_ci 34698c2ecf20Sopenharmony_ci if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) 34708c2ecf20Sopenharmony_ci nr_to_be_unused += si->inuse_pages; 34718c2ecf20Sopenharmony_ci } 34728c2ecf20Sopenharmony_ci freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; 34738c2ecf20Sopenharmony_ci spin_unlock(&swap_lock); 34748c2ecf20Sopenharmony_ci 34758c2ecf20Sopenharmony_ci return (freeswap < get_free_swap_threshold()); 34768c2ecf20Sopenharmony_ci} 34778c2ecf20Sopenharmony_ciEXPORT_SYMBOL(free_swap_is_low); 34788c2ecf20Sopenharmony_ci#endif 34798c2ecf20Sopenharmony_ci 34808c2ecf20Sopenharmony_ci/* 34818c2ecf20Sopenharmony_ci * Verify that a swap entry is valid and increment its swap map count. 34828c2ecf20Sopenharmony_ci * 34838c2ecf20Sopenharmony_ci * Returns error code in following case. 34848c2ecf20Sopenharmony_ci * - success -> 0 34858c2ecf20Sopenharmony_ci * - swp_entry is invalid -> EINVAL 34868c2ecf20Sopenharmony_ci * - swp_entry is migration entry -> EINVAL 34878c2ecf20Sopenharmony_ci * - swap-cache reference is requested but there is already one. -> EEXIST 34888c2ecf20Sopenharmony_ci * - swap-cache reference is requested but the entry is not used. -> ENOENT 34898c2ecf20Sopenharmony_ci * - swap-mapped reference requested but needs continued swap count. -> ENOMEM 34908c2ecf20Sopenharmony_ci */ 34918c2ecf20Sopenharmony_cistatic int __swap_duplicate(swp_entry_t entry, unsigned char usage) 34928c2ecf20Sopenharmony_ci{ 34938c2ecf20Sopenharmony_ci struct swap_info_struct *p; 34948c2ecf20Sopenharmony_ci struct swap_cluster_info *ci; 34958c2ecf20Sopenharmony_ci unsigned long offset; 34968c2ecf20Sopenharmony_ci unsigned char count; 34978c2ecf20Sopenharmony_ci unsigned char has_cache; 34988c2ecf20Sopenharmony_ci int err = -EINVAL; 34998c2ecf20Sopenharmony_ci 35008c2ecf20Sopenharmony_ci p = get_swap_device(entry); 35018c2ecf20Sopenharmony_ci if (!p) 35028c2ecf20Sopenharmony_ci goto out; 35038c2ecf20Sopenharmony_ci 35048c2ecf20Sopenharmony_ci offset = swp_offset(entry); 35058c2ecf20Sopenharmony_ci ci = lock_cluster_or_swap_info(p, offset); 35068c2ecf20Sopenharmony_ci 35078c2ecf20Sopenharmony_ci count = p->swap_map[offset]; 35088c2ecf20Sopenharmony_ci 35098c2ecf20Sopenharmony_ci /* 35108c2ecf20Sopenharmony_ci * swapin_readahead() doesn't check if a swap entry is valid, so the 35118c2ecf20Sopenharmony_ci * swap entry could be SWAP_MAP_BAD. Check here with lock held. 35128c2ecf20Sopenharmony_ci */ 35138c2ecf20Sopenharmony_ci if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { 35148c2ecf20Sopenharmony_ci err = -ENOENT; 35158c2ecf20Sopenharmony_ci goto unlock_out; 35168c2ecf20Sopenharmony_ci } 35178c2ecf20Sopenharmony_ci 35188c2ecf20Sopenharmony_ci has_cache = count & SWAP_HAS_CACHE; 35198c2ecf20Sopenharmony_ci count &= ~SWAP_HAS_CACHE; 35208c2ecf20Sopenharmony_ci err = 0; 35218c2ecf20Sopenharmony_ci 35228c2ecf20Sopenharmony_ci if (usage == SWAP_HAS_CACHE) { 35238c2ecf20Sopenharmony_ci 35248c2ecf20Sopenharmony_ci /* set SWAP_HAS_CACHE if there is no cache and entry is used */ 35258c2ecf20Sopenharmony_ci if (!has_cache && count) 35268c2ecf20Sopenharmony_ci has_cache = SWAP_HAS_CACHE; 35278c2ecf20Sopenharmony_ci else if (has_cache) /* someone else added cache */ 35288c2ecf20Sopenharmony_ci err = -EEXIST; 35298c2ecf20Sopenharmony_ci else /* no users remaining */ 35308c2ecf20Sopenharmony_ci err = -ENOENT; 35318c2ecf20Sopenharmony_ci 35328c2ecf20Sopenharmony_ci } else if (count || has_cache) { 35338c2ecf20Sopenharmony_ci 35348c2ecf20Sopenharmony_ci if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) 35358c2ecf20Sopenharmony_ci count += usage; 35368c2ecf20Sopenharmony_ci else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) 35378c2ecf20Sopenharmony_ci err = -EINVAL; 35388c2ecf20Sopenharmony_ci else if (swap_count_continued(p, offset, count)) 35398c2ecf20Sopenharmony_ci count = COUNT_CONTINUED; 35408c2ecf20Sopenharmony_ci else 35418c2ecf20Sopenharmony_ci err = -ENOMEM; 35428c2ecf20Sopenharmony_ci } else 35438c2ecf20Sopenharmony_ci err = -ENOENT; /* unused swap entry */ 35448c2ecf20Sopenharmony_ci 35458c2ecf20Sopenharmony_ci WRITE_ONCE(p->swap_map[offset], count | has_cache); 35468c2ecf20Sopenharmony_ci 35478c2ecf20Sopenharmony_ciunlock_out: 35488c2ecf20Sopenharmony_ci unlock_cluster_or_swap_info(p, ci); 35498c2ecf20Sopenharmony_ciout: 35508c2ecf20Sopenharmony_ci if (p) 35518c2ecf20Sopenharmony_ci put_swap_device(p); 35528c2ecf20Sopenharmony_ci return err; 35538c2ecf20Sopenharmony_ci} 35548c2ecf20Sopenharmony_ci 35558c2ecf20Sopenharmony_ci/* 35568c2ecf20Sopenharmony_ci * Help swapoff by noting that swap entry belongs to shmem/tmpfs 35578c2ecf20Sopenharmony_ci * (in which case its reference count is never incremented). 35588c2ecf20Sopenharmony_ci */ 35598c2ecf20Sopenharmony_civoid swap_shmem_alloc(swp_entry_t entry) 35608c2ecf20Sopenharmony_ci{ 35618c2ecf20Sopenharmony_ci __swap_duplicate(entry, SWAP_MAP_SHMEM); 35628c2ecf20Sopenharmony_ci} 35638c2ecf20Sopenharmony_ci 35648c2ecf20Sopenharmony_ci/* 35658c2ecf20Sopenharmony_ci * Increase reference count of swap entry by 1. 35668c2ecf20Sopenharmony_ci * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required 35678c2ecf20Sopenharmony_ci * but could not be atomically allocated. Returns 0, just as if it succeeded, 35688c2ecf20Sopenharmony_ci * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which 35698c2ecf20Sopenharmony_ci * might occur if a page table entry has got corrupted. 35708c2ecf20Sopenharmony_ci */ 35718c2ecf20Sopenharmony_ciint swap_duplicate(swp_entry_t entry) 35728c2ecf20Sopenharmony_ci{ 35738c2ecf20Sopenharmony_ci int err = 0; 35748c2ecf20Sopenharmony_ci 35758c2ecf20Sopenharmony_ci while (!err && __swap_duplicate(entry, 1) == -ENOMEM) 35768c2ecf20Sopenharmony_ci err = add_swap_count_continuation(entry, GFP_ATOMIC); 35778c2ecf20Sopenharmony_ci return err; 35788c2ecf20Sopenharmony_ci} 35798c2ecf20Sopenharmony_ci 35808c2ecf20Sopenharmony_ci/* 35818c2ecf20Sopenharmony_ci * @entry: swap entry for which we allocate swap cache. 35828c2ecf20Sopenharmony_ci * 35838c2ecf20Sopenharmony_ci * Called when allocating swap cache for existing swap entry, 35848c2ecf20Sopenharmony_ci * This can return error codes. Returns 0 at success. 35858c2ecf20Sopenharmony_ci * -EEXIST means there is a swap cache. 35868c2ecf20Sopenharmony_ci * Note: return code is different from swap_duplicate(). 35878c2ecf20Sopenharmony_ci */ 35888c2ecf20Sopenharmony_ciint swapcache_prepare(swp_entry_t entry) 35898c2ecf20Sopenharmony_ci{ 35908c2ecf20Sopenharmony_ci return __swap_duplicate(entry, SWAP_HAS_CACHE); 35918c2ecf20Sopenharmony_ci} 35928c2ecf20Sopenharmony_ci 35938c2ecf20Sopenharmony_cistruct swap_info_struct *swp_swap_info(swp_entry_t entry) 35948c2ecf20Sopenharmony_ci{ 35958c2ecf20Sopenharmony_ci return swap_type_to_swap_info(swp_type(entry)); 35968c2ecf20Sopenharmony_ci} 35978c2ecf20Sopenharmony_ci 35988c2ecf20Sopenharmony_cistruct swap_info_struct *page_swap_info(struct page *page) 35998c2ecf20Sopenharmony_ci{ 36008c2ecf20Sopenharmony_ci swp_entry_t entry = { .val = page_private(page) }; 36018c2ecf20Sopenharmony_ci return swp_swap_info(entry); 36028c2ecf20Sopenharmony_ci} 36038c2ecf20Sopenharmony_ci 36048c2ecf20Sopenharmony_ci/* 36058c2ecf20Sopenharmony_ci * out-of-line __page_file_ methods to avoid include hell. 36068c2ecf20Sopenharmony_ci */ 36078c2ecf20Sopenharmony_cistruct address_space *__page_file_mapping(struct page *page) 36088c2ecf20Sopenharmony_ci{ 36098c2ecf20Sopenharmony_ci return page_swap_info(page)->swap_file->f_mapping; 36108c2ecf20Sopenharmony_ci} 36118c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(__page_file_mapping); 36128c2ecf20Sopenharmony_ci 36138c2ecf20Sopenharmony_cipgoff_t __page_file_index(struct page *page) 36148c2ecf20Sopenharmony_ci{ 36158c2ecf20Sopenharmony_ci swp_entry_t swap = { .val = page_private(page) }; 36168c2ecf20Sopenharmony_ci return swp_offset(swap); 36178c2ecf20Sopenharmony_ci} 36188c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(__page_file_index); 36198c2ecf20Sopenharmony_ci 36208c2ecf20Sopenharmony_ci/* 36218c2ecf20Sopenharmony_ci * add_swap_count_continuation - called when a swap count is duplicated 36228c2ecf20Sopenharmony_ci * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's 36238c2ecf20Sopenharmony_ci * page of the original vmalloc'ed swap_map, to hold the continuation count 36248c2ecf20Sopenharmony_ci * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called 36258c2ecf20Sopenharmony_ci * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. 36268c2ecf20Sopenharmony_ci * 36278c2ecf20Sopenharmony_ci * These continuation pages are seldom referenced: the common paths all work 36288c2ecf20Sopenharmony_ci * on the original swap_map, only referring to a continuation page when the 36298c2ecf20Sopenharmony_ci * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. 36308c2ecf20Sopenharmony_ci * 36318c2ecf20Sopenharmony_ci * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding 36328c2ecf20Sopenharmony_ci * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) 36338c2ecf20Sopenharmony_ci * can be called after dropping locks. 36348c2ecf20Sopenharmony_ci */ 36358c2ecf20Sopenharmony_ciint add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) 36368c2ecf20Sopenharmony_ci{ 36378c2ecf20Sopenharmony_ci struct swap_info_struct *si; 36388c2ecf20Sopenharmony_ci struct swap_cluster_info *ci; 36398c2ecf20Sopenharmony_ci struct page *head; 36408c2ecf20Sopenharmony_ci struct page *page; 36418c2ecf20Sopenharmony_ci struct page *list_page; 36428c2ecf20Sopenharmony_ci pgoff_t offset; 36438c2ecf20Sopenharmony_ci unsigned char count; 36448c2ecf20Sopenharmony_ci int ret = 0; 36458c2ecf20Sopenharmony_ci 36468c2ecf20Sopenharmony_ci /* 36478c2ecf20Sopenharmony_ci * When debugging, it's easier to use __GFP_ZERO here; but it's better 36488c2ecf20Sopenharmony_ci * for latency not to zero a page while GFP_ATOMIC and holding locks. 36498c2ecf20Sopenharmony_ci */ 36508c2ecf20Sopenharmony_ci page = alloc_page(gfp_mask | __GFP_HIGHMEM); 36518c2ecf20Sopenharmony_ci 36528c2ecf20Sopenharmony_ci si = get_swap_device(entry); 36538c2ecf20Sopenharmony_ci if (!si) { 36548c2ecf20Sopenharmony_ci /* 36558c2ecf20Sopenharmony_ci * An acceptable race has occurred since the failing 36568c2ecf20Sopenharmony_ci * __swap_duplicate(): the swap device may be swapoff 36578c2ecf20Sopenharmony_ci */ 36588c2ecf20Sopenharmony_ci goto outer; 36598c2ecf20Sopenharmony_ci } 36608c2ecf20Sopenharmony_ci spin_lock(&si->lock); 36618c2ecf20Sopenharmony_ci 36628c2ecf20Sopenharmony_ci offset = swp_offset(entry); 36638c2ecf20Sopenharmony_ci 36648c2ecf20Sopenharmony_ci ci = lock_cluster(si, offset); 36658c2ecf20Sopenharmony_ci 36668c2ecf20Sopenharmony_ci count = si->swap_map[offset] & ~SWAP_HAS_CACHE; 36678c2ecf20Sopenharmony_ci 36688c2ecf20Sopenharmony_ci if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { 36698c2ecf20Sopenharmony_ci /* 36708c2ecf20Sopenharmony_ci * The higher the swap count, the more likely it is that tasks 36718c2ecf20Sopenharmony_ci * will race to add swap count continuation: we need to avoid 36728c2ecf20Sopenharmony_ci * over-provisioning. 36738c2ecf20Sopenharmony_ci */ 36748c2ecf20Sopenharmony_ci goto out; 36758c2ecf20Sopenharmony_ci } 36768c2ecf20Sopenharmony_ci 36778c2ecf20Sopenharmony_ci if (!page) { 36788c2ecf20Sopenharmony_ci ret = -ENOMEM; 36798c2ecf20Sopenharmony_ci goto out; 36808c2ecf20Sopenharmony_ci } 36818c2ecf20Sopenharmony_ci 36828c2ecf20Sopenharmony_ci /* 36838c2ecf20Sopenharmony_ci * We are fortunate that although vmalloc_to_page uses pte_offset_map, 36848c2ecf20Sopenharmony_ci * no architecture is using highmem pages for kernel page tables: so it 36858c2ecf20Sopenharmony_ci * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps. 36868c2ecf20Sopenharmony_ci */ 36878c2ecf20Sopenharmony_ci head = vmalloc_to_page(si->swap_map + offset); 36888c2ecf20Sopenharmony_ci offset &= ~PAGE_MASK; 36898c2ecf20Sopenharmony_ci 36908c2ecf20Sopenharmony_ci spin_lock(&si->cont_lock); 36918c2ecf20Sopenharmony_ci /* 36928c2ecf20Sopenharmony_ci * Page allocation does not initialize the page's lru field, 36938c2ecf20Sopenharmony_ci * but it does always reset its private field. 36948c2ecf20Sopenharmony_ci */ 36958c2ecf20Sopenharmony_ci if (!page_private(head)) { 36968c2ecf20Sopenharmony_ci BUG_ON(count & COUNT_CONTINUED); 36978c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&head->lru); 36988c2ecf20Sopenharmony_ci set_page_private(head, SWP_CONTINUED); 36998c2ecf20Sopenharmony_ci si->flags |= SWP_CONTINUED; 37008c2ecf20Sopenharmony_ci } 37018c2ecf20Sopenharmony_ci 37028c2ecf20Sopenharmony_ci list_for_each_entry(list_page, &head->lru, lru) { 37038c2ecf20Sopenharmony_ci unsigned char *map; 37048c2ecf20Sopenharmony_ci 37058c2ecf20Sopenharmony_ci /* 37068c2ecf20Sopenharmony_ci * If the previous map said no continuation, but we've found 37078c2ecf20Sopenharmony_ci * a continuation page, free our allocation and use this one. 37088c2ecf20Sopenharmony_ci */ 37098c2ecf20Sopenharmony_ci if (!(count & COUNT_CONTINUED)) 37108c2ecf20Sopenharmony_ci goto out_unlock_cont; 37118c2ecf20Sopenharmony_ci 37128c2ecf20Sopenharmony_ci map = kmap_atomic(list_page) + offset; 37138c2ecf20Sopenharmony_ci count = *map; 37148c2ecf20Sopenharmony_ci kunmap_atomic(map); 37158c2ecf20Sopenharmony_ci 37168c2ecf20Sopenharmony_ci /* 37178c2ecf20Sopenharmony_ci * If this continuation count now has some space in it, 37188c2ecf20Sopenharmony_ci * free our allocation and use this one. 37198c2ecf20Sopenharmony_ci */ 37208c2ecf20Sopenharmony_ci if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) 37218c2ecf20Sopenharmony_ci goto out_unlock_cont; 37228c2ecf20Sopenharmony_ci } 37238c2ecf20Sopenharmony_ci 37248c2ecf20Sopenharmony_ci list_add_tail(&page->lru, &head->lru); 37258c2ecf20Sopenharmony_ci page = NULL; /* now it's attached, don't free it */ 37268c2ecf20Sopenharmony_ciout_unlock_cont: 37278c2ecf20Sopenharmony_ci spin_unlock(&si->cont_lock); 37288c2ecf20Sopenharmony_ciout: 37298c2ecf20Sopenharmony_ci unlock_cluster(ci); 37308c2ecf20Sopenharmony_ci spin_unlock(&si->lock); 37318c2ecf20Sopenharmony_ci put_swap_device(si); 37328c2ecf20Sopenharmony_ciouter: 37338c2ecf20Sopenharmony_ci if (page) 37348c2ecf20Sopenharmony_ci __free_page(page); 37358c2ecf20Sopenharmony_ci return ret; 37368c2ecf20Sopenharmony_ci} 37378c2ecf20Sopenharmony_ci 37388c2ecf20Sopenharmony_ci/* 37398c2ecf20Sopenharmony_ci * swap_count_continued - when the original swap_map count is incremented 37408c2ecf20Sopenharmony_ci * from SWAP_MAP_MAX, check if there is already a continuation page to carry 37418c2ecf20Sopenharmony_ci * into, carry if so, or else fail until a new continuation page is allocated; 37428c2ecf20Sopenharmony_ci * when the original swap_map count is decremented from 0 with continuation, 37438c2ecf20Sopenharmony_ci * borrow from the continuation and report whether it still holds more. 37448c2ecf20Sopenharmony_ci * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster 37458c2ecf20Sopenharmony_ci * lock. 37468c2ecf20Sopenharmony_ci */ 37478c2ecf20Sopenharmony_cistatic bool swap_count_continued(struct swap_info_struct *si, 37488c2ecf20Sopenharmony_ci pgoff_t offset, unsigned char count) 37498c2ecf20Sopenharmony_ci{ 37508c2ecf20Sopenharmony_ci struct page *head; 37518c2ecf20Sopenharmony_ci struct page *page; 37528c2ecf20Sopenharmony_ci unsigned char *map; 37538c2ecf20Sopenharmony_ci bool ret; 37548c2ecf20Sopenharmony_ci 37558c2ecf20Sopenharmony_ci head = vmalloc_to_page(si->swap_map + offset); 37568c2ecf20Sopenharmony_ci if (page_private(head) != SWP_CONTINUED) { 37578c2ecf20Sopenharmony_ci BUG_ON(count & COUNT_CONTINUED); 37588c2ecf20Sopenharmony_ci return false; /* need to add count continuation */ 37598c2ecf20Sopenharmony_ci } 37608c2ecf20Sopenharmony_ci 37618c2ecf20Sopenharmony_ci spin_lock(&si->cont_lock); 37628c2ecf20Sopenharmony_ci offset &= ~PAGE_MASK; 37638c2ecf20Sopenharmony_ci page = list_next_entry(head, lru); 37648c2ecf20Sopenharmony_ci map = kmap_atomic(page) + offset; 37658c2ecf20Sopenharmony_ci 37668c2ecf20Sopenharmony_ci if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ 37678c2ecf20Sopenharmony_ci goto init_map; /* jump over SWAP_CONT_MAX checks */ 37688c2ecf20Sopenharmony_ci 37698c2ecf20Sopenharmony_ci if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ 37708c2ecf20Sopenharmony_ci /* 37718c2ecf20Sopenharmony_ci * Think of how you add 1 to 999 37728c2ecf20Sopenharmony_ci */ 37738c2ecf20Sopenharmony_ci while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { 37748c2ecf20Sopenharmony_ci kunmap_atomic(map); 37758c2ecf20Sopenharmony_ci page = list_next_entry(page, lru); 37768c2ecf20Sopenharmony_ci BUG_ON(page == head); 37778c2ecf20Sopenharmony_ci map = kmap_atomic(page) + offset; 37788c2ecf20Sopenharmony_ci } 37798c2ecf20Sopenharmony_ci if (*map == SWAP_CONT_MAX) { 37808c2ecf20Sopenharmony_ci kunmap_atomic(map); 37818c2ecf20Sopenharmony_ci page = list_next_entry(page, lru); 37828c2ecf20Sopenharmony_ci if (page == head) { 37838c2ecf20Sopenharmony_ci ret = false; /* add count continuation */ 37848c2ecf20Sopenharmony_ci goto out; 37858c2ecf20Sopenharmony_ci } 37868c2ecf20Sopenharmony_ci map = kmap_atomic(page) + offset; 37878c2ecf20Sopenharmony_ciinit_map: *map = 0; /* we didn't zero the page */ 37888c2ecf20Sopenharmony_ci } 37898c2ecf20Sopenharmony_ci *map += 1; 37908c2ecf20Sopenharmony_ci kunmap_atomic(map); 37918c2ecf20Sopenharmony_ci while ((page = list_prev_entry(page, lru)) != head) { 37928c2ecf20Sopenharmony_ci map = kmap_atomic(page) + offset; 37938c2ecf20Sopenharmony_ci *map = COUNT_CONTINUED; 37948c2ecf20Sopenharmony_ci kunmap_atomic(map); 37958c2ecf20Sopenharmony_ci } 37968c2ecf20Sopenharmony_ci ret = true; /* incremented */ 37978c2ecf20Sopenharmony_ci 37988c2ecf20Sopenharmony_ci } else { /* decrementing */ 37998c2ecf20Sopenharmony_ci /* 38008c2ecf20Sopenharmony_ci * Think of how you subtract 1 from 1000 38018c2ecf20Sopenharmony_ci */ 38028c2ecf20Sopenharmony_ci BUG_ON(count != COUNT_CONTINUED); 38038c2ecf20Sopenharmony_ci while (*map == COUNT_CONTINUED) { 38048c2ecf20Sopenharmony_ci kunmap_atomic(map); 38058c2ecf20Sopenharmony_ci page = list_next_entry(page, lru); 38068c2ecf20Sopenharmony_ci BUG_ON(page == head); 38078c2ecf20Sopenharmony_ci map = kmap_atomic(page) + offset; 38088c2ecf20Sopenharmony_ci } 38098c2ecf20Sopenharmony_ci BUG_ON(*map == 0); 38108c2ecf20Sopenharmony_ci *map -= 1; 38118c2ecf20Sopenharmony_ci if (*map == 0) 38128c2ecf20Sopenharmony_ci count = 0; 38138c2ecf20Sopenharmony_ci kunmap_atomic(map); 38148c2ecf20Sopenharmony_ci while ((page = list_prev_entry(page, lru)) != head) { 38158c2ecf20Sopenharmony_ci map = kmap_atomic(page) + offset; 38168c2ecf20Sopenharmony_ci *map = SWAP_CONT_MAX | count; 38178c2ecf20Sopenharmony_ci count = COUNT_CONTINUED; 38188c2ecf20Sopenharmony_ci kunmap_atomic(map); 38198c2ecf20Sopenharmony_ci } 38208c2ecf20Sopenharmony_ci ret = count == COUNT_CONTINUED; 38218c2ecf20Sopenharmony_ci } 38228c2ecf20Sopenharmony_ciout: 38238c2ecf20Sopenharmony_ci spin_unlock(&si->cont_lock); 38248c2ecf20Sopenharmony_ci return ret; 38258c2ecf20Sopenharmony_ci} 38268c2ecf20Sopenharmony_ci 38278c2ecf20Sopenharmony_ci/* 38288c2ecf20Sopenharmony_ci * free_swap_count_continuations - swapoff free all the continuation pages 38298c2ecf20Sopenharmony_ci * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. 38308c2ecf20Sopenharmony_ci */ 38318c2ecf20Sopenharmony_cistatic void free_swap_count_continuations(struct swap_info_struct *si) 38328c2ecf20Sopenharmony_ci{ 38338c2ecf20Sopenharmony_ci pgoff_t offset; 38348c2ecf20Sopenharmony_ci 38358c2ecf20Sopenharmony_ci for (offset = 0; offset < si->max; offset += PAGE_SIZE) { 38368c2ecf20Sopenharmony_ci struct page *head; 38378c2ecf20Sopenharmony_ci head = vmalloc_to_page(si->swap_map + offset); 38388c2ecf20Sopenharmony_ci if (page_private(head)) { 38398c2ecf20Sopenharmony_ci struct page *page, *next; 38408c2ecf20Sopenharmony_ci 38418c2ecf20Sopenharmony_ci list_for_each_entry_safe(page, next, &head->lru, lru) { 38428c2ecf20Sopenharmony_ci list_del(&page->lru); 38438c2ecf20Sopenharmony_ci __free_page(page); 38448c2ecf20Sopenharmony_ci } 38458c2ecf20Sopenharmony_ci } 38468c2ecf20Sopenharmony_ci } 38478c2ecf20Sopenharmony_ci} 38488c2ecf20Sopenharmony_ci 38498c2ecf20Sopenharmony_ci#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) 38508c2ecf20Sopenharmony_civoid cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) 38518c2ecf20Sopenharmony_ci{ 38528c2ecf20Sopenharmony_ci struct swap_info_struct *si, *next; 38538c2ecf20Sopenharmony_ci int nid = page_to_nid(page); 38548c2ecf20Sopenharmony_ci 38558c2ecf20Sopenharmony_ci if (!(gfp_mask & __GFP_IO)) 38568c2ecf20Sopenharmony_ci return; 38578c2ecf20Sopenharmony_ci 38588c2ecf20Sopenharmony_ci if (!blk_cgroup_congested()) 38598c2ecf20Sopenharmony_ci return; 38608c2ecf20Sopenharmony_ci 38618c2ecf20Sopenharmony_ci /* 38628c2ecf20Sopenharmony_ci * We've already scheduled a throttle, avoid taking the global swap 38638c2ecf20Sopenharmony_ci * lock. 38648c2ecf20Sopenharmony_ci */ 38658c2ecf20Sopenharmony_ci if (current->throttle_queue) 38668c2ecf20Sopenharmony_ci return; 38678c2ecf20Sopenharmony_ci 38688c2ecf20Sopenharmony_ci spin_lock(&swap_avail_lock); 38698c2ecf20Sopenharmony_ci plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], 38708c2ecf20Sopenharmony_ci avail_lists[nid]) { 38718c2ecf20Sopenharmony_ci if (si->bdev) { 38728c2ecf20Sopenharmony_ci blkcg_schedule_throttle(bdev_get_queue(si->bdev), true); 38738c2ecf20Sopenharmony_ci break; 38748c2ecf20Sopenharmony_ci } 38758c2ecf20Sopenharmony_ci } 38768c2ecf20Sopenharmony_ci spin_unlock(&swap_avail_lock); 38778c2ecf20Sopenharmony_ci} 38788c2ecf20Sopenharmony_ci#endif 38798c2ecf20Sopenharmony_ci 38808c2ecf20Sopenharmony_cistatic int __init swapfile_init(void) 38818c2ecf20Sopenharmony_ci{ 38828c2ecf20Sopenharmony_ci int nid; 38838c2ecf20Sopenharmony_ci 38848c2ecf20Sopenharmony_ci swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head), 38858c2ecf20Sopenharmony_ci GFP_KERNEL); 38868c2ecf20Sopenharmony_ci if (!swap_avail_heads) { 38878c2ecf20Sopenharmony_ci pr_emerg("Not enough memory for swap heads, swap is disabled\n"); 38888c2ecf20Sopenharmony_ci return -ENOMEM; 38898c2ecf20Sopenharmony_ci } 38908c2ecf20Sopenharmony_ci 38918c2ecf20Sopenharmony_ci for_each_node(nid) 38928c2ecf20Sopenharmony_ci plist_head_init(&swap_avail_heads[nid]); 38938c2ecf20Sopenharmony_ci 38948c2ecf20Sopenharmony_ci return 0; 38958c2ecf20Sopenharmony_ci} 38968c2ecf20Sopenharmony_cisubsys_initcall(swapfile_init); 3897